8 from commoncode import readDataset
9 from cistematic.genomes import Genome
10 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
12 print "%prog: version 5.6"
19 usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
21 parser = optparse.OptionParser(usage=usage)
22 parser.add_option("--gidField", type="int", dest="fieldID")
23 parser.add_option("--maxLength", type="float", dest="maxLength")
24 parser.add_option("--cache", action="store_true", dest="doCache")
25 parser.add_option("--models", dest="extendGenome")
26 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
27 parser.set_defaults(fieldID=0, maxLength=1000000000., doCache=False, extendGenome="",
30 (options, args) = parser.parse_args(argv[1:])
34 print "\twhere splicecountfile can be set to 'none' to not count splices\n"
39 uniquecountfile = args[2]
40 splicecountfile = args[3]
47 candidatefile = open(args[5])
48 candidateLines = candidatefile.readlines()
50 acceptedfilename = args[6]
54 normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
55 candidateLines, acceptedfilename, options.fieldID,
56 options.maxLength, options.doCache, options.extendGenome,
57 options.replaceModels)
60 def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
61 outfilename, candidateLines=[], acceptedfilename="",
62 fieldID=0, maxLength=1000000000., doCache=False,
63 extendGenome="", replaceModels=False):
65 uniquecountfile = open(uniquecountfilename)
68 acceptedfile = open(acceptedfilename, "w")
71 if splicecountfilename != "none":
73 splicecountfile = open(splicecountfilename)
77 print "will replace gene models with %s" % extendGenome
79 print "will extend gene models with %s" % extendGenome
83 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
84 print "%s cached" % genome
86 hg = Genome(genome, inRAM=True)
88 if extendGenome != "":
89 hg.extendFeatures(extendGenome, replace=replaceModels)
91 RDS = readDataset(hitfile, verbose = True, cache=doCache, reportCount=False)
92 uniqcount = RDS.getUniqsCount()
93 print "%d unique reads" % uniqcount
103 featuresDict = hg.getallGeneFeatures()
104 print "got featuresDict"
106 outfile = open(outfilename, "w")
108 for line in uniquecountfile:
109 fields = line.strip().split()
110 gid = fields[fieldID]
112 countDict[gid] = float(fields[-1])
114 gidToGeneDict[gid] = gene
116 uniquecountfile.close()
119 for line in splicecountfile:
120 fields = line.strip().split()
121 gid = fields[fieldID]
123 countDict[gid] += float(fields[-1])
128 splicecount += float(fields[-1])
130 splicecountfile.close()
132 for line in candidateLines:
136 fields = line.strip().split()
139 if gid not in gidList:
140 if gid not in farList:
142 gidToGeneDict[gid] = gene
144 if gid not in countDict:
147 countDict[gid] += float(fields[6])
149 if gid not in candidateDict:
150 candidateDict[gid] = []
152 candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
154 totalCount = (uniqcount + splicecount) / 1000000.
155 uniqScale = uniqcount / 1000000.
157 gene = gidToGeneDict[gid]
160 featureList = featuresDict[gid]
163 featureList = featuresDict[gene]
169 for (ftype, chrom, start, stop, sense) in featureList:
170 if (start, stop) not in newfeatureList:
171 newfeatureList.append((start, stop))
172 geneLength += (abs(start - stop) + 1.) / 1000.
176 elif geneLength > maxLength:
177 geneLength = maxLength
179 rpm = countDict[gid] / totalCount
180 rpkm = rpm / geneLength
181 if gid in candidateDict:
182 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
183 cratio = cCount / (cLength / 1000.)
184 cratio = (uniqScale * cratio) / totalCount
185 if 10. * cratio < rpkm:
188 countDict[gid] += cCount
189 geneLength += cLength / 1000.
190 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
192 rpm = countDict[gid] / totalCount
193 rpkm = rpm / geneLength
194 outfile.write("%s\t%s\t%.4f\t%.2f\n" % (gid, gene, geneLength, rpkm))
197 gene = gidToGeneDict[gid]
199 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
200 geneLength += cLength / 1000.
205 for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
206 cratio = cCount / (cLength / 1000.)
207 cratio = cratio / totalCount
208 acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
210 rpm = countDict[gid] / totalCount
211 rpkm = rpm / geneLength
212 outfile.write('%s\t%s\t%.4f\t%.2f\n' % (gene, gene, geneLength, rpkm))
221 uncacheGeneDB(genome)
224 if __name__ == "__main__":