5 print 'psyco not running'
8 from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
9 from cistematic.genomes import Genome
10 from cistematic.core.geneinfo import geneinfoDB
11 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
13 print '%s: version 4.1' % sys.argv[0]
20 usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
22 parser = optparse.OptionParser(usage=usage)
23 parser.add_option("--stranded", action="store_false", dest="ignoreSense")
24 parser.add_option("--uniq", action="store_true", dest="withUniqs")
25 parser.add_option("--multi", action="store_true", dest="withMulti")
26 parser.add_option("--record", action="store_true", dest="recording",
27 help="ignored with uniq reads")
28 parser.add_option("--accept", dest="acceptfile")
29 parser.add_option("--cache", type="int", dest="cachePages")
30 parser.add_option("--verbose", action="store_true", dest="doVerbose")
31 parser.add_option("--models", dest="extendGenome")
32 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
33 parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
34 acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
37 (options, args) = parser.parse_args(argv[1:])
48 geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
49 options.withUniqs, options.withMulti, options.recording,
50 options.acceptfile, options.cachePages, options.doVerbose,
51 options.extendGenome, options.replaceModels)
54 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
55 withUniqs=False, withMulti=False, recording=False, acceptfile=None,
56 cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
58 if (not withUniqs and not withMulti) or (withUniqs and withMulti):
59 print "must have either one of -uniq or -multi set. Exiting"
62 if cachePages is not None:
64 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
65 idb = geneinfoDB(cache=True)
66 print "%s cached" % genome
71 hg = Genome(genome, inRAM=True)
74 if acceptfile is not None:
75 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
79 if recording and withUniqs:
84 print "will replace gene models with %s" % extendGenome
86 print "will extend gene models with %s" % extendGenome
90 if extendGenome != "":
91 hg.extendFeatures(extendGenome, replace = replaceModels)
93 hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
94 if cachePages > hitRDS.getDefaultCacheSize():
95 hitRDS.setDBcache(cachePages)
97 readlen = hitRDS.getReadSize()
99 geneinfoDict = idb.getallGeneInfo(genome)
100 geneannotDict = hg.allAnnotInfo()
104 featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
105 gidList = hg.allGIDs()
108 for chrom in acceptDict:
109 for (label, start, stop, length) in acceptDict[chrom]:
110 if label not in gidList:
111 gidList.append(label)
115 gidReadDict[gid] = []
120 uniquecounts = open(countfile)
121 for line in uniquecounts:
122 fields = line.strip().split()
123 # add a pseudo-count here to ease calculations below
124 uniqueCountDict[fields[0]] = float(fields[-1]) + 1
128 outfile = open(outfilename, "w")
131 if withMulti and not withUniqs:
132 chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
134 chromList = hitRDS.getChromosomes(fullChrom=False)
136 for achrom in chromList:
137 if achrom not in featuresByChromDict:
140 print "\n" + achrom + " ",
142 fullchrom = "chr" + achrom
143 hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
144 featList = featuresByChromDict[achrom]
146 for (tagStart, tagReadID) in hitDict[fullchrom]:
148 if index % 100000 == 0:
149 print "read %d" % index,
151 stopPoint = tagStart + readlen
155 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
160 if start > stopPoint:
164 if start <= tagStart <= stop:
166 gidReadDict[gid].append(tagReadID)
167 if tagReadID in read2GidDict:
168 if gid not in read2GidDict[tagReadID]:
169 read2GidDict[tagReadID].append(gid)
171 read2GidDict[tagReadID] = [gid]
175 print "gid %s not in gidReadDict" % gid
179 for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
181 if index % 100000 == 0:
182 print "read %d" % index,
184 stopPoint = tagStart + readlen
188 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
193 if start > stopPoint:
202 if start <= tagStart <= stop and sense == tSense:
204 gidReadDict[gid].append(tagReadID)
205 if tagReadID in read2GidDict:
206 if gid not in read2GidDict[tagReadID]:
207 read2GidDict[tagReadID].append(gid)
209 read2GidDict[tagReadID] = [gid]
213 print "gid %s not in gidReadDict" % gid
222 geneinfo = geneinfoDict[gid]
223 if genome == "celegans":
224 symbol = geneinfo[0][1]
226 symbol = geneinfo[0][0]
229 symbol = geneannotDict[(genome, gid)][0]
236 for readID in gidReadDict[gid]:
238 tagValue = uniqueCountDict[gid]
243 for aGid in read2GidDict[readID]:
245 tagDenom += uniqueCountDict[aGid]
250 tagCount += tagValue / tagDenom
251 except ZeroDivisionError:
255 print "%s %s %f" % (gid, symbol, tagCount)
257 outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
262 uncacheGeneDB(genome)
265 if __name__ == "__main__":