5 print "psyco not running"
9 from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
11 from cistematic.genomes import Genome
12 from commoncode import getGeneInfoDict
13 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
15 print "geneMrnaCountsWeighted: version 4.3"
22 usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
36 geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
37 options.withUniqs, options.withMulti,
38 options.acceptfile, options.cachePages, options.doVerbose,
39 options.extendGenome, options.replaceModels)
42 def makeParser(usage=""):
43 parser = optparse.OptionParser(usage=usage)
44 parser.add_option("--stranded", action="store_false", dest="ignoreSense")
45 parser.add_option("--uniq", action="store_true", dest="withUniqs")
46 parser.add_option("--multi", action="store_true", dest="withMulti")
47 parser.add_option("--accept", dest="acceptfile")
48 parser.add_option("--cache", type="int", dest="cachePages")
49 parser.add_option("--verbose", action="store_true", dest="doVerbose")
50 parser.add_option("--models", dest="extendGenome")
51 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
53 configParser = getConfigParser()
54 section = "geneMrnaCountsWeighted"
55 ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
56 withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
57 withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
58 acceptfile = getConfigOption(configParser, section, "acceptfile", None)
59 cachePages = getConfigOption(configParser, section, "cachePages", None)
60 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
61 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
62 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
64 parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
65 acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
66 replaceModels=replaceModels)
71 #TODO: Reported user performance issue. Long run times in conditions:
72 # small number of reads ~40-50M
73 # all features on single chromosome
75 # User states has been a long time problem.
77 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
78 withUniqs=False, withMulti=False, acceptfile=None,
79 cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
81 if (not withUniqs and not withMulti) or (withUniqs and withMulti):
82 print "must have either one of -uniq or -multi set. Exiting"
85 if cachePages is not None:
87 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
88 print "%s cached" % genome
93 hg = Genome(genome, inRAM=True)
97 print "will replace gene models with %s" % extendGenome
99 print "will extend gene models with %s" % extendGenome
101 hg.extendFeatures(extendGenome, replace=replaceModels)
103 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
104 if cachePages > hitRDS.getDefaultCacheSize():
105 hitRDS.setDBcache(cachePages)
107 allGIDs = set(hg.allGIDs())
108 if acceptfile is not None:
109 regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
110 for chrom in regionDict:
111 for region in regionDict[chrom]:
112 allGIDs.add(region.label)
116 featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
121 gidReadDict[gid] = []
124 if withMulti and not withUniqs:
125 chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
127 chromList = hitRDS.getChromosomes(fullChrom=False)
129 readlen = hitRDS.getReadSize()
130 for chromosome in chromList:
131 if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
134 print "\n%s " % chromosome,
135 fullchrom = "chr%s" % chromosome
136 hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
137 featureList = featuresByChromDict[chromosome]
139 readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
140 index = totalProcessedReads
141 for (tagReadID, gid) in readGidList:
143 gidReadDict[gid].append(tagReadID)
144 if tagReadID in read2GidDict:
145 read2GidDict[tagReadID].add(gid)
147 read2GidDict[tagReadID] = set([gid])
149 print "gid %s not in gidReadDict" % gid
151 writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
153 uncacheGeneDB(genome)
156 def doNotProcessChromosome(chromosome, chromosomeList):
157 return chromosome not in chromosomeList
160 def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
165 for read in hitDict[fullchrom]:
166 tagStart = read["start"]
167 tagReadID = read["readID"]
169 tagSense = read["sense"]
173 if index % 100000 == 0:
174 print "read %d" % index,
176 stopPoint = tagStart + readlen
180 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
185 if start > stopPoint:
195 if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
196 readGidList.append((tagReadID, gid))
199 return readGidList, index
202 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
205 uniquecounts = open(countFilename)
206 for line in uniquecounts:
207 fields = line.strip().split()
208 # add a pseudo-count here to ease calculations below
209 #TODO: figure out why this was done in prior implementation...
210 uniqueCountDict[fields[0]] = float(fields[-1]) + 1
214 genomeName = genome.genome
215 geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
216 geneannotDict = genome.allAnnotInfo()
217 outfile = open(outFilename, "w")
219 symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
220 tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
222 print "%s %s %f" % (gid, symbol, tagCount)
224 outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
229 def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
231 symbol = "LOC%s" % gid
234 geneinfo = geneinfoDict[gid]
235 if genomeName == "celegans":
236 symbol = geneinfo[0][1]
238 symbol = geneinfo[0][0]
239 except (KeyError, IndexError):
241 symbol = geneannotDict[(genomeName, gid)][0]
242 except (KeyError, IndexError):
243 symbol = "LOC%s" % gid
250 def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
252 for readID in gidReadDict[gid]:
254 tagValue = uniqueCountDict[gid]
259 for relatedGID in read2GidDict[readID]:
261 tagDenom += uniqueCountDict[relatedGID]
266 tagCount += tagValue / tagDenom
267 except ZeroDivisionError:
273 if __name__ == "__main__":