5 print "psyco not running"
10 from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
11 from cistematic.genomes import Genome
12 from commoncode import getGeneInfoDict
13 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
15 print "geneMrnaCountsWeighted: version 5.0"
22 usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
35 bamfile = pysam.Samfile(hitfile, "rb")
37 geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
38 options.withUniqs, options.withMulti,
39 options.acceptfile, options.cachePages, options.doVerbose,
40 options.extendGenome, options.replaceModels)
43 def makeParser(usage=""):
44 parser = optparse.OptionParser(usage=usage)
45 parser.add_option("--stranded", action="store_false", dest="ignoreSense")
46 parser.add_option("--uniq", action="store_true", dest="withUniqs")
47 parser.add_option("--multi", action="store_true", dest="withMulti")
48 parser.add_option("--accept", dest="acceptfile")
49 parser.add_option("--cache", type="int", dest="cachePages")
50 parser.add_option("--verbose", action="store_true", dest="doVerbose")
51 parser.add_option("--models", dest="extendGenome")
52 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
54 configParser = getConfigParser()
55 section = "geneMrnaCountsWeighted"
56 ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
57 withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
58 withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
59 acceptfile = getConfigOption(configParser, section, "acceptfile", None)
60 cachePages = getConfigOption(configParser, section, "cachePages", None)
61 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
62 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
63 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
65 parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
66 acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
67 replaceModels=replaceModels)
72 def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
73 withUniqs=False, withMulti=False, acceptfile=None,
74 cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
76 if (not withUniqs and not withMulti) or (withUniqs and withMulti):
77 print "must have either one of -uniq or -multi set. Exiting"
80 if cachePages is not None:
82 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
83 print "%s cached" % genome
87 hg = Genome(genome, inRAM=True)
91 print "will replace gene models with %s" % extendGenome
93 print "will extend gene models with %s" % extendGenome
95 hg.extendFeatures(extendGenome, replace=replaceModels)
97 allGIDs = set(hg.allGIDs())
98 if acceptfile is not None:
99 regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
100 for chrom in regionDict:
101 for region in regionDict[chrom]:
102 allGIDs.add(region.label)
106 featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
111 gidReadDict[gid] = []
114 chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
115 for chromosome in chromosomeList:
116 if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
119 print "\n%s " % chromosome,
120 featureList = featuresByChromDict[chromosome]
122 readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
123 index = totalProcessedReads
124 for (tagReadID, gid) in readGidList:
126 gidReadDict[gid].append(tagReadID)
127 if tagReadID in read2GidDict:
128 read2GidDict[tagReadID].add(gid)
130 read2GidDict[tagReadID] = set([gid])
132 print "gid %s not in gidReadDict" % gid
134 writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
136 uncacheGeneDB(genome)
139 def doNotProcessChromosome(chromosome, chromosomeList):
140 return chromosome not in chromosomeList
143 def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
147 readlen = getHeaderComment(bamfile.header, "ReadLength")
148 for alignedread in bamfile.fetch(chromosome):
149 if doNotProcessRead(alignedread, doUniqs, doMulti):
152 tagStart = alignedread.pos
153 tagReadID = addPairIDtoReadID(alignedread)
156 if alignedread.is_reverse:
162 if index % 100000 == 0:
163 print "read %d" % index,
165 stopPoint = tagStart + readlen
169 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
174 if start > stopPoint:
184 if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
185 readGidList.append((tagReadID, gid))
188 return readGidList, index
191 def doNotProcessRead(read, doUniqs, doMulti):
193 if isSpliceEntry(read.cigar):
195 elif doUniqs and read.opt('NH') > 1:
197 elif doMulti and read.opt('NH') == 1:
203 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
206 uniquecounts = open(countFilename)
207 for line in uniquecounts:
208 fields = line.strip().split()
209 # add a pseudo-count here to ease calculations below
210 #TODO: figure out why this was done in prior implementation...
211 uniqueCountDict[fields[0]] = float(fields[-1]) + 1
215 genomeName = genome.genome
216 geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
217 geneannotDict = genome.allAnnotInfo()
218 outfile = open(outFilename, "w")
220 symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
221 tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
223 print "%s %s %f" % (gid, symbol, tagCount)
225 outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
230 def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
232 symbol = "LOC%s" % gid
235 geneinfo = geneinfoDict[gid]
236 if genomeName == "celegans":
237 symbol = geneinfo[0][1]
239 symbol = geneinfo[0][0]
240 except (KeyError, IndexError):
242 symbol = geneannotDict[(genomeName, gid)][0]
243 except (KeyError, IndexError):
244 symbol = "LOC%s" % gid
251 def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
253 for readID in gidReadDict[gid]:
255 tagValue = uniqueCountDict[gid]
260 for relatedGID in read2GidDict[readID]:
262 tagDenom += uniqueCountDict[relatedGID]
267 tagCount += tagValue / tagDenom
268 except ZeroDivisionError:
274 if __name__ == "__main__":