5 print "psyco not running"
9 from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
11 from cistematic.genomes import Genome
12 from commoncode import getGeneInfoDict
13 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
15 print "geneMrnaCountsWeighted: version 4.3"
22 usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
24 parser = makeParser(usage)
25 (options, args) = parser.parse_args(argv[1:])
36 geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
37 options.withUniqs, options.withMulti,
38 options.acceptfile, options.cachePages, options.doVerbose,
39 options.extendGenome, options.replaceModels)
42 def makeParser(usage=""):
43 parser = optparse.OptionParser(usage=usage)
44 parser.add_option("--stranded", action="store_false", dest="ignoreSense")
45 parser.add_option("--uniq", action="store_true", dest="withUniqs")
46 parser.add_option("--multi", action="store_true", dest="withMulti")
47 parser.add_option("--accept", dest="acceptfile")
48 parser.add_option("--cache", type="int", dest="cachePages")
49 parser.add_option("--verbose", action="store_true", dest="doVerbose")
50 parser.add_option("--models", dest="extendGenome")
51 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
53 configParser = getConfigParser()
54 section = "geneMrnaCountsWeighted"
55 ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
56 withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
57 withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
58 acceptfile = getConfigOption(configParser, section, "acceptfile", None)
59 cachePages = getConfigOption(configParser, section, "cachePages", None)
60 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
61 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
62 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
64 parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
65 acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
66 replaceModels=replaceModels)
71 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
72 withUniqs=False, withMulti=False, acceptfile=None,
73 cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
75 if (not withUniqs and not withMulti) or (withUniqs and withMulti):
76 print "must have either one of -uniq or -multi set. Exiting"
79 if cachePages is not None:
81 hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
82 print "%s cached" % genome
87 hg = Genome(genome, inRAM=True)
91 print "will replace gene models with %s" % extendGenome
93 print "will extend gene models with %s" % extendGenome
95 hg.extendFeatures(extendGenome, replace=replaceModels)
97 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
98 if cachePages > hitRDS.getDefaultCacheSize():
99 hitRDS.setDBcache(cachePages)
101 allGIDs = set(hg.allGIDs())
102 if acceptfile is not None:
103 regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
104 for chrom in regionDict:
105 for region in regionDict[chrom]:
106 allGIDs.add(region.label)
110 featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
115 gidReadDict[gid] = []
118 if withMulti and not withUniqs:
119 chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
121 chromList = hitRDS.getChromosomes(fullChrom=False)
123 readlen = hitRDS.getReadSize()
124 for chromosome in chromList:
125 if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
128 print "\n%s " % chromosome,
129 fullchrom = "chr%s" % chromosome
130 hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
131 featureList = featuresByChromDict[chromosome]
133 readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
134 index = totalProcessedReads
135 for (tagReadID, gid) in readGidList:
137 gidReadDict[gid].append(tagReadID)
138 if tagReadID in read2GidDict:
139 read2GidDict[tagReadID].add(gid)
141 read2GidDict[tagReadID] = set([gid])
143 print "gid %s not in gidReadDict" % gid
145 writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
147 uncacheGeneDB(genome)
150 def doNotProcessChromosome(chromosome, chromosomeList):
151 return chromosome not in chromosomeList
154 def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
159 for read in hitDict[fullchrom]:
160 tagStart = read["start"]
161 tagReadID = read["readID"]
162 if read.has_key("sense"):
163 tagSense = read["sense"]
167 if index % 100000 == 0:
168 print "read %d" % index,
170 stopPoint = tagStart + readlen
174 for (start, stop, gid, sense, ftype) in featList[startFeature:]:
179 if start > stopPoint:
189 if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
190 readGidList.append((tagReadID, gid))
193 return readGidList, index
196 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
199 uniquecounts = open(countFilename)
200 for line in uniquecounts:
201 fields = line.strip().split()
202 # add a pseudo-count here to ease calculations below
203 uniqueCountDict[fields[0]] = float(fields[-1]) + 1
207 genomeName = genome.genome
208 geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
209 geneannotDict = genome.allAnnotInfo()
210 outfile = open(outFilename, "w")
212 symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
213 tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
215 print "%s %s %f" % (gid, symbol, tagCount)
217 outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
222 def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
224 symbol = "LOC%s" % gid
227 geneinfo = geneinfoDict[gid]
228 if genomeName == "celegans":
229 symbol = geneinfo[0][1]
231 symbol = geneinfo[0][0]
232 except (KeyError, IndexError):
234 symbol = geneannotDict[(genomeName, gid)][0]
235 except (KeyError, IndexError):
236 symbol = "LOC%s" % gid
243 def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
245 for readID in gidReadDict[gid]:
247 tagValue = uniqueCountDict[gid]
252 for relatedGID in read2GidDict[readID]:
254 tagDenom += uniqueCountDict[relatedGID]
259 tagCount += tagValue / tagDenom
260 except ZeroDivisionError:
266 if __name__ == "__main__":