X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneMrnaCountsWeighted.py;h=3d02c679d28cf03c8712a3031d5c207ac65bd45d;hp=7acf0b92fd1d93da0c29d4bdbba08af6c7d2a937;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 7acf0b9..3d02c67 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -2,15 +2,17 @@ try: import psyco psyco.full() except: - print 'psyco not running' + print "psyco not running" -import sys, optparse -from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict +import sys +import optparse +import pysam +from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -print '%s: version 4.1' % sys.argv[0] +print "geneMrnaCountsWeighted: version 5.0" def main(argv=None): @@ -19,21 +21,7 @@ def main(argv=None): usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--stranded", action="store_false", dest="ignoreSense") - parser.add_option("--uniq", action="store_true", dest="withUniqs") - parser.add_option("--multi", action="store_true", dest="withMulti") - parser.add_option("--record", action="store_true", dest="recording", - help="ignored with uniq reads") - parser.add_option("--accept", dest="acceptfile") - parser.add_option("--cache", type="int", dest="cachePages") - parser.add_option("--verbose", action="store_true", dest="doVerbose") - parser.add_option("--models", dest="extendGenome") - parser.add_option("--replacemodels", action="store_true", dest="replaceModels") - parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False, - acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", - replaceModels=False) - + parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 4: @@ -44,15 +32,45 @@ def main(argv=None): hitfile = args[1] countfile = args[2] outfilename = args[3] + bamfile = pysam.Samfile(hitfile, "rb") - geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense, - options.withUniqs, options.withMulti, options.recording, + geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense, + options.withUniqs, options.withMulti, options.acceptfile, options.cachePages, options.doVerbose, options.extendGenome, options.replaceModels) -def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, - withUniqs=False, withMulti=False, recording=False, acceptfile=None, +def makeParser(usage=""): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--stranded", action="store_false", dest="ignoreSense") + parser.add_option("--uniq", action="store_true", dest="withUniqs") + parser.add_option("--multi", action="store_true", dest="withMulti") + parser.add_option("--accept", dest="acceptfile") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--verbose", action="store_true", dest="doVerbose") + parser.add_option("--models", dest="extendGenome") + parser.add_option("--replacemodels", action="store_true", dest="replaceModels") + + configParser = getConfigParser() + section = "geneMrnaCountsWeighted" + ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True) + withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False) + withMulti = getConfigBoolOption(configParser, section, "withMulti", False) + acceptfile = getConfigOption(configParser, section, "acceptfile", None) + cachePages = getConfigOption(configParser, section, "cachePages", None) + doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False) + extendGenome = getConfigOption(configParser, section, "extendGenome", "") + replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False) + + parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti, + acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome, + replaceModels=replaceModels) + + return parser + + +def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True, + withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): if (not withUniqs and not withMulti) or (withUniqs and withMulti): @@ -62,204 +80,195 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= if cachePages is not None: cacheGeneDB(genome) hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True) - idb = geneinfoDB(cache=True) print "%s cached" % genome doCache = True else: doCache = False - cachePages = 0 hg = Genome(genome, inRAM=True) - idb = geneinfoDB() - - if acceptfile is not None: - acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) - else: - acceptDict = {} - - if recording and withUniqs: - recording = False if extendGenome: if replaceModels: print "will replace gene models with %s" % extendGenome else: print "will extend gene models with %s" % extendGenome - else: - replaceModels = False - if extendGenome != "": - hg.extendFeatures(extendGenome, replace = replaceModels) - - hitRDS = readDataset(hitfile, verbose = True, cache=doCache) - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) + hg.extendFeatures(extendGenome, replace=replaceModels) - readlen = hitRDS.getReadSize() + allGIDs = set(hg.allGIDs()) + if acceptfile is not None: + regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose) + for chrom in regionDict: + for region in regionDict[chrom]: + allGIDs.add(region.label) + else: + regionDict = {} + + featuresByChromDict = getFeaturesByChromDict(hg, regionDict) - geneinfoDict = idb.getallGeneInfo(genome) - geneannotDict = hg.allAnnotInfo() - gidCount = {} gidReadDict = {} + read2GidDict = {} + for gid in allGIDs: + gidReadDict[gid] = [] - featuresByChromDict = getFeaturesByChromDict(hg, acceptDict) - gidList = hg.allGIDs() + index = 0 + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + for chromosome in chromosomeList: + if doNotProcessChromosome(chromosome, featuresByChromDict.keys()): + continue - gidList.sort() - for chrom in acceptDict: - for (label, start, stop, length) in acceptDict[chrom]: - if label not in gidList: - gidList.append(label) + print "\n%s " % chromosome, + featureList = featuresByChromDict[chromosome] - for gid in gidList: - gidCount[gid] = 0 - gidReadDict[gid] = [] + readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense) + index = totalProcessedReads + for (tagReadID, gid) in readGidList: + try: + gidReadDict[gid].append(tagReadID) + if tagReadID in read2GidDict: + read2GidDict[tagReadID].add(gid) + else: + read2GidDict[tagReadID] = set([gid]) + except KeyError: + print "gid %s not in gidReadDict" % gid - uniqueCountDict = {} - read2GidDict = {} + writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache) + if doCache: + uncacheGeneDB(genome) - uniquecounts = open(countfile) + +def doNotProcessChromosome(chromosome, chromosomeList): + return chromosome not in chromosomeList + + +def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True): + + startFeature = 0 + readGidList = [] + readlen = getHeaderComment(bamfile.header, "ReadLength") + for alignedread in bamfile.fetch(chromosome): + if doNotProcessRead(alignedread, doUniqs, doMulti): + continue + + tagStart = alignedread.pos + tagReadID = addPairIDtoReadID(alignedread) + tagSense = "" + if not ignoreSense: + if alignedread.is_reverse: + tagSense = "-" + else: + tagSense = "+" + + index += 1 + if index % 100000 == 0: + print "read %d" % index, + + stopPoint = tagStart + readlen + if startFeature < 0: + startFeature = 0 + + for (start, stop, gid, sense, ftype) in featList[startFeature:]: + if tagStart > stop: + startFeature += 1 + continue + + if start > stopPoint: + startFeature -= 100 + break + + if not ignoreSense: + if sense == "R": + sense = "-" + else: + sense = "+" + + if start <= tagStart <= stop and (ignoreSense or tagSense == sense): + readGidList.append((tagReadID, gid)) + stopPoint = stop + + return readGidList, index + + +def doNotProcessRead(read, doUniqs, doMulti): + doNotProcess = False + if isSpliceEntry(read.cigar): + doNotProcess = True + elif doUniqs and read.opt('NH') > 1: + doNotProcess = True + elif doMulti and read.opt('NH') == 1: + doNotProcess = True + + return doNotProcess + + +def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False): + + uniqueCountDict = {} + uniquecounts = open(countFilename) for line in uniquecounts: fields = line.strip().split() # add a pseudo-count here to ease calculations below + #TODO: figure out why this was done in prior implementation... uniqueCountDict[fields[0]] = float(fields[-1]) + 1 uniquecounts.close() - outfile = open(outfilename, "w") + genomeName = genome.genome + geneinfoDict = getGeneInfoDict(genomeName, cache=doCache) + geneannotDict = genome.allAnnotInfo() + outfile = open(outFilename, "w") + for gid in allGIDs: + symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict) + tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict) + if doVerbose: + print "%s %s %f" % (gid, symbol, tagCount) - index = 0 - if withMulti and not withUniqs: - chromList = hitRDS.getChromosomes(table="multi", fullChrom=False) - else: - chromList = hitRDS.getChromosomes(fullChrom=False) + outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount)) - for achrom in chromList: - if achrom not in featuresByChromDict: - continue + outfile.close() - print "\n" + achrom + " ", - startFeature = 0 - fullchrom = "chr" + achrom - hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti) - featList = featuresByChromDict[achrom] - if ignoreSense: - for (tagStart, tagReadID) in hitDict[fullchrom]: - index += 1 - if index % 100000 == 0: - print "read %d" % index, - - stopPoint = tagStart + readlen - if startFeature < 0: - startFeature = 0 - - for (start, stop, gid, sense, ftype) in featList[startFeature:]: - if tagStart > stop: - startFeature += 1 - continue - - if start > stopPoint: - startFeature -= 100 - break - - if start <= tagStart <= stop: - try: - gidReadDict[gid].append(tagReadID) - if tagReadID in read2GidDict: - if gid not in read2GidDict[tagReadID]: - read2GidDict[tagReadID].append(gid) - else: - read2GidDict[tagReadID] = [gid] - - gidCount[gid] += 1 - except: - print "gid %s not in gidReadDict" % gid - - stopPoint = stop - else: - for (tagStart, tSense, tagReadID) in hitDict[fullchrom]: - index += 1 - if index % 100000 == 0: - print "read %d" % index, - - stopPoint = tagStart + readlen - if startFeature < 0: - startFeature = 0 - - for (start, stop, gid, sense, ftype) in featList[startFeature:]: - if tagStart > stop: - startFeature += 1 - continue - - if start > stopPoint: - startFeature -= 100 - break - - if sense == "R": - sense = "-" - else: - sense = "+" - - if start <= tagStart <= stop and sense == tSense: - try: - gidReadDict[gid].append(tagReadID) - if tagReadID in read2GidDict: - if gid not in read2GidDict[tagReadID]: - read2GidDict[tagReadID].append(gid) - else: - read2GidDict[tagReadID] = [gid] - - gidCount[gid] += 1 - except: - print "gid %s not in gidReadDict" % gid - - stopPoint = stop - - for gid in gidList: - if "FAR" not in gid: - symbol = "LOC" + gid - geneinfo = "" - try: - geneinfo = geneinfoDict[gid] - if genome == "celegans": - symbol = geneinfo[0][1] - else: - symbol = geneinfo[0][0] - except: - try: - symbol = geneannotDict[(genome, gid)][0] - except: - symbol = "LOC" + gid - else: - symbol = gid - tagCount = 0. - for readID in gidReadDict[gid]: +def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict): + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + if genomeName == "celegans": + symbol = geneinfo[0][1] + else: + symbol = geneinfo[0][0] + except (KeyError, IndexError): try: - tagValue = uniqueCountDict[gid] - except: - tagValue = 1 + symbol = geneannotDict[(genomeName, gid)][0] + except (KeyError, IndexError): + symbol = "LOC%s" % gid + else: + symbol = gid + + return symbol - tagDenom = 0. - for aGid in read2GidDict[readID]: - try: - tagDenom += uniqueCountDict[aGid] - except: - tagDenom += 1 +def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): + tagCount = 0. + for readID in gidReadDict[gid]: try: - tagCount += tagValue / tagDenom - except ZeroDivisionError: - tagCount = 0 - - if doVerbose: - print "%s %s %f" % (gid, symbol, tagCount) + tagValue = uniqueCountDict[gid] + except KeyError: + pass - outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount)) + tagDenom = 0. + for relatedGID in read2GidDict[readID]: + try: + tagDenom += uniqueCountDict[relatedGID] + except KeyError: + tagDenom += 1 - outfile.close() + try: + tagCount += tagValue / tagDenom + except ZeroDivisionError: + pass - if doCache: - uncacheGeneDB(genome) + return tagCount if __name__ == "__main__":