X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneLocusCounts.py;h=218017e10a0d9a04d0b1cd8adde8449ecb60dcb8;hp=0e8792b740c3c83b524a03c0084027ed0bc7766c;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneLocusCounts.py b/geneLocusCounts.py index 0e8792b..218017e 100755 --- a/geneLocusCounts.py +++ b/geneLocusCounts.py @@ -17,12 +17,14 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, getLocusByChromDict +import sys +import optparse +from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict -print '%s: version 3.0' % sys.argv[0] +print "geneLocusCounts: version 3.1" def main(argv=None): if not argv: @@ -30,19 +32,7 @@ def main(argv=None): usage = "usage: python %prog genome readDB outfilename [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--noUniqs", action="store_false", dest="doUniqs", - help="do not count unique reads") - parser.add_option("--multi", action="store_true", dest="doUniqs", - help="count multi reads") - parser.add_option("--splices", action="store_true", dest="doUniqs", - help="count splice reads") - parser.add_option("--spanTSS", action="store_true", dest="spanTSS") - parser.add_option("--regions", dest="acceptfile") - parser.add_option("--noCDS", action="store_false", dest="useCDS") - parser.add_option("--locusLength", type="int", dest="bplength", - help="number of bases to report") - parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="") + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -68,71 +58,107 @@ def main(argv=None): except ValueError: pass - geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile) + geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, + options.doMulti, options.doSplices, options.useCDS, options.spanTSS, + options.bplength, options.acceptfile) -def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""): - print 'returning only up to %d bp from gene locus' % bplength - print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--noUniqs", action="store_false", dest="doUniqs", + help="do not count unique reads") + parser.add_option("--multi", action="store_true", dest="doUniqs", + help="count multi reads") + parser.add_option("--splices", action="store_true", dest="doUniqs", + help="count splice reads") + parser.add_option("--spanTSS", action="store_true", dest="spanTSS") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--noCDS", action="store_false", dest="useCDS") + parser.add_option("--locusLength", type="int", dest="bplength", + help="number of bases to report") + + configParser = getConfigParser() + section = "geneLocusCounts" + doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True) + doMulti = getConfigBoolOption(configParser, section, "doMulti", False) + doSplices = getConfigBoolOption(configParser, section, "doSplices", False) + useCDS = getConfigBoolOption(configParser, section, "useCDS", True) + spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False) + bplength = getConfigIntOption(configParser, section, "bplength", 0) + acceptfile = getConfigOption(configParser, section, "acceptfile", "") + + parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices, + useCDS=useCDS, spanTSS=spanTSS, bplength=bplength, + acceptfile=acceptfile) + + return parser + + +def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, + doUniqs=True, doMulti=False, doSplices=False, useCDS=True, + spanTSS=False, bplength=0, acceptfile=""): + + print "returning only up to %d bp from gene locus" % bplength + print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS) if acceptfile: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) - hitRDS = readDataset(hitfile, verbose = True) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True) totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices) hg = Genome(genome) - idb = geneinfoDB(cache=True) - - gidCount = {} - gidList = [] - gidLen = {} - geneinfoDict = idb.getallGeneInfo(genome) - locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength) - + gidDict = {} + locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength) locusChroms = locusByChromDict.keys() chromList = hitRDS.getChromosomes(fullChrom=False) chromList.sort() for chrom in chromList: - if chrom == 'M' or chrom not in locusChroms: + if doNotProcessChromosome(chrom, locusChroms): continue - print 'chr' + chrom - fullchrom = 'chr' + chrom + fullchrom = "chr%s" % chrom + print fullchrom hitRDS.memSync(fullchrom, index=True) for (start, stop, gid, length) in locusByChromDict[chrom]: - if gid not in gidList: - gidList.append(gid) - gidCount[gid] = 0 - gidLen[gid] = length + if not gidDict.has_key(gid): + gidDict[gid] = {"count": 0, "length": length} - gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) - outfile = open(outfilename,'w') + outfile = open(outfilename, "w") totalCount /= 1000000. - outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n') + outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n") + gidList = gidDict.keys() gidList.sort() + geneinfoDict = getGeneInfoDict(genome, cache=True) for gid in gidList: - if 'FAR' not in gid: - symbol = 'LOC' + gid - geneinfo = '' + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except: + except (KeyError, IndexError): pass else: symbol = gid - if gid in gidCount and gid in gidLen: - rpm = gidCount[gid] / totalCount - rpkm = 1000. * rpm / gidLen[gid] - outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm)) + gidCount = gidDict[gid]["count"] + gidLength = gidDict[gid]["length"] + rpm = gidCount / totalCount + rpkm = 1000. * rpm / gidLength + outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm)) outfile.close() + +def doNotProcessChromosome(chrom, locusChroms): + return chrom == "M" or chrom not in locusChroms + + if __name__ == "__main__": main(sys.argv) \ No newline at end of file