except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
-print '%s: version 3.0' % sys.argv[0]
+print "geneLocusCounts: version 3.1"
def main(argv=None):
if not argv:
usage = "usage: python %prog genome readDB outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
- help="do not count unique reads")
- parser.add_option("--multi", action="store_true", dest="doUniqs",
- help="count multi reads")
- parser.add_option("--splices", action="store_true", dest="doUniqs",
- help="count splice reads")
- parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
- parser.add_option("--regions", dest="acceptfile")
- parser.add_option("--noCDS", action="store_false", dest="useCDS")
- parser.add_option("--locusLength", type="int", dest="bplength",
- help="number of bases to report")
- parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
except ValueError:
pass
- geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+ geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
+ options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
+ options.bplength, options.acceptfile)
-def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
- print 'returning only up to %d bp from gene locus' % bplength
- print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+ help="do not count unique reads")
+ parser.add_option("--multi", action="store_true", dest="doUniqs",
+ help="count multi reads")
+ parser.add_option("--splices", action="store_true", dest="doUniqs",
+ help="count splice reads")
+ parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+ parser.add_option("--regions", dest="acceptfile")
+ parser.add_option("--noCDS", action="store_false", dest="useCDS")
+ parser.add_option("--locusLength", type="int", dest="bplength",
+ help="number of bases to report")
+
+ configParser = getConfigParser()
+ section = "geneLocusCounts"
+ doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+ doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
+ spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
+ bplength = getConfigIntOption(configParser, section, "bplength", 0)
+ acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+
+ parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
+ useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
+ acceptfile=acceptfile)
+
+ return parser
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
+ doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
+ spanTSS=False, bplength=0, acceptfile=""):
+
+ print "returning only up to %d bp from gene locus" % bplength
+ print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
if acceptfile:
acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
- hitRDS = readDataset(hitfile, verbose = True)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
hg = Genome(genome)
- idb = geneinfoDB(cache=True)
-
- gidCount = {}
- gidList = []
- gidLen = {}
- geneinfoDict = idb.getallGeneInfo(genome)
- locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
-
+ gidDict = {}
+ locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
locusChroms = locusByChromDict.keys()
chromList = hitRDS.getChromosomes(fullChrom=False)
chromList.sort()
for chrom in chromList:
- if chrom == 'M' or chrom not in locusChroms:
+ if doNotProcessChromosome(chrom, locusChroms):
continue
- print 'chr' + chrom
- fullchrom = 'chr' + chrom
+ fullchrom = "chr%s" % chrom
+ print fullchrom
hitRDS.memSync(fullchrom, index=True)
for (start, stop, gid, length) in locusByChromDict[chrom]:
- if gid not in gidList:
- gidList.append(gid)
- gidCount[gid] = 0
- gidLen[gid] = length
+ if not gid in gidDict:
+ gidDict[gid] = {"count": 0, "length": length}
- gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
- outfile = open(outfilename,'w')
+ outfile = open(outfilename, "w")
totalCount /= 1000000.
- outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+ outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
+ gidList = gidDict.keys()
gidList.sort()
+ geneinfoDict = getGeneInfoDict(genome, cache=True)
for gid in gidList:
- if 'FAR' not in gid:
- symbol = 'LOC' + gid
- geneinfo = ''
+ if "FAR" not in gid:
+ symbol = "LOC%s" % gid
+ geneinfo = ""
try:
geneinfo = geneinfoDict[gid]
symbol = geneinfo[0][0]
- except:
+ except (KeyError, IndexError):
pass
else:
symbol = gid
- if gid in gidCount and gid in gidLen:
- rpm = gidCount[gid] / totalCount
- rpkm = 1000. * rpm / gidLen[gid]
- outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+ gidCount = gidDict[gid]["count"]
+ gidLength = gidDict[gid]["length"]
+ rpm = gidCount / totalCount
+ rpkm = 1000. * rpm / gidLength
+ outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
outfile.close()
+
+def doNotProcessChromosome(chrom, locusChroms):
+ return chrom == "M" or chrom not in locusChroms
+
+
if __name__ == "__main__":
main(sys.argv)
\ No newline at end of file