5 """ usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
6 where upstream and downstream are in bp and and optional
7 using noCDS requires either upstream or downstream (but not both)
8 to be nonzero. Using -locuslength will report the first bplength
9 or the last bplength of the gene region depending on whether it
10 is positive or negative.
11 will by default only count the uniq reads (use -noUniqs to turn off)
12 but can also count multi and splice reads given the appropriate flags
22 from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
24 from cistematic.genomes import Genome
25 from commoncode import getGeneInfoDict
27 print "geneLocusCounts: version 3.1"
33 usage = "usage: python %prog genome readDB outfilename [options]"
35 parser = getParser(usage)
36 (options, args) = parser.parse_args(argv[1:])
49 upstream = int(args[3])
56 if "-" not in args[3]:
57 downstream = int(args[4])
61 geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
62 options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
63 options.bplength, options.acceptfile)
67 parser = optparse.OptionParser(usage=usage)
68 parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
69 help="do not count unique reads")
70 parser.add_option("--multi", action="store_true", dest="doUniqs",
71 help="count multi reads")
72 parser.add_option("--splices", action="store_true", dest="doUniqs",
73 help="count splice reads")
74 parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
75 parser.add_option("--regions", dest="acceptfile")
76 parser.add_option("--noCDS", action="store_false", dest="useCDS")
77 parser.add_option("--locusLength", type="int", dest="bplength",
78 help="number of bases to report")
80 configParser = getConfigParser()
81 section = "geneLocusCounts"
82 doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
83 doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
84 doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
85 useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
86 spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
87 bplength = getConfigIntOption(configParser, section, "bplength", 0)
88 acceptfile = getConfigOption(configParser, section, "acceptfile", "")
90 parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
91 useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
92 acceptfile=acceptfile)
97 def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
98 doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
99 spanTSS=False, bplength=0, acceptfile=""):
101 print "returning only up to %d bp from gene locus" % bplength
102 print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
105 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
107 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
109 totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
113 locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
114 locusChroms = locusByChromDict.keys()
115 chromList = hitRDS.getChromosomes(fullChrom=False)
117 for chrom in chromList:
118 if doNotProcessChromosome(chrom, locusChroms):
121 fullchrom = "chr%s" % chrom
123 hitRDS.memSync(fullchrom, index=True)
124 for (start, stop, gid, length) in locusByChromDict[chrom]:
125 if not gidDict.has_key(gid):
126 gidDict[gid] = {"count": 0, "length": length}
128 gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
130 outfile = open(outfilename, "w")
132 totalCount /= 1000000.
134 outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
135 gidList = gidDict.keys()
137 geneinfoDict = getGeneInfoDict(genome, cache=True)
140 symbol = "LOC%s" % gid
143 geneinfo = geneinfoDict[gid]
144 symbol = geneinfo[0][0]
145 except (KeyError, IndexError):
150 gidCount = gidDict[gid]["count"]
151 gidLength = gidDict[gid]["length"]
152 rpm = gidCount / totalCount
153 rpkm = 1000. * rpm / gidLength
154 outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
159 def doNotProcessChromosome(chrom, locusChroms):
160 return chrom == "M" or chrom not in locusChroms
163 if __name__ == "__main__":