5 """ usage: python geneLocusCounts genome readDB outfilename [upstream] [downstream] [--noCDS] [--spanTSS] [--locusLength bplength] [--regions acceptfile] [--noUniqs] [--multi] [--splices]
6 where upstream and downstream are in bp and and optional
7 using noCDS requires either upstream or downstream (but not both)
8 to be nonzero. Using -locuslength will report the first bplength
9 or the last bplength of the gene region depending on whether it
10 is positive or negative.
11 will by default only count the uniq reads (use -noUniqs to turn off)
12 but can also count multi and splice reads given the appropriate flags
21 from commoncode import readDataset, getMergedRegions, getLocusByChromDict
22 from cistematic.genomes import Genome
23 from cistematic.core.geneinfo import geneinfoDB
25 print '%s: version 3.0' % sys.argv[0]
31 usage = "usage: python %prog genome readDB outfilename [options]"
33 parser = optparse.OptionParser(usage=usage)
34 parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
35 help="do not count unique reads")
36 parser.add_option("--multi", action="store_true", dest="doUniqs",
37 help="count multi reads")
38 parser.add_option("--splices", action="store_true", dest="doUniqs",
39 help="count splice reads")
40 parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
41 parser.add_option("--regions", dest="acceptfile")
42 parser.add_option("--noCDS", action="store_false", dest="useCDS")
43 parser.add_option("--locusLength", type="int", dest="bplength",
44 help="number of bases to report")
45 parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
46 (options, args) = parser.parse_args(argv[1:])
59 upstream = int(args[3])
66 if "-" not in args[3]:
67 downstream = int(args[4])
71 geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
74 def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
75 print 'returning only up to %d bp from gene locus' % bplength
76 print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
79 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
81 hitRDS = readDataset(hitfile, verbose = True)
83 totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
86 idb = geneinfoDB(cache=True)
91 geneinfoDict = idb.getallGeneInfo(genome)
92 locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
94 locusChroms = locusByChromDict.keys()
95 chromList = hitRDS.getChromosomes(fullChrom=False)
97 for chrom in chromList:
98 if chrom == 'M' or chrom not in locusChroms:
102 fullchrom = 'chr' + chrom
103 hitRDS.memSync(fullchrom, index=True)
104 for (start, stop, gid, length) in locusByChromDict[chrom]:
105 if gid not in gidList:
110 gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
112 outfile = open(outfilename,'w')
114 totalCount /= 1000000.
116 outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
123 geneinfo = geneinfoDict[gid]
124 symbol = geneinfo[0][0]
130 if gid in gidCount and gid in gidLen:
131 rpm = gidCount[gid] / totalCount
132 rpkm = 1000. * rpm / gidLen[gid]
133 outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
137 if __name__ == "__main__":