12 from commoncode import readDataset, getMergedRegions, findPeak, getLocusByChromDict
13 from cistematic.genomes import Genome
14 from cistematic.core.geneinfo import geneinfoDB
17 print "%prog: version 2.0"
23 usage = "usage: python %prog genome rdsfile outfilename [--up upstream] [--down downstream] [--regions acceptfile] [--raw]"
25 parser = optparse.OptionParser(usage=usage)
26 parser.add_option("--up", type="int", dest="upstream")
27 parser.add_option("--down", type="int", dest="downstream")
28 parser.add_option("--regions", dest="acceptfile")
29 parser.add_option("--raw", action="store_false", dest="normalize")
30 parser.add_option("--cache", action="store_true", dest="doCache")
31 parser.set_defaults(upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False)
32 (options, args) = parser.parse_args(argv[1:])
36 print "\twhere upstream and downstream are in bp and and optional"
43 geneLocusPeaks(genome, hitfile, outfilename, options.upstream, options.downstream, options.acceptfile, options.normalize, options.doCache)
46 def geneLocusPeaks(genome, hitfile, outfilename, upstream=0, downstream=0, acceptfile="", normalize=True, doCache=False):
50 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
52 print "upstream = %d downstream = %d" % (upstream, downstream)
54 hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
55 readlen = hitRDS.getReadSize()
56 normalizationFactor = 1.0
58 totalCount = len(hitRDS)
59 normalizationFactor = totalCount / 1000000.
61 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
64 idb = geneinfoDB(cache=True)
68 geneinfoDict = idb.getallGeneInfo(genome)
69 locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS=True, additionalRegionsDict=acceptDict)
71 gidList = hg.allGIDs()
73 for chrom in acceptDict:
74 for (label, start, stop, length) in acceptDict[chrom]:
75 if label not in gidList:
82 if chrom not in locusByChromDict:
86 for (start, stop, gid, glen) in locusByChromDict[chrom]:
88 (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[chrom], start, glen, readlen)
90 gidCount[gid] = smoothArray[topPos[0]]
91 gidPos[gid] = (chrom, start + topPos[0])
93 gidPos[gid] = (chrom, start)
95 outfile = open(outfilename, "w")
102 geneinfo = geneinfoDict[gid]
103 symbol = geneinfo[0][0]
109 if gid in gidCount and gid in gidPos:
110 (chrom, pos) = gidPos[gid]
111 outfile.write("%s\t%s\tchr%s\t%d\t%.2f\n" % (gid, symbol, chrom, pos, gidCount[gid]))
116 if __name__ == "__main__":