10 print 'psyco not running'
12 import sys, string, optparse
13 from commoncode import readDataset, getMergedRegions, findPeak, writeLog
15 versionString = "%prog: version 3.9"
22 usage = "usage: python %prog regionfile rdsfile outfilename [options]"
24 parser = optparse.OptionParser(usage=usage)
25 parser.add_option("--markRDS", action="store_true", dest="flagRDS")
26 parser.add_option("--chromField", type="int", dest="cField")
27 parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
28 parser.add_option("--raw", action="store_false", dest="normalize")
29 parser.add_option("--padregion", type="int", dest="padregion")
30 parser.add_option("--mergeregion", type="int", dest="mergeregion")
31 parser.add_option("--nomerge", action="store_false", dest="merging")
32 parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
33 parser.add_option("--noMulti", action="store_false", dest="doMulti")
34 parser.add_option("--splices", action="store_true", dest="doSplices")
35 parser.add_option("--peak", action="store_true", dest="usePeak")
36 parser.add_option("--cache", type="int", dest="cachePages")
37 parser.add_option("--log", dest="logfilename")
38 parser.add_option("--rpkm", action="store_true", dest="doRPKM")
39 parser.add_option("--length", action="store_true", dest="doLength")
40 parser.add_option("--force", action="store_true", dest="forceRegion")
41 parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
42 padregion=0, mergeregion=0, merging=True, doUniqs=True,
43 doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
44 logfilename="regionCounts.log", doRPKM=False, doLength=False,
47 (options, args) = parser.parse_args(argv[1:])
53 regionfilename = args[0]
57 regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
58 options.useFullchrom, options.normalize, options.padregion,
59 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
60 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
61 options.doRPKM, options.doLength, options.forceRegion)
64 def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
65 useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
66 merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
67 cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
70 print "padding %d bp on each side of a region" % padregion
71 print "merging regions closer than %d bp" % mergeregion
72 print "will use peak values"
84 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
86 regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
87 fullChrom=useFullchrom, verbose=True, chromField=cField,
88 doMerge=merging, pad=padregion)
91 labeltoRegionDict = {}
94 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
95 readlen = hitRDS.getReadSize()
96 if cachePages > hitRDS.getDefaultCacheSize():
97 hitRDS.setDBcache(cachePages)
99 totalCount = len(hitRDS)
101 normalizationFactor = totalCount / 1000000.
103 chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
104 if len(chromList) == 0 and doSplices:
105 chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
110 hitRDS.setSynchronousPragma("OFF")
112 for rchrom in regionDict:
113 if forceRegion and rchrom not in chromList:
115 for (label, start, stop, length) in regionDict[rchrom]:
116 regionCount[label] = 0
117 labelList.append(label)
118 labeltoRegionDict[label] = (rchrom, start, stop)
120 for rchrom in chromList:
122 if rchrom not in regionDict:
129 fullchrom = "chr%s" % rchrom
132 readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
134 dictLen = len(readDict[fullchrom])
136 for (label, start, stop, length) in regionDict[rchrom]:
137 regionCount[label] = 0
138 labelList.append(label)
139 labeltoRegionDict[label] = (rchrom, start, stop)
144 fullchrom = "chr%s" % rchrom
146 for (label, rstart, rstop, length) in regionDict[rchrom]:
147 regionList.append((label, fullchrom, rstart, rstop))
150 for localIndex in xrange(rindex, dictLen):
151 read = readDict[fullchrom][localIndex]
154 elif rstart <= read[0] <= rstop:
155 readList.append(read)
159 if len(readList) < 1:
163 (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
165 topValue = smoothArray[topPos[0]]
167 print "problem with %s %s" % (str(topPos), str(smoothArray))
170 regionCount[label] += topValue
172 regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
175 hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
178 hitRDS.setSynchronousPragma("ON")
181 for label in regionCount:
182 regionCount[label] = float(regionCount[label]) / normalizationFactor
184 outfile = open(outfilename, "w")
189 for label in labelList:
190 (chrom, start, stop) = labeltoRegionDict[label]
194 fullchrom = "chr%s" % chrom
198 length = abs(stop - start) / 1000.
205 outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
207 outfile.write("\t%.1f" % length)
209 outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
214 if doCache and flagRDS:
215 hitRDS.saveCacheDB(hitfile)
217 writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
220 if __name__ == "__main__":