10 print 'psyco not running'
15 from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
18 versionString = "regionCounts: version 3.10"
25 usage = "usage: python %prog regionfile rdsfile outfilename [options]"
27 parser = getParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
34 regionfilename = args[0]
38 regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
39 options.useFullchrom, options.normalize, options.padregion,
40 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
41 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
42 options.doRPKM, options.doLength, options.forceRegion)
46 parser = optparse.OptionParser(usage=usage)
47 parser.add_option("--markRDS", action="store_true", dest="flagRDS")
48 parser.add_option("--chromField", type="int", dest="cField")
49 parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
50 parser.add_option("--raw", action="store_false", dest="normalize")
51 parser.add_option("--padregion", type="int", dest="padregion")
52 parser.add_option("--mergeregion", type="int", dest="mergeregion")
53 parser.add_option("--nomerge", action="store_false", dest="merging")
54 parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
55 parser.add_option("--noMulti", action="store_false", dest="doMulti")
56 parser.add_option("--splices", action="store_true", dest="doSplices")
57 parser.add_option("--peak", action="store_true", dest="usePeak")
58 parser.add_option("--cache", type="int", dest="cachePages")
59 parser.add_option("--log", dest="logfilename")
60 parser.add_option("--rpkm", action="store_true", dest="doRPKM")
61 parser.add_option("--length", action="store_true", dest="doLength")
62 parser.add_option("--force", action="store_true", dest="forceRegion")
64 configParser = getConfigParser()
65 section = "regionCounts"
66 flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
67 cField = getConfigIntOption(configParser, section, "cField", 1)
68 useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
69 normalize = getConfigBoolOption(configParser, section, "normalize", True)
70 padregion = getConfigIntOption(configParser, section, "padregion", 0)
71 mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
72 merging = getConfigBoolOption(configParser, section, "merging", True)
73 doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
74 doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
75 doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
76 usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
77 cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
78 logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
79 doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
80 doLength = getConfigBoolOption(configParser, section, "doLength", False)
81 forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
83 parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
84 padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
85 doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
86 logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
87 forceRegion=forceRegion)
92 def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
93 useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
94 merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
95 cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
98 print "padding %d bp on each side of a region" % padregion
99 print "merging regions closer than %d bp" % mergeregion
100 print "will use peak values"
112 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
114 regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
115 fullChrom=useFullchrom, verbose=True, chromField=cField,
116 doMerge=merging, pad=padregion)
119 labeltoRegionDict = {}
122 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
123 readlen = hitRDS.getReadSize()
124 if cachePages > hitRDS.getDefaultCacheSize():
125 hitRDS.setDBcache(cachePages)
127 totalCount = len(hitRDS)
129 normalizationFactor = totalCount / 1000000.
131 chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
132 if len(chromList) == 0 and doSplices:
133 chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
138 hitRDS.setSynchronousPragma("OFF")
140 for rchrom in regionDict:
141 if forceRegion and rchrom not in chromList:
143 for region in regionDict[rchrom]:
144 regionCount[region.label] = 0
145 labelList.append(region.label)
146 labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
148 for rchrom in chromList:
150 if rchrom not in regionDict:
157 fullchrom = "chr%s" % rchrom
160 readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
162 dictLen = len(readDict[fullchrom])
164 for region in regionDict[rchrom]:
168 regionCount[label] = 0
169 labelList.append(label)
170 labeltoRegionDict[label] = (rchrom, start, stop)
171 regionList.append((label, fullchrom, start, stop))
174 for localIndex in xrange(rindex, dictLen):
175 read = readDict[fullchrom][localIndex]
176 if read["start"] < start:
178 elif start <= read["start"] <= stop:
179 readList.append(read)
183 if len(readList) < 1:
187 peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
189 topValue = peak.smoothArray[peak.topPos[0]]
191 print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
194 regionCount[label] += topValue
196 regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
199 hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
202 hitRDS.setSynchronousPragma("ON")
205 for label in regionCount:
206 regionCount[label] = float(regionCount[label]) / normalizationFactor
208 outfile = open(outfilename, "w")
213 for label in labelList:
214 (chrom, start, stop) = labeltoRegionDict[label]
218 fullchrom = "chr%s" % chrom
222 length = abs(stop - start) / 1000.
229 outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
231 outfile.write("\t%.1f" % length)
233 outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
238 if doCache and flagRDS:
239 hitRDS.saveCacheDB(hitfile)
241 writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
244 if __name__ == "__main__":