10 print 'psyco not running'
16 from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
18 versionString = "regionCounts: version 3.10"
25 usage = "usage: python %prog regionfile rdsfile outfilename [options]"
27 parser = getParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
34 regionfilename = args[0]
38 bamfile = pysam.Samfile(bamfilename, "rb")
40 regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
41 options.useFullchrom, options.normalize, options.padregion,
42 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
43 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
44 options.doRPKM, options.doLength, options.forceRegion)
48 parser = optparse.OptionParser(usage=usage)
49 parser.add_option("--markRDS", action="store_true", dest="flagRDS")
50 parser.add_option("--chromField", type="int", dest="cField")
51 parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
52 parser.add_option("--raw", action="store_false", dest="normalize")
53 parser.add_option("--padregion", type="int", dest="padregion")
54 parser.add_option("--mergeregion", type="int", dest="mergeregion")
55 parser.add_option("--nomerge", action="store_false", dest="merging")
56 parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
57 parser.add_option("--noMulti", action="store_false", dest="doMulti")
58 parser.add_option("--splices", action="store_true", dest="doSplices")
59 parser.add_option("--peak", action="store_true", dest="usePeak")
60 parser.add_option("--cache", type="int", dest="cachePages")
61 parser.add_option("--log", dest="logfilename")
62 parser.add_option("--rpkm", action="store_true", dest="doRPKM")
63 parser.add_option("--length", action="store_true", dest="doLength")
64 parser.add_option("--force", action="store_true", dest="forceRegion")
66 configParser = getConfigParser()
67 section = "regionCounts"
68 flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
69 cField = getConfigIntOption(configParser, section, "cField", 1)
70 useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
71 normalize = getConfigBoolOption(configParser, section, "normalize", True)
72 padregion = getConfigIntOption(configParser, section, "padregion", 0)
73 mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
74 merging = getConfigBoolOption(configParser, section, "merging", True)
75 doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
76 doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
77 doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
78 usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
79 cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
80 logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
81 doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
82 doLength = getConfigBoolOption(configParser, section, "doLength", False)
83 forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
85 parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
86 padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
87 doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
88 logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
89 forceRegion=forceRegion)
94 def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
95 useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
96 merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
97 cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
100 print "padding %d bp on each side of a region" % padregion
101 print "merging regions closer than %d bp" % mergeregion
102 print "will use peak values"
109 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
111 regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
112 fullChrom=useFullchrom, verbose=True, chromField=cField,
113 doMerge=merging, pad=padregion)
116 labeltoRegionDict = {}
119 readlen = getHeaderComment(bamfile.header, "ReadLength")
120 totalCount = getHeaderComment(bamfile.header, "Total")
122 normalizationFactor = totalCount / 1000000.
124 chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
126 for rchrom in regionDict:
127 if forceRegion and rchrom not in chromList:
129 for region in regionDict[rchrom]:
130 regionCount[region.label] = 0
131 labelList.append(region.label)
132 labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
134 for rchrom in chromList:
136 if rchrom not in regionDict:
143 fullchrom = "chr%s" % rchrom
145 for region in regionDict[rchrom]:
149 regionCount[label] = 0
150 labelList.append(label)
151 labeltoRegionDict[label] = (rchrom, start, stop)
152 #regionList.append((label, fullchrom, start, stop))
155 for alignedread in bamfile.fetch(fullchrom, start, stop):
156 weight = 1.0/alignedread.opt('NH')
157 readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
159 if len(readList) < 1:
163 peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
165 topValue = peak.smoothArray[peak.topPos[0]]
167 print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
170 regionCount[label] += topValue
172 regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
175 for label in regionCount:
176 regionCount[label] = float(regionCount[label]) / normalizationFactor
178 outfile = open(outfilename, "w")
183 for label in labelList:
184 (chrom, start, stop) = labeltoRegionDict[label]
188 fullchrom = "chr%s" % chrom
192 length = abs(stop - start) / 1000.
199 outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
201 outfile.write("\t%.1f" % length)
203 outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
208 writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
211 def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
216 for alignedread in bamfile.fetch(chr, start, end):
217 readMultiplicity = alignedread.opt('NH')
218 if doSplices and isSpliceEntry(alignedread.cigar):
219 if readMultiplicity == 1 and doUniqs:
222 multiSplice += 1.0/readMultiplicity
223 elif readMultiplicity == 1 and doUniqs:
226 multis += 1.0/readMultiplicity
228 totalReads = uniques + multis + uniqueSplice + multiSplice
233 if __name__ == "__main__":