first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / regionCounts.py
1 #
2 #  regionCounts.py
3 #  ENRAGE
4 #
5
6 try:
7     import psyco
8     psyco.full()
9 except:
10     print 'psyco not running'
11
12 import sys
13 import string
14 import optparse
15 import pysam
16 from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
17
18 versionString = "regionCounts: version 3.10"
19 print versionString
20
21 def main(argv=None):
22     if not argv:
23         argv = sys.argv
24
25     usage = "usage: python %prog regionfile rdsfile outfilename [options]"
26
27     parser = getParser(usage)
28     (options, args) = parser.parse_args(argv[1:])
29
30     if len(args) < 3:
31         print usage
32         sys.exit(1)
33
34     regionfilename = args[0]
35     bamfilename =  args[1]
36     outfilename = args[2]
37
38     bamfile = pysam.Samfile(bamfilename, "rb")
39
40     regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
41                  options.useFullchrom, options.normalize, options.padregion,
42                  options.mergeregion, options.merging, options.doUniqs, options.doMulti,
43                  options.doSplices, options.usePeak, options.cachePages, options.logfilename,
44                  options.doRPKM, options.doLength, options.forceRegion)
45
46
47 def getParser(usage):
48     parser = optparse.OptionParser(usage=usage)
49     parser.add_option("--markRDS", action="store_true", dest="flagRDS")
50     parser.add_option("--chromField", type="int", dest="cField")
51     parser.add_option("--fullchrom", action="store_true", dest="useFullchrom")
52     parser.add_option("--raw", action="store_false", dest="normalize")
53     parser.add_option("--padregion", type="int", dest="padregion")
54     parser.add_option("--mergeregion", type="int", dest="mergeregion")
55     parser.add_option("--nomerge", action="store_false", dest="merging")
56     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
57     parser.add_option("--noMulti", action="store_false", dest="doMulti")
58     parser.add_option("--splices", action="store_true", dest="doSplices")
59     parser.add_option("--peak", action="store_true", dest="usePeak")
60     parser.add_option("--cache", type="int", dest="cachePages")
61     parser.add_option("--log", dest="logfilename")
62     parser.add_option("--rpkm", action="store_true", dest="doRPKM")
63     parser.add_option("--length", action="store_true", dest="doLength")
64     parser.add_option("--force", action="store_true", dest="forceRegion")
65
66     configParser = getConfigParser()
67     section = "regionCounts"
68     flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
69     cField = getConfigIntOption(configParser, section, "cField", 1)
70     useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
71     normalize = getConfigBoolOption(configParser, section, "normalize", True)
72     padregion = getConfigIntOption(configParser, section, "padregion", 0)
73     mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
74     merging = getConfigBoolOption(configParser, section, "merging", True)
75     doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
76     doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
77     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
78     usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
79     cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
80     logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
81     doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
82     doLength = getConfigBoolOption(configParser, section, "doLength", False)
83     forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
84
85     parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
86                         padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
87                         doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
88                         logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
89                         forceRegion=forceRegion)
90
91     return parser
92
93
94 def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
95                  useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
96                  merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
97                  cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
98                  forceRegion=False):
99
100     print "padding %d bp on each side of a region" % padregion
101     print "merging regions closer than %d bp" % mergeregion
102     print "will use peak values"
103
104     normalize = True
105     doRPKM = False
106     if doRPKM == True:
107         normalize = True
108
109     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
110
111     regionDict = getMergedRegions(regionfilename, maxDist=mergeregion, minHits=-1, keepLabel=True,
112                                   fullChrom=useFullchrom, verbose=True, chromField=cField,
113                                   doMerge=merging, pad=padregion)
114
115     labelList = []
116     labeltoRegionDict = {}
117     regionCount = {}
118
119     readlen = getHeaderComment(bamfile.header, "ReadLength")
120     totalCount = getHeaderComment(bamfile.header, "Total")
121     if normalize:
122         normalizationFactor = totalCount / 1000000.
123
124     chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
125     chromList.sort()
126     for rchrom in regionDict:
127         if forceRegion and rchrom not in chromList:
128             print rchrom
129             for region in regionDict[rchrom]:
130                 regionCount[region.label] = 0
131                 labelList.append(region.label)
132                 labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
133
134     for rchrom in chromList:
135         #regionList = []
136         if rchrom not in regionDict:
137             continue
138
139         print rchrom
140         if useFullchrom:
141             fullchrom = rchrom
142         else:
143             fullchrom = "chr%s" % rchrom
144
145         for region in regionDict[rchrom]:
146             label = region.label
147             start = region.start
148             stop = region.stop
149             regionCount[label] = 0
150             labelList.append(label)
151             labeltoRegionDict[label] = (rchrom, start, stop)
152             #regionList.append((label, fullchrom, start, stop))
153             if usePeak:
154                 readList = []
155                 for alignedread in bamfile.fetch(fullchrom, start, stop):
156                     weight = 1.0/alignedread.opt('NH')
157                     readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
158
159                 if len(readList) < 1:
160                     continue
161
162                 readList.sort()
163                 peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
164                 try:
165                     topValue = peak.smoothArray[peak.topPos[0]]
166                 except:
167                     print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
168                     continue
169
170                 regionCount[label] += topValue
171             else:
172                 regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
173
174     if normalize:
175         for label in regionCount:
176             regionCount[label] = float(regionCount[label]) / normalizationFactor
177
178     outfile = open(outfilename, "w")
179
180     if forceRegion:
181         labelList.sort()
182
183     for label in labelList:
184         (chrom, start, stop) = labeltoRegionDict[label]
185         if useFullchrom:
186             fullchrom = chrom
187         else:
188             fullchrom = "chr%s" % chrom
189
190         if normalize:
191             if doRPKM:
192                 length = abs(stop - start) / 1000.
193             else:
194                 length = 1.
195
196             if length < 0.001:
197                 length = 0.001
198
199             outfile.write("%s\t%s\t%d\t%d\t%.2f" % (label, fullchrom, start, stop, regionCount[label]/length))
200             if doLength:
201                 outfile.write("\t%.1f" % length)
202         else:
203             outfile.write('%s\t%s\t%d\t%d\t%d' % (label, fullchrom, start, stop, regionCount[label]))
204
205         outfile.write("\n")
206
207     outfile.close()
208     writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
209
210
211 def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
212     uniques = 0
213     multis = 0.0
214     uniqueSplice = 0
215     multiSplice = 0.0
216     for alignedread in bamfile.fetch(chr, start, end):
217         readMultiplicity = alignedread.opt('NH')
218         if doSplices and isSpliceEntry(alignedread.cigar):
219             if readMultiplicity == 1 and doUniqs:
220                 uniqueSplice += 1
221             elif doMulti:
222                 multiSplice += 1.0/readMultiplicity
223         elif readMultiplicity == 1 and doUniqs:
224             uniques += 1
225         elif doMulti:
226             multis += 1.0/readMultiplicity
227
228     totalReads = uniques + multis + uniqueSplice + multiSplice
229
230     return totalReads
231
232
233 if __name__ == "__main__":
234     main(sys.argv)