rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / farPairs.py
1 #
2 #  farPairs.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 7/13/10.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import time
16 import optparse
17 import string
18 import ReadDataset
19 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
20
21 print "farPairs: version 1.4"
22
23
24 def main(argv=None):
25     if not argv:
26         argv = sys.argv
27
28     usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
29
30     parser = getParser(usage)
31     (options, args) = parser.parse_args(argv[1:])
32
33     if len(args) < 3:
34         print usage
35         print "\tIs both slow and takes up large amount of RAM"
36         sys.exit(1)
37
38     rdsfile = args[0]
39     outfilename = args[1]
40     outbedname = args[2]
41
42     farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
43              options.cachePages, options.minDist, options.maxDist, options.minCount,
44              options.label)
45
46
47 def getParser(usage):
48     parser = optparse.OptionParser(usage=usage)
49     parser.add_option("--cache", type="int", dest="cachePages")
50     parser.add_option("--verbose", action="store_true", dest="doVerbose")
51     parser.add_option("--minDist", type="int", dest="minDist")
52     parser.add_option("--maxDist", type="int", dest="maxDist")
53     parser.add_option("--minCount", type="int", dest="minCount")
54     parser.add_option("--label", dest="label")
55
56     configParser = getConfigParser
57     section = "farPairs"
58     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
59     cachePages = getConfigOption(configParser, section, "cachePages", None)
60     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
61     maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
62     minCount = getConfigIntOption(configParser, section, "minCount", 2)
63     label = getConfigOption(configParser, section, "label", None)
64
65     parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
66                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
67
68     return parser
69
70
71 def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
72              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
73
74     flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
75
76     outfile = open(outfilename, "w")
77     for region in flagDict:
78         regionConnections = countDuplicatesInList(flagDict[region])
79         for (connectedRegion, count) in regionConnections:
80             if count >= minCount:
81                 outline = "%s\t%s\t%d" % (region, connectedRegion, count)
82                 print >> outfile, outline
83                 if doVerbose:
84                     print outline
85
86     outfile.close()
87     if doVerbose:
88         print "finished: ", time.ctime()
89
90
91 def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
92     doCache = False
93     if cachePages is not None:
94         doCache = True
95     else:
96         cachePages = 0
97
98     if label is None:
99         label = rdsfile
100
101     RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
102     rdsChromList = RDS.getChromosomes()
103
104     if doVerbose:
105         print time.ctime()
106     
107     outbed = open(outbedname, "w")
108     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
109     outbed.close()
110
111     readlen = RDS.getReadSize()
112     flagDict = {}
113     for chromosome in rdsChromList:
114         if doNotProcessChromosome(chromosome):
115             continue
116
117         writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
118
119     print "%d connected regions" % len(flagDict)
120
121     return flagDict
122
123
124 def doNotProcessChromosome(chrom):
125     return chrom == "chrM"
126
127
128 def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
129     outbed = open(outbedname, "a")
130     print chromosome
131     uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
132     if doVerbose:
133         print len(uniqDict), time.ctime()
134
135     for readID in uniqDict:
136         readList = uniqDict[readID]
137         if readsAreFarPair(readList, minDist, maxDist):
138             start1 = readList[0]["start"]
139             start2 = readList[1]["start"]
140             startList = [start1, start2]
141             startList.sort()
142             outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
143             outbed.write(outputLine)
144             flag1 = readList[0]["flag"]
145             flag2 = readList[1]["flag"]
146             if doVerbose:
147                 print flag1, flag2, abs(start1 - start2)
148
149             try:
150                 flagDict[flag1].append(flag2)
151             except KeyError:
152                 flagDict[flag1] = [flag2]
153
154             try:
155                 flagDict[flag2].append(flag1)
156             except KeyError:
157                 flagDict[flag2] = [flag1]
158
159     outbed.close()
160
161
162 def readsAreFarPair(readList, minDist, maxDist):
163     isFarPair = False
164     if len(readList) == 2:
165         flag1 = readList[0]["flag"]
166         flag2 = readList[1]["flag"]
167         if flag1 != flag2 and flag1 != "" and flag2 != "":
168             start1 = readList[0]["start"]
169             start2 = readList[1]["start"]
170             dist = abs(start1 - start2)
171             if minDist < dist < maxDist:
172                 isFarPair = True
173
174     return isFarPair
175
176
177 def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
178     sizes = ["%d" % readlen]
179     coords = ["0"]
180     leftStart = startList[0] - 1
181     rightStop = startList[-1]
182     for index in range(1, numPieces):
183         sizes.append("%d" % (readlen + 1))
184         coords.append("%d" % (startList[index] - startList[0]))
185
186     if rsense == "+":
187         senseCode = plusSenseColor
188     else:
189         senseCode = minusSenseColor
190
191     readSizes = string.join(sizes, ",")
192     readCoords = string.join(coords, ",")
193     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
194
195     return outline
196
197
198 if __name__ == "__main__":
199     main(sys.argv)