erange version 4.0a dev release
[erange.git] / farPairs.py
1 #
2 #  farPairs.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 7/13/10.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import time
16 import optparse
17 import ReadDataset
18 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
19
20 print "farPairs: version 1.4"
21
22
23 def main(argv=None):
24     if not argv:
25         argv = sys.argv
26
27     usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
28
29     parser = getParser(usage)
30     (options, args) = parser.parse_args(argv[1:])
31
32     if len(args) < 3:
33         print usage
34         print "\tIs both slow and takes up large amount of RAM"
35         sys.exit(1)
36
37     rdsfile = args[0]
38     outfilename = args[1]
39     outbedname = args[2]
40
41     farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
42              options.cachePages, options.minDist, options.maxDist, options.minCount,
43              options.label)
44
45
46 def getParser(usage):
47     parser = optparse.OptionParser(usage=usage)
48     parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
49     parser.add_option("--cache", type="int", dest="cachePages")
50     parser.add_option("--verbose", action="store_true", dest="doVerbose")
51     parser.add_option("--minDist", type="int", dest="minDist")
52     parser.add_option("--maxDist", type="int", dest="maxDist")
53     parser.add_option("--minCount", type="int", dest="minCount")
54     parser.add_option("--label", dest="label")
55
56     configParser = getConfigParser
57     section = "farPairs"
58     sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
59     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
60     cachePages = getConfigOption(configParser, section, "cachePages", None)
61     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
62     maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
63     minCount = getConfigIntOption(configParser, section, "minCount", 2)
64     label = getConfigOption(configParser, section, "label", None)
65
66     parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
67                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
68
69     return parser
70
71
72 def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
73              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
74
75     doCache = False
76     if cachePages is not None:
77         doCache = True
78     else:
79         cachePages = 0
80
81     if label is None:
82         label = rdsfile
83
84     RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
85     rdsChromList = RDS.getChromosomes()
86
87     if doVerbose:
88         print time.ctime()
89
90     total = 0
91     outfile = open(outfilename, "w")
92     outbed = open(outbedname, "w")
93     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
94
95     readlen = RDS.getReadSize()
96     flagDict = {}
97     for chromosome in rdsChromList:
98         if doNotProcessChromosome(chromosome):
99             continue
100
101         print chromosome
102         uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
103         if doVerbose:
104             print len(uniqDict), time.ctime()
105
106         for readID in uniqDict:
107             readList = uniqDict[readID]
108             if len(readList) == 2:
109                 total += 1
110                 start1 = readList[0]["start"]
111                 flag1 = readList[0]["flag"]
112                 start2 = readList[1]["start"]
113                 flag2 = readList[1]["flag"]
114
115                 if flag1 != flag2:
116                     dist = abs(start1 - start2)
117                     startList = [start1, start2]
118                     stopList = [start1 + readlen, start2 + readlen]
119                     startList.sort()
120                     stopList.sort()
121                     if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
122                         outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
123                         outbed.write(outputLine)
124                         if doVerbose:
125                             print flag1, flag2, dist
126
127                         try:
128                             flagDict[flag1].append((flag2, start1, start2))
129                         except KeyError:
130                             flagDict[flag1] = [(flag2, start1, start2)]
131
132                         try:
133                             flagDict[flag2].append((flag1, start1, start2))
134                         except KeyError:
135                             flagDict[flag2] = [(flag2, start1, start2)]
136
137     print "%d connected regions" % len(flagDict)
138
139     for region in flagDict:
140         flagDict[region].sort()
141         regionConnections = {}
142         for (region2, start1, start2) in flagDict[region]:
143             try:
144                 regionConnections[region2] += 1
145             except KeyError:
146                 regionConnections[region2] = 1
147
148         for region2 in regionConnections:
149             if regionConnections[region2] >= minCount:
150                 outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
151                 if doVerbose:
152                     print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
153
154     outfile.close()
155     outbed.close()
156     if doVerbose:
157         print "finished: ", time.ctime()
158
159
160 def doNotProcessChromosome(chrom):
161     return chrom == "chrM"
162
163
164 def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
165     readSizes = "%d" % (stopList[0] - startList[0])
166     readCoords = "0"
167     leftStart = startList[0] - 1
168     rightStop = stopList[-1]
169     for index in range(1, numPieces):
170         readSizes += ",%d" % (stopList[index] - startList[index] + 1)
171         readCoords += ",%d" % (startList[index] - startList[0])
172
173     if rsense == "+":
174         senseCode = plusSense
175     else:
176         senseCode = minusSense
177
178     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
179     return outline
180
181
182 if __name__ == "__main__":
183     main(sys.argv)