5 # Created by Ali Mortazavi on 7/13/10.
19 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
21 print "farPairs: version 1.4"
28 usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
30 parser = getParser(usage)
31 (options, args) = parser.parse_args(argv[1:])
35 print "\tIs both slow and takes up large amount of RAM"
42 farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
43 options.cachePages, options.minDist, options.maxDist, options.minCount,
48 parser = optparse.OptionParser(usage=usage)
49 parser.add_option("--cache", type="int", dest="cachePages")
50 parser.add_option("--verbose", action="store_true", dest="doVerbose")
51 parser.add_option("--minDist", type="int", dest="minDist")
52 parser.add_option("--maxDist", type="int", dest="maxDist")
53 parser.add_option("--minCount", type="int", dest="minCount")
54 parser.add_option("--label", dest="label")
56 configParser = getConfigParser
58 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
59 cachePages = getConfigOption(configParser, section, "cachePages", None)
60 minDist = getConfigIntOption(configParser, section, "minDist", 1000)
61 maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
62 minCount = getConfigIntOption(configParser, section, "minCount", 2)
63 label = getConfigOption(configParser, section, "label", None)
65 parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
66 minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
71 def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
72 cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
74 flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
76 outfile = open(outfilename, "w")
77 for region in flagDict:
78 regionConnections = countDuplicatesInList(flagDict[region])
79 for (connectedRegion, count) in regionConnections:
81 outline = "%s\t%s\t%d" % (region, connectedRegion, count)
82 print >> outfile, outline
88 print "finished: ", time.ctime()
91 def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
93 if cachePages is not None:
101 RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
102 rdsChromList = RDS.getChromosomes()
107 outbed = open(outbedname, "w")
108 outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
111 readlen = RDS.getReadSize()
113 for chromosome in rdsChromList:
114 if doNotProcessChromosome(chromosome):
117 writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
119 print "%d connected regions" % len(flagDict)
124 def doNotProcessChromosome(chrom):
125 return chrom == "chrM"
128 def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
129 outbed = open(outbedname, "a")
131 uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
133 print len(uniqDict), time.ctime()
135 for readID in uniqDict:
136 readList = uniqDict[readID]
137 if readsAreFarPair(readList, minDist, maxDist):
138 start1 = readList[0]["start"]
139 start2 = readList[1]["start"]
140 startList = [start1, start2]
142 outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
143 outbed.write(outputLine)
144 flag1 = readList[0]["flag"]
145 flag2 = readList[1]["flag"]
147 print flag1, flag2, abs(start1 - start2)
150 flagDict[flag1].append(flag2)
152 flagDict[flag1] = [flag2]
155 flagDict[flag2].append(flag1)
157 flagDict[flag2] = [flag1]
162 def readsAreFarPair(readList, minDist, maxDist):
164 if len(readList) == 2:
165 flag1 = readList[0]["flag"]
166 flag2 = readList[1]["flag"]
167 if flag1 != flag2 and flag1 != "" and flag2 != "":
168 start1 = readList[0]["start"]
169 start2 = readList[1]["start"]
170 dist = abs(start1 - start2)
171 if minDist < dist < maxDist:
177 def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
178 sizes = ["%d" % readlen]
180 leftStart = startList[0] - 1
181 rightStop = startList[-1]
182 for index in range(1, numPieces):
183 sizes.append("%d" % (readlen + 1))
184 coords.append("%d" % (startList[index] - startList[0]))
187 senseCode = plusSenseColor
189 senseCode = minusSenseColor
191 readSizes = string.join(sizes, ",")
192 readCoords = string.join(coords, ",")
193 outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
198 if __name__ == "__main__":