5 # Created by Ali Mortazavi on 7/13/10.
18 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
20 print "farPairs: version 1.4"
27 usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
34 print "\tIs both slow and takes up large amount of RAM"
41 farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
42 options.cachePages, options.minDist, options.maxDist, options.minCount,
47 parser = optparse.OptionParser(usage=usage)
48 parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
49 parser.add_option("--cache", type="int", dest="cachePages")
50 parser.add_option("--verbose", action="store_true", dest="doVerbose")
51 parser.add_option("--minDist", type="int", dest="minDist")
52 parser.add_option("--maxDist", type="int", dest="maxDist")
53 parser.add_option("--minCount", type="int", dest="minCount")
54 parser.add_option("--label", dest="label")
56 configParser = getConfigParser
58 sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
59 doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
60 cachePages = getConfigOption(configParser, section, "cachePages", None)
61 minDist = getConfigIntOption(configParser, section, "minDist", 1000)
62 maxDist = getConfigIntOption(configParser, section, "maxDist", 500000)
63 minCount = getConfigIntOption(configParser, section, "minCount", 2)
64 label = getConfigOption(configParser, section, "label", None)
66 parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
67 minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
72 def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
73 cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
76 if cachePages is not None:
84 RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=doCache)
85 rdsChromList = RDS.getChromosomes()
91 outfile = open(outfilename, "w")
92 outbed = open(outbedname, "w")
93 outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
95 readlen = RDS.getReadSize()
97 for chromosome in rdsChromList:
98 if doNotProcessChromosome(chromosome):
102 uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
104 print len(uniqDict), time.ctime()
106 for readID in uniqDict:
107 readList = uniqDict[readID]
108 if len(readList) == 2:
110 start1 = readList[0]["start"]
111 flag1 = readList[0]["flag"]
112 start2 = readList[1]["start"]
113 flag2 = readList[1]["flag"]
116 dist = abs(start1 - start2)
117 startList = [start1, start2]
118 stopList = [start1 + readlen, start2 + readlen]
121 if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
122 outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
123 outbed.write(outputLine)
125 print flag1, flag2, dist
128 flagDict[flag1].append((flag2, start1, start2))
130 flagDict[flag1] = [(flag2, start1, start2)]
133 flagDict[flag2].append((flag1, start1, start2))
135 flagDict[flag2] = [(flag2, start1, start2)]
137 print "%d connected regions" % len(flagDict)
139 for region in flagDict:
140 flagDict[region].sort()
141 regionConnections = {}
142 for (region2, start1, start2) in flagDict[region]:
144 regionConnections[region2] += 1
146 regionConnections[region2] = 1
148 for region2 in regionConnections:
149 if regionConnections[region2] >= minCount:
150 outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
152 print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
157 print "finished: ", time.ctime()
160 def doNotProcessChromosome(chrom):
161 return chrom == "chrM"
164 def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
165 readSizes = "%d" % (stopList[0] - startList[0])
167 leftStart = startList[0] - 1
168 rightStop = stopList[-1]
169 for index in range(1, numPieces):
170 readSizes += ",%d" % (stopList[index] - startList[index] + 1)
171 readCoords += ",%d" % (startList[index] - startList[0])
174 senseCode = plusSense
176 senseCode = minusSense
178 outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
182 if __name__ == "__main__":