5 # Created by Ali Mortazavi on 7/13/10.
16 from commoncode import readDataset
18 print "%prog: version 1.3"
25 usage = "usage: python %prog rdsfile outfile bedfile [--verbose] [--cache numPages] [--minDist bp] [--maxDist bp] [--minCount count] [--label string]"
27 parser = optparse.OptionParser(usage=usage)
28 parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
29 parser.add_option("--cache", type="int", dest="cachePages")
30 parser.add_option("--verbose", action="store_true", dest="doVerbose")
31 parser.add_option("--minDist", type="int", dest="minDist")
32 parser.add_option("--maxDist", type="int", dest="maxDist")
33 parser.add_option("--minCount", type="int", dest="minCount")
34 parser.add_option("--label", dest="label")
35 parser.set_defaults(sameChromOnly=False, doVerbose=False, cachePages=None,
36 minDist=1000, maxDist=500000, minCount=2, label=None)
37 (options, args) = parser.parse_args(argv[1:])
41 print "\tIs both slow and takes up large amount of RAM"
48 farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
49 options.cachePages, options.minDist, options.maxDist, options.minCount,
53 def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
54 cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
57 if cachePages is not None:
65 RDS = readDataset(rdsfile, verbose=True, cache=doCache)
66 rdsChromList = RDS.getChromosomes()
72 outfile = open(outfilename, "w")
73 outbed = open(outbedname, "w")
74 outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
76 readlen = RDS.getReadSize()
78 for chromosome in rdsChromList:
79 if doNotProcessChromosome(chromosome):
83 uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
85 print len(uniqDict), time.ctime()
87 for readID in uniqDict:
88 readList = uniqDict[readID]
89 if len(readList) == 2:
91 (start1, flag1, pair1) = readList[0]
92 (start2, flag2, pair2) = readList[1]
95 dist = abs(start1 - start2)
96 startList = [start1, start2]
97 stopList = [start1 + readlen, start2 + readlen]
100 if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
101 outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
102 outbed.write(outputLine)
104 print flag1, flag2, dist
107 flagDict[flag1].append((flag2, start1, start2))
109 flagDict[flag1] = [(flag2, start1, start2)]
112 flagDict[flag2].append((flag1, start1, start2))
114 flagDict[flag2] = [(flag2, start1, start2)]
116 print "%d connected regions" % len(flagDict)
118 for region in flagDict:
119 flagDict[region].sort()
120 regionConnections = {}
121 for (region2, start1, start2) in flagDict[region]:
123 regionConnections[region2] += 1
125 regionConnections[region2] = 1
127 for region2 in regionConnections:
128 if regionConnections[region2] >= minCount:
129 outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
131 print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
136 print "finished: ", time.ctime()
139 def doNotProcessChromosome(chrom):
140 return chrom == "chrM"
143 def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
144 readSizes = "%d" % (stopList[0] - startList[0])
146 leftStart = startList[0] - 1
147 rightStop = stopList[-1]
148 for index in range(1, numPieces):
149 readSizes += ",%d" % (stopList[index] - startList[index] + 1)
150 readCoords += ",%d" % (startList[index] - startList[0])
153 senseCode = plusSense
155 senseCode = minusSense
157 outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
161 if __name__ == "__main__":