5 # Created by Ali Mortazavi on 10/14/08.
15 from commoncode import readDataset
16 import sys, time, optparse
23 print "%prog: version 3.3"
24 print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
25 usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
27 parser = optparse.OptionParser(usage=usage)
28 parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
29 parser.add_option("--splices", action="store_true", dest="doSplices")
30 parser.add_option("--verbose", action="store_true", dest="doVerbose")
31 parser.add_option("--maxDist", type="int", dest="maxDist")
32 parser.add_option("--cache", type="int", dest="cachePages")
33 parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None)
34 (options, args) = parser.parse_args(argv[1:])
40 minDist = int(args[0])
44 distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
47 def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
48 if cachePages is not None:
54 RDS = readDataset(rdsfile, verbose = True, cache=doCache)
55 if not RDS.hasIndex():
56 print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
59 if cachePages > RDS.getDefaultCacheSize():
60 RDS.setDBcache(cachePages)
65 print "getting splices"
66 splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
69 print "getting uniq reads"
70 uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
74 for readID in splicesDict:
75 theRead = splicesDict[readID]
79 uniqDict[readID].append(read0)
84 uniqDict[readID] = [read0,read2]
87 print len(uniqDict), time.ctime()
89 outfile = open(outfilename,"w")
94 for readID in uniqDict:
95 readList = uniqDict[readID]
96 if len(readList) == 2:
98 (start1, sense1, chrom1, pair1) = readList[0]
99 (start2, sense2, chrom2, pair2) = readList[1]
106 outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
107 outfile.write(outline + "\n")
109 print diffChrom, outline
111 dist = abs(start1 - start2)
113 if minDist < dist < maxDist:
115 outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
116 outfile.write(outline + "\n")
118 print distal, outline
120 outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total))
125 outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)))
127 print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total))
128 print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))
132 if __name__ == "__main__":