snapshot of 4.0a development. initial git repo commit
[erange.git] / distalPairs.py
1 #
2 #  distalPairs.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 10/14/08.
6 #
7
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     pass
14
15 from commoncode import readDataset
16 import sys, time, optparse
17
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     print "%prog: version 3.3"
24     print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
25     usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
26
27     parser = optparse.OptionParser(usage=usage)
28     parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
29     parser.add_option("--splices", action="store_true", dest="doSplices")
30     parser.add_option("--verbose", action="store_true", dest="doVerbose")
31     parser.add_option("--maxDist", type="int", dest="maxDist")
32     parser.add_option("--cache", type="int", dest="cachePages")
33     parser.set_defaults(sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None)
34     (options, args) = parser.parse_args(argv[1:])
35
36     if len(args) < 3:
37         print usage
38         sys.exit(1)
39
40     minDist = int(args[0])
41     rdsfile = args[1]
42     outfilename = args[2]
43
44     distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
45
46
47 def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
48     if cachePages is not None:
49         doCache = True
50     else:
51         doCache = False
52         cachePages = -1
53
54     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
55     if not RDS.hasIndex():
56         print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
57         sys.exit(1)
58
59     if cachePages > RDS.getDefaultCacheSize():
60         RDS.setDBcache(cachePages)
61
62     print time.ctime()
63
64     if doSplices:
65         print "getting splices"
66         splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
67         print "got splices"
68
69     print "getting uniq reads"    
70     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
71     print "got uniqs"
72
73     if doSplices:
74         for readID in splicesDict:
75             theRead = splicesDict[readID]
76             read0 = theRead[0]
77             del read0[1]
78             try:
79                 uniqDict[readID].append(read0)
80             except:
81                 if len(theRead) == 4:
82                     read2 = theRead[2]
83                     del read2[1]
84                     uniqDict[readID] = [read0,read2]
85
86     if doVerbose:
87         print len(uniqDict), time.ctime()
88
89     outfile = open(outfilename,"w")
90
91     diffChrom = 0
92     distal = 0
93     total = 0
94     for readID in uniqDict:
95         readList = uniqDict[readID]
96         if len(readList) == 2:
97             total += 1
98             (start1, sense1, chrom1, pair1) = readList[0]
99             (start2, sense2, chrom2, pair2) = readList[1]
100
101             if chrom1 != chrom2:
102                 diffChrom += 1
103                 if sameChromOnly:
104                     continue
105                 else:
106                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
107                     outfile.write(outline + "\n")
108                     if doVerbose:
109                         print diffChrom, outline
110             else:
111                 dist = abs(start1 - start2)
112
113                 if minDist < dist < maxDist:
114                     distal += 1
115                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
116                     outfile.write(outline + "\n")
117                     if doVerbose:
118                         print distal, outline
119
120     outfile.write("#distal: %d\tdiffChrom: %d\tpossible: %d\n" % (distal, diffChrom, total))
121     total = float(total)
122     if total < 1:
123         total = 1.
124
125     outfile.write("#distal %2.2f pct\tdiffChrom %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total)))
126     outfile.close()
127     print "distal: %d\tdiffChrom: %d\tpossible: %d" % (distal, diffChrom, int(total))
128     print "distal: %2.2f pct\tdiffChrom: %2.2f pct\n" % ((100. * distal/total), (100. * diffChrom/total))
129     print time.ctime()
130
131
132 if __name__ == "__main__":
133     main(sys.argv)