snapshot of 4.0a development. initial git repo commit
[erange.git] / altSpliceCounts.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 print 'version 3.6'
8
9 import sys, optparse
10 from commoncode import readDataset
11
12 def main(argv=None):
13     if not argv:
14         argv = sys.argv
15
16     usage = "usage: python %s rdsfile outfilename [--cache pages]"
17
18     parser = optparse.OptionParser(usage=usage)
19     parser.add_option("--cache", type="int", dest="numCachePages",
20                       help="number of cache pages to use [default: 100000]")
21     parser.set_defaults(numCachePages=None)
22     (options, args) = parser.parse_args(argv[1:])
23
24     if len(args) < 2:
25         print usage
26         sys.exit(1)
27
28     hitfile =  args[0]
29     outfilename = args[1]
30
31     if options.numCachePages is not None:
32         doCache = True
33         cachePages = options.numCachePages
34     else:
35         doCache = False
36         cachePages = 100000
37
38     altSpliceCounts(hitfile, outfilename, doCache, cachePages)
39
40
41 def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
42     startDict = {}
43     stopDict = {}
44     resultDict = {}
45
46     hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
47     if cachePages > hitRDS.getDefaultCacheSize():
48         hitRDS.setDBcache(cachePages)
49
50     readlen = hitRDS.getReadSize()
51     hitDict = hitRDS.getSplicesDict(noSense=True)
52     outfile = open(outfilename,'w')
53
54     for chrom in hitDict:
55         startDict[chrom] = []
56         stopDict[chrom] = []
57         resultDict[chrom] = []
58
59     index = 0
60     for chrom in hitDict:
61         for (tagStart, lstop, rstart, tagStop) in hitDict[chrom]:
62             index += 1
63             length = tagStop - tagStart
64             if length < readlen + 5:
65                 continue
66
67             startDict[chrom].append((tagStart, length))
68             stopDict[chrom].append((tagStop, length))
69
70         startDict[chrom].sort()
71         stopDict[chrom].sort()
72
73     spliceEvent = 0
74     altSpliceEvent = 0
75     alternative = 1
76     for chrom in startDict:
77         firstIndex = 0
78         maxIndex = len(startDict[chrom])
79         while firstIndex < maxIndex:
80             (fstart, flen) = startDict[chrom][firstIndex]
81             (start, length) = (fstart, flen)
82             secondIndex = firstIndex
83             secondLengths = []
84             while (start - fstart) < readlen:
85                 if secondIndex >= maxIndex:
86                     break
87
88                 (start, length) = startDict[chrom][secondIndex]
89                 if (start - fstart) < readlen and abs(length - flen) > readlen:
90                     line =  (chrom, fstart, fstart + flen, chrom, start, start + length)
91                     alreadySeen = False
92                     for slength in secondLengths:
93                         if abs(slength - length) < readlen:
94                             alreadySeen = True
95
96                     if len(resultDict[chrom]) == 0:
97                         resultDict[chrom].append(line)
98                     elif line != resultDict[chrom][-1] and not alreadySeen:
99                         resultDict[chrom].append(line)
100                         secondLengths.append(length)
101                         altSpliceEvent += 1
102                         spliceEvent += 1
103
104                 secondIndex += 1
105
106             firstIndex = secondIndex
107             spliceEvent += 1
108
109         firstIndex = 0
110         maxIndex = len(stopDict[chrom])
111         while firstIndex < maxIndex:
112             (fstop, flen) = stopDict[chrom][firstIndex]
113             (stop, length) = (fstop, flen)
114             secondIndex = firstIndex
115             secondLengths = []
116             while (stop - fstop) < readlen:
117                 if secondIndex >= maxIndex:
118                     break
119                 (stop, length) = stopDict[chrom][secondIndex]
120                 if (stop - fstop) < readlen and abs(length - flen) > readlen:
121                     line = (chrom, fstop - flen, fstop, chrom, stop - length, stop)
122                     alreadySeen = False
123                     for slength in secondLengths:
124                         if abs(slength - length) < readlen:
125                             alreadySeen = True
126
127                     if len(resultDict[chrom]) == 0:
128                         resultDict[chrom].append(line)
129
130                     if line != resultDict[chrom][-1] and not alreadySeen:
131                         resultDict[chrom].append(line)
132                         secondLengths.append(length)
133                         altSpliceEvent += 1
134                         spliceEvent += 1
135
136                 secondIndex += 1
137
138             firstIndex = secondIndex
139             spliceEvent += 1
140
141         resultDict[chrom].sort()
142         for line in resultDict[chrom]:
143             outfile.write('alt%d' % alternative + '\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n'  % line)
144             alternative += 1
145
146         print chrom, maxIndex, spliceEvent, altSpliceEvent
147
148     print spliceEvent, altSpliceEvent
149     outfile.close()
150
151 if __name__ == "__main__":
152     main(sys.argv)