first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / altSpliceCounts.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 print "altSpliceCounts: version 3.7"
8
9 import sys
10 import optparse
11 import ReadDataset
12 from commoncode import getConfigParser, getConfigOption
13
14 def main(argv=None):
15     if not argv:
16         argv = sys.argv
17
18     usage = "usage: python %s rdsfile outfilename [--cache pages]"
19
20     parser = makeParser(usage)
21     (options, args) = parser.parse_args(argv[1:])
22
23     if len(args) < 2:
24         print usage
25         sys.exit(1)
26
27     hitfile =  args[0]
28     outfilename = args[1]
29
30     if options.numCachePages is not None:
31         doCache = True
32         cachePages = options.numCachePages
33     else:
34         doCache = False
35         cachePages = 100000
36
37     altSpliceCounts(hitfile, outfilename, doCache, cachePages)
38
39
40 def makeParser(usage=""):
41     parser = optparse.OptionParser(usage=usage)
42     parser.add_option("--cache", type="int", dest="numCachePages",
43                       help="number of cache pages to use [default: 100000]")
44
45     configParser = getConfigParser()
46     section = "altSpliceCounts"
47     numCachePages = getConfigOption(configParser, section, "numCachePages", None)
48
49     parser.set_defaults(numCachePages=numCachePages)
50
51     return parser
52
53
54 def altSpliceCounts(hitfile, outfilename, doCache=False, cachePages=100000):
55     startDict = {}
56     stopDict = {}
57     resultDict = {}
58
59     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
60     if cachePages > hitRDS.getDefaultCacheSize():
61         hitRDS.setDBcache(cachePages)
62
63     readlen = hitRDS.getReadSize()
64     hitDict = hitRDS.getSplicesDict(noSense=True)
65     outfile = open(outfilename, "w")
66
67     for chrom in hitDict:
68         startDict[chrom] = []
69         stopDict[chrom] = []
70         resultDict[chrom] = []
71
72     index = 0
73     for chrom in hitDict:
74         for read in hitDict[chrom]:
75             tagStart = read["startL"]
76             tagStop = read["stopR"]
77             index += 1
78             length = tagStop - tagStart
79             if length < readlen + 5:
80                 continue
81
82             startDict[chrom].append((tagStart, length))
83             stopDict[chrom].append((tagStop, length))
84
85         startDict[chrom].sort()
86         stopDict[chrom].sort()
87
88     spliceEvent = 0
89     altSpliceEvent = 0
90     alternative = 1
91     for chrom in startDict:
92         firstIndex = 0
93         maxIndex = len(startDict[chrom])
94         while firstIndex < maxIndex:
95             (fstart, flen) = startDict[chrom][firstIndex]
96             (start, length) = (fstart, flen)
97             secondIndex = firstIndex
98             secondLengths = []
99             while (start - fstart) < readlen:
100                 if secondIndex >= maxIndex:
101                     break
102
103                 (start, length) = startDict[chrom][secondIndex]
104                 if (start - fstart) < readlen and abs(length - flen) > readlen:
105                     line =  (chrom, fstart, fstart + flen, chrom, start, start + length)
106                     alreadySeen = False
107                     for slength in secondLengths:
108                         if abs(slength - length) < readlen:
109                             alreadySeen = True
110
111                     if len(resultDict[chrom]) == 0:
112                         resultDict[chrom].append(line)
113                     elif line != resultDict[chrom][-1] and not alreadySeen:
114                         resultDict[chrom].append(line)
115                         secondLengths.append(length)
116                         altSpliceEvent += 1
117                         spliceEvent += 1
118
119                 secondIndex += 1
120
121             firstIndex = secondIndex
122             spliceEvent += 1
123
124         firstIndex = 0
125         maxIndex = len(stopDict[chrom])
126         while firstIndex < maxIndex:
127             (fstop, flen) = stopDict[chrom][firstIndex]
128             (stop, length) = (fstop, flen)
129             secondIndex = firstIndex
130             secondLengths = []
131             while (stop - fstop) < readlen:
132                 if secondIndex >= maxIndex:
133                     break
134                 (stop, length) = stopDict[chrom][secondIndex]
135                 if (stop - fstop) < readlen and abs(length - flen) > readlen:
136                     line = (chrom, fstop - flen, fstop, chrom, stop - length, stop)
137                     alreadySeen = False
138                     for slength in secondLengths:
139                         if abs(slength - length) < readlen:
140                             alreadySeen = True
141
142                     if len(resultDict[chrom]) == 0:
143                         resultDict[chrom].append(line)
144
145                     if line != resultDict[chrom][-1] and not alreadySeen:
146                         resultDict[chrom].append(line)
147                         secondLengths.append(length)
148                         altSpliceEvent += 1
149                         spliceEvent += 1
150
151                 secondIndex += 1
152
153             firstIndex = secondIndex
154             spliceEvent += 1
155
156         resultDict[chrom].sort()
157         for line in resultDict[chrom]:
158             outfile.write("alt%d" % alternative + "\tchr%s\t%d\t%d\tchr%s\t%d\t%d\n"  % line)
159             alternative += 1
160
161         print chrom, maxIndex, spliceEvent, altSpliceEvent
162
163     print spliceEvent, altSpliceEvent
164     outfile.close()
165
166 if __name__ == "__main__":
167     main(sys.argv)