snapshot of 4.0a development. initial git repo commit
[erange.git] / getsplicefa.py
1 import sys
2 import optparse
3 import string
4 try:
5     import psyco
6     psyco.full()
7 except:
8     print "psyco not running"
9 from cistematic.core import complement
10 from cistematic.genomes import Genome
11
12
13 def main(argv=None):
14     if not argv:
15         argv = sys.argv
16
17     verstring = "%prog: version 1.0"
18     print verstring
19     delimiter = "|"
20
21     usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
22             \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
23             \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
24
25     parser = optparse.OptionParser(usage=usage)
26     parser.add_option("--verbose", action="store_true", dest="doVerbose",
27                       help="show verbose messages [default: False]")
28     parser.add_option("--spacer", type="int", dest="spacer",
29                       help="number of spacer NTs to use [default: 2")
30     parser.set_defaults(doVerbose=False, spacer=2)
31     (options, args) = parser.parse_args(argv[1:])
32
33     try:
34         genome = args[0]
35         datafilename = args[1]
36         outfilename = args[2]
37         maxBorder = args[3]
38     except IndexError:
39         print usage
40         sys.exit(1)
41
42     getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)
43
44
45 def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
46     spacerseq = "N" * spacer
47
48     datafile = open(datafilename)
49     hg = Genome(genome)
50
51     spliceCountDict = {}
52     exonStartDict = {}
53     exonStopDict = {}
54     exonLengthDict = {}
55     nameToChromDict = {}
56     nameToComplementDict = {}
57     alreadySeen = {}
58     counter = 0
59
60     for line in datafile:
61         fields = line.split()
62         name = fields[0]
63         spliceCount = int(fields[7]) - 1
64         if spliceCount < 1:
65             continue
66
67         counter += spliceCount
68         spliceCountDict[name] = spliceCount
69         chrom = fields[1][3:]
70         if chrom == "chrM":
71             continue
72
73         nameToChromDict[name] = chrom
74         if chrom not in alreadySeen:
75             alreadySeen[chrom] = []
76
77         nameToComplementDict[name] = fields[2]
78         exonStarts = []
79         exonStops = []
80         for val in fields[8].split(",")[:-1]:
81             exonStarts.append(int(val))
82
83         for val in fields[9].split(",")[:-1]:
84             exonStops.append(int(val))
85
86         exonStartDict[name] = exonStarts
87         exonStopDict[name] = exonStops
88         exonLengths = []
89         for index in range(spliceCount + 1):
90             exonLengths.append(exonStops[index] - exonStarts[index])
91
92         exonLengthDict[name] = exonLengths
93
94     print len(spliceCountDict)
95     print counter
96
97     missedCount = 0
98     depressedCount = 0
99     splicefileindex = 1
100     spliceCounter = 0
101     outfile = open(outfilename, "w")
102     for name in nameToChromDict:
103         try:
104             spliceCount = spliceCountDict[name]
105         except:
106             continue
107
108         exonStarts = exonStartDict[name]
109         exonStops = exonStopDict[name]
110         exonLengths = exonLengthDict[name]
111         chrom = nameToChromDict[name]
112         for index in range(spliceCount):
113             if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
114                 continue
115
116             regionstart = exonStops[index] - maxBorder
117             alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
118             beforeLen = exonLengths[index]
119             afterLen = exonLengths[index + 1]
120             if (beforeLen + afterLen) < maxBorder + spacer:
121                 missedCount += 1
122                 continue
123
124             if (beforeLen + afterLen) < 2 * maxBorder:
125                 depressedCount += 1
126
127             if beforeLen > maxBorder:
128                 beforeLen = maxBorder
129
130             if afterLen > maxBorder:
131                 afterLen = maxBorder
132
133             try:
134                 beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
135                 afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
136             except:
137                 if doVerbose:
138                     print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
139                 continue
140
141             sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
142             spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
143             outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
144             outfile.write(outstring)
145
146         splicefileindex += 1
147         spliceCounter += 1
148         if spliceCounter > 10000:
149             print "%d genes" % splicefileindex
150             spliceCounter = 0
151
152     outfile.close()
153
154     print "%d splices too short to be seen" % missedCount
155     print "%d splices will be under-reported" % depressedCount
156
157 if __name__ == "__main__":
158     main(sys.argv)