snapshot of 4.0a development. initial git repo commit
[erange.git] / predictSpliceCount.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 import sys
8 from cistematic.genomes import Genome
9
10
11 def main(argv=None):
12     if not argv:
13         argv = sys.argv
14
15     print '%s: version 1.1' % argv[0]
16
17     if len(argv) < 6:
18         print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
19         sys.exit(1)
20
21     genome = argv[1]
22     # number of nucleotides at the end of each exon that is affected by splicing
23     splicelead = int(argv[2])
24     uniquefilecount = argv[3]
25     splicefilecount =  argv[4]
26     outfilename = argv[5]
27
28     predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename)
29
30
31 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
32     hg = Genome(genome)
33
34     gidDict = {}
35     gidList = []
36     uniqueCountDict = {}
37     spliceCountDict = {}
38
39     uniquefile = open(uniquefilecount)
40     for line in uniquefile:
41         fields = line.strip().split()
42         gidDict[fields[0]] = fields[1]
43         gidList.append(fields[0])
44         uniqueCountDict[fields[0]] = int(fields[2])
45
46     splicefile = open(splicefilecount)
47     for line in splicefile:
48         fields = line.strip().split()
49         spliceCountDict[fields[0]] = int(fields[2])
50
51     outfile = open(outfilename,'w')
52
53     gidList.sort()
54     for gid in gidList:
55         symbol = gidDict[gid]
56         featureList = hg.getGeneFeatures((genome, gid))
57         newfeatureList = []
58         featuresizesum = 0
59         for (ftype, chrom, start, stop, sense) in featureList:
60             if (start, stop) not in newfeatureList:
61                 newfeatureList.append((start, stop))
62                 featuresizesum += stop - start + 1
63
64         if featuresizesum < 1:
65             featuresizesum = 1
66
67         splicearea = (len(newfeatureList) - 1) * splicelead
68         if splicearea < splicelead:
69             splicearea = 0
70
71         fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
72         expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
73
74         # this p-value is based on the observed unique count, not the expected total count
75         # nor the multi-read adjusted count
76         pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
77         print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
78         outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
79
80     outfile.close()
81
82
83 if __name__ == "__main__":
84     main(sys.argv)