rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / predictSpliceCount.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print 'psyco not running'
6
7 import sys
8 from cistematic.genomes import Genome
9
10
11 class GeneSymbolAndCount():
12
13     def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
14         self.symbol = symbol
15         self.uniqueCount = uniqueCount
16         self.spliceCount = spliceCount
17
18
19 def main(argv=None):
20     if not argv:
21         argv = sys.argv
22
23     print "predictSpliceCount: version 1.2"
24
25     if len(argv) < 6:
26         print 'usage: python %s genome maxBorder uniquecountfile splicecountfile outfile' % argv[0]
27         sys.exit(1)
28
29     genome = argv[1]
30     # number of nucleotides at the end of each exon that is affected by splicing
31     splicelead = int(argv[2])
32     uniquefilecount = argv[3]
33     splicefilecount =  argv[4]
34     outfilename = argv[5]
35
36     predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename)
37
38
39 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
40     hg = Genome(genome)
41
42     gidDict = getGeneData(uniquefilecount, splicefilecount)
43     gidList = gidDict.keys()
44     gidList.sort()
45
46     outfile = open(outfilename, "w")
47     for gid in gidList:
48         featureList = hg.getGeneFeatures((genome, gid))
49         featuresizesum, splicearea = getStats(featureList, splicelead)
50         fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
51         geneData = gidDict[gid]
52         uniqueCount = geneData.uniqueCount
53         expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
54
55         # this p-value is based on the observed unique count, not the expected total count
56         # nor the multi-read adjusted count
57         pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
58         symbol = geneData.symbol
59         spliceCount = geneData.spliceCount
60         print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
61         outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
62
63     outfile.close()
64
65
66 def getGeneData(uniquefilecount, splicefilecount):
67     gidDict = {}
68     uniquefile = open(uniquefilecount)
69     for line in uniquefile:
70         fields = line.strip().split()
71         geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
72         gidDict[fields[0]] = geneData
73
74     uniquefile.close()
75     splicefile = open(splicefilecount)
76     for line in splicefile:
77         fields = line.strip().split()
78         gidDict[fields[0]].spliceCount = int(fields[2])
79
80     splicefile.close()
81
82     return gidDict
83
84
85 def getStats(featureList, splicelead):
86     newfeatureList = []
87     featuresizesum = 0
88     for (ftype, chrom, start, stop, sense) in featureList:
89         if (start, stop) not in newfeatureList:
90             newfeatureList.append((start, stop))
91             featuresizesum += stop - start + 1
92
93     if featuresizesum < 1:
94         featuresizesum = 1
95
96     splicearea = (len(newfeatureList) - 1) * splicelead
97     if splicearea < splicelead:
98         splicearea = 0
99
100     return featuresizesum, splicearea
101
102
103 if __name__ == "__main__":
104     main(sys.argv)