Release version for Erange 4.0a
[erange.git] / predictSpliceCount.py
index bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07..2df39a9412ab6794a1c437bead4b476c482e3918 100755 (executable)
@@ -8,6 +8,14 @@ import sys
 from cistematic.genomes import Genome
 
 
+class GeneSymbolAndCount():
+
+    def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+        self.symbol = symbol
+        self.uniqueCount = uniqueCount
+        self.spliceCount = spliceCount
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -31,53 +39,65 @@ def main(argv=None):
 def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
     hg = Genome(genome)
 
-    gidDict = {}
-    gidList = []
-    uniqueCountDict = {}
-    spliceCountDict = {}
+    gidDict = getGeneData(uniquefilecount, splicefilecount)
+    gidList = gidDict.keys()
+    gidList.sort()
+
+    outfile = open(outfilename, "w")
+    for gid in gidList:
+        featureList = hg.getGeneFeatures((genome, gid))
+        featuresizesum, splicearea = getStats(featureList, splicelead)
+        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+        geneData = gidDict[gid]
+        uniqueCount = geneData.uniqueCount
+        expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+        # this p-value is based on the observed unique count, not the expected total count
+        # nor the multi-read adjusted count
+        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+        symbol = geneData.symbol
+        spliceCount = geneData.spliceCount
+        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
 
+    outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+    gidDict = {}
     uniquefile = open(uniquefilecount)
     for line in uniquefile:
         fields = line.strip().split()
-        gidDict[fields[0]] = fields[1]
-        gidList.append(fields[0])
-        uniqueCountDict[fields[0]] = int(fields[2])
+        geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+        gidDict[fields[0]] = geneData
 
+    uniquefile.close()
     splicefile = open(splicefilecount)
     for line in splicefile:
         fields = line.strip().split()
-        spliceCountDict[fields[0]] = int(fields[2])
+        gidDict[fields[0]].spliceCount = int(fields[2])
 
-    outfile = open(outfilename,'w')
+    splicefile.close()
 
-    gidList.sort()
-    for gid in gidList:
-        symbol = gidDict[gid]
-        featureList = hg.getGeneFeatures((genome, gid))
-        newfeatureList = []
-        featuresizesum = 0
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if (start, stop) not in newfeatureList:
-                newfeatureList.append((start, stop))
-                featuresizesum += stop - start + 1
+    return gidDict
 
-        if featuresizesum < 1:
-            featuresizesum = 1
 
-        splicearea = (len(newfeatureList) - 1) * splicelead
-        if splicearea < splicelead:
-            splicearea = 0
+def getStats(featureList, splicelead):
+    newfeatureList = []
+    featuresizesum = 0
+    for (ftype, chrom, start, stop, sense) in featureList:
+        if (start, stop) not in newfeatureList:
+            newfeatureList.append((start, stop))
+            featuresizesum += stop - start + 1
 
-        fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
-        expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+    if featuresizesum < 1:
+        featuresizesum = 1
 
-        # this p-value is based on the observed unique count, not the expected total count
-        # nor the multi-read adjusted count
-        pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
-        print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
-        outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+    splicearea = (len(newfeatureList) - 1) * splicelead
+    if splicearea < splicelead:
+        splicearea = 0
 
-    outfile.close()
+    return featuresizesum, splicearea
 
 
 if __name__ == "__main__":