from cistematic.genomes import Genome
+class GeneSymbolAndCount():
+
+ def __init__(self, symbol="", uniqueCount=0, spliceCount=0):
+ self.symbol = symbol
+ self.uniqueCount = uniqueCount
+ self.spliceCount = spliceCount
+
+
def main(argv=None):
if not argv:
argv = sys.argv
def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename):
hg = Genome(genome)
- gidDict = {}
- gidList = []
- uniqueCountDict = {}
- spliceCountDict = {}
+ gidDict = getGeneData(uniquefilecount, splicefilecount)
+ gidList = gidDict.keys()
+ gidList.sort()
+
+ outfile = open(outfilename, "w")
+ for gid in gidList:
+ featureList = hg.getGeneFeatures((genome, gid))
+ featuresizesum, splicearea = getStats(featureList, splicelead)
+ fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
+ geneData = gidDict[gid]
+ uniqueCount = geneData.uniqueCount
+ expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount
+
+ # this p-value is based on the observed unique count, not the expected total count
+ # nor the multi-read adjusted count
+ pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount)
+ symbol = geneData.symbol
+ spliceCount = geneData.spliceCount
+ print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)
+ outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount))
+ outfile.close()
+
+
+def getGeneData(uniquefilecount, splicefilecount):
+ gidDict = {}
uniquefile = open(uniquefilecount)
for line in uniquefile:
fields = line.strip().split()
- gidDict[fields[0]] = fields[1]
- gidList.append(fields[0])
- uniqueCountDict[fields[0]] = int(fields[2])
+ geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2]))
+ gidDict[fields[0]] = geneData
+ uniquefile.close()
splicefile = open(splicefilecount)
for line in splicefile:
fields = line.strip().split()
- spliceCountDict[fields[0]] = int(fields[2])
+ gidDict[fields[0]].spliceCount = int(fields[2])
- outfile = open(outfilename,'w')
+ splicefile.close()
- gidList.sort()
- for gid in gidList:
- symbol = gidDict[gid]
- featureList = hg.getGeneFeatures((genome, gid))
- newfeatureList = []
- featuresizesum = 0
- for (ftype, chrom, start, stop, sense) in featureList:
- if (start, stop) not in newfeatureList:
- newfeatureList.append((start, stop))
- featuresizesum += stop - start + 1
+ return gidDict
- if featuresizesum < 1:
- featuresizesum = 1
- splicearea = (len(newfeatureList) - 1) * splicelead
- if splicearea < splicelead:
- splicearea = 0
+def getStats(featureList, splicelead):
+ newfeatureList = []
+ featuresizesum = 0
+ for (ftype, chrom, start, stop, sense) in featureList:
+ if (start, stop) not in newfeatureList:
+ newfeatureList.append((start, stop))
+ featuresizesum += stop - start + 1
- fractionCoverage = featuresizesum / float(splicearea + featuresizesum)
- expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid]
+ if featuresizesum < 1:
+ featuresizesum = 1
- # this p-value is based on the observed unique count, not the expected total count
- # nor the multi-read adjusted count
- pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid])
- print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])
- outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]))
+ splicearea = (len(newfeatureList) - 1) * splicelead
+ if splicearea < splicelead:
+ splicearea = 0
- outfile.close()
+ return featuresizesum, splicearea
if __name__ == "__main__":