X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=predictSpliceCount.py;fp=predictSpliceCount.py;h=2df39a9412ab6794a1c437bead4b476c482e3918;hp=bfd6e1c48b3d4a449ea03d8f0191f0abd63e9d07;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/predictSpliceCount.py b/predictSpliceCount.py index bfd6e1c..2df39a9 100755 --- a/predictSpliceCount.py +++ b/predictSpliceCount.py @@ -8,6 +8,14 @@ import sys from cistematic.genomes import Genome +class GeneSymbolAndCount(): + + def __init__(self, symbol="", uniqueCount=0, spliceCount=0): + self.symbol = symbol + self.uniqueCount = uniqueCount + self.spliceCount = spliceCount + + def main(argv=None): if not argv: argv = sys.argv @@ -31,53 +39,65 @@ def main(argv=None): def predictSpliceCount(genome, splicelead, uniquefilecount, splicefilecount, outfilename): hg = Genome(genome) - gidDict = {} - gidList = [] - uniqueCountDict = {} - spliceCountDict = {} + gidDict = getGeneData(uniquefilecount, splicefilecount) + gidList = gidDict.keys() + gidList.sort() + + outfile = open(outfilename, "w") + for gid in gidList: + featureList = hg.getGeneFeatures((genome, gid)) + featuresizesum, splicearea = getStats(featureList, splicelead) + fractionCoverage = featuresizesum / float(splicearea + featuresizesum) + geneData = gidDict[gid] + uniqueCount = geneData.uniqueCount + expectedSpliceCount = int(round(uniqueCount/fractionCoverage)) - uniqueCount + + # this p-value is based on the observed unique count, not the expected total count + # nor the multi-read adjusted count + pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCount) + symbol = geneData.symbol + spliceCount = geneData.spliceCount + print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount) + outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCount)) + outfile.close() + + +def getGeneData(uniquefilecount, splicefilecount): + gidDict = {} uniquefile = open(uniquefilecount) for line in uniquefile: fields = line.strip().split() - gidDict[fields[0]] = fields[1] - gidList.append(fields[0]) - uniqueCountDict[fields[0]] = int(fields[2]) + geneData = GeneSymbolAndCount(symbol=fields[1], uniqueCount=int(fields[2])) + gidDict[fields[0]] = geneData + uniquefile.close() splicefile = open(splicefilecount) for line in splicefile: fields = line.strip().split() - spliceCountDict[fields[0]] = int(fields[2]) + gidDict[fields[0]].spliceCount = int(fields[2]) - outfile = open(outfilename,'w') + splicefile.close() - gidList.sort() - for gid in gidList: - symbol = gidDict[gid] - featureList = hg.getGeneFeatures((genome, gid)) - newfeatureList = [] - featuresizesum = 0 - for (ftype, chrom, start, stop, sense) in featureList: - if (start, stop) not in newfeatureList: - newfeatureList.append((start, stop)) - featuresizesum += stop - start + 1 + return gidDict - if featuresizesum < 1: - featuresizesum = 1 - splicearea = (len(newfeatureList) - 1) * splicelead - if splicearea < splicelead: - splicearea = 0 +def getStats(featureList, splicelead): + newfeatureList = [] + featuresizesum = 0 + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + featuresizesum += stop - start + 1 - fractionCoverage = featuresizesum / float(splicearea + featuresizesum) - expectedSpliceCount = int(round(uniqueCountDict[gid]/fractionCoverage)) - uniqueCountDict[gid] + if featuresizesum < 1: + featuresizesum = 1 - # this p-value is based on the observed unique count, not the expected total count - # nor the multi-read adjusted count - pvalue = 1 - pow(1 - float(splicelead)/featuresizesum, uniqueCountDict[gid]) - print '%s %s %f %d %d' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid]) - outfile.write('%s\t%s\t%f\t%d\t%d\n' % (gid, symbol, pvalue, expectedSpliceCount, spliceCountDict[gid])) + splicearea = (len(newfeatureList) - 1) * splicelead + if splicearea < splicelead: + splicearea = 0 - outfile.close() + return featuresizesum, splicearea if __name__ == "__main__":