X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fcisstat%2Fanalyzego.py;fp=cistematic%2Fcisstat%2Fanalyzego.py;h=10ebe318fbf0de5263c4c810d5cb43f55ec7d2c8;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/cisstat/analyzego.py b/cistematic/cisstat/analyzego.py new file mode 100644 index 0000000..10ebe31 --- /dev/null +++ b/cistematic/cisstat/analyzego.py @@ -0,0 +1,303 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# get Gene Ontology Information and distribution + +import cistematic.core +from cistematic.core.geneinfo import geneinfoDB +from cistematic.cisstat.score import pvalue +from cistematic.cisstat.helper import hyperGeometric +from math import sqrt +from random import shuffle +import string + +def calculateGOStats(geneIDList, fileprefix, excludeIDList = [], roundsRandomization=0, sigLevel = 0.01): + """ calculates GO enrichment (and depletion) statistics for a given list of + geneIDs (assumed to be all from the same genome) using either the + hypergeometric distribution or random sampling for roundsRandomization + rounds if greater than 0. Specific geneID's can be excluded from the + calculations if included in excludeIDList. + + Results are saved in files with the given fileprefix. GO Terms that are + larger than 15 genes and that have pvalues < sigLevel / (#num GO categories) + are reported in fileprefix.gosig, whereas genes that matches the GO term are + listed in fileprefix.ZZZZZZZ where the Z's are the GOID. + """ + print "calculateGOStats: %s" % fileprefix + if roundsRandomization > 0: + goexpfile = open("%s.goexp" % fileprefix, "w") + gostatfile = open("%s.gostat" % fileprefix, "w") + gozscorefile = open("%s.gozscore" % fileprefix, "w") + gosigfile = open("%s.gosig" % fileprefix, "w") + + if len(geneIDList) < 1: + print "Need at the very least one gene!" + return + + firstgid = geneIDList[0] + genome = firstgid[0] + cistematic.core.cacheGeneDB(genome) + idb = geneinfoDB(cache=True) + goBin = {} + goPossible = {} + found = {} + locusList = [] + excludeLocusList = [] + excludeGOBin = {} + zList = [] + sigList = [] + print "Getting GO list" + goDesc = cistematic.core.allGOTerms(genome) + goInfo = cistematic.core.getAllGOInfo(genome) + print "len(goDesc) = %d" % len(goDesc) + for GOID in goDesc: + goBin[GOID] = 0 + goPossible[GOID] = cistematic.core.getGOIDCount(genome, GOID) + + for entry in geneIDList: + if entry not in excludeIDList and entry not in locusList: + locusList.append(entry) + + for (genome, gID) in excludeIDList: + if gID not in excludeLocusList: + excludeLocusList.append(gID) + try: + excludeGOTerms = goInfo[(genome, gID)] + except: + continue + + for entry in excludeGOTerms: + excludeGOTermsFields = entry.split("\t") + GOID = excludeGOTermsFields[0] + if GOID not in excludeGOBin: + excludeGOBin[GOID] = 0 + + excludeGOBin[GOID] += 1 + + print "Getting GO list for locusList" + for locus in locusList: + try: + locusGOTerms = goInfo[locus] + except: + continue + + for entry in locusGOTerms: + locusGOTermsFields = entry.split("\t") + GOID = locusGOTermsFields[0] + if GOID not in found: + found[GOID] = [] + + if locus not in found[GOID]: + found[GOID].append(locus) + goBin[GOID] += 1 + + numGenes = len(locusList) + goSize = {} + print "Arranging by size" + print "numGenes = %s" % str(numGenes) + for GOID in goBin: + goLen = goBin[GOID] + if goLen not in goSize: + goSize[goLen] = [] + + goSize[goLen].append(GOID) + + goLengths = goSize.keys() + goLengths.sort() + goLengths.reverse() + goList = [] + for goLen in goLengths: + for GOID in goSize[goLen]: + goList.append(GOID) + + allGIDs = [] + try: + theGIDs = cistematic.core.getGenomeGeneIDs(genome) + except: + print "could not get gene entries for %s" % genome + + for aGID in theGIDs: + if aGID not in excludeLocusList: + allGIDs.append(aGID) + + gokeys = goBin.keys() + mean = {} + standardDev = {} + if roundsRandomization > 0: + print "Get Random sampling" + sample = {} + for sampleNum in range(roundsRandomization): + print "Round %d" % (sampleNum + 1) + sample[sampleNum] = {} + for GOID in gokeys: + sample[sampleNum][GOID] = 0 + + shuffle(allGIDs) + sampleGenes = allGIDs[:numGenes] + for gid in sampleGenes: + try: + goarray = goInfo[(genome, gid)] + except: + continue + + for entry in goarray: + goarrayFields = entry.split("\t") + GOID = goarrayFields[0] + if GOID in gokeys: + sample[sampleNum][GOID] += 1 + + print "Calculating stats" + for GOID in gokeys: + mean[GOID] = 0 + sumofsquares = 0 + for sampleNum in range(roundsRandomization): + mean[GOID] += sample[sampleNum][GOID] + + mean[GOID] = float(mean[GOID]) / float(roundsRandomization) + for sampleNum in range(roundsRandomization): + sumofsquares += (sample[sampleNum][GOID] - mean[GOID]) * (sample[sampleNum][GOID] - mean[GOID]) + + standardDev[GOID] = sqrt(sumofsquares / float(roundsRandomization - 1)) + goexpfile.write("%s\t%f\t%f\n" % (GOID, mean[GOID], standardDev[GOID])) + else: + # Use hypergeometric + N = float(len(allGIDs)) + for GOID in gokeys: + possible = goPossible[GOID] + if GOID in excludeGOBin: + possible -= excludeGOBin[GOID] + + mean[GOID] = (numGenes * possible) / N + standardDev[GOID] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0)) + + print "Writing out gostat" + for GOID in goList: + count = goBin[GOID] + possible = goPossible[GOID] + if GOID in excludeGOBin: + possible -= excludeGOBin[GOID] + + try: + divisor = float(standardDev[GOID]) + if divisor > 0: + zscore = float(count - mean[GOID]) / float(standardDev[GOID]) + else: + zscore = 0.0 + + if possible > 0: + percentage = float(count) * 100.0 / float(possible) + else: + percentage = 0 + + pval = 1.0 + if roundsRandomization > 0: + if zscore < 0.0: + pval = pvalue(-1.0 * zscore) + else: + pval = pvalue(zscore) + else: + zscore = count - mean[GOID] + pval = hyperGeometric(len(allGIDs), possible, numGenes, count) + + status = "enriched" + if (count - mean[GOID]) < 0: + status = "depleted" + + gostatfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % (GOID, count, possible, percentage, zscore, pval, goDesc[GOID])) + zList.append((zscore, (GOID, count, possible, percentage, zscore, pval, goDesc[GOID]))) + if len(goList) > 0 and pval < (sigLevel / float(len(goList))) and possible > 15 and count > 1: + sigList.append((pval, (GOID, count, possible, percentage, pval, status, goDesc[GOID]))) + except: + gostatfile.write("%s\t%d out of %d\t \t%s\n" % (GOID, count, possible, goDesc[GOID])) + + gostatfile.close() + print "writing gozscore" + zList.sort() + zList.reverse() + for (zscore, entry) in zList: + gozscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % entry) + + gozscorefile.close() + print "writing gosig" + sigList.sort() + annotCache = {} + for (pval, entry) in sigList: + GOID = entry[0] + gosigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\t%s\n" % entry) + goidsigfile = open("%s.%s" % (fileprefix, GOID[3:]), "w") + goidsigfileList = [] + if GOID not in found: + print "could not find %s" % GOID + continue + + print "%s\t%d genes" % (GOID, len(found[GOID])) + for gID in found[GOID]: + geneSymbol = "" + outsyn = "" + outdesc = "" + geneInfo = "" + if gID in annotCache: + (geneSymbol, outsyn, outdesc) = annotCache[gID] + else: + try: + geneInfo = idb.getGeneInfo(gID) + geneSymbol = geneInfo[0] + synonyms = idb.geneIDSynonyms(gID) + description = idb.getDescription(gID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + geneSymbol = str(gID) + try: + if len(gID) == 2: + newGeneID = idb.getGeneID(gID[0], gID[1]) + if len(newGeneID) > 1: + geneInfo = idb.getGeneInfo(newGeneID) + + if len(geneInfo) > 0: + geneSymbol = gID[1] + synonyms = idb.geneIDSynonyms(newGeneID) + description = idb.getDescription(newGeneID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + pass + + goidsigfileList.append("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc)) + if gID not in annotCache: + annotCache[gID] = (geneSymbol, outsyn, outdesc) + + goidsigfileList.sort() + for line in goidsigfileList: + goidsigfile.write(line) + + goidsigfile.close() + + gosigfile.close() + cistematic.core.uncacheGeneDB(genome) \ No newline at end of file