--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# get Gene Ontology Information and distribution
+
+import cistematic.core
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.cisstat.score import pvalue
+from cistematic.cisstat.helper import hyperGeometric
+from math import sqrt
+from random import shuffle
+import string
+
+def calculateGOStats(geneIDList, fileprefix, excludeIDList = [], roundsRandomization=0, sigLevel = 0.01):
+ """ calculates GO enrichment (and depletion) statistics for a given list of
+ geneIDs (assumed to be all from the same genome) using either the
+ hypergeometric distribution or random sampling for roundsRandomization
+ rounds if greater than 0. Specific geneID's can be excluded from the
+ calculations if included in excludeIDList.
+
+ Results are saved in files with the given fileprefix. GO Terms that are
+ larger than 15 genes and that have pvalues < sigLevel / (#num GO categories)
+ are reported in fileprefix.gosig, whereas genes that matches the GO term are
+ listed in fileprefix.ZZZZZZZ where the Z's are the GOID.
+ """
+ print "calculateGOStats: %s" % fileprefix
+ if roundsRandomization > 0:
+ goexpfile = open("%s.goexp" % fileprefix, "w")
+ gostatfile = open("%s.gostat" % fileprefix, "w")
+ gozscorefile = open("%s.gozscore" % fileprefix, "w")
+ gosigfile = open("%s.gosig" % fileprefix, "w")
+
+ if len(geneIDList) < 1:
+ print "Need at the very least one gene!"
+ return
+
+ firstgid = geneIDList[0]
+ genome = firstgid[0]
+ cistematic.core.cacheGeneDB(genome)
+ idb = geneinfoDB(cache=True)
+ goBin = {}
+ goPossible = {}
+ found = {}
+ locusList = []
+ excludeLocusList = []
+ excludeGOBin = {}
+ zList = []
+ sigList = []
+ print "Getting GO list"
+ goDesc = cistematic.core.allGOTerms(genome)
+ goInfo = cistematic.core.getAllGOInfo(genome)
+ print "len(goDesc) = %d" % len(goDesc)
+ for GOID in goDesc:
+ goBin[GOID] = 0
+ goPossible[GOID] = cistematic.core.getGOIDCount(genome, GOID)
+
+ for entry in geneIDList:
+ if entry not in excludeIDList and entry not in locusList:
+ locusList.append(entry)
+
+ for (genome, gID) in excludeIDList:
+ if gID not in excludeLocusList:
+ excludeLocusList.append(gID)
+ try:
+ excludeGOTerms = goInfo[(genome, gID)]
+ except:
+ continue
+
+ for entry in excludeGOTerms:
+ excludeGOTermsFields = entry.split("\t")
+ GOID = excludeGOTermsFields[0]
+ if GOID not in excludeGOBin:
+ excludeGOBin[GOID] = 0
+
+ excludeGOBin[GOID] += 1
+
+ print "Getting GO list for locusList"
+ for locus in locusList:
+ try:
+ locusGOTerms = goInfo[locus]
+ except:
+ continue
+
+ for entry in locusGOTerms:
+ locusGOTermsFields = entry.split("\t")
+ GOID = locusGOTermsFields[0]
+ if GOID not in found:
+ found[GOID] = []
+
+ if locus not in found[GOID]:
+ found[GOID].append(locus)
+ goBin[GOID] += 1
+
+ numGenes = len(locusList)
+ goSize = {}
+ print "Arranging by size"
+ print "numGenes = %s" % str(numGenes)
+ for GOID in goBin:
+ goLen = goBin[GOID]
+ if goLen not in goSize:
+ goSize[goLen] = []
+
+ goSize[goLen].append(GOID)
+
+ goLengths = goSize.keys()
+ goLengths.sort()
+ goLengths.reverse()
+ goList = []
+ for goLen in goLengths:
+ for GOID in goSize[goLen]:
+ goList.append(GOID)
+
+ allGIDs = []
+ try:
+ theGIDs = cistematic.core.getGenomeGeneIDs(genome)
+ except:
+ print "could not get gene entries for %s" % genome
+
+ for aGID in theGIDs:
+ if aGID not in excludeLocusList:
+ allGIDs.append(aGID)
+
+ gokeys = goBin.keys()
+ mean = {}
+ standardDev = {}
+ if roundsRandomization > 0:
+ print "Get Random sampling"
+ sample = {}
+ for sampleNum in range(roundsRandomization):
+ print "Round %d" % (sampleNum + 1)
+ sample[sampleNum] = {}
+ for GOID in gokeys:
+ sample[sampleNum][GOID] = 0
+
+ shuffle(allGIDs)
+ sampleGenes = allGIDs[:numGenes]
+ for gid in sampleGenes:
+ try:
+ goarray = goInfo[(genome, gid)]
+ except:
+ continue
+
+ for entry in goarray:
+ goarrayFields = entry.split("\t")
+ GOID = goarrayFields[0]
+ if GOID in gokeys:
+ sample[sampleNum][GOID] += 1
+
+ print "Calculating stats"
+ for GOID in gokeys:
+ mean[GOID] = 0
+ sumofsquares = 0
+ for sampleNum in range(roundsRandomization):
+ mean[GOID] += sample[sampleNum][GOID]
+
+ mean[GOID] = float(mean[GOID]) / float(roundsRandomization)
+ for sampleNum in range(roundsRandomization):
+ sumofsquares += (sample[sampleNum][GOID] - mean[GOID]) * (sample[sampleNum][GOID] - mean[GOID])
+
+ standardDev[GOID] = sqrt(sumofsquares / float(roundsRandomization - 1))
+ goexpfile.write("%s\t%f\t%f\n" % (GOID, mean[GOID], standardDev[GOID]))
+ else:
+ # Use hypergeometric
+ N = float(len(allGIDs))
+ for GOID in gokeys:
+ possible = goPossible[GOID]
+ if GOID in excludeGOBin:
+ possible -= excludeGOBin[GOID]
+
+ mean[GOID] = (numGenes * possible) / N
+ standardDev[GOID] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
+
+ print "Writing out gostat"
+ for GOID in goList:
+ count = goBin[GOID]
+ possible = goPossible[GOID]
+ if GOID in excludeGOBin:
+ possible -= excludeGOBin[GOID]
+
+ try:
+ divisor = float(standardDev[GOID])
+ if divisor > 0:
+ zscore = float(count - mean[GOID]) / float(standardDev[GOID])
+ else:
+ zscore = 0.0
+
+ if possible > 0:
+ percentage = float(count) * 100.0 / float(possible)
+ else:
+ percentage = 0
+
+ pval = 1.0
+ if roundsRandomization > 0:
+ if zscore < 0.0:
+ pval = pvalue(-1.0 * zscore)
+ else:
+ pval = pvalue(zscore)
+ else:
+ zscore = count - mean[GOID]
+ pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
+
+ status = "enriched"
+ if (count - mean[GOID]) < 0:
+ status = "depleted"
+
+ gostatfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % (GOID, count, possible, percentage, zscore, pval, goDesc[GOID]))
+ zList.append((zscore, (GOID, count, possible, percentage, zscore, pval, goDesc[GOID])))
+ if len(goList) > 0 and pval < (sigLevel / float(len(goList))) and possible > 15 and count > 1:
+ sigList.append((pval, (GOID, count, possible, percentage, pval, status, goDesc[GOID])))
+ except:
+ gostatfile.write("%s\t%d out of %d\t \t%s\n" % (GOID, count, possible, goDesc[GOID]))
+
+ gostatfile.close()
+ print "writing gozscore"
+ zList.sort()
+ zList.reverse()
+ for (zscore, entry) in zList:
+ gozscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % entry)
+
+ gozscorefile.close()
+ print "writing gosig"
+ sigList.sort()
+ annotCache = {}
+ for (pval, entry) in sigList:
+ GOID = entry[0]
+ gosigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\t%s\n" % entry)
+ goidsigfile = open("%s.%s" % (fileprefix, GOID[3:]), "w")
+ goidsigfileList = []
+ if GOID not in found:
+ print "could not find %s" % GOID
+ continue
+
+ print "%s\t%d genes" % (GOID, len(found[GOID]))
+ for gID in found[GOID]:
+ geneSymbol = ""
+ outsyn = ""
+ outdesc = ""
+ geneInfo = ""
+ if gID in annotCache:
+ (geneSymbol, outsyn, outdesc) = annotCache[gID]
+ else:
+ try:
+ geneInfo = idb.getGeneInfo(gID)
+ geneSymbol = geneInfo[0]
+ synonyms = idb.geneIDSynonyms(gID)
+ description = idb.getDescription(gID)
+ outsyn = string.join(synonyms)
+ outdesc = string.join(description, ";")
+ except:
+ geneSymbol = str(gID)
+ try:
+ if len(gID) == 2:
+ newGeneID = idb.getGeneID(gID[0], gID[1])
+ if len(newGeneID) > 1:
+ geneInfo = idb.getGeneInfo(newGeneID)
+
+ if len(geneInfo) > 0:
+ geneSymbol = gID[1]
+ synonyms = idb.geneIDSynonyms(newGeneID)
+ description = idb.getDescription(newGeneID)
+ outsyn = string.join(synonyms)
+ outdesc = string.join(description, ";")
+ except:
+ pass
+
+ goidsigfileList.append("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
+ if gID not in annotCache:
+ annotCache[gID] = (geneSymbol, outsyn, outdesc)
+
+ goidsigfileList.sort()
+ for line in goidsigfileList:
+ goidsigfile.write(line)
+
+ goidsigfile.close()
+
+ gosigfile.close()
+ cistematic.core.uncacheGeneDB(genome)
\ No newline at end of file