1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # get Gene Ontology Information and distribution
32 import cistematic.core
33 from cistematic.core.geneinfo import geneinfoDB
34 from cistematic.cisstat.score import pvalue
35 from cistematic.cisstat.helper import hyperGeometric
37 from random import shuffle
40 def calculateGOStats(geneIDList, fileprefix, excludeIDList = [], roundsRandomization=0, sigLevel = 0.01):
41 """ calculates GO enrichment (and depletion) statistics for a given list of
42 geneIDs (assumed to be all from the same genome) using either the
43 hypergeometric distribution or random sampling for roundsRandomization
44 rounds if greater than 0. Specific geneID's can be excluded from the
45 calculations if included in excludeIDList.
47 Results are saved in files with the given fileprefix. GO Terms that are
48 larger than 15 genes and that have pvalues < sigLevel / (#num GO categories)
49 are reported in fileprefix.gosig, whereas genes that matches the GO term are
50 listed in fileprefix.ZZZZZZZ where the Z's are the GOID.
52 print "calculateGOStats: %s" % fileprefix
53 if roundsRandomization > 0:
54 goexpfile = open("%s.goexp" % fileprefix, "w")
55 gostatfile = open("%s.gostat" % fileprefix, "w")
56 gozscorefile = open("%s.gozscore" % fileprefix, "w")
57 gosigfile = open("%s.gosig" % fileprefix, "w")
59 if len(geneIDList) < 1:
60 print "Need at the very least one gene!"
63 firstgid = geneIDList[0]
65 cistematic.core.cacheGeneDB(genome)
66 idb = geneinfoDB(cache=True)
75 print "Getting GO list"
76 goDesc = cistematic.core.allGOTerms(genome)
77 goInfo = cistematic.core.getAllGOInfo(genome)
78 print "len(goDesc) = %d" % len(goDesc)
81 goPossible[GOID] = cistematic.core.getGOIDCount(genome, GOID)
83 for entry in geneIDList:
84 if entry not in excludeIDList and entry not in locusList:
85 locusList.append(entry)
87 for (genome, gID) in excludeIDList:
88 if gID not in excludeLocusList:
89 excludeLocusList.append(gID)
91 excludeGOTerms = goInfo[(genome, gID)]
95 for entry in excludeGOTerms:
96 excludeGOTermsFields = entry.split("\t")
97 GOID = excludeGOTermsFields[0]
98 if GOID not in excludeGOBin:
99 excludeGOBin[GOID] = 0
101 excludeGOBin[GOID] += 1
103 print "Getting GO list for locusList"
104 for locus in locusList:
106 locusGOTerms = goInfo[locus]
110 for entry in locusGOTerms:
111 locusGOTermsFields = entry.split("\t")
112 GOID = locusGOTermsFields[0]
113 if GOID not in found:
116 if locus not in found[GOID]:
117 found[GOID].append(locus)
120 numGenes = len(locusList)
122 print "Arranging by size"
123 print "numGenes = %s" % str(numGenes)
126 if goLen not in goSize:
129 goSize[goLen].append(GOID)
131 goLengths = goSize.keys()
135 for goLen in goLengths:
136 for GOID in goSize[goLen]:
141 theGIDs = cistematic.core.getGenomeGeneIDs(genome)
143 print "could not get gene entries for %s" % genome
146 if aGID not in excludeLocusList:
149 gokeys = goBin.keys()
152 if roundsRandomization > 0:
153 print "Get Random sampling"
155 for sampleNum in range(roundsRandomization):
156 print "Round %d" % (sampleNum + 1)
157 sample[sampleNum] = {}
159 sample[sampleNum][GOID] = 0
162 sampleGenes = allGIDs[:numGenes]
163 for gid in sampleGenes:
165 goarray = goInfo[(genome, gid)]
169 for entry in goarray:
170 goarrayFields = entry.split("\t")
171 GOID = goarrayFields[0]
173 sample[sampleNum][GOID] += 1
175 print "Calculating stats"
179 for sampleNum in range(roundsRandomization):
180 mean[GOID] += sample[sampleNum][GOID]
182 mean[GOID] = float(mean[GOID]) / float(roundsRandomization)
183 for sampleNum in range(roundsRandomization):
184 sumofsquares += (sample[sampleNum][GOID] - mean[GOID]) * (sample[sampleNum][GOID] - mean[GOID])
186 standardDev[GOID] = sqrt(sumofsquares / float(roundsRandomization - 1))
187 goexpfile.write("%s\t%f\t%f\n" % (GOID, mean[GOID], standardDev[GOID]))
190 N = float(len(allGIDs))
192 possible = goPossible[GOID]
193 if GOID in excludeGOBin:
194 possible -= excludeGOBin[GOID]
196 mean[GOID] = (numGenes * possible) / N
197 standardDev[GOID] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
199 print "Writing out gostat"
202 possible = goPossible[GOID]
203 if GOID in excludeGOBin:
204 possible -= excludeGOBin[GOID]
207 divisor = float(standardDev[GOID])
209 zscore = float(count - mean[GOID]) / float(standardDev[GOID])
214 percentage = float(count) * 100.0 / float(possible)
219 if roundsRandomization > 0:
221 pval = pvalue(-1.0 * zscore)
223 pval = pvalue(zscore)
225 zscore = count - mean[GOID]
226 pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
229 if (count - mean[GOID]) < 0:
232 gostatfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % (GOID, count, possible, percentage, zscore, pval, goDesc[GOID]))
233 zList.append((zscore, (GOID, count, possible, percentage, zscore, pval, goDesc[GOID])))
234 if len(goList) > 0 and pval < (sigLevel / float(len(goList))) and possible > 15 and count > 1:
235 sigList.append((pval, (GOID, count, possible, percentage, pval, status, goDesc[GOID])))
237 gostatfile.write("%s\t%d out of %d\t \t%s\n" % (GOID, count, possible, goDesc[GOID]))
240 print "writing gozscore"
243 for (zscore, entry) in zList:
244 gozscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % entry)
247 print "writing gosig"
250 for (pval, entry) in sigList:
252 gosigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\t%s\n" % entry)
253 goidsigfile = open("%s.%s" % (fileprefix, GOID[3:]), "w")
255 if GOID not in found:
256 print "could not find %s" % GOID
259 print "%s\t%d genes" % (GOID, len(found[GOID]))
260 for gID in found[GOID]:
265 if gID in annotCache:
266 (geneSymbol, outsyn, outdesc) = annotCache[gID]
269 geneInfo = idb.getGeneInfo(gID)
270 geneSymbol = geneInfo[0]
271 synonyms = idb.geneIDSynonyms(gID)
272 description = idb.getDescription(gID)
273 outsyn = string.join(synonyms)
274 outdesc = string.join(description, ";")
276 geneSymbol = str(gID)
279 newGeneID = idb.getGeneID(gID[0], gID[1])
280 if len(newGeneID) > 1:
281 geneInfo = idb.getGeneInfo(newGeneID)
283 if len(geneInfo) > 0:
285 synonyms = idb.geneIDSynonyms(newGeneID)
286 description = idb.getDescription(newGeneID)
287 outsyn = string.join(synonyms)
288 outdesc = string.join(description, ";")
292 goidsigfileList.append("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
293 if gID not in annotCache:
294 annotCache[gID] = (geneSymbol, outsyn, outdesc)
296 goidsigfileList.sort()
297 for line in goidsigfileList:
298 goidsigfile.write(line)
303 cistematic.core.uncacheGeneDB(genome)