1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # get Gene Enrichment Information and distribution
32 import cistematic.core
33 from cistematic.core.geneinfo import geneinfoDB
34 from cistematic.cisstat.helper import hyperGeometric
39 def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10):
40 """ calculates Data enrichment (and depletion) statistics for a given list of
41 geneIDs (assumed to be all from the same genome) using either the
42 hypergeometric distribution. Specific geneID's can be excluded from the
43 calculations if included in excludeIDList.
45 dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s.
47 Results are saved in files with the given fileprefix. Conditions that are larger
48 than minPossible genes and that have pvalues < sigLevel / (# conditions)
49 are reported in fileprefix.sig, whereas genes that matches condition are
50 listed in fileprefix.ZZZZZZZ where the Z's are the condition.
52 print "calculateEnrichmentStats: %s" % fileprefix
53 statfile = open("%s.stat" % fileprefix, "w")
54 zscorefile = open("%s.zscore" % fileprefix, "w")
55 sigfile = open("%s.sig" % fileprefix, "w")
56 if len(geneIDList) < 2:
57 print "Need at the very least two genes!"
60 firstgid = geneIDList[0]
62 cistematic.core.cacheGeneDB(genome)
63 idb = geneinfoDB(cache=True)
73 print "len(dataDict) = %d" % len(dataDict)
74 print "building dataInfo"
75 for condition in dataDict:
76 dataBin[condition] = 0
77 dataPossible[condition] = len(dataDict[condition])
78 for geneID in dataDict[condition]:
79 if geneID not in dataInfo:
82 dataInfo[geneID].append(condition)
84 for entry in geneIDList:
85 if entry not in excludeIDList and entry not in locusList:
86 locusList.append(entry)
88 for (genome, gID) in excludeIDList:
89 if gID not in excludeLocusList:
90 excludeLocusList.append(gID)
92 excludeDataTerms = dataInfo[(genome, gID)]
96 for condition in excludeDataTerms:
97 if condition not in excludeDataBin:
98 excludeDataBin[condition] = 0
100 excludeDataBin[condition] += 1
102 print "Getting condition list for locusList"
103 for locus in locusList:
105 locusDataTerms = dataInfo[locus]
109 for condition in locusDataTerms:
110 if condition not in found:
111 found[condition] = []
113 if locus not in found[condition]:
114 found[condition].append(locus)
115 dataBin[condition] += 1
117 numGenes = len(locusList)
119 print "Arranging by size"
120 print "numGenes = %s" % str(numGenes)
121 for condition in dataBin:
122 dataLen = dataBin[condition]
123 if dataLen not in dataSize:
124 dataSize[dataLen] = []
126 dataSize[dataLen].append(condition)
128 dataLengths = dataSize.keys()
130 dataLengths.reverse()
132 for dataLen in dataLengths:
133 for condition in dataSize[dataLen]:
134 dataList.append(condition)
138 theGIDs = cistematic.core.getGenomeGeneIDs(genome)
140 print "could not get gene entries for %s" % genome
143 if aGID not in excludeLocusList:
146 datakeys = dataBin.keys()
149 N = float(len(allGIDs))
150 for condition in datakeys:
151 possible = dataPossible[condition]
152 if condition in excludeDataBin:
153 possible -= excludeDataBin[condition]
155 mean[condition] = (numGenes * possible) / N
156 standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
158 print "Writing out .stat"
159 for condition in dataList:
160 count = dataBin[condition]
161 possible = dataPossible[condition]
162 if condition in excludeDataBin:
163 possible -= excludeDataBin[condition]
166 divisor = float(standardDev[condition])
168 zscore = float(count - mean[condition]) / float(standardDev[condition])
173 percentage = float(count) * 100.0 / float(possible)
178 zscore = count - mean[condition]
179 pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
181 if (count - mean[condition]) < 0:
184 statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval))
185 zList.append((zscore, (condition, count, possible, percentage, zscore, pval)))
186 if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1:
187 sigList.append((pval, (condition, count, possible, percentage, pval, status)))
189 statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible))
192 print "writing zscore"
195 for (zscore, entry) in zList:
196 zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry)
202 for (pval, entry) in sigList:
203 condition = entry[0].replace(" ", "_")
204 sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry)
205 conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w")
206 if entry[0] not in found:
207 print "could not find %s" % entry[0]
210 for gID in found[entry[0]]:
215 if gID in annotCache:
216 (geneSymbol, outsyn, outdesc) = annotCache[gID]
219 geneInfo = idb.getGeneInfo(gID)
220 geneSymbol = geneInfo[0]
221 synonyms = idb.geneIDSynonyms(gID)
222 description = idb.getDescription(gID)
223 outsyn = string.join(synonyms)
224 outdesc = string.join(description, ";")
226 geneSymbol = str(gID)
229 newGeneID = idb.getGeneID(gID[0], gID[1])
230 if len(newGeneID) > 1:
231 geneInfo = idb.getGeneInfo(newGeneID)
233 if len(geneInfo) > 0:
235 synonyms = idb.geneIDSynonyms(newGeneID)
236 description = idb.getDescription(newGeneID)
237 outsyn = string.join(synonyms)
238 outdesc = string.join(description, ";")
242 conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
243 if gID not in annotCache:
244 annotCache[gID] = (geneSymbol, outsyn, outdesc)
246 conditionsigfile.close()
249 cistematic.core.uncacheGeneDB(genome)