erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / cisstat / enrichment.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # get Gene Enrichment Information and distribution 
31
32 import cistematic.core
33 from cistematic.core.geneinfo import geneinfoDB
34 from cistematic.cisstat.helper import hyperGeometric
35 from math import sqrt
36 import string
37
38
39 def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10):
40     """ calculates Data enrichment (and depletion) statistics for a given list of 
41         geneIDs (assumed to be all from the same genome) using either the 
42         hypergeometric distribution. Specific geneID's can be excluded from the 
43         calculations if included in excludeIDList. 
44  
45         dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s.
46
47         Results are saved in files with the given fileprefix. Conditions that are larger
48         than minPossible genes and that have pvalues < sigLevel / (# conditions) 
49         are reported in fileprefix.sig, whereas genes that matches condition are 
50         listed in fileprefix.ZZZZZZZ where the Z's are the condition.
51     """
52     print "calculateEnrichmentStats: %s" % fileprefix
53     statfile = open("%s.stat" % fileprefix, "w")
54     zscorefile = open("%s.zscore" % fileprefix, "w") 
55     sigfile = open("%s.sig" % fileprefix, "w")
56     if len(geneIDList) < 2:
57         print "Need at the very least two genes!"
58         return
59
60     firstgid = geneIDList[0]
61     genome = firstgid[0]
62     cistematic.core.cacheGeneDB(genome)
63     idb = geneinfoDB(cache=True)
64     dataBin = {}
65     dataInfo = {}
66     dataPossible = {}
67     found = {}
68     locusList = []
69     excludeLocusList = []
70     excludeDataBin = {}
71     zList = []
72     sigList = []
73     print "len(dataDict) = %d" % len(dataDict)
74     print "building dataInfo"
75     for condition in dataDict:
76         dataBin[condition] = 0
77         dataPossible[condition] = len(dataDict[condition])
78         for geneID in dataDict[condition]:
79             if geneID not in dataInfo:
80                 dataInfo[geneID] = []
81
82             dataInfo[geneID].append(condition)
83
84     for entry in geneIDList:
85         if entry not in excludeIDList and entry not in locusList:
86             locusList.append(entry)
87
88     for (genome, gID) in excludeIDList:
89         if gID not in excludeLocusList:
90             excludeLocusList.append(gID)
91             try:
92                 excludeDataTerms = dataInfo[(genome, gID)]
93             except:
94                 continue
95
96             for condition in excludeDataTerms:
97                 if condition not in excludeDataBin:
98                     excludeDataBin[condition] = 0
99
100                 excludeDataBin[condition] += 1
101
102     print "Getting condition list for locusList"
103     for locus in locusList:
104         try:
105             locusDataTerms = dataInfo[locus]
106         except:
107             continue
108
109         for condition in locusDataTerms:
110             if condition not in found:
111                 found[condition] = []
112
113             if locus not in found[condition]:
114                 found[condition].append(locus)
115                 dataBin[condition] += 1
116
117     numGenes = len(locusList)
118     dataSize = {}
119     print "Arranging by size"
120     print "numGenes = %s" % str(numGenes)
121     for condition in dataBin:
122         dataLen = dataBin[condition]
123         if dataLen not in dataSize:
124             dataSize[dataLen] = []
125
126         dataSize[dataLen].append(condition)
127
128     dataLengths = dataSize.keys()
129     dataLengths.sort()
130     dataLengths.reverse()
131     dataList = []
132     for dataLen in dataLengths:
133         for condition in dataSize[dataLen]:
134             dataList.append(condition)
135
136     allGIDs = []
137     try:
138         theGIDs = cistematic.core.getGenomeGeneIDs(genome)
139     except:
140         print "could not get gene entries for %s" % genome
141
142     for aGID in theGIDs:
143         if aGID not in excludeLocusList:
144             allGIDs.append(aGID)
145
146     datakeys = dataBin.keys()
147     mean = {}
148     standardDev = {}
149     N = float(len(allGIDs))
150     for condition in datakeys:
151         possible = dataPossible[condition]
152         if condition in excludeDataBin:
153             possible -= excludeDataBin[condition]
154
155         mean[condition] = (numGenes * possible) / N
156         standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
157
158     print "Writing out .stat"
159     for condition in dataList:
160         count = dataBin[condition]
161         possible = dataPossible[condition]
162         if condition in excludeDataBin:
163             possible -= excludeDataBin[condition]
164
165         try:
166             divisor = float(standardDev[condition])
167             if divisor > 0:
168                 zscore = float(count - mean[condition]) / float(standardDev[condition])
169             else:
170                 zscore = 0.0
171
172             if possible > 0:
173                 percentage = float(count) * 100.0 / float(possible)
174             else:
175                 percentage = 0
176
177             pval = 1.0
178             zscore = count - mean[condition]
179             pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
180             status = "enriched"
181             if (count - mean[condition]) < 0:
182                 status = "depleted"
183
184             statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval))
185             zList.append((zscore, (condition, count, possible, percentage, zscore, pval)))
186             if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1:
187                 sigList.append((pval, (condition, count, possible, percentage, pval, status)))
188         except:
189             statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible))
190
191     statfile.close()
192     print "writing zscore"
193     zList.sort()
194     zList.reverse()
195     for (zscore, entry) in zList:
196         zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry)
197
198     zscorefile.close()
199     print "writing sig"
200     sigList.sort()
201     annotCache = {}
202     for (pval, entry) in sigList:
203         condition = entry[0].replace(" ", "_")
204         sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry)
205         conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w")
206         if entry[0] not in found:
207             print "could not find %s" % entry[0]
208             continue
209
210         for gID in found[entry[0]]:
211             geneSymbol = ""
212             outsyn = ""
213             outdesc = ""
214             geneInfo = ""
215             if gID in annotCache:
216                 (geneSymbol, outsyn, outdesc) = annotCache[gID]
217             else:
218                 try:
219                     geneInfo = idb.getGeneInfo(gID)
220                     geneSymbol = geneInfo[0]
221                     synonyms = idb.geneIDSynonyms(gID)
222                     description = idb.getDescription(gID)
223                     outsyn = string.join(synonyms)
224                     outdesc = string.join(description, ";")
225                 except:
226                     geneSymbol = str(gID)
227                     try:
228                         if len(gID) == 2:
229                             newGeneID = idb.getGeneID(gID[0], gID[1])
230                             if len(newGeneID) > 1:
231                                 geneInfo = idb.getGeneInfo(newGeneID)
232
233                         if len(geneInfo) > 0:
234                             geneSymbol = gID[1]
235                             synonyms = idb.geneIDSynonyms(newGeneID)
236                             description = idb.getDescription(newGeneID)
237                             outsyn = string.join(synonyms)
238                             outdesc = string.join(description, ";")
239                     except:
240                         pass
241
242             conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
243             if gID not in annotCache:
244                 annotCache[gID] = (geneSymbol, outsyn, outdesc)
245
246         conditionsigfile.close()
247
248     sigfile.close()
249     cistematic.core.uncacheGeneDB(genome)