--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# get Gene Enrichment Information and distribution
+
+import cistematic.core
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.cisstat.helper import hyperGeometric
+from math import sqrt
+import string
+
+
+def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10):
+ """ calculates Data enrichment (and depletion) statistics for a given list of
+ geneIDs (assumed to be all from the same genome) using either the
+ hypergeometric distribution. Specific geneID's can be excluded from the
+ calculations if included in excludeIDList.
+
+ dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s.
+
+ Results are saved in files with the given fileprefix. Conditions that are larger
+ than minPossible genes and that have pvalues < sigLevel / (# conditions)
+ are reported in fileprefix.sig, whereas genes that matches condition are
+ listed in fileprefix.ZZZZZZZ where the Z's are the condition.
+ """
+ print "calculateEnrichmentStats: %s" % fileprefix
+ statfile = open("%s.stat" % fileprefix, "w")
+ zscorefile = open("%s.zscore" % fileprefix, "w")
+ sigfile = open("%s.sig" % fileprefix, "w")
+ if len(geneIDList) < 2:
+ print "Need at the very least two genes!"
+ return
+
+ firstgid = geneIDList[0]
+ genome = firstgid[0]
+ cistematic.core.cacheGeneDB(genome)
+ idb = geneinfoDB(cache=True)
+ dataBin = {}
+ dataInfo = {}
+ dataPossible = {}
+ found = {}
+ locusList = []
+ excludeLocusList = []
+ excludeDataBin = {}
+ zList = []
+ sigList = []
+ print "len(dataDict) = %d" % len(dataDict)
+ print "building dataInfo"
+ for condition in dataDict:
+ dataBin[condition] = 0
+ dataPossible[condition] = len(dataDict[condition])
+ for geneID in dataDict[condition]:
+ if geneID not in dataInfo:
+ dataInfo[geneID] = []
+
+ dataInfo[geneID].append(condition)
+
+ for entry in geneIDList:
+ if entry not in excludeIDList and entry not in locusList:
+ locusList.append(entry)
+
+ for (genome, gID) in excludeIDList:
+ if gID not in excludeLocusList:
+ excludeLocusList.append(gID)
+ try:
+ excludeDataTerms = dataInfo[(genome, gID)]
+ except:
+ continue
+
+ for condition in excludeDataTerms:
+ if condition not in excludeDataBin:
+ excludeDataBin[condition] = 0
+
+ excludeDataBin[condition] += 1
+
+ print "Getting condition list for locusList"
+ for locus in locusList:
+ try:
+ locusDataTerms = dataInfo[locus]
+ except:
+ continue
+
+ for condition in locusDataTerms:
+ if condition not in found:
+ found[condition] = []
+
+ if locus not in found[condition]:
+ found[condition].append(locus)
+ dataBin[condition] += 1
+
+ numGenes = len(locusList)
+ dataSize = {}
+ print "Arranging by size"
+ print "numGenes = %s" % str(numGenes)
+ for condition in dataBin:
+ dataLen = dataBin[condition]
+ if dataLen not in dataSize:
+ dataSize[dataLen] = []
+
+ dataSize[dataLen].append(condition)
+
+ dataLengths = dataSize.keys()
+ dataLengths.sort()
+ dataLengths.reverse()
+ dataList = []
+ for dataLen in dataLengths:
+ for condition in dataSize[dataLen]:
+ dataList.append(condition)
+
+ allGIDs = []
+ try:
+ theGIDs = cistematic.core.getGenomeGeneIDs(genome)
+ except:
+ print "could not get gene entries for %s" % genome
+
+ for aGID in theGIDs:
+ if aGID not in excludeLocusList:
+ allGIDs.append(aGID)
+
+ datakeys = dataBin.keys()
+ mean = {}
+ standardDev = {}
+ N = float(len(allGIDs))
+ for condition in datakeys:
+ possible = dataPossible[condition]
+ if condition in excludeDataBin:
+ possible -= excludeDataBin[condition]
+
+ mean[condition] = (numGenes * possible) / N
+ standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
+
+ print "Writing out .stat"
+ for condition in dataList:
+ count = dataBin[condition]
+ possible = dataPossible[condition]
+ if condition in excludeDataBin:
+ possible -= excludeDataBin[condition]
+
+ try:
+ divisor = float(standardDev[condition])
+ if divisor > 0:
+ zscore = float(count - mean[condition]) / float(standardDev[condition])
+ else:
+ zscore = 0.0
+
+ if possible > 0:
+ percentage = float(count) * 100.0 / float(possible)
+ else:
+ percentage = 0
+
+ pval = 1.0
+ zscore = count - mean[condition]
+ pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
+ status = "enriched"
+ if (count - mean[condition]) < 0:
+ status = "depleted"
+
+ statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval))
+ zList.append((zscore, (condition, count, possible, percentage, zscore, pval)))
+ if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1:
+ sigList.append((pval, (condition, count, possible, percentage, pval, status)))
+ except:
+ statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible))
+
+ statfile.close()
+ print "writing zscore"
+ zList.sort()
+ zList.reverse()
+ for (zscore, entry) in zList:
+ zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry)
+
+ zscorefile.close()
+ print "writing sig"
+ sigList.sort()
+ annotCache = {}
+ for (pval, entry) in sigList:
+ condition = entry[0].replace(" ", "_")
+ sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry)
+ conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w")
+ if entry[0] not in found:
+ print "could not find %s" % entry[0]
+ continue
+
+ for gID in found[entry[0]]:
+ geneSymbol = ""
+ outsyn = ""
+ outdesc = ""
+ geneInfo = ""
+ if gID in annotCache:
+ (geneSymbol, outsyn, outdesc) = annotCache[gID]
+ else:
+ try:
+ geneInfo = idb.getGeneInfo(gID)
+ geneSymbol = geneInfo[0]
+ synonyms = idb.geneIDSynonyms(gID)
+ description = idb.getDescription(gID)
+ outsyn = string.join(synonyms)
+ outdesc = string.join(description, ";")
+ except:
+ geneSymbol = str(gID)
+ try:
+ if len(gID) == 2:
+ newGeneID = idb.getGeneID(gID[0], gID[1])
+ if len(newGeneID) > 1:
+ geneInfo = idb.getGeneInfo(newGeneID)
+
+ if len(geneInfo) > 0:
+ geneSymbol = gID[1]
+ synonyms = idb.geneIDSynonyms(newGeneID)
+ description = idb.getDescription(newGeneID)
+ outsyn = string.join(synonyms)
+ outdesc = string.join(description, ";")
+ except:
+ pass
+
+ conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
+ if gID not in annotCache:
+ annotCache[gID] = (geneSymbol, outsyn, outdesc)
+
+ conditionsigfile.close()
+
+ sigfile.close()
+ cistematic.core.uncacheGeneDB(genome)
\ No newline at end of file