X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fcisstat%2Fenrichment.py;fp=cistematic%2Fcisstat%2Fenrichment.py;h=b2fec462c73d13cdd9d8c75de828873cea6b56b8;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/cisstat/enrichment.py b/cistematic/cisstat/enrichment.py new file mode 100644 index 0000000..b2fec46 --- /dev/null +++ b/cistematic/cisstat/enrichment.py @@ -0,0 +1,249 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# get Gene Enrichment Information and distribution + +import cistematic.core +from cistematic.core.geneinfo import geneinfoDB +from cistematic.cisstat.helper import hyperGeometric +from math import sqrt +import string + + +def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10): + """ calculates Data enrichment (and depletion) statistics for a given list of + geneIDs (assumed to be all from the same genome) using either the + hypergeometric distribution. Specific geneID's can be excluded from the + calculations if included in excludeIDList. + + dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s. + + Results are saved in files with the given fileprefix. Conditions that are larger + than minPossible genes and that have pvalues < sigLevel / (# conditions) + are reported in fileprefix.sig, whereas genes that matches condition are + listed in fileprefix.ZZZZZZZ where the Z's are the condition. + """ + print "calculateEnrichmentStats: %s" % fileprefix + statfile = open("%s.stat" % fileprefix, "w") + zscorefile = open("%s.zscore" % fileprefix, "w") + sigfile = open("%s.sig" % fileprefix, "w") + if len(geneIDList) < 2: + print "Need at the very least two genes!" + return + + firstgid = geneIDList[0] + genome = firstgid[0] + cistematic.core.cacheGeneDB(genome) + idb = geneinfoDB(cache=True) + dataBin = {} + dataInfo = {} + dataPossible = {} + found = {} + locusList = [] + excludeLocusList = [] + excludeDataBin = {} + zList = [] + sigList = [] + print "len(dataDict) = %d" % len(dataDict) + print "building dataInfo" + for condition in dataDict: + dataBin[condition] = 0 + dataPossible[condition] = len(dataDict[condition]) + for geneID in dataDict[condition]: + if geneID not in dataInfo: + dataInfo[geneID] = [] + + dataInfo[geneID].append(condition) + + for entry in geneIDList: + if entry not in excludeIDList and entry not in locusList: + locusList.append(entry) + + for (genome, gID) in excludeIDList: + if gID not in excludeLocusList: + excludeLocusList.append(gID) + try: + excludeDataTerms = dataInfo[(genome, gID)] + except: + continue + + for condition in excludeDataTerms: + if condition not in excludeDataBin: + excludeDataBin[condition] = 0 + + excludeDataBin[condition] += 1 + + print "Getting condition list for locusList" + for locus in locusList: + try: + locusDataTerms = dataInfo[locus] + except: + continue + + for condition in locusDataTerms: + if condition not in found: + found[condition] = [] + + if locus not in found[condition]: + found[condition].append(locus) + dataBin[condition] += 1 + + numGenes = len(locusList) + dataSize = {} + print "Arranging by size" + print "numGenes = %s" % str(numGenes) + for condition in dataBin: + dataLen = dataBin[condition] + if dataLen not in dataSize: + dataSize[dataLen] = [] + + dataSize[dataLen].append(condition) + + dataLengths = dataSize.keys() + dataLengths.sort() + dataLengths.reverse() + dataList = [] + for dataLen in dataLengths: + for condition in dataSize[dataLen]: + dataList.append(condition) + + allGIDs = [] + try: + theGIDs = cistematic.core.getGenomeGeneIDs(genome) + except: + print "could not get gene entries for %s" % genome + + for aGID in theGIDs: + if aGID not in excludeLocusList: + allGIDs.append(aGID) + + datakeys = dataBin.keys() + mean = {} + standardDev = {} + N = float(len(allGIDs)) + for condition in datakeys: + possible = dataPossible[condition] + if condition in excludeDataBin: + possible -= excludeDataBin[condition] + + mean[condition] = (numGenes * possible) / N + standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0)) + + print "Writing out .stat" + for condition in dataList: + count = dataBin[condition] + possible = dataPossible[condition] + if condition in excludeDataBin: + possible -= excludeDataBin[condition] + + try: + divisor = float(standardDev[condition]) + if divisor > 0: + zscore = float(count - mean[condition]) / float(standardDev[condition]) + else: + zscore = 0.0 + + if possible > 0: + percentage = float(count) * 100.0 / float(possible) + else: + percentage = 0 + + pval = 1.0 + zscore = count - mean[condition] + pval = hyperGeometric(len(allGIDs), possible, numGenes, count) + status = "enriched" + if (count - mean[condition]) < 0: + status = "depleted" + + statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval)) + zList.append((zscore, (condition, count, possible, percentage, zscore, pval))) + if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1: + sigList.append((pval, (condition, count, possible, percentage, pval, status))) + except: + statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible)) + + statfile.close() + print "writing zscore" + zList.sort() + zList.reverse() + for (zscore, entry) in zList: + zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry) + + zscorefile.close() + print "writing sig" + sigList.sort() + annotCache = {} + for (pval, entry) in sigList: + condition = entry[0].replace(" ", "_") + sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry) + conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w") + if entry[0] not in found: + print "could not find %s" % entry[0] + continue + + for gID in found[entry[0]]: + geneSymbol = "" + outsyn = "" + outdesc = "" + geneInfo = "" + if gID in annotCache: + (geneSymbol, outsyn, outdesc) = annotCache[gID] + else: + try: + geneInfo = idb.getGeneInfo(gID) + geneSymbol = geneInfo[0] + synonyms = idb.geneIDSynonyms(gID) + description = idb.getDescription(gID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + geneSymbol = str(gID) + try: + if len(gID) == 2: + newGeneID = idb.getGeneID(gID[0], gID[1]) + if len(newGeneID) > 1: + geneInfo = idb.getGeneInfo(newGeneID) + + if len(geneInfo) > 0: + geneSymbol = gID[1] + synonyms = idb.geneIDSynonyms(newGeneID) + description = idb.getDescription(newGeneID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + pass + + conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc)) + if gID not in annotCache: + annotCache[gID] = (geneSymbol, outsyn, outdesc) + + conditionsigfile.close() + + sigfile.close() + cistematic.core.uncacheGeneDB(genome) \ No newline at end of file