erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / cisstat / enrichment.py
diff --git a/cistematic/cisstat/enrichment.py b/cistematic/cisstat/enrichment.py
new file mode 100644 (file)
index 0000000..b2fec46
--- /dev/null
@@ -0,0 +1,249 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+# get Gene Enrichment Information and distribution 
+
+import cistematic.core
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.cisstat.helper import hyperGeometric
+from math import sqrt
+import string
+
+
+def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10):
+    """ calculates Data enrichment (and depletion) statistics for a given list of 
+        geneIDs (assumed to be all from the same genome) using either the 
+        hypergeometric distribution. Specific geneID's can be excluded from the 
+        calculations if included in excludeIDList. 
+        dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s.
+
+        Results are saved in files with the given fileprefix. Conditions that are larger
+        than minPossible genes and that have pvalues < sigLevel / (# conditions) 
+        are reported in fileprefix.sig, whereas genes that matches condition are 
+        listed in fileprefix.ZZZZZZZ where the Z's are the condition.
+    """
+    print "calculateEnrichmentStats: %s" % fileprefix
+    statfile = open("%s.stat" % fileprefix, "w")
+    zscorefile = open("%s.zscore" % fileprefix, "w") 
+    sigfile = open("%s.sig" % fileprefix, "w")
+    if len(geneIDList) < 2:
+        print "Need at the very least two genes!"
+        return
+
+    firstgid = geneIDList[0]
+    genome = firstgid[0]
+    cistematic.core.cacheGeneDB(genome)
+    idb = geneinfoDB(cache=True)
+    dataBin = {}
+    dataInfo = {}
+    dataPossible = {}
+    found = {}
+    locusList = []
+    excludeLocusList = []
+    excludeDataBin = {}
+    zList = []
+    sigList = []
+    print "len(dataDict) = %d" % len(dataDict)
+    print "building dataInfo"
+    for condition in dataDict:
+        dataBin[condition] = 0
+        dataPossible[condition] = len(dataDict[condition])
+        for geneID in dataDict[condition]:
+            if geneID not in dataInfo:
+                dataInfo[geneID] = []
+
+            dataInfo[geneID].append(condition)
+
+    for entry in geneIDList:
+        if entry not in excludeIDList and entry not in locusList:
+            locusList.append(entry)
+
+    for (genome, gID) in excludeIDList:
+        if gID not in excludeLocusList:
+            excludeLocusList.append(gID)
+            try:
+                excludeDataTerms = dataInfo[(genome, gID)]
+            except:
+                continue
+
+            for condition in excludeDataTerms:
+                if condition not in excludeDataBin:
+                    excludeDataBin[condition] = 0
+
+                excludeDataBin[condition] += 1
+
+    print "Getting condition list for locusList"
+    for locus in locusList:
+        try:
+            locusDataTerms = dataInfo[locus]
+        except:
+            continue
+
+        for condition in locusDataTerms:
+            if condition not in found:
+                found[condition] = []
+
+            if locus not in found[condition]:
+                found[condition].append(locus)
+                dataBin[condition] += 1
+
+    numGenes = len(locusList)
+    dataSize = {}
+    print "Arranging by size"
+    print "numGenes = %s" % str(numGenes)
+    for condition in dataBin:
+        dataLen = dataBin[condition]
+        if dataLen not in dataSize:
+            dataSize[dataLen] = []
+
+        dataSize[dataLen].append(condition)
+
+    dataLengths = dataSize.keys()
+    dataLengths.sort()
+    dataLengths.reverse()
+    dataList = []
+    for dataLen in dataLengths:
+        for condition in dataSize[dataLen]:
+            dataList.append(condition)
+
+    allGIDs = []
+    try:
+        theGIDs = cistematic.core.getGenomeGeneIDs(genome)
+    except:
+        print "could not get gene entries for %s" % genome
+
+    for aGID in theGIDs:
+        if aGID not in excludeLocusList:
+            allGIDs.append(aGID)
+
+    datakeys = dataBin.keys()
+    mean = {}
+    standardDev = {}
+    N = float(len(allGIDs))
+    for condition in datakeys:
+        possible = dataPossible[condition]
+        if condition in excludeDataBin:
+            possible -= excludeDataBin[condition]
+
+        mean[condition] = (numGenes * possible) / N
+        standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
+
+    print "Writing out .stat"
+    for condition in dataList:
+        count = dataBin[condition]
+        possible = dataPossible[condition]
+        if condition in excludeDataBin:
+            possible -= excludeDataBin[condition]
+
+        try:
+            divisor = float(standardDev[condition])
+            if divisor > 0:
+                zscore = float(count - mean[condition]) / float(standardDev[condition])
+            else:
+                zscore = 0.0
+
+            if possible > 0:
+                percentage = float(count) * 100.0 / float(possible)
+            else:
+                percentage = 0
+
+            pval = 1.0
+            zscore = count - mean[condition]
+            pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
+            status = "enriched"
+            if (count - mean[condition]) < 0:
+                status = "depleted"
+
+            statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval))
+            zList.append((zscore, (condition, count, possible, percentage, zscore, pval)))
+            if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1:
+                sigList.append((pval, (condition, count, possible, percentage, pval, status)))
+        except:
+            statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible))
+
+    statfile.close()
+    print "writing zscore"
+    zList.sort()
+    zList.reverse()
+    for (zscore, entry) in zList:
+        zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry)
+
+    zscorefile.close()
+    print "writing sig"
+    sigList.sort()
+    annotCache = {}
+    for (pval, entry) in sigList:
+        condition = entry[0].replace(" ", "_")
+        sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry)
+        conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w")
+        if entry[0] not in found:
+            print "could not find %s" % entry[0]
+            continue
+
+        for gID in found[entry[0]]:
+            geneSymbol = ""
+            outsyn = ""
+            outdesc = ""
+            geneInfo = ""
+            if gID in annotCache:
+                (geneSymbol, outsyn, outdesc) = annotCache[gID]
+            else:
+                try:
+                    geneInfo = idb.getGeneInfo(gID)
+                    geneSymbol = geneInfo[0]
+                    synonyms = idb.geneIDSynonyms(gID)
+                    description = idb.getDescription(gID)
+                    outsyn = string.join(synonyms)
+                    outdesc = string.join(description, ";")
+                except:
+                    geneSymbol = str(gID)
+                    try:
+                        if len(gID) == 2:
+                            newGeneID = idb.getGeneID(gID[0], gID[1])
+                            if len(newGeneID) > 1:
+                                geneInfo = idb.getGeneInfo(newGeneID)
+
+                        if len(geneInfo) > 0:
+                            geneSymbol = gID[1]
+                            synonyms = idb.geneIDSynonyms(newGeneID)
+                            description = idb.getDescription(newGeneID)
+                            outsyn = string.join(synonyms)
+                            outdesc = string.join(description, ";")
+                    except:
+                        pass
+
+            conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
+            if gID not in annotCache:
+                annotCache[gID] = (geneSymbol, outsyn, outdesc)
+
+        conditionsigfile.close()
+
+    sigfile.close()
+    cistematic.core.uncacheGeneDB(genome)
\ No newline at end of file