erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / cisstat / analyzego.py
diff --git a/cistematic/cisstat/analyzego.py b/cistematic/cisstat/analyzego.py
new file mode 100644 (file)
index 0000000..10ebe31
--- /dev/null
@@ -0,0 +1,303 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+# get Gene Ontology Information and distribution 
+
+import cistematic.core
+from cistematic.core.geneinfo import geneinfoDB
+from cistematic.cisstat.score import pvalue
+from cistematic.cisstat.helper import hyperGeometric
+from math import sqrt
+from random import shuffle
+import string
+
+def calculateGOStats(geneIDList, fileprefix, excludeIDList = [], roundsRandomization=0, sigLevel = 0.01):
+    """ calculates GO enrichment (and depletion) statistics for a given list of 
+        geneIDs (assumed to be all from the same genome) using either the 
+        hypergeometric distribution or random sampling for roundsRandomization 
+        rounds if greater than 0. Specific geneID's can be excluded from the 
+        calculations if included in excludeIDList. 
+        Results are saved in files with the given fileprefix. GO Terms that are 
+        larger than 15 genes and that have pvalues < sigLevel / (#num GO categories) 
+        are reported in fileprefix.gosig, whereas genes that matches the GO term are 
+        listed in fileprefix.ZZZZZZZ where the Z's are the GOID.
+    """
+    print "calculateGOStats: %s" % fileprefix
+    if roundsRandomization > 0:
+        goexpfile = open("%s.goexp" % fileprefix, "w")
+    gostatfile = open("%s.gostat" % fileprefix, "w")
+    gozscorefile = open("%s.gozscore" % fileprefix, "w") 
+    gosigfile = open("%s.gosig" % fileprefix, "w")
+
+    if len(geneIDList) < 1:
+        print "Need at the very least one gene!"
+        return
+
+    firstgid = geneIDList[0]
+    genome = firstgid[0]
+    cistematic.core.cacheGeneDB(genome)
+    idb = geneinfoDB(cache=True)
+    goBin = {}
+    goPossible = {}
+    found = {}
+    locusList = []
+    excludeLocusList = []
+    excludeGOBin = {}
+    zList = []
+    sigList = []
+    print "Getting GO list"
+    goDesc = cistematic.core.allGOTerms(genome)
+    goInfo = cistematic.core.getAllGOInfo(genome)
+    print "len(goDesc) = %d" % len(goDesc)
+    for GOID in goDesc:
+        goBin[GOID] = 0
+        goPossible[GOID] = cistematic.core.getGOIDCount(genome, GOID)
+
+    for entry in geneIDList:
+        if entry not in excludeIDList and entry not in locusList:
+            locusList.append(entry)
+
+    for (genome, gID) in excludeIDList:
+        if gID not in excludeLocusList:
+            excludeLocusList.append(gID)
+            try:
+                excludeGOTerms = goInfo[(genome, gID)]
+            except:
+                continue
+
+            for entry in excludeGOTerms:
+                excludeGOTermsFields = entry.split("\t")
+                GOID = excludeGOTermsFields[0]
+                if GOID not in excludeGOBin:
+                    excludeGOBin[GOID] = 0
+
+                excludeGOBin[GOID] += 1
+
+    print "Getting GO list for locusList"
+    for locus in locusList:
+        try:
+            locusGOTerms = goInfo[locus]
+        except:
+            continue
+
+        for entry in locusGOTerms:
+            locusGOTermsFields = entry.split("\t")
+            GOID = locusGOTermsFields[0]
+            if GOID not in found:
+                found[GOID] = []
+
+            if locus not in found[GOID]:
+                found[GOID].append(locus)
+                goBin[GOID] += 1
+
+    numGenes = len(locusList)
+    goSize = {}
+    print "Arranging by size"
+    print "numGenes = %s" % str(numGenes)
+    for GOID in goBin:
+        goLen = goBin[GOID]
+        if goLen not in goSize:
+            goSize[goLen] = []
+
+        goSize[goLen].append(GOID)
+
+    goLengths = goSize.keys()
+    goLengths.sort()
+    goLengths.reverse()
+    goList = []
+    for goLen in goLengths:
+        for GOID in goSize[goLen]:
+            goList.append(GOID)
+
+    allGIDs = []
+    try:
+        theGIDs = cistematic.core.getGenomeGeneIDs(genome)
+    except:
+        print "could not get gene entries for %s" % genome
+
+    for aGID in theGIDs:
+        if aGID not in excludeLocusList:
+            allGIDs.append(aGID)
+
+    gokeys = goBin.keys()
+    mean = {}
+    standardDev = {}
+    if roundsRandomization > 0:
+        print "Get Random sampling"
+        sample = {}
+        for sampleNum in range(roundsRandomization):
+            print "Round %d" % (sampleNum + 1)
+            sample[sampleNum] = {}
+            for GOID in gokeys:
+                sample[sampleNum][GOID] = 0
+
+            shuffle(allGIDs)
+            sampleGenes = allGIDs[:numGenes]
+            for gid in sampleGenes:
+                try:
+                    goarray = goInfo[(genome, gid)]
+                except:
+                    continue
+
+                for entry in goarray:
+                    goarrayFields = entry.split("\t")
+                    GOID = goarrayFields[0]
+                    if GOID in gokeys:
+                        sample[sampleNum][GOID] += 1
+
+        print "Calculating stats"
+        for GOID in gokeys:
+            mean[GOID] = 0
+            sumofsquares = 0
+            for sampleNum in range(roundsRandomization):
+                mean[GOID] += sample[sampleNum][GOID]
+
+            mean[GOID] = float(mean[GOID]) / float(roundsRandomization)
+            for sampleNum in range(roundsRandomization):
+                sumofsquares += (sample[sampleNum][GOID] - mean[GOID]) * (sample[sampleNum][GOID] - mean[GOID])
+
+            standardDev[GOID] = sqrt(sumofsquares / float(roundsRandomization - 1))
+            goexpfile.write("%s\t%f\t%f\n" % (GOID, mean[GOID], standardDev[GOID]))
+    else:
+        # Use hypergeometric
+        N = float(len(allGIDs))
+        for GOID in gokeys:
+            possible = goPossible[GOID]
+            if GOID in excludeGOBin:
+                possible -= excludeGOBin[GOID]
+
+            mean[GOID] = (numGenes * possible) / N
+            standardDev[GOID] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
+
+    print "Writing out gostat"
+    for GOID in goList:
+        count = goBin[GOID]
+        possible = goPossible[GOID]
+        if GOID in excludeGOBin:
+            possible -= excludeGOBin[GOID]
+
+        try:
+            divisor = float(standardDev[GOID])
+            if divisor > 0:
+                zscore = float(count - mean[GOID]) / float(standardDev[GOID])
+            else:
+                zscore = 0.0
+
+            if possible > 0:
+                percentage = float(count) * 100.0 / float(possible)
+            else:
+                percentage = 0
+
+            pval = 1.0
+            if roundsRandomization > 0:
+                if zscore < 0.0:
+                    pval = pvalue(-1.0 * zscore)
+                else:
+                    pval = pvalue(zscore)
+            else:
+                zscore = count - mean[GOID]
+                pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
+
+            status = "enriched"
+            if (count - mean[GOID]) < 0:
+                status = "depleted"
+
+            gostatfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % (GOID, count, possible, percentage, zscore, pval, goDesc[GOID]))
+            zList.append((zscore, (GOID, count, possible, percentage, zscore, pval, goDesc[GOID])))
+            if len(goList) > 0 and pval < (sigLevel / float(len(goList))) and possible > 15 and count > 1:
+                sigList.append((pval, (GOID, count, possible, percentage, pval, status, goDesc[GOID])))
+        except:
+            gostatfile.write("%s\t%d out of %d\t \t%s\n" % (GOID, count, possible, goDesc[GOID]))
+
+    gostatfile.close()
+    print "writing gozscore"
+    zList.sort()
+    zList.reverse()
+    for (zscore, entry) in zList:
+        gozscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % entry)
+
+    gozscorefile.close()
+    print "writing gosig"
+    sigList.sort()
+    annotCache = {}
+    for (pval, entry) in sigList:
+        GOID = entry[0]
+        gosigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\t%s\n" % entry)
+        goidsigfile = open("%s.%s" % (fileprefix, GOID[3:]), "w")
+        goidsigfileList = []
+        if GOID not in found:
+            print "could not find %s" % GOID
+            continue
+
+        print "%s\t%d genes" % (GOID, len(found[GOID]))
+        for gID in found[GOID]:
+            geneSymbol = ""
+            outsyn = ""
+            outdesc = ""
+            geneInfo = ""
+            if gID in annotCache:
+                (geneSymbol, outsyn, outdesc) = annotCache[gID]
+            else:
+                try:
+                    geneInfo = idb.getGeneInfo(gID)
+                    geneSymbol = geneInfo[0]
+                    synonyms = idb.geneIDSynonyms(gID)
+                    description = idb.getDescription(gID)
+                    outsyn = string.join(synonyms)
+                    outdesc = string.join(description, ";")
+                except:
+                    geneSymbol = str(gID)
+                    try:
+                        if len(gID) == 2:
+                            newGeneID = idb.getGeneID(gID[0], gID[1])
+                            if len(newGeneID) > 1:
+                                geneInfo = idb.getGeneInfo(newGeneID)
+
+                        if len(geneInfo) > 0:
+                            geneSymbol = gID[1]
+                            synonyms = idb.geneIDSynonyms(newGeneID)
+                            description = idb.getDescription(newGeneID)
+                            outsyn = string.join(synonyms)
+                            outdesc = string.join(description, ";")
+                    except:
+                        pass
+
+            goidsigfileList.append("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
+            if gID not in annotCache:
+                annotCache[gID] = (geneSymbol, outsyn, outdesc)
+
+        goidsigfileList.sort()
+        for line in goidsigfileList:
+            goidsigfile.write(line)
+
+        goidsigfile.close()
+
+    gosigfile.close()
+    cistematic.core.uncacheGeneDB(genome)
\ No newline at end of file