cistematic/cisstat/enrichment.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # get Gene Enrichment Information and distribution
  31
  32 import cistematic.core
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from cistematic.cisstat.helper import hyperGeometric
  35 from math import sqrt
  36 import string
  37
  38
  39 def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10):
  40     """ calculates Data enrichment (and depletion) statistics for a given list of
  41         geneIDs (assumed to be all from the same genome) using either the
  42         hypergeometric distribution. Specific geneID's can be excluded from the
  43         calculations if included in excludeIDList.
  44
  45         dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s.
  46
  47         Results are saved in files with the given fileprefix. Conditions that are larger
  48         than minPossible genes and that have pvalues < sigLevel / (# conditions)
  49         are reported in fileprefix.sig, whereas genes that matches condition are
  50         listed in fileprefix.ZZZZZZZ where the Z's are the condition.
  51     """
  52     print "calculateEnrichmentStats: %s" % fileprefix
  53     statfile = open("%s.stat" % fileprefix, "w")
  54     zscorefile = open("%s.zscore" % fileprefix, "w")
  55     sigfile = open("%s.sig" % fileprefix, "w")
  56     if len(geneIDList) < 2:
  57         print "Need at the very least two genes!"
  58         return
  59
  60     firstgid = geneIDList[0]
  61     genome = firstgid[0]
  62     cistematic.core.cacheGeneDB(genome)
  63     idb = geneinfoDB(cache=True)
  64     dataBin = {}
  65     dataInfo = {}
  66     dataPossible = {}
  67     found = {}
  68     locusList = []
  69     excludeLocusList = []
  70     excludeDataBin = {}
  71     zList = []
  72     sigList = []
  73     print "len(dataDict) = %d" % len(dataDict)
  74     print "building dataInfo"
  75     for condition in dataDict:
  76         dataBin[condition] = 0
  77         dataPossible[condition] = len(dataDict[condition])
  78         for geneID in dataDict[condition]:
  79             if geneID not in dataInfo:
  80                 dataInfo[geneID] = []
  81
  82             dataInfo[geneID].append(condition)
  83
  84     for entry in geneIDList:
  85         if entry not in excludeIDList and entry not in locusList:
  86             locusList.append(entry)
  87
  88     for (genome, gID) in excludeIDList:
  89         if gID not in excludeLocusList:
  90             excludeLocusList.append(gID)
  91             try:
  92                 excludeDataTerms = dataInfo[(genome, gID)]
  93             except:
  94                 continue
  95
  96             for condition in excludeDataTerms:
  97                 if condition not in excludeDataBin:
  98                     excludeDataBin[condition] = 0
  99
 100                 excludeDataBin[condition] += 1
 101
 102     print "Getting condition list for locusList"
 103     for locus in locusList:
 104         try:
 105             locusDataTerms = dataInfo[locus]
 106         except:
 107             continue
 108
 109         for condition in locusDataTerms:
 110             if condition not in found:
 111                 found[condition] = []
 112
 113             if locus not in found[condition]:
 114                 found[condition].append(locus)
 115                 dataBin[condition] += 1
 116
 117     numGenes = len(locusList)
 118     dataSize = {}
 119     print "Arranging by size"
 120     print "numGenes = %s" % str(numGenes)
 121     for condition in dataBin:
 122         dataLen = dataBin[condition]
 123         if dataLen not in dataSize:
 124             dataSize[dataLen] = []
 125
 126         dataSize[dataLen].append(condition)
 127
 128     dataLengths = dataSize.keys()
 129     dataLengths.sort()
 130     dataLengths.reverse()
 131     dataList = []
 132     for dataLen in dataLengths:
 133         for condition in dataSize[dataLen]:
 134             dataList.append(condition)
 135
 136     allGIDs = []
 137     try:
 138         theGIDs = cistematic.core.getGenomeGeneIDs(genome)
 139     except:
 140         print "could not get gene entries for %s" % genome
 141
 142     for aGID in theGIDs:
 143         if aGID not in excludeLocusList:
 144             allGIDs.append(aGID)
 145
 146     datakeys = dataBin.keys()
 147     mean = {}
 148     standardDev = {}
 149     N = float(len(allGIDs))
 150     for condition in datakeys:
 151         possible = dataPossible[condition]
 152         if condition in excludeDataBin:
 153             possible -= excludeDataBin[condition]
 154
 155         mean[condition] = (numGenes * possible) / N
 156         standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0))
 157
 158     print "Writing out .stat"
 159     for condition in dataList:
 160         count = dataBin[condition]
 161         possible = dataPossible[condition]
 162         if condition in excludeDataBin:
 163             possible -= excludeDataBin[condition]
 164
 165         try:
 166             divisor = float(standardDev[condition])
 167             if divisor > 0:
 168                 zscore = float(count - mean[condition]) / float(standardDev[condition])
 169             else:
 170                 zscore = 0.0
 171
 172             if possible > 0:
 173                 percentage = float(count) * 100.0 / float(possible)
 174             else:
 175                 percentage = 0
 176
 177             pval = 1.0
 178             zscore = count - mean[condition]
 179             pval = hyperGeometric(len(allGIDs), possible, numGenes, count)
 180             status = "enriched"
 181             if (count - mean[condition]) < 0:
 182                 status = "depleted"
 183
 184             statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval))
 185             zList.append((zscore, (condition, count, possible, percentage, zscore, pval)))
 186             if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1:
 187                 sigList.append((pval, (condition, count, possible, percentage, pval, status)))
 188         except:
 189             statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible))
 190
 191     statfile.close()
 192     print "writing zscore"
 193     zList.sort()
 194     zList.reverse()
 195     for (zscore, entry) in zList:
 196         zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry)
 197
 198     zscorefile.close()
 199     print "writing sig"
 200     sigList.sort()
 201     annotCache = {}
 202     for (pval, entry) in sigList:
 203         condition = entry[0].replace(" ", "_")
 204         sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry)
 205         conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w")
 206         if entry[0] not in found:
 207             print "could not find %s" % entry[0]
 208             continue
 209
 210         for gID in found[entry[0]]:
 211             geneSymbol = ""
 212             outsyn = ""
 213             outdesc = ""
 214             geneInfo = ""
 215             if gID in annotCache:
 216                 (geneSymbol, outsyn, outdesc) = annotCache[gID]
 217             else:
 218                 try:
 219                     geneInfo = idb.getGeneInfo(gID)
 220                     geneSymbol = geneInfo[0]
 221                     synonyms = idb.geneIDSynonyms(gID)
 222                     description = idb.getDescription(gID)
 223                     outsyn = string.join(synonyms)
 224                     outdesc = string.join(description, ";")
 225                 except:
 226                     geneSymbol = str(gID)
 227                     try:
 228                         if len(gID) == 2:
 229                             newGeneID = idb.getGeneID(gID[0], gID[1])
 230                             if len(newGeneID) > 1:
 231                                 geneInfo = idb.getGeneInfo(newGeneID)
 232
 233                         if len(geneInfo) > 0:
 234                             geneSymbol = gID[1]
 235                             synonyms = idb.geneIDSynonyms(newGeneID)
 236                             description = idb.getDescription(newGeneID)
 237                             outsyn = string.join(synonyms)
 238                             outdesc = string.join(description, ";")
 239                     except:
 240                         pass
 241
 242             conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc))
 243             if gID not in annotCache:
 244                 annotCache[gID] = (geneSymbol, outsyn, outdesc)
 245
 246         conditionsigfile.close()
 247
 248     sigfile.close()
 249     cistematic.core.uncacheGeneDB(genome)