cistematic/core/orthomatcher.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 import string
  31 from cistematic.core.homology import homologyDB
  32 from os import environ
  33
  34 if environ.get("CISTEMATIC_TEMP"):
  35     cisTemp = environ.get("CISTEMATIC_TEMP")
  36 else:
  37     cisTemp = "/tmp"
  38
  39
  40 def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False):
  41     """ save to one file per genome the genes that are considred homologs in match files from different species.
  42     """
  43     theGenomes = matchFiles.keys()
  44     outFileNames = {}
  45     genes = {}
  46     entryDict = {}
  47     ignoreList = {}
  48     for aGenome in theGenomes:
  49         outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
  50         genes[aGenome] = []
  51         entryDict[aGenome] = {}
  52         ignoreList[aGenome] = []
  53
  54     for aGenome in theGenomes:
  55         goodLines = []
  56         ignoreLines = []
  57         theFile = open(matchFiles[aGenome], "r")
  58         for entry in theFile:
  59             fields = entry.split("\t")
  60             gid = fields[11]
  61             if filterCDS and fields[6] == "CDS":
  62                 ignoreLines.append(string.join(fields[:4], "-"))
  63                 continue
  64
  65             if fields[6] == "NONE":
  66                 continue
  67
  68             if filterLowerCase and fields[12] != fields[12].upper():
  69                 continue
  70
  71             goodLines.append(entry)
  72
  73         theFile.close()
  74         for entry in goodLines:
  75             fields = entry.split()
  76             gid = fields[11]
  77             geneID = (aGenome, gid)
  78             matchID = string.join(fields[:4], "-")
  79             if matchID in ignoreLines:
  80                 continue
  81
  82             if geneID not in genes[aGenome]:
  83                 genes[aGenome].append(geneID)
  84
  85             if geneID not in entryDict[aGenome]:
  86                 entryDict[aGenome][geneID] = []
  87                 entryDict[aGenome][geneID].append(entry)
  88
  89         print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
  90         doCrossMatch(outFileNames, genes, entryDict, ignoreList)
  91
  92
  93 def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False):
  94     """ save to one file per genome the genes that are considred homologs in match files from different species.
  95     """
  96     theGenomes = matchFiles.keys()
  97     outFileNames = {}
  98     genes = {}
  99     entryDict = {}
 100     ignoreList = {}
 101     for aGenome in theGenomes:
 102         outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
 103         genes[aGenome] = []
 104         entryDict[aGenome] = {}
 105         ignoreList[aGenome] = []
 106
 107     for aGenome in theGenomes:
 108         theList = []
 109         if not fileList:
 110             theList = [matchFiles[aGenome]]
 111         else:
 112             theList = matchFiles[aGenome]
 113
 114         for aFile in theList:
 115             theFile = open(aFile, "r")
 116             for entry in theFile:
 117                 fields = entry.split()
 118                 gid = fields[gidField]
 119                 geneID = (aGenome, gid)
 120                 if geneID not in genes[aGenome]:
 121                     genes[aGenome].append(geneID)
 122
 123                 if geneID not in entryDict[aGenome]:
 124                     entryDict[aGenome][geneID] = []
 125                     entryDict[aGenome][geneID].append(entry)
 126
 127             theFile.close()
 128
 129         print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
 130         doCrossMatch(outFileNames, genes, entryDict, ignoreList)
 131
 132
 133 def doCrossMatch(outFileNames, genes, entryDict, ignoreList):
 134     theGenomes = []
 135     outFile = {}
 136     for genome in outFileNames:
 137         theGenomes.append(genome)
 138
 139     hdb = homologyDB(cache=True)
 140     for aGenome in theGenomes:
 141         outFile = open(outFileNames[aGenome], "w")
 142         counter = 0
 143         for geneID in genes[aGenome]:
 144             try:
 145                 if geneID in ignoreList[aGenome]:
 146                     continue
 147
 148                 res = hdb.getHomologousGenes(geneID)
 149                 newcounter = 0
 150                 for entry in res:
 151                     if entry[0] in theGenomes:
 152                         secondGenome = entry[0]
 153
 154                     if entry in genes[secondGenome]:
 155                         print "%d: %s %s" % (counter, str(geneID), str(entry))
 156                         if newcounter < 1:
 157                             for rec in entryDict[aGenome][geneID]:
 158                                 outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec))
 159
 160                         for rec in entryDict[secondGenome][entry]:
 161                             outFile.write("%d\t%s\t%s" % (counter, str(entry), rec))
 162
 163                         newcounter  += 1
 164
 165                     if secondGenome == aGenome:
 166                         ignoreList[aGenome].append(entry)
 167
 168                 if newcounter > 0:
 169                     counter += 1
 170                     outFile.write("\n")
 171             except:
 172                 print "Problem around %s" % (str(geneID))
 173
 174         outFile.close()