--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+import string
+from cistematic.core.homology import homologyDB
+from os import environ
+
+if environ.get("CISTEMATIC_TEMP"):
+ cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+ cisTemp = "/tmp"
+
+
+def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False):
+ """ save to one file per genome the genes that are considred homologs in match files from different species.
+ """
+ theGenomes = matchFiles.keys()
+ outFileNames = {}
+ genes = {}
+ entryDict = {}
+ ignoreList = {}
+ for aGenome in theGenomes:
+ outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
+ genes[aGenome] = []
+ entryDict[aGenome] = {}
+ ignoreList[aGenome] = []
+
+ for aGenome in theGenomes:
+ goodLines = []
+ ignoreLines = []
+ theFile = open(matchFiles[aGenome], "r")
+ for entry in theFile:
+ fields = entry.split("\t")
+ gid = fields[11]
+ if filterCDS and fields[6] == "CDS":
+ ignoreLines.append(string.join(fields[:4], "-"))
+ continue
+
+ if fields[6] == "NONE":
+ continue
+
+ if filterLowerCase and fields[12] != fields[12].upper():
+ continue
+
+ goodLines.append(entry)
+
+ theFile.close()
+ for entry in goodLines:
+ fields = entry.split()
+ gid = fields[11]
+ geneID = (aGenome, gid)
+ matchID = string.join(fields[:4], "-")
+ if matchID in ignoreLines:
+ continue
+
+ if geneID not in genes[aGenome]:
+ genes[aGenome].append(geneID)
+
+ if geneID not in entryDict[aGenome]:
+ entryDict[aGenome][geneID] = []
+ entryDict[aGenome][geneID].append(entry)
+
+ print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
+ doCrossMatch(outFileNames, genes, entryDict, ignoreList)
+
+
+def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False):
+ """ save to one file per genome the genes that are considred homologs in match files from different species.
+ """
+ theGenomes = matchFiles.keys()
+ outFileNames = {}
+ genes = {}
+ entryDict = {}
+ ignoreList = {}
+ for aGenome in theGenomes:
+ outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
+ genes[aGenome] = []
+ entryDict[aGenome] = {}
+ ignoreList[aGenome] = []
+
+ for aGenome in theGenomes:
+ theList = []
+ if not fileList:
+ theList = [matchFiles[aGenome]]
+ else:
+ theList = matchFiles[aGenome]
+
+ for aFile in theList:
+ theFile = open(aFile, "r")
+ for entry in theFile:
+ fields = entry.split()
+ gid = fields[gidField]
+ geneID = (aGenome, gid)
+ if geneID not in genes[aGenome]:
+ genes[aGenome].append(geneID)
+
+ if geneID not in entryDict[aGenome]:
+ entryDict[aGenome][geneID] = []
+ entryDict[aGenome][geneID].append(entry)
+
+ theFile.close()
+
+ print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
+ doCrossMatch(outFileNames, genes, entryDict, ignoreList)
+
+
+def doCrossMatch(outFileNames, genes, entryDict, ignoreList):
+ theGenomes = []
+ outFile = {}
+ for genome in outFileNames:
+ theGenomes.append(genome)
+
+ hdb = homologyDB(cache=True)
+ for aGenome in theGenomes:
+ outFile = open(outFileNames[aGenome], "w")
+ counter = 0
+ for geneID in genes[aGenome]:
+ try:
+ if geneID in ignoreList[aGenome]:
+ continue
+
+ res = hdb.getHomologousGenes(geneID)
+ newcounter = 0
+ for entry in res:
+ if entry[0] in theGenomes:
+ secondGenome = entry[0]
+
+ if entry in genes[secondGenome]:
+ print "%d: %s %s" % (counter, str(geneID), str(entry))
+ if newcounter < 1:
+ for rec in entryDict[aGenome][geneID]:
+ outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec))
+
+ for rec in entryDict[secondGenome][entry]:
+ outFile.write("%d\t%s\t%s" % (counter, str(entry), rec))
+
+ newcounter += 1
+
+ if secondGenome == aGenome:
+ ignoreList[aGenome].append(entry)
+
+ if newcounter > 0:
+ counter += 1
+ outFile.write("\n")
+ except:
+ print "Problem around %s" % (str(geneID))
+
+ outFile.close()
\ No newline at end of file