erange 4.0a dev release with integrated cistematic

[erange.git] / cistematic / core / orthomatcher.py
diff --git a/cistematic/core/orthomatcher.py b/cistematic/core/orthomatcher.py

new file mode 100644 (file)

index 0000000..36d40da
--- /dev/null
+++ b/cistematic/core/orthomatcher.py
@@ -0,0 +1,174 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+import string
+from cistematic.core.homology import homologyDB
+from os import environ
+
+if environ.get("CISTEMATIC_TEMP"):
+    cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+    cisTemp = "/tmp"
+
+
+def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False):
+    """ save to one file per genome the genes that are considred homologs in match files from different species.
+    """
+    theGenomes = matchFiles.keys()
+    outFileNames = {}
+    genes = {}
+    entryDict = {}
+    ignoreList = {}
+    for aGenome in theGenomes:
+        outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
+        genes[aGenome] = []
+        entryDict[aGenome] = {}
+        ignoreList[aGenome] = []
+
+    for aGenome in theGenomes:
+        goodLines = []
+        ignoreLines = []
+        theFile = open(matchFiles[aGenome], "r")
+        for entry in theFile:
+            fields = entry.split("\t")
+            gid = fields[11]
+            if filterCDS and fields[6] == "CDS":
+                ignoreLines.append(string.join(fields[:4], "-"))
+                continue
+
+            if fields[6] == "NONE":
+                continue
+
+            if filterLowerCase and fields[12] != fields[12].upper():
+                continue
+
+            goodLines.append(entry)
+
+        theFile.close()
+        for entry in goodLines:
+            fields = entry.split()
+            gid = fields[11]
+            geneID = (aGenome, gid)
+            matchID = string.join(fields[:4], "-")
+            if matchID in ignoreLines:
+                continue
+
+            if geneID not in genes[aGenome]:
+                genes[aGenome].append(geneID)
+
+            if geneID not in entryDict[aGenome]:
+                entryDict[aGenome][geneID] = []
+                entryDict[aGenome][geneID].append(entry)
+
+        print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
+        doCrossMatch(outFileNames, genes, entryDict, ignoreList)
+
+
+def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False):
+    """ save to one file per genome the genes that are considred homologs in match files from different species.
+    """
+    theGenomes = matchFiles.keys()
+    outFileNames = {}
+    genes = {}
+    entryDict = {}
+    ignoreList = {}
+    for aGenome in theGenomes:
+        outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
+        genes[aGenome] = []
+        entryDict[aGenome] = {}
+        ignoreList[aGenome] = []
+
+    for aGenome in theGenomes:
+        theList = []
+        if not fileList:
+            theList = [matchFiles[aGenome]]
+        else:
+            theList = matchFiles[aGenome]
+
+        for aFile in theList:
+            theFile = open(aFile, "r")
+            for entry in theFile:
+                fields = entry.split()
+                gid = fields[gidField]
+                geneID = (aGenome, gid)
+                if geneID not in genes[aGenome]:
+                    genes[aGenome].append(geneID)
+
+                if geneID not in entryDict[aGenome]:
+                    entryDict[aGenome][geneID] = []
+                    entryDict[aGenome][geneID].append(entry)
+
+            theFile.close()
+
+        print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
+        doCrossMatch(outFileNames, genes, entryDict, ignoreList)
+
+
+def doCrossMatch(outFileNames, genes, entryDict, ignoreList):
+    theGenomes = []
+    outFile = {}
+    for genome in outFileNames:
+        theGenomes.append(genome)
+
+    hdb = homologyDB(cache=True)
+    for aGenome in theGenomes:
+        outFile = open(outFileNames[aGenome], "w")
+        counter = 0
+        for geneID in genes[aGenome]:
+            try:
+                if geneID in ignoreList[aGenome]:
+                    continue
+
+                res = hdb.getHomologousGenes(geneID)
+                newcounter = 0
+                for entry in res:
+                    if entry[0] in theGenomes:
+                        secondGenome = entry[0]
+
+                    if entry in genes[secondGenome]:
+                        print "%d: %s %s" % (counter, str(geneID), str(entry))
+                        if newcounter < 1:
+                            for rec in entryDict[aGenome][geneID]:
+                                outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec))
+
+                        for rec in entryDict[secondGenome][entry]:
+                            outFile.write("%d\t%s\t%s" % (counter, str(entry), rec))
+
+                        newcounter  += 1
+
+                    if secondGenome == aGenome:
+                        ignoreList[aGenome].append(entry)
+
+                if newcounter > 0:
+                    counter += 1
+                    outFile.write("\n")
+            except:
+                print "Problem around %s" % (str(geneID))
+
+        outFile.close()
+\ No newline at end of file