erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / orthomatcher.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 import string
31 from cistematic.core.homology import homologyDB
32 from os import environ
33
34 if environ.get("CISTEMATIC_TEMP"):
35     cisTemp = environ.get("CISTEMATIC_TEMP")
36 else:
37     cisTemp = "/tmp"
38
39
40 def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False):
41     """ save to one file per genome the genes that are considred homologs in match files from different species.
42     """
43     theGenomes = matchFiles.keys()
44     outFileNames = {}
45     genes = {}
46     entryDict = {}
47     ignoreList = {}
48     for aGenome in theGenomes:
49         outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
50         genes[aGenome] = []
51         entryDict[aGenome] = {}
52         ignoreList[aGenome] = []
53
54     for aGenome in theGenomes:
55         goodLines = []
56         ignoreLines = []
57         theFile = open(matchFiles[aGenome], "r")
58         for entry in theFile:
59             fields = entry.split("\t")
60             gid = fields[11]
61             if filterCDS and fields[6] == "CDS":
62                 ignoreLines.append(string.join(fields[:4], "-"))
63                 continue
64
65             if fields[6] == "NONE":
66                 continue
67
68             if filterLowerCase and fields[12] != fields[12].upper():
69                 continue
70
71             goodLines.append(entry)
72
73         theFile.close()
74         for entry in goodLines:
75             fields = entry.split()
76             gid = fields[11]
77             geneID = (aGenome, gid)
78             matchID = string.join(fields[:4], "-")
79             if matchID in ignoreLines:
80                 continue
81
82             if geneID not in genes[aGenome]:
83                 genes[aGenome].append(geneID)
84
85             if geneID not in entryDict[aGenome]:
86                 entryDict[aGenome][geneID] = []
87                 entryDict[aGenome][geneID].append(entry)
88
89         print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
90         doCrossMatch(outFileNames, genes, entryDict, ignoreList)
91
92
93 def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False):
94     """ save to one file per genome the genes that are considred homologs in match files from different species.
95     """
96     theGenomes = matchFiles.keys()
97     outFileNames = {}
98     genes = {}
99     entryDict = {}
100     ignoreList = {}
101     for aGenome in theGenomes:
102         outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
103         genes[aGenome] = []
104         entryDict[aGenome] = {}
105         ignoreList[aGenome] = []
106
107     for aGenome in theGenomes:
108         theList = []
109         if not fileList:
110             theList = [matchFiles[aGenome]]
111         else:
112             theList = matchFiles[aGenome]
113
114         for aFile in theList:
115             theFile = open(aFile, "r")
116             for entry in theFile:
117                 fields = entry.split()
118                 gid = fields[gidField]
119                 geneID = (aGenome, gid)
120                 if geneID not in genes[aGenome]:
121                     genes[aGenome].append(geneID)
122
123                 if geneID not in entryDict[aGenome]:
124                     entryDict[aGenome][geneID] = []
125                     entryDict[aGenome][geneID].append(entry)
126
127             theFile.close()
128
129         print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
130         doCrossMatch(outFileNames, genes, entryDict, ignoreList)
131
132
133 def doCrossMatch(outFileNames, genes, entryDict, ignoreList):
134     theGenomes = []
135     outFile = {}
136     for genome in outFileNames:
137         theGenomes.append(genome)
138
139     hdb = homologyDB(cache=True)
140     for aGenome in theGenomes:
141         outFile = open(outFileNames[aGenome], "w")
142         counter = 0
143         for geneID in genes[aGenome]:
144             try:
145                 if geneID in ignoreList[aGenome]:
146                     continue
147
148                 res = hdb.getHomologousGenes(geneID)
149                 newcounter = 0
150                 for entry in res:
151                     if entry[0] in theGenomes:
152                         secondGenome = entry[0]
153
154                     if entry in genes[secondGenome]:
155                         print "%d: %s %s" % (counter, str(geneID), str(entry))
156                         if newcounter < 1:
157                             for rec in entryDict[aGenome][geneID]:
158                                 outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec))
159
160                         for rec in entryDict[secondGenome][entry]:
161                             outFile.write("%d\t%s\t%s" % (counter, str(entry), rec))
162
163                         newcounter  += 1
164
165                     if secondGenome == aGenome:
166                         ignoreList[aGenome].append(entry)
167
168                 if newcounter > 0:
169                     counter += 1
170                     outFile.write("\n")
171             except:
172                 print "Problem around %s" % (str(geneID))
173
174         outFile.close()