1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
31 from cistematic.core.homology import homologyDB
32 from os import environ
34 if environ.get("CISTEMATIC_TEMP"):
35 cisTemp = environ.get("CISTEMATIC_TEMP")
40 def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False):
41 """ save to one file per genome the genes that are considred homologs in match files from different species.
43 theGenomes = matchFiles.keys()
48 for aGenome in theGenomes:
49 outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
51 entryDict[aGenome] = {}
52 ignoreList[aGenome] = []
54 for aGenome in theGenomes:
57 theFile = open(matchFiles[aGenome], "r")
59 fields = entry.split("\t")
61 if filterCDS and fields[6] == "CDS":
62 ignoreLines.append(string.join(fields[:4], "-"))
65 if fields[6] == "NONE":
68 if filterLowerCase and fields[12] != fields[12].upper():
71 goodLines.append(entry)
74 for entry in goodLines:
75 fields = entry.split()
77 geneID = (aGenome, gid)
78 matchID = string.join(fields[:4], "-")
79 if matchID in ignoreLines:
82 if geneID not in genes[aGenome]:
83 genes[aGenome].append(geneID)
85 if geneID not in entryDict[aGenome]:
86 entryDict[aGenome][geneID] = []
87 entryDict[aGenome][geneID].append(entry)
89 print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
90 doCrossMatch(outFileNames, genes, entryDict, ignoreList)
93 def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False):
94 """ save to one file per genome the genes that are considred homologs in match files from different species.
96 theGenomes = matchFiles.keys()
101 for aGenome in theGenomes:
102 outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome)
104 entryDict[aGenome] = {}
105 ignoreList[aGenome] = []
107 for aGenome in theGenomes:
110 theList = [matchFiles[aGenome]]
112 theList = matchFiles[aGenome]
114 for aFile in theList:
115 theFile = open(aFile, "r")
116 for entry in theFile:
117 fields = entry.split()
118 gid = fields[gidField]
119 geneID = (aGenome, gid)
120 if geneID not in genes[aGenome]:
121 genes[aGenome].append(geneID)
123 if geneID not in entryDict[aGenome]:
124 entryDict[aGenome][geneID] = []
125 entryDict[aGenome][geneID].append(entry)
129 print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome)
130 doCrossMatch(outFileNames, genes, entryDict, ignoreList)
133 def doCrossMatch(outFileNames, genes, entryDict, ignoreList):
136 for genome in outFileNames:
137 theGenomes.append(genome)
139 hdb = homologyDB(cache=True)
140 for aGenome in theGenomes:
141 outFile = open(outFileNames[aGenome], "w")
143 for geneID in genes[aGenome]:
145 if geneID in ignoreList[aGenome]:
148 res = hdb.getHomologousGenes(geneID)
151 if entry[0] in theGenomes:
152 secondGenome = entry[0]
154 if entry in genes[secondGenome]:
155 print "%d: %s %s" % (counter, str(geneID), str(entry))
157 for rec in entryDict[aGenome][geneID]:
158 outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec))
160 for rec in entryDict[secondGenome][entry]:
161 outFile.write("%d\t%s\t%s" % (counter, str(entry), rec))
165 if secondGenome == aGenome:
166 ignoreList[aGenome].append(entry)
172 print "Problem around %s" % (str(geneID))