X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fcore%2Forthomatcher.py;fp=cistematic%2Fcore%2Forthomatcher.py;h=36d40da8be6b4a94e0c3d806a55ad3664f5fe8d0;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/core/orthomatcher.py b/cistematic/core/orthomatcher.py new file mode 100644 index 0000000..36d40da --- /dev/null +++ b/cistematic/core/orthomatcher.py @@ -0,0 +1,174 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +import string +from cistematic.core.homology import homologyDB +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + + +def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False): + """ save to one file per genome the genes that are considred homologs in match files from different species. + """ + theGenomes = matchFiles.keys() + outFileNames = {} + genes = {} + entryDict = {} + ignoreList = {} + for aGenome in theGenomes: + outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome) + genes[aGenome] = [] + entryDict[aGenome] = {} + ignoreList[aGenome] = [] + + for aGenome in theGenomes: + goodLines = [] + ignoreLines = [] + theFile = open(matchFiles[aGenome], "r") + for entry in theFile: + fields = entry.split("\t") + gid = fields[11] + if filterCDS and fields[6] == "CDS": + ignoreLines.append(string.join(fields[:4], "-")) + continue + + if fields[6] == "NONE": + continue + + if filterLowerCase and fields[12] != fields[12].upper(): + continue + + goodLines.append(entry) + + theFile.close() + for entry in goodLines: + fields = entry.split() + gid = fields[11] + geneID = (aGenome, gid) + matchID = string.join(fields[:4], "-") + if matchID in ignoreLines: + continue + + if geneID not in genes[aGenome]: + genes[aGenome].append(geneID) + + if geneID not in entryDict[aGenome]: + entryDict[aGenome][geneID] = [] + entryDict[aGenome][geneID].append(entry) + + print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome) + doCrossMatch(outFileNames, genes, entryDict, ignoreList) + + +def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False): + """ save to one file per genome the genes that are considred homologs in match files from different species. + """ + theGenomes = matchFiles.keys() + outFileNames = {} + genes = {} + entryDict = {} + ignoreList = {} + for aGenome in theGenomes: + outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome) + genes[aGenome] = [] + entryDict[aGenome] = {} + ignoreList[aGenome] = [] + + for aGenome in theGenomes: + theList = [] + if not fileList: + theList = [matchFiles[aGenome]] + else: + theList = matchFiles[aGenome] + + for aFile in theList: + theFile = open(aFile, "r") + for entry in theFile: + fields = entry.split() + gid = fields[gidField] + geneID = (aGenome, gid) + if geneID not in genes[aGenome]: + genes[aGenome].append(geneID) + + if geneID not in entryDict[aGenome]: + entryDict[aGenome][geneID] = [] + entryDict[aGenome][geneID].append(entry) + + theFile.close() + + print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome) + doCrossMatch(outFileNames, genes, entryDict, ignoreList) + + +def doCrossMatch(outFileNames, genes, entryDict, ignoreList): + theGenomes = [] + outFile = {} + for genome in outFileNames: + theGenomes.append(genome) + + hdb = homologyDB(cache=True) + for aGenome in theGenomes: + outFile = open(outFileNames[aGenome], "w") + counter = 0 + for geneID in genes[aGenome]: + try: + if geneID in ignoreList[aGenome]: + continue + + res = hdb.getHomologousGenes(geneID) + newcounter = 0 + for entry in res: + if entry[0] in theGenomes: + secondGenome = entry[0] + + if entry in genes[secondGenome]: + print "%d: %s %s" % (counter, str(geneID), str(entry)) + if newcounter < 1: + for rec in entryDict[aGenome][geneID]: + outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec)) + + for rec in entryDict[secondGenome][entry]: + outFile.write("%d\t%s\t%s" % (counter, str(entry), rec)) + + newcounter += 1 + + if secondGenome == aGenome: + ignoreList[aGenome].append(entry) + + if newcounter > 0: + counter += 1 + outFile.write("\n") + except: + print "Problem around %s" % (str(geneID)) + + outFile.close() \ No newline at end of file