+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# This class contains the core code for using orthology and sequence-level conservation.
+try:
+ from pysqlite2 import dbapi2 as sqlite
+except:
+ from sqlite3 import dbapi2 as sqlite
+
+import string, cistematic.core
+from cistematic.core.homology import homologyDB, homologeneGenomes
+from cistematic.programs.mafft import Mafft
+from cistematic.programs.paircomp import Paircomp
+from cistematic.programs.fastcomp import Fastcomp
+
+
+class Conservation:
+ """ The conservation class holds the conservation specific code for
+ specifying orthology/paralogy, calling conservation identification code,
+ and manipulating/storing conserved sequences. It is meant to be used
+ in conjuction with the Experiment and AnalyzeMotifs classes.
+ """
+ conservationID = "default"
+ consDBName = ""
+ startingGenome = ""
+ targetGenomes = []
+ refGenes = []
+
+
+ def setTargetGenomes(self, tGenomes):
+ """ restricts homologs to the genomes specified in the tGenomes list.
+ """
+ self.targetGenomes = tGenomes
+
+
+ def setRefGenes(self, rGenes):
+ """ sets the list of loci (not geneIDs) that will be compared to homologs.
+ """
+ self.refGenes = rGenes
+
+
+ def setStartingGenome(self, sGenome):
+ """ sets the genome associated with the refGenes.
+ """
+ self.startingGenome = sGenome
+
+
+ def importHomology(self, inputFile, source="generic"):
+ """ loads homology relationships from a tab-delimited file of the form:
+ homologyGroup orthologyGroup genome locus
+ where orthologyGroup can be left blank.
+ """
+ stmtList = []
+ importFile = open(inputFile, "r")
+ stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
+ for line in importFile:
+ (homGroup, orthGroup, genome, locus) = line.split("\t")
+ stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source))
+
+ importFile.close()
+ self.batchsqlcons(stmt, stmtList)
+
+
+ def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"):
+ """ define the geneIDs in geneIDList as being homologous.
+ """
+ stmtList = []
+
+ if homGroup == "":
+ for tempID in geneIDList:
+ tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1]))
+ if not self.hasHomGroup(tempHomGroup):
+ homGroup = tempHomGroup
+ break
+
+ if homGroup == "":
+ self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList))
+ return ""
+
+ stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
+ for geneID in geneIDList:
+ stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source))
+
+ self.batchsqlCons(stmt, stmtList)
+ return homGroup
+
+
+ def deleteHomolog(self, geneID, homGroup=""):
+ """ delete all entries from the homology table for a given geneID and homGroup.
+ """
+ stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1])
+ self.sqlcons(stmt, "commit")
+
+
+ def deleteOrtholog(self, geneID, orthGroup):
+ """ delete all entries from the homology table for a given geneID and orthGroup.
+ """
+ stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1])
+ self.sqlcons(stmt, "commit")
+
+
+ def deleteHomologyGroup(self, homGroup):
+ """ delete all entries from the homology table matching the given homology group.
+ """
+ stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' " % (self.conservationID, homGroup)
+ self.sqlcons(stmt, "commit")
+
+
+ def deleteOrthologyGroup(self, orthGroup):
+ """ delete all entries from the homology table matching the given orthology group.
+ """
+ stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup)
+ self.sqlcons(stmt, "commit")
+
+
+ def hasHomGroup(self, testGroup):
+ """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table.
+ """
+ stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup
+ res = self.sqlcons(stmt)
+ if int(res[0]) > 0:
+ return True
+
+ return False
+
+
+ def returnHomologs(self, geneID):
+ """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in
+ self.targetGenomes, based on entries in the homology table. This function will try to load entries from
+ homologene for supported genomes, if necessary.
+ """
+ homologList = []
+ usedHomologene = False
+ stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
+ groups = self.sqlcons(stmt)
+
+ if len(groups) < 1 and geneID[0] in homologeneGenomes:
+ self.loadFromHomologene(geneID)
+ groups = self.sqlcons(stmt)
+ usedHomologene = True
+
+ for (recID, hIDentry) in groups:
+ stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s' " % (str(hIDentry), str(recID))
+ genes = self.sqlcons(stmt)
+ for gene in genes:
+ strgene = (str(gene[0]), str(gene[1]))
+ if usedHomologene and strgene[0] not in self.targetGenomes:
+ continue
+
+ if strgene not in homologList:
+ homologList.append(strgene)
+
+ return homologList
+
+
+ def returnOrthologs(self, geneID):
+ """ returns a list of genes that are orthologous to a geneID and whose genome are in
+ self.targetGenomes, based on explicit entries in the homology table.
+ """
+ orthologList = []
+ stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
+ groups = self.sqlcons(stmt)
+
+ for (recID, oIDentry) in groups:
+ if oIDentry == "":
+ continue
+ stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID))
+ genes = self.sqlcons(stmt)
+ for gene in genes:
+ strgene = (str(gene[0]), str(gene[1]))
+ if strgene[0] in self.targetGenomes and strgene not in orthologList:
+ orthologList.append(strgene)
+
+ return orthologList
+
+
+ def areOrthologs(self, geneID1, geneID2):
+ """ returns True if genes geneID1 and geneID2 are orthologs.
+ """
+ stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1
+ groups1 = self.sqlcons(stmt)
+
+ stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2
+ groups2 = self.sqlcons(stmt)
+
+ oEntries1 = []
+ oEntries2 = []
+ for (recID, oIDentry) in groups1:
+ oEntries1.append(oIDentry)
+
+ for (recID, oIDentry) in groups2:
+ oEntries2.append(oIDentry)
+
+ for entry in oEntries1:
+ if entry in oEntries2:
+ return True
+
+ return False
+
+
+ def loadFromHomologene(self, geneID):
+ """ load the homologous genes to geneID from homologene.
+ """
+ try:
+ hdb = homologyDB(self.targetGenomes)
+ hGenes = hdb.getHomologousGenes(geneID)
+ except:
+ hdb = homologyDB(self.targetGenomes, cache=True)
+ hGenes = hdb.getHomologousGenes(geneID)
+
+ hGenes.append(geneID)
+ homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
+ self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene")
+
+
+ def computeAlignments(self, geneIDList=[]):
+ """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of
+ all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form
+ genome-locus' from the geneIDs in the starting genome.
+ """
+ if len(geneIDList) < 1:
+ geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+ prog = Mafft()
+ for geneID in geneIDList:
+ homGroup = str(geneID[0]) + '-' + str(geneID[1])
+ hGenes = self.returnHomologs(geneID)
+ hGenes.append(geneID)
+ fastaFile = self.createDataFile(geneIDList = hGenes)
+ prog.inputFile(fastaFile)
+ prog.run()
+ alignedDict = prog.getAlignment()
+ for dictKey in alignedDict:
+ aGeneID = dictKey.split('-')
+ self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey])
+
+
+ def insertAlignedSequence(self, datasetID, geneID, seq):
+ """ save an aligned sequence into alignedSequence.
+ """
+ values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq)
+ stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values
+ self.sqlcons(stmt, "commit")
+
+
+ def getAlignedSequence(self, geneID, datasetID=""):
+ """ retrieve an aligned sequence from alignedSequence using datasetID and geneID.
+ """
+ stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
+ if len(datasetID) > 0:
+ stmt += "and datasetID = '%s' " % (datasetID)
+
+ res = self.sqlcons(stmt)
+
+ return str(res[0][0])
+
+
+ def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]):
+ """ map regions of sequences where multiple alignments show at least one other (default)
+ or all genes (stritct=True) as lining up.
+ """
+ if len(geneIDList) < 1:
+ geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+ if strict:
+ criteria = "strict"
+ else:
+ criteria = "partial"
+
+ for geneID in geneIDList:
+ homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
+ hGenes = self.returnHomologs(geneID)
+ hGenes.append(geneID)
+ if len(hGenes) < 2:
+ continue
+
+ maskedSequences = self.maskUsingConservation(hGenes, strict)
+ for gID in hGenes:
+ start = 0
+ consLength = 0
+ for pos in range(len(maskedSequences[gID])):
+ if maskedSequences[gID][pos] == "N":
+ if start != 0 and consLength >= minConsLength:
+ self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
+
+ start = 0
+ consLength = 0
+ continue
+
+ if start == 0:
+ start = pos
+
+ consLength += 1
+
+ if start != 0 and consLength >= minConsLength:
+ self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
+
+
+ def maskUsingConservation(self, geneIDList, strict=False):
+ """ mask every gene in geneIDList using conservation amongst themselves, which have
+ already been computed using computeAlignments()
+ """
+ alignmentDict = {}
+ maskedDict = {}
+
+ for gID in geneIDList:
+ seqLen = 0
+ try:
+ alignmentDict[gID] = self.getAlignedSequence(gID)
+ maskedDict[gID] = ""
+ seqLen = len(alignmentDict[gID])
+ except:
+ return maskedDict
+
+ if strict:
+ criteria = len(geneIDList)
+ else:
+ criteria = 2
+
+ for pos in range(seqLen):
+ posDict = {}
+ conserved = 0
+ for geneID in geneIDList:
+ posDict[geneID] = alignmentDict[geneID][pos]
+ if posDict[geneID] != "-" and posDict[geneID] != "N":
+ conserved += 1
+
+ if conserved >= criteria:
+ for geneID in geneIDList:
+ if posDict[geneID] != "-":
+ maskedDict[geneID] += posDict[geneID]
+ else:
+ for geneID in geneIDList:
+ if posDict[geneID] != "-":
+ maskedDict[geneID] += "N"
+
+ return maskedDict
+
+
+ def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False):
+ """ map regions of sequences with seqcomp windows in all genes that have
+ more than threshold (<= 1) conservation in more than minSequences sequences.
+ Can optionally use a higher threshold for orothlogs if specified using
+ orthologyThreshold.
+ """
+ consList = []
+ if orthologyThreshold < threshold:
+ orthologyThreshold = threshold
+
+ if len(geneIDList) < 1:
+ geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+ if useFastcomp:
+ prog = Fastcomp()
+ else:
+ prog = Paircomp()
+
+ prog.setWindowSize(window)
+ # if we have a window on a sequence, then we have at least one match!
+ minSeqNum = minSequences - 1
+ for geneID in geneIDList:
+ genePairs = []
+ homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
+ hGenes = self.returnHomologs(geneID)
+ hGenes.append(geneID)
+ for first in hGenes:
+ for second in hGenes:
+ if first != second and [first, second] not in genePairs and [second, first] not in genePairs:
+ genePairs.append([first, second])
+
+ seqcompWindows = {}
+ print "genepairs = %s" % str(genePairs)
+ for pair in genePairs:
+ fastaFile = self.createDataFile(geneIDList = pair)
+ if self.areOrthologs(pair[0], pair[1]):
+ prog.setThreshold(orthologyThreshold)
+ else:
+ prog.setThreshold(threshold)
+
+ prog.inputFile(fastaFile)
+ prog.run()
+ seqcompWindows[(pair[0], pair[1])] = prog.getWindows()
+
+ resultWindows = {}
+ for gene in hGenes:
+ resultWindows[gene] = {}
+
+ for pair in seqcompWindows:
+ (geneID1, geneID2) = pair
+ for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]:
+ if seq1pos not in resultWindows[geneID1]:
+ resultWindows[geneID1][seq1pos] = []
+
+ if seq2pos not in resultWindows[geneID2]:
+ resultWindows[geneID2][seq2pos] = []
+
+ resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches))
+ resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches))
+
+ for geneID in hGenes:
+ for position in resultWindows[geneID]:
+ otherGeneIDs = []
+ for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
+ if geneID2 not in otherGeneIDs:
+ otherGeneIDs.append(geneID2)
+
+ if len(otherGeneIDs) >= minSeqNum:
+ criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window)
+ for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
+ criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
+
+ consList.append((homGroup, geneID, position, window, "seqcompCons", criteria))
+
+ self.insertConservedSequenceBatch(consList)
+
+
+ def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False):
+ """ map regions of sequences with seqcomp windows in all genes that have
+ more than threshold (<= 1) conservation. Uses transivity.
+ """
+ self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp)
+
+
+ def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False):
+ """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of
+ sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1)
+ conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity.
+ """
+ consList = []
+ if len(geneIDList) < 1:
+ geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+ if useFastcomp:
+ prog = Fastcomp()
+ else:
+ prog = Paircomp()
+
+ prog.setWindowSize(window)
+ for geneID in geneIDList:
+ genePairs = []
+ homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
+ oGenes = self.returnOrthologs(geneID)
+ # homologs should contain orthologs!
+ hGenes = self.returnHomologs(geneID)
+ if len(hGenes) < 1:
+ continue
+
+ seqcompWindows = {}
+ genePairs = [[geneID, gene] for gene in hGenes]
+ for pair in genePairs:
+ if pair[1] in oGenes:
+ prog.setThreshold(orthologyThreshold)
+ else:
+ prog.setThreshold(paralogyThreshold)
+
+ fastaFile = self.createDataFile(geneIDList = pair)
+ prog.inputFile(fastaFile)
+ prog.run()
+ seqcompWindows[pair[1]] = prog.getWindows()
+
+ resultWindows = {}
+ print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows))
+ for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]:
+ resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)]
+
+ for gene in hGenes[1:]:
+ newseqPositions = []
+ for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]:
+ if seq1pos not in resultWindows:
+ continue
+
+ newseqPositions.append(seq1pos)
+ resultWindows[seq1pos].append((gene, seq2pos, sense, matches))
+
+ for pos in resultWindows.keys():
+ if pos not in newseqPositions:
+ del resultWindows[pos]
+
+ for windowPos in resultWindows.keys():
+ windowList = resultWindows[windowPos]
+ criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window)
+ for (geneID2, seq2pos, sense, matches) in windowList:
+ criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
+
+ consList.append((homGroup, geneID, windowPos, window, tag, criteria))
+
+ for windowEntry in windowList:
+ (gID, seq2pos, sense, matches) = windowEntry
+ consList.append((homGroup, gID, seq2pos, window, tag, criteria))
+
+ self.insertConservedSequenceBatch(consList)
+
+
+ def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria):
+ """ insert an entry in conservedSequence.
+ """
+ values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria)
+ stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values
+ self.sqlcons(stmt, 'commit')
+
+
+ def insertConservedSequenceBatch(self, batchlist):
+ """ insert a list of entries in conservedSequence.
+ """
+ sqlList = []
+ stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)"
+ for (datasetID, geneID, pos, length, method, criteria) in batchlist:
+ sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria)))
+
+ self.batchsqlCons(stmt, sqlList)
+
+
+ def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""):
+ """ retrieve a list of conservation windows of the form (start, length) from
+ conservedSequence using datasetID and geneID, which match a particular method and criteria.
+ """
+ results = []
+ stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
+ if datasetID > 0:
+ stmt += "and datasetID = '%d' " % (datasetID)
+
+ if len(method) > 0:
+ stmt += " and method = '%s' " % (method)
+
+ if len(criteria) > 0:
+ stmt += "and score = '%s' " % (criteria)
+
+ res = self.sqlcons(stmt)
+ for (location, length, criteria) in res:
+ results.append((int(location), int(length), str(criteria)))
+
+ return results
+
+
+ def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""):
+ """ returns True if a particular sequence of position and length falls within a conservation
+ window.
+ """
+ if len(position) == 2:
+ position = position[0]
+ stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
+ if datasetID > 0:
+ stmt += "and datasetID = '%d' " % (datasetID)
+
+ if len(method) > 0:
+ stmt += " and method = '%s' " % (method)
+
+ if len(criteria) > 0:
+ stmt += "and score = '%s' " % (criteria)
+
+ res = self.sqlcons(stmt)
+ for (location, wlen) in res:
+ if int(location) <= position and (int(location) + int(wlen)) >= (position + length):
+ return True
+
+ return False
+
+
+ def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21):
+ """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns
+ a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger.
+ """
+ maskedSeqDict = {}
+ if datasetID < 0:
+ datasetID = self.genepoolID
+
+ settingsList = self.getSettingsID(datasetID)
+ geneIDList = eval(settingsList[1])
+ for geneID in geneIDList:
+ theseq = self.genepool[geneID]
+ seqlen = len(theseq)
+ maskedseq = ["N"] * seqlen
+ conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria)
+ for (start, length, crit) in conservedWindows:
+ if start < 0:
+ start = 0
+
+ if start + length >= seqlen:
+ length = seqlen - start - 1
+
+ for index in range(start, start + length):
+ maskedseq[index] = theseq[index]
+
+ tempseq = []
+ numN = 0
+ for letter in maskedseq:
+ if letter.upper() == "N":
+ numN += 1
+ else:
+ numN = 0
+
+ if numN > stripNLonger:
+ continue
+
+ tempseq.append(letter)
+
+ maskedSeqDict[geneID] = string.join(tempseq, "")
+
+ return maskedSeqDict
+
+
+ def conservationStat(self, datasetID=-1, method="", criteria=""):
+ """ report conservation level in each of the genes in the dataset.
+ """
+ nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"]
+ totalcons = 0
+ totalsize = 0
+ consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria)
+ for geneID in consGeneDict:
+ ntstat = 0
+ for NT in nucleotides:
+ ntstat += consGeneDict[geneID].count(NT)
+
+ blocks = (len(consGeneDict[geneID]) - ntstat) / 21
+ origSize = len(self.genepool[geneID])
+ consPercentage = 100. * ntstat / float(origSize)
+ print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage)
+ totalcons += ntstat
+ totalsize += origSize
+
+ consPercentage = 100. * totalcons / float(totalsize)
+ print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage)
+
+
+ def motifConservationStat(self, motifList=[]):
+ """ check how many motifs instances for the motifs in the mapped motifs
+ (or only in motifList) are conserved.
+ """
+ if len(motifList) == 0:
+ motifList = self.geneToMotifKeys(thekey="motif")
+
+ for motID in motifList:
+ index = 0
+ matches = self.motifToGene(motID)
+ for (loc, pos) in matches:
+ mot = self.findMotif(motID)
+ if self.isConserved(loc, pos, len(mot)):
+ index += 1
+
+ print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches)))
+
+
+ def checkForConservedSequence(self, datasetID=-1, method="", criteria=""):
+ """ checks that at least two or more sequences in the dataset have conservation.
+ """
+ someHaveConservation = False
+ numCons = 0
+ checkDict = self.maskNonConservedSequence(datasetID, method, criteria)
+ for entry in checkDict:
+ theseq = checkDict[entry].upper()
+ if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq:
+ numCons += 1
+
+ if numCons > 1:
+ someHaveConservation = True
+
+ return someHaveConservation
+
+
+ def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"):
+ """ exports conserved sequences to a fasta file.
+ """
+ (up, cds, down) = self.getSeqParameters()
+ consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000)
+ if len(genomes) > 0:
+ consDictEntries = consDict.keys()
+ for entry in consDictEntries:
+ if entry[0] not in genomes:
+ del consDict[entry]
+
+ outfilename = "%s/%s.fsa" % (directory, prefix)
+ consOutfile = open(outfilename, "w")
+ for entry in consDict:
+ entryCoordinates = cistematic.core.geneEntry(entry)
+ entrySense = entryCoordinates[4]
+ theseq = consDict[entry].upper()
+ seqlen = len(theseq)
+ (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates)
+ if entrySense == "R":
+ theseq = cistematic.core.complement(theseq)
+
+ conservedBlockStart = -1
+ for pos in range(seqlen):
+ if theseq[pos] == "N":
+ if conservedBlockStart >= 0:
+ consStart = start + conservedBlockStart
+ consSeq = theseq[conservedBlockStart:pos]
+ consStop = consStart + len(consSeq) - 1
+ consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
+ conservedBlockStart = -1
+
+ continue
+
+ if conservedBlockStart < 0:
+ conservedBlockStart = pos
+
+ if conservedBlockStart >= 0:
+ consStart = start + conservedBlockStart
+ consSeq = theseq[conservedBlockStart:pos]
+ consStop = consStart + len(consSeq) - 1
+ consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
+
+ consOutfile.close()
+
+
+ def createConservation(self, conservationID="default", dbName=""):
+ """ creates the conservation SQL tables. Should only be run once per database file.
+ """
+ stmtList = []
+ self.loadConservation(conservationID, dbName)
+ try:
+ stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)")
+ stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)")
+ stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)")
+ stmtList.append("CREATE index cons1 on homology(conservationID)")
+ stmtList.append("CREATE index cons2 on alignedSequence(conservationID)")
+ stmtList.append("CREATE index cons3 on conservedSequence(conservationID)")
+ stmtList.append("CREATE index hom1 on homology(homologyGroup)")
+ stmtList.append("CREATE index orth1 on homology(orthologyGroup)")
+ stmtList.append("CREATE index homlocus1 on homology(genome, locus)")
+ stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)")
+ stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)")
+ for stmt in stmtList:
+ self.sqlcons(stmt, commit=True)
+
+ self.mlog("Created conservation tables in database %s" % dbName)
+ except:
+ self.mlog("Could not create conservation tables in database %s" % dbName)
+ self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?")
+
+
+ def loadConservation(self, conservationID="default", dbName=""):
+ """ changes the conservationID to use the specified one, or use default. Must be used before reading or
+ writing any data to analysis tables.
+ """
+ self.conservationID = conservationID
+ if len (dbName) > 0:
+ self.consDBName = dbName
+ else:
+ self.consDBName = self.expFile
+
+
+ def sqlcons(self, stmt, commit=""):
+ """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
+ """
+ res = []
+ db = sqlite.connect(self.consDBName)
+ sqlc = db.cursor()
+ try:
+ if self.debugSQL:
+ print "Conservation->sqlcons: %s" % stmt
+
+ sqlc.execute(stmt)
+ res = sqlc.fetchall()
+ try:
+ if commit != "":
+ db.commit()
+ except:
+ self.mlog("Conservation->sqlcons (commit exception)")
+ except:
+ self.mlog("Conservation->sqlcons (statement exception): %s" % stmt)
+
+ sqlc.close()
+ db.close()
+
+ return res
+
+
+ def batchsqlCons(self, stmt, batch):
+ """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
+ """
+ res = []
+ db = sqlite.connect(self.consDBName)
+ sqlc = db.cursor()
+ try:
+ if self.debugSQL:
+ print "batchsqlCons: %s" % stmt
+ print "batchsqlCons: %s" % (str(batch))
+
+ sqlc.executemany(stmt, batch)
+ try:
+ db.commit()
+ except:
+ self.mlog("Conservation->batchsqlCons (commit exception)")
+ except:
+ self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt)
+
+ sqlc.close()
+ db.close()
+
+ return res
\ No newline at end of file