X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fexperiments%2Fconservation.py;fp=cistematic%2Fexperiments%2Fconservation.py;h=04ce17cc96a6b80994bf4d84f573ab50c2eaae1e;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/experiments/conservation.py b/cistematic/experiments/conservation.py new file mode 100644 index 0000000..04ce17c --- /dev/null +++ b/cistematic/experiments/conservation.py @@ -0,0 +1,817 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# This class contains the core code for using orthology and sequence-level conservation. +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import string, cistematic.core +from cistematic.core.homology import homologyDB, homologeneGenomes +from cistematic.programs.mafft import Mafft +from cistematic.programs.paircomp import Paircomp +from cistematic.programs.fastcomp import Fastcomp + + +class Conservation: + """ The conservation class holds the conservation specific code for + specifying orthology/paralogy, calling conservation identification code, + and manipulating/storing conserved sequences. It is meant to be used + in conjuction with the Experiment and AnalyzeMotifs classes. + """ + conservationID = "default" + consDBName = "" + startingGenome = "" + targetGenomes = [] + refGenes = [] + + + def setTargetGenomes(self, tGenomes): + """ restricts homologs to the genomes specified in the tGenomes list. + """ + self.targetGenomes = tGenomes + + + def setRefGenes(self, rGenes): + """ sets the list of loci (not geneIDs) that will be compared to homologs. + """ + self.refGenes = rGenes + + + def setStartingGenome(self, sGenome): + """ sets the genome associated with the refGenes. + """ + self.startingGenome = sGenome + + + def importHomology(self, inputFile, source="generic"): + """ loads homology relationships from a tab-delimited file of the form: + homologyGroup orthologyGroup genome locus + where orthologyGroup can be left blank. + """ + stmtList = [] + importFile = open(inputFile, "r") + stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " + for line in importFile: + (homGroup, orthGroup, genome, locus) = line.split("\t") + stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source)) + + importFile.close() + self.batchsqlcons(stmt, stmtList) + + + def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"): + """ define the geneIDs in geneIDList as being homologous. + """ + stmtList = [] + + if homGroup == "": + for tempID in geneIDList: + tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1])) + if not self.hasHomGroup(tempHomGroup): + homGroup = tempHomGroup + break + + if homGroup == "": + self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList)) + return "" + + stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " + for geneID in geneIDList: + stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source)) + + self.batchsqlCons(stmt, stmtList) + return homGroup + + + def deleteHomolog(self, geneID, homGroup=""): + """ delete all entries from the homology table for a given geneID and homGroup. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1]) + self.sqlcons(stmt, "commit") + + + def deleteOrtholog(self, geneID, orthGroup): + """ delete all entries from the homology table for a given geneID and orthGroup. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1]) + self.sqlcons(stmt, "commit") + + + def deleteHomologyGroup(self, homGroup): + """ delete all entries from the homology table matching the given homology group. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' " % (self.conservationID, homGroup) + self.sqlcons(stmt, "commit") + + + def deleteOrthologyGroup(self, orthGroup): + """ delete all entries from the homology table matching the given orthology group. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup) + self.sqlcons(stmt, "commit") + + + def hasHomGroup(self, testGroup): + """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table. + """ + stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup + res = self.sqlcons(stmt) + if int(res[0]) > 0: + return True + + return False + + + def returnHomologs(self, geneID): + """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in + self.targetGenomes, based on entries in the homology table. This function will try to load entries from + homologene for supported genomes, if necessary. + """ + homologList = [] + usedHomologene = False + stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID + groups = self.sqlcons(stmt) + + if len(groups) < 1 and geneID[0] in homologeneGenomes: + self.loadFromHomologene(geneID) + groups = self.sqlcons(stmt) + usedHomologene = True + + for (recID, hIDentry) in groups: + stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s' " % (str(hIDentry), str(recID)) + genes = self.sqlcons(stmt) + for gene in genes: + strgene = (str(gene[0]), str(gene[1])) + if usedHomologene and strgene[0] not in self.targetGenomes: + continue + + if strgene not in homologList: + homologList.append(strgene) + + return homologList + + + def returnOrthologs(self, geneID): + """ returns a list of genes that are orthologous to a geneID and whose genome are in + self.targetGenomes, based on explicit entries in the homology table. + """ + orthologList = [] + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID + groups = self.sqlcons(stmt) + + for (recID, oIDentry) in groups: + if oIDentry == "": + continue + stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID)) + genes = self.sqlcons(stmt) + for gene in genes: + strgene = (str(gene[0]), str(gene[1])) + if strgene[0] in self.targetGenomes and strgene not in orthologList: + orthologList.append(strgene) + + return orthologList + + + def areOrthologs(self, geneID1, geneID2): + """ returns True if genes geneID1 and geneID2 are orthologs. + """ + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1 + groups1 = self.sqlcons(stmt) + + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2 + groups2 = self.sqlcons(stmt) + + oEntries1 = [] + oEntries2 = [] + for (recID, oIDentry) in groups1: + oEntries1.append(oIDentry) + + for (recID, oIDentry) in groups2: + oEntries2.append(oIDentry) + + for entry in oEntries1: + if entry in oEntries2: + return True + + return False + + + def loadFromHomologene(self, geneID): + """ load the homologous genes to geneID from homologene. + """ + try: + hdb = homologyDB(self.targetGenomes) + hGenes = hdb.getHomologousGenes(geneID) + except: + hdb = homologyDB(self.targetGenomes, cache=True) + hGenes = hdb.getHomologousGenes(geneID) + + hGenes.append(geneID) + homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene") + + + def computeAlignments(self, geneIDList=[]): + """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of + all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form + genome-locus' from the geneIDs in the starting genome. + """ + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + prog = Mafft() + for geneID in geneIDList: + homGroup = str(geneID[0]) + '-' + str(geneID[1]) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + fastaFile = self.createDataFile(geneIDList = hGenes) + prog.inputFile(fastaFile) + prog.run() + alignedDict = prog.getAlignment() + for dictKey in alignedDict: + aGeneID = dictKey.split('-') + self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey]) + + + def insertAlignedSequence(self, datasetID, geneID, seq): + """ save an aligned sequence into alignedSequence. + """ + values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq) + stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values + self.sqlcons(stmt, "commit") + + + def getAlignedSequence(self, geneID, datasetID=""): + """ retrieve an aligned sequence from alignedSequence using datasetID and geneID. + """ + stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if len(datasetID) > 0: + stmt += "and datasetID = '%s' " % (datasetID) + + res = self.sqlcons(stmt) + + return str(res[0][0]) + + + def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]): + """ map regions of sequences where multiple alignments show at least one other (default) + or all genes (stritct=True) as lining up. + """ + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if strict: + criteria = "strict" + else: + criteria = "partial" + + for geneID in geneIDList: + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + if len(hGenes) < 2: + continue + + maskedSequences = self.maskUsingConservation(hGenes, strict) + for gID in hGenes: + start = 0 + consLength = 0 + for pos in range(len(maskedSequences[gID])): + if maskedSequences[gID][pos] == "N": + if start != 0 and consLength >= minConsLength: + self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria) + + start = 0 + consLength = 0 + continue + + if start == 0: + start = pos + + consLength += 1 + + if start != 0 and consLength >= minConsLength: + self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria) + + + def maskUsingConservation(self, geneIDList, strict=False): + """ mask every gene in geneIDList using conservation amongst themselves, which have + already been computed using computeAlignments() + """ + alignmentDict = {} + maskedDict = {} + + for gID in geneIDList: + seqLen = 0 + try: + alignmentDict[gID] = self.getAlignedSequence(gID) + maskedDict[gID] = "" + seqLen = len(alignmentDict[gID]) + except: + return maskedDict + + if strict: + criteria = len(geneIDList) + else: + criteria = 2 + + for pos in range(seqLen): + posDict = {} + conserved = 0 + for geneID in geneIDList: + posDict[geneID] = alignmentDict[geneID][pos] + if posDict[geneID] != "-" and posDict[geneID] != "N": + conserved += 1 + + if conserved >= criteria: + for geneID in geneIDList: + if posDict[geneID] != "-": + maskedDict[geneID] += posDict[geneID] + else: + for geneID in geneIDList: + if posDict[geneID] != "-": + maskedDict[geneID] += "N" + + return maskedDict + + + def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False): + """ map regions of sequences with seqcomp windows in all genes that have + more than threshold (<= 1) conservation in more than minSequences sequences. + Can optionally use a higher threshold for orothlogs if specified using + orthologyThreshold. + """ + consList = [] + if orthologyThreshold < threshold: + orthologyThreshold = threshold + + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if useFastcomp: + prog = Fastcomp() + else: + prog = Paircomp() + + prog.setWindowSize(window) + # if we have a window on a sequence, then we have at least one match! + minSeqNum = minSequences - 1 + for geneID in geneIDList: + genePairs = [] + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + for first in hGenes: + for second in hGenes: + if first != second and [first, second] not in genePairs and [second, first] not in genePairs: + genePairs.append([first, second]) + + seqcompWindows = {} + print "genepairs = %s" % str(genePairs) + for pair in genePairs: + fastaFile = self.createDataFile(geneIDList = pair) + if self.areOrthologs(pair[0], pair[1]): + prog.setThreshold(orthologyThreshold) + else: + prog.setThreshold(threshold) + + prog.inputFile(fastaFile) + prog.run() + seqcompWindows[(pair[0], pair[1])] = prog.getWindows() + + resultWindows = {} + for gene in hGenes: + resultWindows[gene] = {} + + for pair in seqcompWindows: + (geneID1, geneID2) = pair + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]: + if seq1pos not in resultWindows[geneID1]: + resultWindows[geneID1][seq1pos] = [] + + if seq2pos not in resultWindows[geneID2]: + resultWindows[geneID2][seq2pos] = [] + + resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches)) + resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches)) + + for geneID in hGenes: + for position in resultWindows[geneID]: + otherGeneIDs = [] + for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]: + if geneID2 not in otherGeneIDs: + otherGeneIDs.append(geneID2) + + if len(otherGeneIDs) >= minSeqNum: + criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window) + for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]: + criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches) + + consList.append((homGroup, geneID, position, window, "seqcompCons", criteria)) + + self.insertConservedSequenceBatch(consList) + + + def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False): + """ map regions of sequences with seqcomp windows in all genes that have + more than threshold (<= 1) conservation. Uses transivity. + """ + self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp) + + + def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False): + """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of + sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1) + conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity. + """ + consList = [] + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if useFastcomp: + prog = Fastcomp() + else: + prog = Paircomp() + + prog.setWindowSize(window) + for geneID in geneIDList: + genePairs = [] + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + oGenes = self.returnOrthologs(geneID) + # homologs should contain orthologs! + hGenes = self.returnHomologs(geneID) + if len(hGenes) < 1: + continue + + seqcompWindows = {} + genePairs = [[geneID, gene] for gene in hGenes] + for pair in genePairs: + if pair[1] in oGenes: + prog.setThreshold(orthologyThreshold) + else: + prog.setThreshold(paralogyThreshold) + + fastaFile = self.createDataFile(geneIDList = pair) + prog.inputFile(fastaFile) + prog.run() + seqcompWindows[pair[1]] = prog.getWindows() + + resultWindows = {} + print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows)) + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]: + resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)] + + for gene in hGenes[1:]: + newseqPositions = [] + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]: + if seq1pos not in resultWindows: + continue + + newseqPositions.append(seq1pos) + resultWindows[seq1pos].append((gene, seq2pos, sense, matches)) + + for pos in resultWindows.keys(): + if pos not in newseqPositions: + del resultWindows[pos] + + for windowPos in resultWindows.keys(): + windowList = resultWindows[windowPos] + criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window) + for (geneID2, seq2pos, sense, matches) in windowList: + criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches) + + consList.append((homGroup, geneID, windowPos, window, tag, criteria)) + + for windowEntry in windowList: + (gID, seq2pos, sense, matches) = windowEntry + consList.append((homGroup, gID, seq2pos, window, tag, criteria)) + + self.insertConservedSequenceBatch(consList) + + + def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria): + """ insert an entry in conservedSequence. + """ + values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria) + stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values + self.sqlcons(stmt, 'commit') + + + def insertConservedSequenceBatch(self, batchlist): + """ insert a list of entries in conservedSequence. + """ + sqlList = [] + stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)" + for (datasetID, geneID, pos, length, method, criteria) in batchlist: + sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria))) + + self.batchsqlCons(stmt, sqlList) + + + def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""): + """ retrieve a list of conservation windows of the form (start, length) from + conservedSequence using datasetID and geneID, which match a particular method and criteria. + """ + results = [] + stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if datasetID > 0: + stmt += "and datasetID = '%d' " % (datasetID) + + if len(method) > 0: + stmt += " and method = '%s' " % (method) + + if len(criteria) > 0: + stmt += "and score = '%s' " % (criteria) + + res = self.sqlcons(stmt) + for (location, length, criteria) in res: + results.append((int(location), int(length), str(criteria))) + + return results + + + def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""): + """ returns True if a particular sequence of position and length falls within a conservation + window. + """ + if len(position) == 2: + position = position[0] + stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if datasetID > 0: + stmt += "and datasetID = '%d' " % (datasetID) + + if len(method) > 0: + stmt += " and method = '%s' " % (method) + + if len(criteria) > 0: + stmt += "and score = '%s' " % (criteria) + + res = self.sqlcons(stmt) + for (location, wlen) in res: + if int(location) <= position and (int(location) + int(wlen)) >= (position + length): + return True + + return False + + + def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21): + """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns + a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger. + """ + maskedSeqDict = {} + if datasetID < 0: + datasetID = self.genepoolID + + settingsList = self.getSettingsID(datasetID) + geneIDList = eval(settingsList[1]) + for geneID in geneIDList: + theseq = self.genepool[geneID] + seqlen = len(theseq) + maskedseq = ["N"] * seqlen + conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria) + for (start, length, crit) in conservedWindows: + if start < 0: + start = 0 + + if start + length >= seqlen: + length = seqlen - start - 1 + + for index in range(start, start + length): + maskedseq[index] = theseq[index] + + tempseq = [] + numN = 0 + for letter in maskedseq: + if letter.upper() == "N": + numN += 1 + else: + numN = 0 + + if numN > stripNLonger: + continue + + tempseq.append(letter) + + maskedSeqDict[geneID] = string.join(tempseq, "") + + return maskedSeqDict + + + def conservationStat(self, datasetID=-1, method="", criteria=""): + """ report conservation level in each of the genes in the dataset. + """ + nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"] + totalcons = 0 + totalsize = 0 + consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria) + for geneID in consGeneDict: + ntstat = 0 + for NT in nucleotides: + ntstat += consGeneDict[geneID].count(NT) + + blocks = (len(consGeneDict[geneID]) - ntstat) / 21 + origSize = len(self.genepool[geneID]) + consPercentage = 100. * ntstat / float(origSize) + print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage) + totalcons += ntstat + totalsize += origSize + + consPercentage = 100. * totalcons / float(totalsize) + print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage) + + + def motifConservationStat(self, motifList=[]): + """ check how many motifs instances for the motifs in the mapped motifs + (or only in motifList) are conserved. + """ + if len(motifList) == 0: + motifList = self.geneToMotifKeys(thekey="motif") + + for motID in motifList: + index = 0 + matches = self.motifToGene(motID) + for (loc, pos) in matches: + mot = self.findMotif(motID) + if self.isConserved(loc, pos, len(mot)): + index += 1 + + print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches))) + + + def checkForConservedSequence(self, datasetID=-1, method="", criteria=""): + """ checks that at least two or more sequences in the dataset have conservation. + """ + someHaveConservation = False + numCons = 0 + checkDict = self.maskNonConservedSequence(datasetID, method, criteria) + for entry in checkDict: + theseq = checkDict[entry].upper() + if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq: + numCons += 1 + + if numCons > 1: + someHaveConservation = True + + return someHaveConservation + + + def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"): + """ exports conserved sequences to a fasta file. + """ + (up, cds, down) = self.getSeqParameters() + consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000) + if len(genomes) > 0: + consDictEntries = consDict.keys() + for entry in consDictEntries: + if entry[0] not in genomes: + del consDict[entry] + + outfilename = "%s/%s.fsa" % (directory, prefix) + consOutfile = open(outfilename, "w") + for entry in consDict: + entryCoordinates = cistematic.core.geneEntry(entry) + entrySense = entryCoordinates[4] + theseq = consDict[entry].upper() + seqlen = len(theseq) + (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates) + if entrySense == "R": + theseq = cistematic.core.complement(theseq) + + conservedBlockStart = -1 + for pos in range(seqlen): + if theseq[pos] == "N": + if conservedBlockStart >= 0: + consStart = start + conservedBlockStart + consSeq = theseq[conservedBlockStart:pos] + consStop = consStart + len(consSeq) - 1 + consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq)) + conservedBlockStart = -1 + + continue + + if conservedBlockStart < 0: + conservedBlockStart = pos + + if conservedBlockStart >= 0: + consStart = start + conservedBlockStart + consSeq = theseq[conservedBlockStart:pos] + consStop = consStart + len(consSeq) - 1 + consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq)) + + consOutfile.close() + + + def createConservation(self, conservationID="default", dbName=""): + """ creates the conservation SQL tables. Should only be run once per database file. + """ + stmtList = [] + self.loadConservation(conservationID, dbName) + try: + stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)") + stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)") + stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)") + stmtList.append("CREATE index cons1 on homology(conservationID)") + stmtList.append("CREATE index cons2 on alignedSequence(conservationID)") + stmtList.append("CREATE index cons3 on conservedSequence(conservationID)") + stmtList.append("CREATE index hom1 on homology(homologyGroup)") + stmtList.append("CREATE index orth1 on homology(orthologyGroup)") + stmtList.append("CREATE index homlocus1 on homology(genome, locus)") + stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)") + stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)") + for stmt in stmtList: + self.sqlcons(stmt, commit=True) + + self.mlog("Created conservation tables in database %s" % dbName) + except: + self.mlog("Could not create conservation tables in database %s" % dbName) + self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?") + + + def loadConservation(self, conservationID="default", dbName=""): + """ changes the conservationID to use the specified one, or use default. Must be used before reading or + writing any data to analysis tables. + """ + self.conservationID = conservationID + if len (dbName) > 0: + self.consDBName = dbName + else: + self.consDBName = self.expFile + + + def sqlcons(self, stmt, commit=""): + """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list. + """ + res = [] + db = sqlite.connect(self.consDBName) + sqlc = db.cursor() + try: + if self.debugSQL: + print "Conservation->sqlcons: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + try: + if commit != "": + db.commit() + except: + self.mlog("Conservation->sqlcons (commit exception)") + except: + self.mlog("Conservation->sqlcons (statement exception): %s" % stmt) + + sqlc.close() + db.close() + + return res + + + def batchsqlCons(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.consDBName) + sqlc = db.cursor() + try: + if self.debugSQL: + print "batchsqlCons: %s" % stmt + print "batchsqlCons: %s" % (str(batch)) + + sqlc.executemany(stmt, batch) + try: + db.commit() + except: + self.mlog("Conservation->batchsqlCons (commit exception)") + except: + self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt) + + sqlc.close() + db.close() + + return res \ No newline at end of file