erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / experiments / conservation.py
diff --git a/cistematic/experiments/conservation.py b/cistematic/experiments/conservation.py
new file mode 100644 (file)
index 0000000..04ce17c
--- /dev/null
@@ -0,0 +1,817 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+# This class contains the core code for using orthology and sequence-level conservation.
+try:
+    from pysqlite2 import dbapi2 as sqlite
+except:
+    from sqlite3 import dbapi2 as sqlite
+
+import string, cistematic.core
+from cistematic.core.homology import homologyDB, homologeneGenomes
+from cistematic.programs.mafft import Mafft
+from cistematic.programs.paircomp import Paircomp
+from cistematic.programs.fastcomp import Fastcomp
+
+
+class Conservation:
+    """ The conservation class holds the conservation specific code for
+        specifying orthology/paralogy, calling conservation identification code, 
+        and manipulating/storing conserved sequences. It is meant to be used 
+        in conjuction with the Experiment and AnalyzeMotifs classes.
+    """
+    conservationID = "default"
+    consDBName = ""
+    startingGenome = ""
+    targetGenomes = []
+    refGenes = []
+
+
+    def setTargetGenomes(self, tGenomes):
+        """ restricts homologs to the genomes specified in the tGenomes list.
+        """
+        self.targetGenomes = tGenomes
+
+
+    def setRefGenes(self, rGenes):
+        """ sets the list of loci (not geneIDs) that will be compared to homologs.
+        """
+        self.refGenes = rGenes
+
+
+    def setStartingGenome(self, sGenome):
+        """ sets the genome associated with the refGenes.
+        """
+        self.startingGenome = sGenome
+
+
+    def importHomology(self, inputFile, source="generic"):
+        """ loads homology relationships from a tab-delimited file of the form:
+            homologyGroup orthologyGroup genome locus
+            where orthologyGroup can be left blank.
+        """
+        stmtList = []
+        importFile = open(inputFile, "r")
+        stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
+        for line in importFile:
+            (homGroup, orthGroup, genome, locus) = line.split("\t")
+            stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source))     
+
+        importFile.close()
+        self.batchsqlcons(stmt, stmtList)
+
+
+    def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"):
+        """ define the geneIDs in geneIDList as being homologous.
+        """
+        stmtList = []
+
+        if homGroup == "":
+            for tempID in geneIDList:
+                tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1]))
+                if not self.hasHomGroup(tempHomGroup):
+                    homGroup = tempHomGroup
+                    break
+
+            if homGroup == "":
+                self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList))
+                return ""
+
+        stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " 
+        for geneID in geneIDList:
+            stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source))
+
+        self.batchsqlCons(stmt, stmtList)
+        return homGroup
+
+
+    def deleteHomolog(self, geneID, homGroup=""):
+        """ delete all entries from the homology table for a given geneID and homGroup.
+        """
+        stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1])
+        self.sqlcons(stmt, "commit")
+
+
+    def deleteOrtholog(self, geneID, orthGroup):
+        """ delete all entries from the homology table for a given geneID and orthGroup.
+        """
+        stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1])
+        self.sqlcons(stmt, "commit")
+
+
+    def deleteHomologyGroup(self, homGroup):
+        """ delete all entries from the homology table matching the given homology group.
+        """
+        stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s'  " % (self.conservationID, homGroup)
+        self.sqlcons(stmt, "commit")
+
+
+    def deleteOrthologyGroup(self, orthGroup):
+        """ delete all entries from the homology table matching the given orthology group.
+        """
+        stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup)
+        self.sqlcons(stmt, "commit")
+
+
+    def hasHomGroup(self, testGroup):
+        """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table.
+        """
+        stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup
+        res = self.sqlcons(stmt)
+        if int(res[0]) > 0:
+            return True
+
+        return False
+
+
+    def returnHomologs(self, geneID):
+        """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in 
+            self.targetGenomes, based on entries in the homology table. This function will try to load entries from 
+            homologene for supported genomes, if necessary.
+        """
+        homologList = []
+        usedHomologene = False
+        stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
+        groups = self.sqlcons(stmt)
+
+        if len(groups) < 1 and geneID[0] in homologeneGenomes:
+            self.loadFromHomologene(geneID)
+            groups = self.sqlcons(stmt)
+            usedHomologene = True
+
+        for (recID, hIDentry) in groups:
+            stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s'  " % (str(hIDentry), str(recID))
+            genes = self.sqlcons(stmt)
+            for gene in genes:
+                strgene = (str(gene[0]), str(gene[1]))
+                if usedHomologene and strgene[0] not in self.targetGenomes:
+                    continue
+
+                if strgene not in homologList:
+                    homologList.append(strgene)
+
+        return homologList
+
+
+    def returnOrthologs(self, geneID):
+        """ returns a list of genes that are orthologous to a geneID and whose genome are in 
+            self.targetGenomes, based on explicit entries in the homology table. 
+        """
+        orthologList = []
+        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
+        groups = self.sqlcons(stmt)
+
+        for (recID, oIDentry) in groups:
+            if oIDentry == "":
+                continue
+            stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID))
+            genes = self.sqlcons(stmt)
+            for gene in genes:
+                strgene = (str(gene[0]), str(gene[1]))
+                if strgene[0] in self.targetGenomes and strgene not in orthologList:
+                    orthologList.append(strgene)
+
+        return orthologList
+
+
+    def areOrthologs(self, geneID1, geneID2):
+        """ returns True if genes geneID1 and geneID2 are orthologs.
+        """
+        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1
+        groups1 = self.sqlcons(stmt)
+
+        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2
+        groups2 = self.sqlcons(stmt)
+
+        oEntries1 = []
+        oEntries2 = []
+        for (recID, oIDentry) in groups1:
+            oEntries1.append(oIDentry)
+
+        for (recID, oIDentry) in groups2:
+            oEntries2.append(oIDentry)
+
+        for entry in oEntries1:
+            if entry in oEntries2:
+                return True
+
+        return False
+
+
+    def loadFromHomologene(self, geneID):
+        """ load the homologous genes to geneID from homologene.
+        """
+        try:
+            hdb = homologyDB(self.targetGenomes)
+            hGenes = hdb.getHomologousGenes(geneID)
+        except:
+            hdb = homologyDB(self.targetGenomes, cache=True)
+            hGenes = hdb.getHomologousGenes(geneID)
+
+        hGenes.append(geneID)
+        homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
+        self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene")
+
+
+    def computeAlignments(self, geneIDList=[]):
+        """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of 
+            all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form
+            genome-locus' from the geneIDs in the starting genome.
+        """
+        if len(geneIDList) < 1:
+            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+        prog = Mafft()
+        for geneID in geneIDList:
+            homGroup =  str(geneID[0]) + '-' + str(geneID[1])
+            hGenes = self.returnHomologs(geneID)
+            hGenes.append(geneID)
+            fastaFile = self.createDataFile(geneIDList = hGenes)
+            prog.inputFile(fastaFile)
+            prog.run()
+            alignedDict = prog.getAlignment()
+            for dictKey in alignedDict:
+                aGeneID = dictKey.split('-')
+                self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey])
+
+
+    def insertAlignedSequence(self, datasetID, geneID, seq):
+        """ save an aligned sequence into alignedSequence.
+        """
+        values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq)
+        stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values
+        self.sqlcons(stmt, "commit")
+
+
+    def getAlignedSequence(self, geneID, datasetID=""):
+        """ retrieve an aligned sequence from alignedSequence using datasetID and geneID.
+        """
+        stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
+        if len(datasetID) > 0:
+            stmt += "and datasetID = '%s' " % (datasetID)
+
+        res = self.sqlcons(stmt)
+
+        return str(res[0][0])
+
+
+    def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]):
+        """ map regions of sequences where multiple alignments show at least one other (default)
+            or all genes (stritct=True) as lining up.
+        """
+        if len(geneIDList) < 1:
+            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+        if strict:
+            criteria = "strict"
+        else:
+            criteria = "partial"
+
+        for geneID in geneIDList:
+            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
+            hGenes = self.returnHomologs(geneID)
+            hGenes.append(geneID)
+            if len(hGenes) < 2:
+                continue
+
+            maskedSequences = self.maskUsingConservation(hGenes, strict)
+            for gID in hGenes:
+                start = 0
+                consLength = 0
+                for pos in range(len(maskedSequences[gID])):
+                    if maskedSequences[gID][pos] == "N":
+                        if start != 0 and consLength >= minConsLength:
+                            self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
+
+                        start = 0
+                        consLength = 0
+                        continue
+
+                    if start == 0:
+                        start = pos
+
+                    consLength += 1
+
+                if start != 0 and consLength >= minConsLength:
+                    self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
+
+
+    def maskUsingConservation(self, geneIDList, strict=False):
+        """ mask every gene in geneIDList using conservation amongst themselves, which have 
+            already been computed using computeAlignments()
+        """
+        alignmentDict = {}
+        maskedDict = {}
+
+        for gID in geneIDList:
+            seqLen = 0
+            try:
+                alignmentDict[gID] = self.getAlignedSequence(gID)
+                maskedDict[gID] = ""
+                seqLen = len(alignmentDict[gID])
+            except:
+                return maskedDict
+
+        if strict:
+            criteria = len(geneIDList)
+        else:
+            criteria = 2
+
+        for pos in range(seqLen):
+            posDict = {}
+            conserved = 0
+            for geneID in geneIDList:
+                posDict[geneID] = alignmentDict[geneID][pos]
+                if posDict[geneID] != "-" and posDict[geneID] != "N":
+                    conserved += 1
+
+            if conserved >= criteria:
+                for geneID in geneIDList:
+                    if posDict[geneID] != "-":
+                        maskedDict[geneID] += posDict[geneID]
+            else:
+                for geneID in geneIDList:
+                    if posDict[geneID] != "-":
+                        maskedDict[geneID] += "N"
+
+        return maskedDict
+
+
+    def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False):
+        """ map regions of sequences with seqcomp windows in all genes that have 
+            more than threshold (<= 1) conservation in more than minSequences sequences.
+            Can optionally use a higher threshold for orothlogs if specified using 
+            orthologyThreshold.
+        """
+        consList = []
+        if orthologyThreshold < threshold:
+            orthologyThreshold = threshold
+
+        if len(geneIDList) < 1:
+            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+        if useFastcomp:
+            prog = Fastcomp()
+        else:
+            prog = Paircomp()
+
+        prog.setWindowSize(window)
+        # if we have a window on a sequence, then we have at least one match!
+        minSeqNum = minSequences - 1
+        for geneID in geneIDList:
+            genePairs = []
+            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
+            hGenes = self.returnHomologs(geneID)
+            hGenes.append(geneID)
+            for first in hGenes:
+                for second in hGenes:
+                    if first != second and [first, second] not in genePairs and [second, first] not in genePairs:
+                        genePairs.append([first, second])
+
+            seqcompWindows = {}
+            print "genepairs = %s" % str(genePairs)
+            for pair in genePairs:
+                fastaFile = self.createDataFile(geneIDList = pair)
+                if self.areOrthologs(pair[0], pair[1]):
+                    prog.setThreshold(orthologyThreshold)
+                else:
+                    prog.setThreshold(threshold)
+
+                prog.inputFile(fastaFile)
+                prog.run()
+                seqcompWindows[(pair[0], pair[1])] = prog.getWindows()
+
+            resultWindows = {}
+            for gene in hGenes:
+                resultWindows[gene] = {}
+
+            for pair in seqcompWindows:
+                (geneID1, geneID2) = pair
+                for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]:
+                    if seq1pos not in resultWindows[geneID1]:
+                        resultWindows[geneID1][seq1pos] = []
+
+                    if seq2pos not in resultWindows[geneID2]:
+                        resultWindows[geneID2][seq2pos] = []
+
+                    resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches))
+                    resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches))
+
+            for geneID in hGenes:
+                for position in resultWindows[geneID]:
+                    otherGeneIDs = []
+                    for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
+                        if geneID2 not in otherGeneIDs:
+                            otherGeneIDs.append(geneID2)
+
+                    if len(otherGeneIDs) >= minSeqNum:
+                        criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window)
+                        for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
+                            criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
+
+                        consList.append((homGroup, geneID, position, window, "seqcompCons", criteria))
+
+        self.insertConservedSequenceBatch(consList)
+
+
+    def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False):
+        """ map regions of sequences with seqcomp windows in all genes that have 
+            more than threshold (<= 1) conservation. Uses transivity.
+        """
+        self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp)
+
+
+    def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False):
+        """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of 
+            sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1) 
+            conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity.
+        """
+        consList = []
+        if len(geneIDList) < 1:
+            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
+
+        if useFastcomp:
+            prog = Fastcomp()
+        else:
+            prog = Paircomp()
+
+        prog.setWindowSize(window)
+        for geneID in geneIDList:
+            genePairs = []
+            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
+            oGenes = self.returnOrthologs(geneID)
+            # homologs should contain orthologs!
+            hGenes = self.returnHomologs(geneID)
+            if len(hGenes) < 1:
+                continue
+
+            seqcompWindows = {}
+            genePairs = [[geneID, gene] for gene in hGenes]
+            for pair in genePairs:
+                if pair[1] in oGenes:
+                    prog.setThreshold(orthologyThreshold)
+                else:
+                    prog.setThreshold(paralogyThreshold)
+
+                fastaFile = self.createDataFile(geneIDList = pair)
+                prog.inputFile(fastaFile)
+                prog.run()
+                seqcompWindows[pair[1]] = prog.getWindows()
+
+            resultWindows = {}
+            print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows)) 
+            for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]:
+                resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)]
+
+            for gene in hGenes[1:]:
+                newseqPositions = []
+                for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]:
+                    if seq1pos not in resultWindows:
+                        continue
+
+                    newseqPositions.append(seq1pos)
+                    resultWindows[seq1pos].append((gene, seq2pos, sense, matches))
+
+                for pos in resultWindows.keys():
+                    if pos not in newseqPositions:
+                        del resultWindows[pos]
+
+            for windowPos in resultWindows.keys():
+                windowList = resultWindows[windowPos]
+                criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window)
+                for (geneID2, seq2pos, sense, matches) in windowList:
+                    criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
+
+                consList.append((homGroup, geneID, windowPos, window, tag, criteria))
+
+                for windowEntry in windowList:
+                    (gID, seq2pos, sense, matches) = windowEntry
+                    consList.append((homGroup, gID, seq2pos, window, tag, criteria))
+
+        self.insertConservedSequenceBatch(consList)
+
+
+    def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria):
+        """ insert an entry in conservedSequence.
+        """
+        values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria)
+        stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values
+        self.sqlcons(stmt, 'commit')
+
+    
+    def insertConservedSequenceBatch(self, batchlist):
+        """ insert a list of entries in conservedSequence.
+        """
+        sqlList = []
+        stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)" 
+        for (datasetID, geneID, pos, length, method, criteria) in batchlist:
+            sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria)))
+
+        self.batchsqlCons(stmt, sqlList)
+
+
+    def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""):
+        """ retrieve a list of conservation windows of the form (start, length) from 
+            conservedSequence using datasetID and geneID, which match a particular method and criteria.
+        """
+        results = []
+        stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
+        if datasetID > 0:
+            stmt += "and datasetID = '%d' " % (datasetID)
+
+        if len(method) > 0:
+            stmt += " and method = '%s' " % (method)
+
+        if len(criteria) > 0:
+            stmt += "and score = '%s' " % (criteria)
+
+        res = self.sqlcons(stmt)
+        for (location, length, criteria) in res:
+            results.append((int(location), int(length), str(criteria)))
+
+        return results
+
+
+    def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""):
+        """ returns True if a particular sequence of position and length falls within a conservation
+            window.
+        """
+        if len(position) == 2:
+            position = position[0]
+        stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
+        if datasetID > 0:
+            stmt += "and datasetID = '%d' " % (datasetID)
+
+        if len(method) > 0:
+            stmt += " and method = '%s' " % (method)
+
+        if len(criteria) > 0:
+            stmt += "and score = '%s' " % (criteria)
+
+        res = self.sqlcons(stmt)
+        for (location, wlen) in res:
+            if int(location) <= position and (int(location) + int(wlen)) >= (position + length):
+                return True
+
+        return False
+
+
+    def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21):
+        """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns
+            a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger.
+        """
+        maskedSeqDict = {}
+        if datasetID < 0:
+            datasetID = self.genepoolID
+
+        settingsList =  self.getSettingsID(datasetID)
+        geneIDList = eval(settingsList[1])
+        for geneID in geneIDList:
+            theseq = self.genepool[geneID]
+            seqlen = len(theseq)
+            maskedseq = ["N"] * seqlen
+            conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria)
+            for (start, length, crit) in conservedWindows:
+                if start < 0:
+                    start = 0
+
+                if start + length >= seqlen:
+                    length = seqlen - start - 1
+
+                for index in range(start, start + length):
+                    maskedseq[index] = theseq[index]
+
+            tempseq = []
+            numN = 0
+            for letter in maskedseq:
+                if letter.upper() == "N":
+                    numN += 1
+                else:
+                    numN = 0
+
+                if numN > stripNLonger:
+                    continue
+
+                tempseq.append(letter)
+
+            maskedSeqDict[geneID] = string.join(tempseq, "")
+
+        return maskedSeqDict
+
+
+    def conservationStat(self, datasetID=-1, method="", criteria=""):
+        """ report conservation level in each of the genes in the dataset.
+        """
+        nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"]
+        totalcons = 0
+        totalsize = 0
+        consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria)
+        for geneID in consGeneDict:
+            ntstat = 0
+            for NT in nucleotides:
+                ntstat += consGeneDict[geneID].count(NT)
+
+            blocks = (len(consGeneDict[geneID]) - ntstat) / 21
+            origSize = len(self.genepool[geneID])
+            consPercentage = 100. * ntstat / float(origSize)
+            print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage)
+            totalcons += ntstat
+            totalsize += origSize
+
+        consPercentage = 100. * totalcons / float(totalsize)
+        print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage)
+
+
+    def motifConservationStat(self, motifList=[]):
+        """ check how many motifs instances for the motifs in the mapped motifs 
+            (or only in motifList) are conserved.
+        """
+        if len(motifList) == 0:
+            motifList = self.geneToMotifKeys(thekey="motif")
+
+        for motID in motifList:
+            index = 0
+            matches = self.motifToGene(motID)
+            for (loc, pos) in matches:
+                mot = self.findMotif(motID)
+                if self.isConserved(loc, pos, len(mot)):
+                    index += 1
+
+            print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches)))
+
+
+    def checkForConservedSequence(self, datasetID=-1, method="", criteria=""):
+        """ checks that at least two or more sequences in the dataset have conservation.
+        """
+        someHaveConservation = False
+        numCons = 0
+        checkDict = self.maskNonConservedSequence(datasetID, method, criteria)
+        for entry in checkDict:
+            theseq = checkDict[entry].upper()
+            if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq:
+                numCons += 1
+
+        if numCons > 1:
+            someHaveConservation = True
+
+        return someHaveConservation
+
+
+    def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"):
+        """ exports conserved sequences to a fasta file.
+        """
+        (up, cds, down) = self.getSeqParameters()
+        consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000)
+        if len(genomes) > 0:
+            consDictEntries = consDict.keys()
+            for entry in consDictEntries:
+                if entry[0] not in genomes:
+                    del consDict[entry]
+
+        outfilename = "%s/%s.fsa" % (directory, prefix)
+        consOutfile = open(outfilename, "w")
+        for entry in consDict:
+            entryCoordinates = cistematic.core.geneEntry(entry)
+            entrySense = entryCoordinates[4]
+            theseq = consDict[entry].upper()
+            seqlen = len(theseq)
+            (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates)
+            if entrySense == "R":
+                theseq = cistematic.core.complement(theseq)
+
+            conservedBlockStart = -1
+            for pos in range(seqlen):
+                if theseq[pos] == "N":
+                    if conservedBlockStart >= 0:
+                        consStart = start + conservedBlockStart
+                        consSeq = theseq[conservedBlockStart:pos]
+                        consStop = consStart + len(consSeq) - 1
+                        consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
+                        conservedBlockStart = -1
+
+                    continue
+
+                if conservedBlockStart < 0:
+                    conservedBlockStart = pos
+
+            if conservedBlockStart >= 0:
+                consStart = start + conservedBlockStart
+                consSeq = theseq[conservedBlockStart:pos]
+                consStop = consStart + len(consSeq) - 1
+                consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
+
+        consOutfile.close()
+
+
+    def createConservation(self, conservationID="default", dbName=""):
+        """ creates the conservation SQL tables. Should only be run once per database file.
+        """
+        stmtList = []
+        self.loadConservation(conservationID, dbName)
+        try:
+            stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)")
+            stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)")
+            stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)")
+            stmtList.append("CREATE index cons1 on homology(conservationID)")
+            stmtList.append("CREATE index cons2 on alignedSequence(conservationID)")
+            stmtList.append("CREATE index cons3 on conservedSequence(conservationID)")
+            stmtList.append("CREATE index hom1 on homology(homologyGroup)")
+            stmtList.append("CREATE index orth1 on homology(orthologyGroup)")
+            stmtList.append("CREATE index homlocus1 on homology(genome, locus)")
+            stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)")
+            stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)")
+            for stmt in stmtList:
+                self.sqlcons(stmt, commit=True)
+
+            self.mlog("Created conservation tables in database %s" % dbName)
+        except:
+            self.mlog("Could not create conservation tables in database %s" % dbName)
+            self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?")
+
+
+    def loadConservation(self, conservationID="default", dbName=""):
+        """ changes the conservationID to use the specified one, or use default. Must be used before reading or
+            writing any data to analysis tables.
+        """
+        self.conservationID = conservationID
+        if len (dbName) > 0:
+            self.consDBName = dbName
+        else:
+            self.consDBName = self.expFile
+
+
+    def sqlcons(self, stmt, commit=""):
+        """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
+        """
+        res = []
+        db = sqlite.connect(self.consDBName)
+        sqlc = db.cursor()
+        try:
+            if self.debugSQL:
+                print "Conservation->sqlcons: %s" % stmt
+
+            sqlc.execute(stmt)
+            res = sqlc.fetchall()
+            try:
+                if commit != "":
+                    db.commit()
+            except:
+                self.mlog("Conservation->sqlcons (commit exception)")
+        except:
+            self.mlog("Conservation->sqlcons (statement exception): %s" % stmt)
+
+        sqlc.close()
+        db.close()
+
+        return res
+
+
+    def batchsqlCons(self, stmt, batch):
+        """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
+        """
+        res = []
+        db = sqlite.connect(self.consDBName)
+        sqlc = db.cursor()
+        try:
+            if self.debugSQL:
+                print "batchsqlCons: %s" % stmt
+                print "batchsqlCons: %s" % (str(batch))
+
+            sqlc.executemany(stmt, batch)
+            try:
+                db.commit()
+            except:
+                self.mlog("Conservation->batchsqlCons (commit exception)")
+        except:
+            self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt)
+
+        sqlc.close()
+        db.close()
+
+        return res
\ No newline at end of file