X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fexperiments%2FanalyzeMotifs.py;fp=cistematic%2Fexperiments%2FanalyzeMotifs.py;h=b1791b0df3e28edd9e0a4ebe36a86d1e5df58756;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181

diff --git a/cistematic/experiments/analyzeMotifs.py b/cistematic/experiments/analyzeMotifs.py
new file mode 100644
index 0000000..b1791b0
--- /dev/null
+++ b/cistematic/experiments/analyzeMotifs.py
@@ -0,0 +1,1097 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+# basic analysis code
+from cistematic.core.motif import matrixRow
+import cistematic.core
+import string
+
+try:
+    from pysqlite2 import dbapi2 as sqlite
+except:
+    from sqlite3 import dbapi2 as sqlite
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+    cisRoot = environ.get("CISTEMATIC_ROOT") 
+else:
+    cisRoot = "/proj/genome"
+
+knownMotifsDB = "%s/db/known_motifs.db" % cisRoot
+
+
+class AnalyzeMotifs:
+    motifSize = {}
+    coappear  = {}
+    motifToAnnotations = {}
+    annotationEntries = []
+    genomeMatches = {}
+    loadedGenomes = []
+    dbName = ""
+    analysisID = "default"
+
+
+    def initializeKnownMotifs(self, source=""):
+        """  loads known motifs from motif database, optionally limiting by source.
+        """
+        self.annotationEntries = []
+        dbkm = sqlite.connect(knownMotifsDB, timeout=60)
+        sqlkm = dbkm.cursor()
+        if len(source) > 0:
+            sqlkm.execute("select * from motifs where source = '%s' " % source)
+        else:
+            sqlkm.execute("select * from motifs")
+
+        dbEntries = sqlkm.fetchall()
+        dbkm.close()
+
+        for entry in dbEntries:
+            (index, source, name, theseq, reference, other_info) = entry
+            self.annotationEntries.append((index, source, name, theseq, reference, other_info))
+
+
+    def findAnnotations(self, motID, thresh=-1, numberN=0):
+        mot = self.findMotif(motID)
+        if thresh < 0:
+            thresh = self.threshold
+    
+        for entry in self.annotationEntries:
+            (id, source, name, theseq, reference, other) = entry
+            loc = mot.locateMotif(theseq, thresh, numberN)
+            if len(loc) > 0 :
+                self.motifToAnnotations[motID].append((source, name, theseq, reference, other, loc))
+
+            if len(self.motifToAnnotations[motID]) >= 500:
+                break
+
+
+    def findAnnotationsRE(self, motID, numMismatches=0):
+        """ Calls the motif's consensus locator on each entry in annotationEntries.
+        """
+        mot = self.findMotif(motID)
+        if numMismatches > 0:
+            mot.initializeMismatchRE(numMismatches)
+        else:
+            mot.initializeRE()
+
+        for entry in self.annotationEntries:
+            (id, source, name, annotSeq, reference, other) = entry
+            # Must use non-RE optimized version for short annotations
+            if len(annotSeq) < len(mot):
+                pad = "N" * (len(mot) - len(annotSeq))
+                annotSeq = pad + annotSeq + pad
+                loc = mot.locateConsensus(annotSeq)
+            else:
+                loc = mot.locateConsensusRE(annotSeq)
+
+            if len(loc) > 0 :
+                self.motifToAnnotations[motID].append((source, name, annotSeq, reference, other, loc))
+
+            if len(self.motifToAnnotations[motID]) >= 500:
+                break
+
+
+    def printAnnotations(self):
+        """ prints the motif annotations for every motif result.
+        """
+        for mot in self.getResults():
+            annotations =   self.motifToAnnotations[mot.tagID]
+            print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
+            for annotation in annotations:
+                (source, name, annotSeq, reference, other, loc) = annotation
+                print "%s:%s\t%s\t%s\t%s" % (source, name, reference, annotSeq, str(loc))
+                print other
+            print
+
+
+    def printAnnotationsShort(self):
+        """ prints a compressed version of the annotations.
+        """
+        for motID in self.motifToAnnotations.keys():
+            for annotation in self.motifToAnnotations[motID]:
+                print "%s: %s (%s)\t%s - %s" % (annotation[0], annotation[1], annotation[4], annotation[2], annotation[5])
+
+
+    def returnAnnotations(self, motID):
+        """ returns the [annotations] for a particular motID
+        """
+        try:
+            return self.motifToAnnotations[motID]
+        except KeyError:
+            return []
+
+
+    def annotateMotifs(self, thresh=0.0, numberN=0):
+        self.mlog( "Annotating Motifs with threshold %d and %d extra Ns" % (1.0 + thresh, numberN))
+        if len(self.annotationEntries) == 0:
+            self.initializeKnownMotifs()
+
+        for mot in self.getResults():
+            mot.setThreshold(thresh)
+            self.motifToAnnotations[mot.tagID] = []
+            self.findAnnotations(mot.tagID, thresh, numberN)
+
+
+    def annotateConsensus(self, numMismatches=0, source=""):
+        self.mlog( "Annotating Consensus")
+        if len(self.annotationEntries) == 0:
+            self.initializeKnownMotifs(source)
+
+        for mot in self.getResults():
+            self.motifToAnnotations[mot.tagID] = []
+            self.findAnnotationsRE(mot.tagID, numMismatches)
+
+
+    def printConsensus(self):
+        """ print the consensus for every motif result.
+        """
+        for mot in self.getResults():
+            print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
+
+
+    def formatPWM(self, aPWM):
+        """ format a PWM into a printable string.
+        """
+        aRow = ""
+        cRow = ""
+        gRow = ""
+        tRow = ""
+        for col in aPWM:
+            aRow = string.join([aRow, str(round(col[matrixRow["A"]],4))], "\t")
+            cRow = string.join([cRow, str(round(col[matrixRow["C"]],4))], "\t")
+            gRow = string.join([gRow, str(round(col[matrixRow["G"]],4))], "\t")
+            tRow = string.join([tRow, str(round(col[matrixRow["T"]],4))], "\t")
+
+        formattedPWM =  "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
+
+        return formattedPWM
+
+
+    def appendGeneToMotif(self,mTagID, geneID, pos):
+        """ make an entry in the geneToMotif table.
+        """
+        (genome, locus) = geneID
+        (loc, sense) = pos
+        stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, '%s', '%s',  '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense)
+        res = self.sql(stmt, "commit")
+
+
+    def appendGeneToMotifBatch(self, batch):
+        """ make a batch of entries in the geneToMotif table.
+        """
+        batchInserts = []
+        stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, ?, ?,  ?, ?, ?, ?, ?)" 
+        for entry in batch:
+            (mTagID, geneID, pos) = entry
+            (genome, locus) = geneID
+            (loc, sense) = pos
+            batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense))
+
+        res = self.batchsql(stmt, batchInserts)
+
+    
+    def geneToMotifKeys(self, thekey="geneID"):
+        """ return the keys to the geneToMotif table. The default key is geneID, otherwise returns mTagID.
+        """
+        results = []
+        if thekey == "geneID":
+            stmt = "SELECT distinct genome, locus from geneToMotif where expID = '%s' and analysisID = '%s' order by locus" % (self.experimentID, self.analysisID)
+        else:
+            stmt = "SELECT distinct mTagID from geneToMotif where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
+
+        res = self.sql(stmt)
+        for entry in res:
+            if thekey == "geneID":
+                (genome, locus) = entry
+                results.append((str(genome), str(locus)))
+            else:
+                mTagID = entry[0]
+                results.append(str(mTagID))
+
+        return results
+
+
+    def appendMotifNeighbors(self,mTagID, match, condition="", geneEntry=""):
+        """ make an entry in the motifNeighbors table.
+        """
+        (chromo, pos) = match
+        (genome, chromNum) = chromo
+        (loc, mSense) = pos
+        if geneEntry != "":
+            (start, stop, sense, geneID) = geneEntry
+            (genome2, locus) = geneID
+        else:
+            start = "-"
+            stop = "-"
+            sense = "-"
+            locus = "NO GENE IN RADIUS"
+
+        stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, '%s','%s',  '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition)
+        res = self.sql(stmt, "commit")
+
+
+    def appendMotifNeighborsBatch(self, batch):
+        """ make a batch of entries in the motifNeighbors table.
+        """
+        batchInserts = []
+        stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+        for entry in batch:
+            (mTagID, match, condition, geneEntry) = entry
+            (chromo, pos) = match
+            (genome, chromNum) = chromo
+            (loc, mSense) = pos
+            if geneEntry != "":
+                (start, stop, sense, geneID) = geneEntry
+                (genome2, locus) = geneID
+            else:
+                start = "-"
+                stop = "-"
+                sense = "-"
+                locus = "NO GENE IN RADIUS"
+
+            batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition))
+
+        res = self.batchsql(stmt, batchInserts)
+
+
+    def motifNeighborsKeys(self, condition=""):
+        """ returns a [list of motif ID's] in the motifNeighbor table. Can restrict to a particular condition.
+        """
+        results = []
+        if condition != "":
+            stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' and condition ='%s' order by mTagID" % (self.experimentID, self.analysisID, condition)
+        else:
+            stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
+
+        res = self.sql(stmt)
+        for entry in res:
+            mTagID = entry[0]
+            results.append(mTagID)
+
+        return results
+
+
+    def motifNeighbors(self,mTagID, condition="", discardNullMatches=False):
+        """ get entries in the geneToMotif table that match a particular Motif & condition.
+        """
+        results = []
+        genome = ""
+        chromNum = ""
+        loc = ""
+        mSense = ""
+        start = ""
+        stop = ""
+        sense = ""
+        locus = ""
+        stmt = "select distinct genome, chromNum, location, motifSense, start, stop, sense, locus, condition from motifNeighbors where expID='%s' and analysisID = '%s' and mTagID='%s' " % (self.experimentID, self.analysisID, mTagID)
+        if discardNullMatches:
+            stmt += " and condition <> 'NONE' "
+
+        if condition != '':
+            stmt +=  " and condition = '%s' " % (condition)
+
+        stmt += " order by ID desc "
+        res = self.sql(stmt)
+
+        for entry in res:
+            (genome, chromNum, loc, mSense, start, stop, sense, locus, condition) = entry
+            match = ((genome, chromNum),(int(loc), mSense))
+            geneEntry = (start, stop, sense, (genome, locus))
+            results.append((match, geneEntry, condition))
+
+        return results
+
+
+    def motifToGene(self, mTagID):
+        """ returns a list of matching geneIDs with location and sense found for a given motif. 
+        """
+        results = []
+        stmt = "SELECT distinct genome, locus, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and mTagID = '%s' order by locus" % (self.experimentID, self.analysisID, mTagID)
+        res = self.sql(stmt)
+        for entry in res:
+            (genome, locus, location, sense) = entry
+            results.append(((genome, locus), (int(location), sense)))
+
+        return results
+
+
+    def geneToMotif(self, geneID):
+        """ returns a list of matching motifs with location and sense found for a given geneID
+        """
+        results = []
+        (genome, locus) = geneID
+        stmt = "SELECT distinct mTagID, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and genome = '%s' and locus = '%s' order by location" % (self.experimentID, self.analysisID, genome, locus)
+        res = self.sql(stmt)
+        for entry in res:
+            (mTagID, location, sense) = entry
+            results.append((mTagID, (location, sense)))
+
+        return results
+
+
+    def mapMotifs(self, Threshold=90.0, numberN=0, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, verbose=True):
+        """ find occurences of result motifs in the current genepool using PWMs. Slow.
+        """
+        currentGenome = ""
+        currentDataset = -1
+        genepoolKeys = self.genepool.keys()
+        posResults = []
+        for mot in self.getResults():
+            mTagID = mot.tagID
+            runID = mTagID.split("-")[0]
+            (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
+            if enforceSanity and not mot.isSane():
+                self.mlog("mapMotifs: not mapping %s - failed sanity check" % (mTagID))
+                continue
+
+            if mot.tagID in ignoreList:
+                self.mlog("mapMotifs: not mapping %s" % mot.tagID)
+                continue
+
+            if len(runList) > 0:
+                if int(runID) not in runList:
+                    self.mlog("mapMotifs: not mapping run %s" % runID)
+                    continue
+
+            if mapAllSeqs:
+                dataID = 0
+
+            self.mlog("mapMotifs: mapping %s with threshold %f and at most %d N" % (mTagID, Threshold, numberN))
+            if dataID <> currentDataset:
+                currentDataset = dataID
+                for geneID in self.getSetting(dataID):
+                    if geneID[0] <> currentGenome:
+                        currentGenome = geneID[0]
+
+                    try:
+                        if geneID not in genepoolKeys:
+                            self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstreamself.geneDB, False, self.boundToNextGene)
+                            genepoolKeys.append[geneID]
+                    except:
+                        self.mlog("mapMotifs: could not load %s" % (str(geneID)))
+                        break
+
+            for geneID in genepoolKeys:
+                if verbose:
+                    print "mapMotifs: %s\n" % str(geneID)
+
+                motifPos = mot.locateMotif(self.genepool[geneID], Threshold, numberN)
+
+                if len(motifPos) > 0:
+                    for pos in motifPos:
+                        posResults.append((mTagID, geneID, pos))
+
+        self.appendGeneToMotifBatch(posResults)
+
+
+    def mapConsensus(self, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, numMismatches=0):
+        """ find occurences of result motifs in the current genepool using regular expressions, allowing 
+            for a number of mismatches.
+        """
+        currentGenome = ""
+        currentDataset = -1
+        genepoolKeys = self.genepool.keys()
+        posResults = []
+        for mot in self.getResults():
+            mTagID = mot.tagID
+            runID = mTagID.split("-")[0]
+            (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
+            if mot.tagID in ignoreList:
+                self.mlog("mapConsensus: not mapping %s" % mot.tagID)
+                continue
+
+            if len(runList) > 0:
+                if int(runID) not in runList:
+                    self.mlog("mapConsensus: not mapping run %s" % runID)
+                    continue
+
+            if enforceSanity and not mot.isSane():
+                self.mlog("mapConsensus: not mapping %s - failed sanity check" % (mTagID))
+                continue
+
+            if mapAllSeqs:
+                dataID = 0
+
+            if numMismatches > 0:
+                mot.initializeMismatchRE(numMismatches)
+            else:
+                mot.initializeRE()
+            self.mlog("mapConsensus: mapping %s with %s mismatches" % (mTagID, numMismatches))
+            if dataID <> currentDataset:
+                currentDataset = dataID
+                for geneID in self.getSetting(dataID):
+                    if geneID[0] <> currentGenome:
+                        currentGenome = geneID[0]
+
+                    try:
+                        if geneID not in genepoolKeys:
+                            self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstream)
+                            genepoolKeys.append[geneID]
+                    except:
+                        self.mlog("mapConsensus: could not load %s" % (str(geneID)))
+                        break
+
+            for geneID in genepoolKeys:
+                motifPos = mot.locateConsensusRE(self.genepool[geneID])
+
+                if len(motifPos) > 0:
+                    for pos in motifPos:
+                        posResults.append((mTagID, geneID, pos))
+
+        self.appendGeneToMotifBatch(posResults)
+
+
+    def mapFeatures(self, radius):
+        """ returns features within a certain radius in bp of all matches.
+        """
+        FeatureList = []
+        for mot in self.getResults():
+            mTagID = mot.tagID
+            self.mlog("mapFeatures: mapping %s using a radius of %d bp" % (mTagID, radius))
+            for match in self.motifToGene(mTagID):
+                matchFeatures = cistematic.core.retrieveFeatures(match, radius)
+                for entry in matchFeatures:
+                    FeatureList.append((mTagID, match, entry))
+
+        return FeatureList
+
+
+    def limitGeneEntries(self, geneList, lowerBound, upperBound):
+        results = []
+        for entry in geneList:
+            if entry[1] < lowerBound or entry[0] > upperBound:
+                continue
+
+            results.append(entry)
+
+        return results
+    
+
+    def mapNeighbors(self, radius, annotate=True):
+        """ returns genes on a chromosome within a certain radius in bp.
+        """
+        localGeneList = {}
+        prevChromo = ""
+        neighborResults = []         
+        for mot in self.getResults():
+            mTagID = mot.tagID
+            motLen = len(mot)
+            motRadius = motLen / 2
+            mtgList = self.motifToGene(mTagID)
+            self.mlog("mapNeighbors: mapping %s using a radius of %d bp (%d instances)" % (mTagID, radius, len(mtgList)))
+            index = 0
+            for match in mtgList:
+                (chromo, hit) = match
+                if annotate and chromo != prevChromo:
+                    prevChromo = chromo
+                    localGeneList = cistematic.core.getChromoGeneEntries(chromo)
+
+                index += 1
+                if (index % 1000) == 0:
+                    print "."
+
+                matchCounter = 0
+                matchCDS = []
+                if annotate:
+                    (chromo, hit) = match
+                    matchpos = int(hit[0])
+                    lowerBound = matchpos - radius
+                    upperBound = matchpos + radius
+                    match2 = (chromo, (matchpos + motRadius, hit[1]))
+                    matchFeatures = cistematic.core.retrieveFeatures(match2, motRadius, "CDS")
+                    for entry in matchFeatures:
+                        matchCDS.append((chromo[0], entry[0]))
+
+                    geneEntriesList = self.limitGeneEntries(localGeneList, lowerBound, upperBound)
+                    for geneEntry in geneEntriesList:
+                        beg = int(geneEntry[0])
+                        end = int(geneEntry[1])
+                        sense = geneEntry[2]
+                        gID = geneEntry[3]
+                        if gID in matchCDS: # matching within the coding sequence
+                            neighborResults.append((mTagID, match, "CDS", geneEntry))
+                            matchCounter += 1
+                        elif matchpos >= beg and matchpos <= end: # not in CDS, but in gene
+                            neighborResults.append((mTagID, match, "GENE", geneEntry))
+                            matchCounter += 1
+                        elif matchpos < beg: 
+                            if sense == "F":
+                                neighborResults.append((mTagID, match, "UPSTREAM", geneEntry))
+                            else:
+                                neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
+
+                            matchCounter += 1
+                        else:
+                            if sense == "F":
+                                neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
+                            else:
+                                neighborResults.append((mTagID,match, "UPSTREAM", geneEntry))
+
+                            matchCounter += 1
+
+                if matchCounter < 1:
+                    neighborResults.append((mTagID, match, "NONE", ""))
+
+        self.appendMotifNeighborsBatch(neighborResults)
+
+
+    def printMotifToGene(self, motifList=[], runList=[]):
+        motKeys = self.geneToMotifKeys(thekey="mTagID")
+        if len(motifList) == 0 and len(runList) == 0:
+            for tag in motKeys:
+                print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
+        else:
+            for tag in motKeys:
+                runID = tag.split("-")[0]
+                if len(runList) > 0:
+                    if runID not in runList:
+                        continue
+
+                if tag in motifList:
+                    print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
+
+
+    def motifToGeneStat(self):
+        tags = self.geneToMotifKeys(thekey="mTagID")
+        counter = 0
+        min = len(self.motifToGene(tags[0]))
+        max = 0
+        for tag in tags:
+            numGenes = len(self.motifToGene(tag))
+            print "%s: %d" % (tag, numGenes)
+            if numGenes > max:
+                max = numGenes
+            elif numGenes < min:
+                min = numGenes
+
+            counter += numGenes
+
+        print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
+
+
+    def printGeneToMotif(self, geneList = []):
+        if len(geneList) == 0:
+            for geneID in self.geneToMotifKeys():
+                print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
+        else:
+            for geneID in self.geneToMotifKeys():
+                if geneID in geneList:
+                    print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
+
+
+    def geneToMotifStat(self):
+        genes = self.geneToMotifKeys()
+        counter = 0
+        min = len(self.geneToMotif(genes[0]))
+        max = 0
+        for gene in genes:
+            numMotifs = len(self.geneToMotif(gene))
+            print "%s - %s: %d" % (gene[0], gene[1], numMotifs)
+            if numMotifs > max:
+                max = numMotifs
+            elif numMotifs < min:
+                min = numMotifs
+
+            counter += numMotifs
+
+        print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
+
+    
+    def printGeneProfile(self, geneID):
+        print "\nMotif matches for %s " % (str(geneID))
+        geneProfile = self.makeGeneProfile(geneID)
+        positions = geneProfile.keys()
+        if len(positions) > 0:
+            positions.sort()
+            for pos in positions:
+                print "%s %s" % (str(pos), str(geneProfile[pos]))
+        else:
+            print "Gene had no matching motifs"
+
+
+    def makeGeneProfile(self, geneID):
+        geneBucket = {}
+        geneMotifs = self.geneToMotif(geneID)
+        if len(geneMotifs) > 0:
+            for mot in geneMotifs:
+                (motID, pos) = mot
+                (loc, sense) = pos
+                if int(loc) not in geneBucket.keys():
+                    geneBucket[int(loc)] = []
+
+                geneBucket[int(loc)].append(mot)
+
+        return geneBucket
+
+
+    def getMotifMatches(self, motifIDs=[]):
+        results = {}
+        if len(motifIDs) < 1:
+            motifIDs = self.geneToMotifKeys(thekey="mTagID")
+
+        for motID in motifIDs:
+            results[motID] = []
+            mot = self.findMotif(motID)
+            motLength = len(mot)
+            for match in self.motifToGene(motID):
+                (chrom, hit) = match
+                (pos, sense) = hit
+                if sense == "F":
+                    results[motID].append(self.genepool[chrom][pos:pos + motLength])
+                else:
+                    results[motID].append(cistematic.core.complement(self.genepool[chrom][pos:pos + motLength], motLength))
+
+        return results
+
+
+    def summarizeMotifs(self, fileName):
+        """ saves the number of occurences and actual occurence PWMs of motifs to fileName.
+        """
+        motDict = {}
+        motText = []
+        try:
+            if len(fileName) < 1:
+                raise IOError
+
+            outFile = open(fileName, "a")
+            self.mlog("Saving motif summary")
+            for motID in self.geneToMotifKeys(thekey="mTagID"):
+                matchNum = 0
+                mot = self.findMotif(motID)
+                motLength = len(mot)
+                motDict[motID] = []
+                for index in range(motLength):
+                    motDict[motID].append([0.0, 0.0, 0.0, 0.0])
+
+                for match in self.motifToGene(motID):
+                    matchNum += 1
+                    (chromo, hit) = match
+                    (pos, sense) = hit
+                    if sense == "F":
+                        matchSeq = self.genepool[chromo][pos:pos + motLength]
+                    else:
+                        matchSeq = cistematic.core.complement(self.genepool[chromo][pos: pos + motLength], motLength)
+
+                    for index in range(motLength):
+                        NT = matchSeq[index]
+                        NT = NT.upper()
+                        if NT in ["A", "C", "G", "T"]:
+                            motDict[motID][index][matrixRow[NT]] += 1
+
+                motLine = "motif %s\t %s matches\t'%s'\n %s\n" % (motID, str(matchNum), mot.buildConsensus(), self.formatPWM(motDict[motID]))
+                motText.append(motLine)
+
+            outFile.writelines(motText)
+            outFile.close()
+        except:
+            self.mlog("Could not save motif summary to file %s\n" % fileName)
+
+
+    def printMotifNeighbors(self):
+        for motID in self.motifNeighborsKeys():
+            print "Matches for motif %s" % motID
+            currentHit = ()
+            for entry in self.motifNeighbors(motID):
+                if entry[0] != currentHit:
+                    print "====================="
+                    print "Match %s" % str(entry[0])
+                    currentHit = entry[0]
+
+                print "\tGene %s:" % str(entry[1])
+                try:
+                    goInfo = cistematic.core.getGOInfo(entry[1][3])
+                    for entry in goInfo:
+                        print "\t\t%s" % str(entry)
+                except:
+                    pass
+
+                print "----------------"
+
+
+    def summarizeMotifNeighbors(self, fileName):
+        """ saves the number of occurences and PWMs of motifs within the gene
+            neighborhood radius as mapped using mapNeighbors() to fileName.
+        """
+        motDict = {}
+        motText = []
+        try:
+            if len(fileName) < 1:
+                raise IOError
+            outFile = open(fileName, "a")
+            self.mlog("Saving neighbor summary")
+            for motID in self.motifNeighborsKeys():
+                matchNum = 0
+                mot = self.findMotif(motID)
+                motLength = len(mot)
+                motDict[motID] = []
+                for index in range(motLength):
+                    motDict[motID].append([0.0, 0.0, 0.0, 0.0])
+
+                currentMatch = ""
+                for entry in self.motifNeighbors(motID, discardNullMatches = True):
+                    if currentMatch != entry[0]:
+                        currentMatch = entry[0]
+                        (genechrom, loc) = entry[0]
+                        (pos, sense) = loc
+                        geneID = entry[1][3]
+                        (geno, gene) = geneID
+                        if gene != "NO GENE IN RADIUS":
+                            matchNum += 1
+                            if sense == "F":
+                                matchSeq = self.genepool[genechrom][pos:pos + motLength]
+                            else:
+                                matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
+
+                            for index in range(motLength):
+                                NT = matchSeq[index]
+                                NT = NT.upper()
+                                if NT in ["A", "C", "G", "T"]:
+                                    motDict[motID][index][matrixRow[NT]] += 1
+
+                motLine = "motif %s\t %s matches\n %s\n" % (motID, str(matchNum), self.formatPWM(motDict[motID]))
+                motText.append(motLine)
+
+            outFile.writelines(motText)
+            outFile.close()
+        except:
+            self.mlog("Could not save motif neighbors to file %s\n" % fileName)
+
+
+    def saveMotifNeighbors(self, fileName, fullAnnotations=True):
+        """ save every occurence of the motifs with any adjoining gene(s) to fileName. 
+            Records annotations and GO terms if available when fullAnnotations=True.
+        """
+        goDict = {}
+        annotDict = {}
+        currentGenome = ""
+        neighborList = []
+        self.mlog("Saving motif neighbors to file %s\n" % fileName)
+        if True:
+            if len(fileName) < 1:
+                raise IOError
+            outFile = open(fileName, "a")
+            for motID in self.motifNeighborsKeys():
+                matchNum = 0
+                mot = self.findMotif(motID)
+                motLength = len(mot)
+                currentMatch = ""
+                for entry in self.motifNeighbors(motID):
+                    neighborList = []
+                    if currentMatch != entry[0]:
+                        matchNum += 1
+                        currentMatch = entry[0]
+                        (genechrom, loc) = entry[0]
+                        (genome, chromo) = genechrom
+                        (pos, sense) = loc
+                        if fullAnnotations and genome != currentGenome:
+                            currentGenome = genome
+                            goDict = cistematic.core.getAllGOInfo(genome)
+                            annotDict = cistematic.core.getAllAnnotInfo(genome)
+
+                    start = entry[1][0]
+                    stop = entry[1][1]
+                    geneSense = entry[1][2]
+                    geneID = entry[1][3]
+                    (geno, gene) = geneID
+                    condition = entry[2]
+                    if sense == "F":
+                        matchSeq = self.genepool[genechrom][pos:pos + motLength]
+                    else:
+                        matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
+
+                    currentEntry = "%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motID, matchNum, genome, chromo, pos, sense, condition, start, stop, geneSense, geno, gene, matchSeq)
+                    if fullAnnotations:
+                        try:
+                            goInfo = goDict.get(geneID, [])
+                            annotInfo = annotDict.get(geneID,[])
+                            annotDescription = ""
+                            prevGoInfo = ""
+                            if self.debugSQL:
+                                print "%s\t%s\t%s" % (str(geneID), str(goInfo), str(annotInfo))
+                            for annot in annotInfo:
+                                annotDescription += annot + " "
+                            if len(goInfo) > 0:
+                                for entry in goInfo:
+                                    if entry == prevGoInfo:
+                                        continue
+                                    neighborList.append("%s\t%s\t%s\n" % (currentEntry, annotDescription, entry))
+                                    prevGoInfo = entry
+                            else:
+                                neighborList.append("%s\t%s\n" % (currentEntry, annotDescription))
+                        except:
+                            neighborList.append("%s\n" % currentEntry)
+                    else:
+                        neighborList.append("%s\n" % currentEntry)
+
+                    outFile.writelines(neighborList)
+
+            outFile.close()
+        else:
+            self.mlog("Could not save motif neighbors to file %s\n" % fileName)
+
+        return neighborList
+
+
+    def buildMotifSize(self):
+        mots = self.getResults()
+        index = 0
+        for mot in mots:
+            motLength = len(mot)
+            if motLength not in self.motifSize.keys():
+                self.motifSize[motLength] =[]
+
+            self.motifSize[motLength].append((index, mot.buildConsensus()))
+            index += 1
+
+
+    def findIdenticalMotifs(self):
+        # FIXME!!!!
+        identicalMotifs = []
+
+        for motLength in self.motifSize.keys():
+            motList = self.motifSize[motLength]
+            motListLen = len(motList)
+            print "doing motif length %d" % motLength
+            for pos in range(motListLen):
+                index = pos + 1
+                while index < motListLen:
+                    if motList[pos][1] == motList[index][1]:
+                        identicalMotifs.append((self.results[pos].tagID, self.results[index].tagID))
+
+                    index += 1
+
+        return identicalMotifs
+
+
+    def createAnalysis(self, dbName=""):
+        """ creates the analysis SQL tables. Should only be run once per database file.
+        """
+        if len(dbName) == 0:
+            dbName = self.expFile
+
+        db = sqlite.connect(dbName, timeout=60)
+        try:    
+            sql = db.cursor()
+            sql.execute("CREATE table geneToMotif(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar)")
+            sql.execute("CREATE table genomeMatches(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar, threshold varchar)")
+            sql.execute("CREATE table motifNeighbors(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, chromNum varchar, location varchar, motifSense varchar, start varchar, stop varchar, sense varchar, locus varchar, condition varchar)")
+            sql.execute("CREATE index mot1 on geneToMotif(expID, analysisID, mTagID)")
+            sql.execute("CREATE index mot2 on genomeMatches(expID, analysisID, mTagID)")
+            sql.execute("CREATE index mot3 on motifNeighbors(expID, analysisID, mTagID)")
+            sql.execute("CREATE index locus1 on geneToMotif(locus)")
+            sql.execute("CREATE index locus2 on motifNeighbors(locus)")
+            sql.execute("CREATE index condition1 on motifNeighbors(condition)")
+            db.commit()
+            sql.close()
+            db.close()
+            self.mlog("Created analysis tables in database %s" % dbName)
+        except:
+            db.close()
+            self.mlog("Could not create tables in database %s" % dbName)
+            self.mlog("WARNING: perhaps you have called createAnalysis twice on the same database?")
+
+
+    def loadAnalysis(self, analysisID="default", dbName=""):
+        """ changes the analysisID to use the specified one, or use default. Must be used before reading or
+            writing any data to analysis tables.
+        """
+        self.analysisID = analysisID
+        if len (dbName) > 0:
+            self.dbName = dbName
+        else:
+            self.dbName = self.expFile
+
+
+    def resetAnalysis(self):
+        """ currently zeros out some of the analysis structures. obsolete ?
+        """
+        self.mlog("resetting analysis")
+        self.motifSize = {}
+        self.motifToAnnotations = {}
+        self.coappear  = {}
+
+
+    def saveAnalysis(self):
+        """ currently obsolete - kept for backward compability.
+        """
+        try:
+            self.mlog("saved analysis %s" % self.analysisID)
+        except:
+            self.mlog("could not save %s" % self.analysisID)
+
+
+    def deleteAnalysis(self, analysisID="default"):
+        """ delete all of the analysis either named aName, or matching
+            an experiment (the current one by default.) Currently a NO-OP. Obsolete.
+        """
+        pass
+
+
+    def buildCoappear(self, motifList=[], distance="100000", strict = False):
+        """ Builds coappear dictionary of geneIDs where motifs occur with others. Can limit to
+            motifs in motifList (default is all motifs), and by distance (default is 100,000 base pairs.) 
+            Results are accessed through printCoappear() and saveCoappear().
+        """
+        occurenceList = {}
+        self.coappear = {}
+        processedMotifs = []
+        motifListLen = len(motifList)
+        if motifListLen == 0:
+            #use all motifs
+            motifList = self.geneToMotifKeys(thekey="mTagID")
+            motifListLen = len(motifList)
+
+        for motif in motifList:
+            if motif not in processedMotifs:
+                matchList = self.motifToGene(motif)
+                for match in matchList:
+                    (geneID, loc) = match
+                    if geneID not in occurenceList:
+                        occurenceList[geneID] = []
+
+                    (location, sense) = loc
+                    occurenceList[geneID].append((location, sense, motif))
+
+            processedMotifs.append(motif)
+
+        for geneID in occurenceList:
+            occurenceList[geneID].sort()
+            if geneID not in self.coappear:
+                self.coappear[geneID] = []    
+
+            coappearing = False
+            differentMotifs = False
+            coappearList = []
+            prevOccurence = occurenceList[geneID][0]
+            del occurenceList[geneID][0]
+            for occurence in occurenceList[geneID]:
+                if occurence[0] < prevOccurence[0] + distance:
+                    coappearing = True
+                    if occurence[2] != prevOccurence[2]:
+                        differentMotifs = True
+
+                    coappearList.append(prevOccurence)
+                elif coappearing:
+                    if strict:
+                        if differentMotifs:
+                            coappearList.append(prevOccurence)
+                            self.coappear[geneID].append(coappearList)
+                    else:
+                        coappearList.append(prevOccurence)
+                        self.coappear[geneID].append(coappearList)
+
+                    coappearing = False
+                    differentMotifs = False
+                    coappearList = []
+
+                prevOccurence = occurence
+
+            if coappearing:
+                if strict:
+                    if differentMotifs:
+                        coappearList.append(prevOccurence)
+                        self.coappear[geneID].append(coappearList)
+                else:
+                    coappearList.append(prevOccurence)
+                    self.coappear[geneID].append(coappearList)
+
+
+    def printCoappear(self):
+        """ prints a formatted version of the coappear dictionary built with buildCoappear()
+        """
+        for geneID in self.coappear:
+            print " ===== %s =====" % str(geneID)
+            for occurence in self.coappear[geneID]:
+                print str(occurence)
+            print " ============="
+
+
+    def saveCoappear(self, fileName):
+        """ save coappear dictionary in tabular format. Returns:
+            index, genome, locus, pos, sense, tag in tab-separated format.
+        """
+        index = 0
+        outLines = []
+        if 1:
+            if len(fileName) < 1:
+                raise IOError
+
+            outFile = open(fileName, "a")
+            for geneID in self.coappear:
+                (genome, locus) = geneID
+                for occurence in self.coappear[geneID]:
+                    index += 1
+                    coappearLine = "%d\t%s\t%s" % (index, genome, locus)
+                    for match in occurence:
+                        (pos, sense, tag) = match
+                        coappearLine += "\t%s\t%s\t%s" % (pos, sense, tag)
+
+                    outLines.append(coappearLine + "\n")
+
+            outFile.writelines(outLines)
+            outFile.close()
+        else:
+            self.mlog("Could not save coappear to file %s\n" % fileName)
+
+
+    def sql(self, stmt, commit=""):
+        """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
+        """
+        db = sqlite.connect(self.dbName, timeout=60)
+        sqlc = db.cursor()
+        if self.debugSQL:
+            print "sql: %s" % stmt
+
+        sqlc.execute(stmt)
+        res = sqlc.fetchall()
+        if commit != "":
+            db.commit()
+
+        sqlc.close()
+        db.close()
+
+        return res
+
+
+    def batchsql(self, stmt, batch):
+        """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
+        """
+        res = []
+        db = sqlite.connect(self.dbName, timeout=60)
+        sqlc = db.cursor()
+        if self.debugSQL:
+            print "batchsql: %s" % stmt
+            print "batchsql: %s" % str(batch)
+
+        sqlc.executemany(stmt, batch)
+        db.commit()
+        sqlc.close()
+        db.close()
+
+        return res
\ No newline at end of file