--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# basic analysis code
+from cistematic.core.motif import matrixRow
+import cistematic.core
+import string
+
+try:
+ from pysqlite2 import dbapi2 as sqlite
+except:
+ from sqlite3 import dbapi2 as sqlite
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+ cisRoot = environ.get("CISTEMATIC_ROOT")
+else:
+ cisRoot = "/proj/genome"
+
+knownMotifsDB = "%s/db/known_motifs.db" % cisRoot
+
+
+class AnalyzeMotifs:
+ motifSize = {}
+ coappear = {}
+ motifToAnnotations = {}
+ annotationEntries = []
+ genomeMatches = {}
+ loadedGenomes = []
+ dbName = ""
+ analysisID = "default"
+
+
+ def initializeKnownMotifs(self, source=""):
+ """ loads known motifs from motif database, optionally limiting by source.
+ """
+ self.annotationEntries = []
+ dbkm = sqlite.connect(knownMotifsDB, timeout=60)
+ sqlkm = dbkm.cursor()
+ if len(source) > 0:
+ sqlkm.execute("select * from motifs where source = '%s' " % source)
+ else:
+ sqlkm.execute("select * from motifs")
+
+ dbEntries = sqlkm.fetchall()
+ dbkm.close()
+
+ for entry in dbEntries:
+ (index, source, name, theseq, reference, other_info) = entry
+ self.annotationEntries.append((index, source, name, theseq, reference, other_info))
+
+
+ def findAnnotations(self, motID, thresh=-1, numberN=0):
+ mot = self.findMotif(motID)
+ if thresh < 0:
+ thresh = self.threshold
+
+ for entry in self.annotationEntries:
+ (id, source, name, theseq, reference, other) = entry
+ loc = mot.locateMotif(theseq, thresh, numberN)
+ if len(loc) > 0 :
+ self.motifToAnnotations[motID].append((source, name, theseq, reference, other, loc))
+
+ if len(self.motifToAnnotations[motID]) >= 500:
+ break
+
+
+ def findAnnotationsRE(self, motID, numMismatches=0):
+ """ Calls the motif's consensus locator on each entry in annotationEntries.
+ """
+ mot = self.findMotif(motID)
+ if numMismatches > 0:
+ mot.initializeMismatchRE(numMismatches)
+ else:
+ mot.initializeRE()
+
+ for entry in self.annotationEntries:
+ (id, source, name, annotSeq, reference, other) = entry
+ # Must use non-RE optimized version for short annotations
+ if len(annotSeq) < len(mot):
+ pad = "N" * (len(mot) - len(annotSeq))
+ annotSeq = pad + annotSeq + pad
+ loc = mot.locateConsensus(annotSeq)
+ else:
+ loc = mot.locateConsensusRE(annotSeq)
+
+ if len(loc) > 0 :
+ self.motifToAnnotations[motID].append((source, name, annotSeq, reference, other, loc))
+
+ if len(self.motifToAnnotations[motID]) >= 500:
+ break
+
+
+ def printAnnotations(self):
+ """ prints the motif annotations for every motif result.
+ """
+ for mot in self.getResults():
+ annotations = self.motifToAnnotations[mot.tagID]
+ print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
+ for annotation in annotations:
+ (source, name, annotSeq, reference, other, loc) = annotation
+ print "%s:%s\t%s\t%s\t%s" % (source, name, reference, annotSeq, str(loc))
+ print other
+ print
+
+
+ def printAnnotationsShort(self):
+ """ prints a compressed version of the annotations.
+ """
+ for motID in self.motifToAnnotations.keys():
+ for annotation in self.motifToAnnotations[motID]:
+ print "%s: %s (%s)\t%s - %s" % (annotation[0], annotation[1], annotation[4], annotation[2], annotation[5])
+
+
+ def returnAnnotations(self, motID):
+ """ returns the [annotations] for a particular motID
+ """
+ try:
+ return self.motifToAnnotations[motID]
+ except KeyError:
+ return []
+
+
+ def annotateMotifs(self, thresh=0.0, numberN=0):
+ self.mlog( "Annotating Motifs with threshold %d and %d extra Ns" % (1.0 + thresh, numberN))
+ if len(self.annotationEntries) == 0:
+ self.initializeKnownMotifs()
+
+ for mot in self.getResults():
+ mot.setThreshold(thresh)
+ self.motifToAnnotations[mot.tagID] = []
+ self.findAnnotations(mot.tagID, thresh, numberN)
+
+
+ def annotateConsensus(self, numMismatches=0, source=""):
+ self.mlog( "Annotating Consensus")
+ if len(self.annotationEntries) == 0:
+ self.initializeKnownMotifs(source)
+
+ for mot in self.getResults():
+ self.motifToAnnotations[mot.tagID] = []
+ self.findAnnotationsRE(mot.tagID, numMismatches)
+
+
+ def printConsensus(self):
+ """ print the consensus for every motif result.
+ """
+ for mot in self.getResults():
+ print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
+
+
+ def formatPWM(self, aPWM):
+ """ format a PWM into a printable string.
+ """
+ aRow = ""
+ cRow = ""
+ gRow = ""
+ tRow = ""
+ for col in aPWM:
+ aRow = string.join([aRow, str(round(col[matrixRow["A"]],4))], "\t")
+ cRow = string.join([cRow, str(round(col[matrixRow["C"]],4))], "\t")
+ gRow = string.join([gRow, str(round(col[matrixRow["G"]],4))], "\t")
+ tRow = string.join([tRow, str(round(col[matrixRow["T"]],4))], "\t")
+
+ formattedPWM = "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
+
+ return formattedPWM
+
+
+ def appendGeneToMotif(self,mTagID, geneID, pos):
+ """ make an entry in the geneToMotif table.
+ """
+ (genome, locus) = geneID
+ (loc, sense) = pos
+ stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense)
+ res = self.sql(stmt, "commit")
+
+
+ def appendGeneToMotifBatch(self, batch):
+ """ make a batch of entries in the geneToMotif table.
+ """
+ batchInserts = []
+ stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, ?, ?, ?, ?, ?, ?, ?)"
+ for entry in batch:
+ (mTagID, geneID, pos) = entry
+ (genome, locus) = geneID
+ (loc, sense) = pos
+ batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense))
+
+ res = self.batchsql(stmt, batchInserts)
+
+
+ def geneToMotifKeys(self, thekey="geneID"):
+ """ return the keys to the geneToMotif table. The default key is geneID, otherwise returns mTagID.
+ """
+ results = []
+ if thekey == "geneID":
+ stmt = "SELECT distinct genome, locus from geneToMotif where expID = '%s' and analysisID = '%s' order by locus" % (self.experimentID, self.analysisID)
+ else:
+ stmt = "SELECT distinct mTagID from geneToMotif where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
+
+ res = self.sql(stmt)
+ for entry in res:
+ if thekey == "geneID":
+ (genome, locus) = entry
+ results.append((str(genome), str(locus)))
+ else:
+ mTagID = entry[0]
+ results.append(str(mTagID))
+
+ return results
+
+
+ def appendMotifNeighbors(self,mTagID, match, condition="", geneEntry=""):
+ """ make an entry in the motifNeighbors table.
+ """
+ (chromo, pos) = match
+ (genome, chromNum) = chromo
+ (loc, mSense) = pos
+ if geneEntry != "":
+ (start, stop, sense, geneID) = geneEntry
+ (genome2, locus) = geneID
+ else:
+ start = "-"
+ stop = "-"
+ sense = "-"
+ locus = "NO GENE IN RADIUS"
+
+ stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition)
+ res = self.sql(stmt, "commit")
+
+
+ def appendMotifNeighborsBatch(self, batch):
+ """ make a batch of entries in the motifNeighbors table.
+ """
+ batchInserts = []
+ stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
+ for entry in batch:
+ (mTagID, match, condition, geneEntry) = entry
+ (chromo, pos) = match
+ (genome, chromNum) = chromo
+ (loc, mSense) = pos
+ if geneEntry != "":
+ (start, stop, sense, geneID) = geneEntry
+ (genome2, locus) = geneID
+ else:
+ start = "-"
+ stop = "-"
+ sense = "-"
+ locus = "NO GENE IN RADIUS"
+
+ batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition))
+
+ res = self.batchsql(stmt, batchInserts)
+
+
+ def motifNeighborsKeys(self, condition=""):
+ """ returns a [list of motif ID's] in the motifNeighbor table. Can restrict to a particular condition.
+ """
+ results = []
+ if condition != "":
+ stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' and condition ='%s' order by mTagID" % (self.experimentID, self.analysisID, condition)
+ else:
+ stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
+
+ res = self.sql(stmt)
+ for entry in res:
+ mTagID = entry[0]
+ results.append(mTagID)
+
+ return results
+
+
+ def motifNeighbors(self,mTagID, condition="", discardNullMatches=False):
+ """ get entries in the geneToMotif table that match a particular Motif & condition.
+ """
+ results = []
+ genome = ""
+ chromNum = ""
+ loc = ""
+ mSense = ""
+ start = ""
+ stop = ""
+ sense = ""
+ locus = ""
+ stmt = "select distinct genome, chromNum, location, motifSense, start, stop, sense, locus, condition from motifNeighbors where expID='%s' and analysisID = '%s' and mTagID='%s' " % (self.experimentID, self.analysisID, mTagID)
+ if discardNullMatches:
+ stmt += " and condition <> 'NONE' "
+
+ if condition != '':
+ stmt += " and condition = '%s' " % (condition)
+
+ stmt += " order by ID desc "
+ res = self.sql(stmt)
+
+ for entry in res:
+ (genome, chromNum, loc, mSense, start, stop, sense, locus, condition) = entry
+ match = ((genome, chromNum),(int(loc), mSense))
+ geneEntry = (start, stop, sense, (genome, locus))
+ results.append((match, geneEntry, condition))
+
+ return results
+
+
+ def motifToGene(self, mTagID):
+ """ returns a list of matching geneIDs with location and sense found for a given motif.
+ """
+ results = []
+ stmt = "SELECT distinct genome, locus, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and mTagID = '%s' order by locus" % (self.experimentID, self.analysisID, mTagID)
+ res = self.sql(stmt)
+ for entry in res:
+ (genome, locus, location, sense) = entry
+ results.append(((genome, locus), (int(location), sense)))
+
+ return results
+
+
+ def geneToMotif(self, geneID):
+ """ returns a list of matching motifs with location and sense found for a given geneID
+ """
+ results = []
+ (genome, locus) = geneID
+ stmt = "SELECT distinct mTagID, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and genome = '%s' and locus = '%s' order by location" % (self.experimentID, self.analysisID, genome, locus)
+ res = self.sql(stmt)
+ for entry in res:
+ (mTagID, location, sense) = entry
+ results.append((mTagID, (location, sense)))
+
+ return results
+
+
+ def mapMotifs(self, Threshold=90.0, numberN=0, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, verbose=True):
+ """ find occurences of result motifs in the current genepool using PWMs. Slow.
+ """
+ currentGenome = ""
+ currentDataset = -1
+ genepoolKeys = self.genepool.keys()
+ posResults = []
+ for mot in self.getResults():
+ mTagID = mot.tagID
+ runID = mTagID.split("-")[0]
+ (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
+ if enforceSanity and not mot.isSane():
+ self.mlog("mapMotifs: not mapping %s - failed sanity check" % (mTagID))
+ continue
+
+ if mot.tagID in ignoreList:
+ self.mlog("mapMotifs: not mapping %s" % mot.tagID)
+ continue
+
+ if len(runList) > 0:
+ if int(runID) not in runList:
+ self.mlog("mapMotifs: not mapping run %s" % runID)
+ continue
+
+ if mapAllSeqs:
+ dataID = 0
+
+ self.mlog("mapMotifs: mapping %s with threshold %f and at most %d N" % (mTagID, Threshold, numberN))
+ if dataID <> currentDataset:
+ currentDataset = dataID
+ for geneID in self.getSetting(dataID):
+ if geneID[0] <> currentGenome:
+ currentGenome = geneID[0]
+
+ try:
+ if geneID not in genepoolKeys:
+ self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstreamself.geneDB, False, self.boundToNextGene)
+ genepoolKeys.append[geneID]
+ except:
+ self.mlog("mapMotifs: could not load %s" % (str(geneID)))
+ break
+
+ for geneID in genepoolKeys:
+ if verbose:
+ print "mapMotifs: %s\n" % str(geneID)
+
+ motifPos = mot.locateMotif(self.genepool[geneID], Threshold, numberN)
+
+ if len(motifPos) > 0:
+ for pos in motifPos:
+ posResults.append((mTagID, geneID, pos))
+
+ self.appendGeneToMotifBatch(posResults)
+
+
+ def mapConsensus(self, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, numMismatches=0):
+ """ find occurences of result motifs in the current genepool using regular expressions, allowing
+ for a number of mismatches.
+ """
+ currentGenome = ""
+ currentDataset = -1
+ genepoolKeys = self.genepool.keys()
+ posResults = []
+ for mot in self.getResults():
+ mTagID = mot.tagID
+ runID = mTagID.split("-")[0]
+ (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
+ if mot.tagID in ignoreList:
+ self.mlog("mapConsensus: not mapping %s" % mot.tagID)
+ continue
+
+ if len(runList) > 0:
+ if int(runID) not in runList:
+ self.mlog("mapConsensus: not mapping run %s" % runID)
+ continue
+
+ if enforceSanity and not mot.isSane():
+ self.mlog("mapConsensus: not mapping %s - failed sanity check" % (mTagID))
+ continue
+
+ if mapAllSeqs:
+ dataID = 0
+
+ if numMismatches > 0:
+ mot.initializeMismatchRE(numMismatches)
+ else:
+ mot.initializeRE()
+ self.mlog("mapConsensus: mapping %s with %s mismatches" % (mTagID, numMismatches))
+ if dataID <> currentDataset:
+ currentDataset = dataID
+ for geneID in self.getSetting(dataID):
+ if geneID[0] <> currentGenome:
+ currentGenome = geneID[0]
+
+ try:
+ if geneID not in genepoolKeys:
+ self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstream)
+ genepoolKeys.append[geneID]
+ except:
+ self.mlog("mapConsensus: could not load %s" % (str(geneID)))
+ break
+
+ for geneID in genepoolKeys:
+ motifPos = mot.locateConsensusRE(self.genepool[geneID])
+
+ if len(motifPos) > 0:
+ for pos in motifPos:
+ posResults.append((mTagID, geneID, pos))
+
+ self.appendGeneToMotifBatch(posResults)
+
+
+ def mapFeatures(self, radius):
+ """ returns features within a certain radius in bp of all matches.
+ """
+ FeatureList = []
+ for mot in self.getResults():
+ mTagID = mot.tagID
+ self.mlog("mapFeatures: mapping %s using a radius of %d bp" % (mTagID, radius))
+ for match in self.motifToGene(mTagID):
+ matchFeatures = cistematic.core.retrieveFeatures(match, radius)
+ for entry in matchFeatures:
+ FeatureList.append((mTagID, match, entry))
+
+ return FeatureList
+
+
+ def limitGeneEntries(self, geneList, lowerBound, upperBound):
+ results = []
+ for entry in geneList:
+ if entry[1] < lowerBound or entry[0] > upperBound:
+ continue
+
+ results.append(entry)
+
+ return results
+
+
+ def mapNeighbors(self, radius, annotate=True):
+ """ returns genes on a chromosome within a certain radius in bp.
+ """
+ localGeneList = {}
+ prevChromo = ""
+ neighborResults = []
+ for mot in self.getResults():
+ mTagID = mot.tagID
+ motLen = len(mot)
+ motRadius = motLen / 2
+ mtgList = self.motifToGene(mTagID)
+ self.mlog("mapNeighbors: mapping %s using a radius of %d bp (%d instances)" % (mTagID, radius, len(mtgList)))
+ index = 0
+ for match in mtgList:
+ (chromo, hit) = match
+ if annotate and chromo != prevChromo:
+ prevChromo = chromo
+ localGeneList = cistematic.core.getChromoGeneEntries(chromo)
+
+ index += 1
+ if (index % 1000) == 0:
+ print "."
+
+ matchCounter = 0
+ matchCDS = []
+ if annotate:
+ (chromo, hit) = match
+ matchpos = int(hit[0])
+ lowerBound = matchpos - radius
+ upperBound = matchpos + radius
+ match2 = (chromo, (matchpos + motRadius, hit[1]))
+ matchFeatures = cistematic.core.retrieveFeatures(match2, motRadius, "CDS")
+ for entry in matchFeatures:
+ matchCDS.append((chromo[0], entry[0]))
+
+ geneEntriesList = self.limitGeneEntries(localGeneList, lowerBound, upperBound)
+ for geneEntry in geneEntriesList:
+ beg = int(geneEntry[0])
+ end = int(geneEntry[1])
+ sense = geneEntry[2]
+ gID = geneEntry[3]
+ if gID in matchCDS: # matching within the coding sequence
+ neighborResults.append((mTagID, match, "CDS", geneEntry))
+ matchCounter += 1
+ elif matchpos >= beg and matchpos <= end: # not in CDS, but in gene
+ neighborResults.append((mTagID, match, "GENE", geneEntry))
+ matchCounter += 1
+ elif matchpos < beg:
+ if sense == "F":
+ neighborResults.append((mTagID, match, "UPSTREAM", geneEntry))
+ else:
+ neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
+
+ matchCounter += 1
+ else:
+ if sense == "F":
+ neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
+ else:
+ neighborResults.append((mTagID,match, "UPSTREAM", geneEntry))
+
+ matchCounter += 1
+
+ if matchCounter < 1:
+ neighborResults.append((mTagID, match, "NONE", ""))
+
+ self.appendMotifNeighborsBatch(neighborResults)
+
+
+ def printMotifToGene(self, motifList=[], runList=[]):
+ motKeys = self.geneToMotifKeys(thekey="mTagID")
+ if len(motifList) == 0 and len(runList) == 0:
+ for tag in motKeys:
+ print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
+ else:
+ for tag in motKeys:
+ runID = tag.split("-")[0]
+ if len(runList) > 0:
+ if runID not in runList:
+ continue
+
+ if tag in motifList:
+ print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
+
+
+ def motifToGeneStat(self):
+ tags = self.geneToMotifKeys(thekey="mTagID")
+ counter = 0
+ min = len(self.motifToGene(tags[0]))
+ max = 0
+ for tag in tags:
+ numGenes = len(self.motifToGene(tag))
+ print "%s: %d" % (tag, numGenes)
+ if numGenes > max:
+ max = numGenes
+ elif numGenes < min:
+ min = numGenes
+
+ counter += numGenes
+
+ print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
+
+
+ def printGeneToMotif(self, geneList = []):
+ if len(geneList) == 0:
+ for geneID in self.geneToMotifKeys():
+ print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
+ else:
+ for geneID in self.geneToMotifKeys():
+ if geneID in geneList:
+ print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
+
+
+ def geneToMotifStat(self):
+ genes = self.geneToMotifKeys()
+ counter = 0
+ min = len(self.geneToMotif(genes[0]))
+ max = 0
+ for gene in genes:
+ numMotifs = len(self.geneToMotif(gene))
+ print "%s - %s: %d" % (gene[0], gene[1], numMotifs)
+ if numMotifs > max:
+ max = numMotifs
+ elif numMotifs < min:
+ min = numMotifs
+
+ counter += numMotifs
+
+ print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
+
+
+ def printGeneProfile(self, geneID):
+ print "\nMotif matches for %s " % (str(geneID))
+ geneProfile = self.makeGeneProfile(geneID)
+ positions = geneProfile.keys()
+ if len(positions) > 0:
+ positions.sort()
+ for pos in positions:
+ print "%s %s" % (str(pos), str(geneProfile[pos]))
+ else:
+ print "Gene had no matching motifs"
+
+
+ def makeGeneProfile(self, geneID):
+ geneBucket = {}
+ geneMotifs = self.geneToMotif(geneID)
+ if len(geneMotifs) > 0:
+ for mot in geneMotifs:
+ (motID, pos) = mot
+ (loc, sense) = pos
+ if int(loc) not in geneBucket.keys():
+ geneBucket[int(loc)] = []
+
+ geneBucket[int(loc)].append(mot)
+
+ return geneBucket
+
+
+ def getMotifMatches(self, motifIDs=[]):
+ results = {}
+ if len(motifIDs) < 1:
+ motifIDs = self.geneToMotifKeys(thekey="mTagID")
+
+ for motID in motifIDs:
+ results[motID] = []
+ mot = self.findMotif(motID)
+ motLength = len(mot)
+ for match in self.motifToGene(motID):
+ (chrom, hit) = match
+ (pos, sense) = hit
+ if sense == "F":
+ results[motID].append(self.genepool[chrom][pos:pos + motLength])
+ else:
+ results[motID].append(cistematic.core.complement(self.genepool[chrom][pos:pos + motLength], motLength))
+
+ return results
+
+
+ def summarizeMotifs(self, fileName):
+ """ saves the number of occurences and actual occurence PWMs of motifs to fileName.
+ """
+ motDict = {}
+ motText = []
+ try:
+ if len(fileName) < 1:
+ raise IOError
+
+ outFile = open(fileName, "a")
+ self.mlog("Saving motif summary")
+ for motID in self.geneToMotifKeys(thekey="mTagID"):
+ matchNum = 0
+ mot = self.findMotif(motID)
+ motLength = len(mot)
+ motDict[motID] = []
+ for index in range(motLength):
+ motDict[motID].append([0.0, 0.0, 0.0, 0.0])
+
+ for match in self.motifToGene(motID):
+ matchNum += 1
+ (chromo, hit) = match
+ (pos, sense) = hit
+ if sense == "F":
+ matchSeq = self.genepool[chromo][pos:pos + motLength]
+ else:
+ matchSeq = cistematic.core.complement(self.genepool[chromo][pos: pos + motLength], motLength)
+
+ for index in range(motLength):
+ NT = matchSeq[index]
+ NT = NT.upper()
+ if NT in ["A", "C", "G", "T"]:
+ motDict[motID][index][matrixRow[NT]] += 1
+
+ motLine = "motif %s\t %s matches\t'%s'\n %s\n" % (motID, str(matchNum), mot.buildConsensus(), self.formatPWM(motDict[motID]))
+ motText.append(motLine)
+
+ outFile.writelines(motText)
+ outFile.close()
+ except:
+ self.mlog("Could not save motif summary to file %s\n" % fileName)
+
+
+ def printMotifNeighbors(self):
+ for motID in self.motifNeighborsKeys():
+ print "Matches for motif %s" % motID
+ currentHit = ()
+ for entry in self.motifNeighbors(motID):
+ if entry[0] != currentHit:
+ print "====================="
+ print "Match %s" % str(entry[0])
+ currentHit = entry[0]
+
+ print "\tGene %s:" % str(entry[1])
+ try:
+ goInfo = cistematic.core.getGOInfo(entry[1][3])
+ for entry in goInfo:
+ print "\t\t%s" % str(entry)
+ except:
+ pass
+
+ print "----------------"
+
+
+ def summarizeMotifNeighbors(self, fileName):
+ """ saves the number of occurences and PWMs of motifs within the gene
+ neighborhood radius as mapped using mapNeighbors() to fileName.
+ """
+ motDict = {}
+ motText = []
+ try:
+ if len(fileName) < 1:
+ raise IOError
+ outFile = open(fileName, "a")
+ self.mlog("Saving neighbor summary")
+ for motID in self.motifNeighborsKeys():
+ matchNum = 0
+ mot = self.findMotif(motID)
+ motLength = len(mot)
+ motDict[motID] = []
+ for index in range(motLength):
+ motDict[motID].append([0.0, 0.0, 0.0, 0.0])
+
+ currentMatch = ""
+ for entry in self.motifNeighbors(motID, discardNullMatches = True):
+ if currentMatch != entry[0]:
+ currentMatch = entry[0]
+ (genechrom, loc) = entry[0]
+ (pos, sense) = loc
+ geneID = entry[1][3]
+ (geno, gene) = geneID
+ if gene != "NO GENE IN RADIUS":
+ matchNum += 1
+ if sense == "F":
+ matchSeq = self.genepool[genechrom][pos:pos + motLength]
+ else:
+ matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
+
+ for index in range(motLength):
+ NT = matchSeq[index]
+ NT = NT.upper()
+ if NT in ["A", "C", "G", "T"]:
+ motDict[motID][index][matrixRow[NT]] += 1
+
+ motLine = "motif %s\t %s matches\n %s\n" % (motID, str(matchNum), self.formatPWM(motDict[motID]))
+ motText.append(motLine)
+
+ outFile.writelines(motText)
+ outFile.close()
+ except:
+ self.mlog("Could not save motif neighbors to file %s\n" % fileName)
+
+
+ def saveMotifNeighbors(self, fileName, fullAnnotations=True):
+ """ save every occurence of the motifs with any adjoining gene(s) to fileName.
+ Records annotations and GO terms if available when fullAnnotations=True.
+ """
+ goDict = {}
+ annotDict = {}
+ currentGenome = ""
+ neighborList = []
+ self.mlog("Saving motif neighbors to file %s\n" % fileName)
+ if True:
+ if len(fileName) < 1:
+ raise IOError
+ outFile = open(fileName, "a")
+ for motID in self.motifNeighborsKeys():
+ matchNum = 0
+ mot = self.findMotif(motID)
+ motLength = len(mot)
+ currentMatch = ""
+ for entry in self.motifNeighbors(motID):
+ neighborList = []
+ if currentMatch != entry[0]:
+ matchNum += 1
+ currentMatch = entry[0]
+ (genechrom, loc) = entry[0]
+ (genome, chromo) = genechrom
+ (pos, sense) = loc
+ if fullAnnotations and genome != currentGenome:
+ currentGenome = genome
+ goDict = cistematic.core.getAllGOInfo(genome)
+ annotDict = cistematic.core.getAllAnnotInfo(genome)
+
+ start = entry[1][0]
+ stop = entry[1][1]
+ geneSense = entry[1][2]
+ geneID = entry[1][3]
+ (geno, gene) = geneID
+ condition = entry[2]
+ if sense == "F":
+ matchSeq = self.genepool[genechrom][pos:pos + motLength]
+ else:
+ matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
+
+ currentEntry = "%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motID, matchNum, genome, chromo, pos, sense, condition, start, stop, geneSense, geno, gene, matchSeq)
+ if fullAnnotations:
+ try:
+ goInfo = goDict.get(geneID, [])
+ annotInfo = annotDict.get(geneID,[])
+ annotDescription = ""
+ prevGoInfo = ""
+ if self.debugSQL:
+ print "%s\t%s\t%s" % (str(geneID), str(goInfo), str(annotInfo))
+ for annot in annotInfo:
+ annotDescription += annot + " "
+ if len(goInfo) > 0:
+ for entry in goInfo:
+ if entry == prevGoInfo:
+ continue
+ neighborList.append("%s\t%s\t%s\n" % (currentEntry, annotDescription, entry))
+ prevGoInfo = entry
+ else:
+ neighborList.append("%s\t%s\n" % (currentEntry, annotDescription))
+ except:
+ neighborList.append("%s\n" % currentEntry)
+ else:
+ neighborList.append("%s\n" % currentEntry)
+
+ outFile.writelines(neighborList)
+
+ outFile.close()
+ else:
+ self.mlog("Could not save motif neighbors to file %s\n" % fileName)
+
+ return neighborList
+
+
+ def buildMotifSize(self):
+ mots = self.getResults()
+ index = 0
+ for mot in mots:
+ motLength = len(mot)
+ if motLength not in self.motifSize.keys():
+ self.motifSize[motLength] =[]
+
+ self.motifSize[motLength].append((index, mot.buildConsensus()))
+ index += 1
+
+
+ def findIdenticalMotifs(self):
+ # FIXME!!!!
+ identicalMotifs = []
+
+ for motLength in self.motifSize.keys():
+ motList = self.motifSize[motLength]
+ motListLen = len(motList)
+ print "doing motif length %d" % motLength
+ for pos in range(motListLen):
+ index = pos + 1
+ while index < motListLen:
+ if motList[pos][1] == motList[index][1]:
+ identicalMotifs.append((self.results[pos].tagID, self.results[index].tagID))
+
+ index += 1
+
+ return identicalMotifs
+
+
+ def createAnalysis(self, dbName=""):
+ """ creates the analysis SQL tables. Should only be run once per database file.
+ """
+ if len(dbName) == 0:
+ dbName = self.expFile
+
+ db = sqlite.connect(dbName, timeout=60)
+ try:
+ sql = db.cursor()
+ sql.execute("CREATE table geneToMotif(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar)")
+ sql.execute("CREATE table genomeMatches(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar, threshold varchar)")
+ sql.execute("CREATE table motifNeighbors(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, chromNum varchar, location varchar, motifSense varchar, start varchar, stop varchar, sense varchar, locus varchar, condition varchar)")
+ sql.execute("CREATE index mot1 on geneToMotif(expID, analysisID, mTagID)")
+ sql.execute("CREATE index mot2 on genomeMatches(expID, analysisID, mTagID)")
+ sql.execute("CREATE index mot3 on motifNeighbors(expID, analysisID, mTagID)")
+ sql.execute("CREATE index locus1 on geneToMotif(locus)")
+ sql.execute("CREATE index locus2 on motifNeighbors(locus)")
+ sql.execute("CREATE index condition1 on motifNeighbors(condition)")
+ db.commit()
+ sql.close()
+ db.close()
+ self.mlog("Created analysis tables in database %s" % dbName)
+ except:
+ db.close()
+ self.mlog("Could not create tables in database %s" % dbName)
+ self.mlog("WARNING: perhaps you have called createAnalysis twice on the same database?")
+
+
+ def loadAnalysis(self, analysisID="default", dbName=""):
+ """ changes the analysisID to use the specified one, or use default. Must be used before reading or
+ writing any data to analysis tables.
+ """
+ self.analysisID = analysisID
+ if len (dbName) > 0:
+ self.dbName = dbName
+ else:
+ self.dbName = self.expFile
+
+
+ def resetAnalysis(self):
+ """ currently zeros out some of the analysis structures. obsolete ?
+ """
+ self.mlog("resetting analysis")
+ self.motifSize = {}
+ self.motifToAnnotations = {}
+ self.coappear = {}
+
+
+ def saveAnalysis(self):
+ """ currently obsolete - kept for backward compability.
+ """
+ try:
+ self.mlog("saved analysis %s" % self.analysisID)
+ except:
+ self.mlog("could not save %s" % self.analysisID)
+
+
+ def deleteAnalysis(self, analysisID="default"):
+ """ delete all of the analysis either named aName, or matching
+ an experiment (the current one by default.) Currently a NO-OP. Obsolete.
+ """
+ pass
+
+
+ def buildCoappear(self, motifList=[], distance="100000", strict = False):
+ """ Builds coappear dictionary of geneIDs where motifs occur with others. Can limit to
+ motifs in motifList (default is all motifs), and by distance (default is 100,000 base pairs.)
+ Results are accessed through printCoappear() and saveCoappear().
+ """
+ occurenceList = {}
+ self.coappear = {}
+ processedMotifs = []
+ motifListLen = len(motifList)
+ if motifListLen == 0:
+ #use all motifs
+ motifList = self.geneToMotifKeys(thekey="mTagID")
+ motifListLen = len(motifList)
+
+ for motif in motifList:
+ if motif not in processedMotifs:
+ matchList = self.motifToGene(motif)
+ for match in matchList:
+ (geneID, loc) = match
+ if geneID not in occurenceList:
+ occurenceList[geneID] = []
+
+ (location, sense) = loc
+ occurenceList[geneID].append((location, sense, motif))
+
+ processedMotifs.append(motif)
+
+ for geneID in occurenceList:
+ occurenceList[geneID].sort()
+ if geneID not in self.coappear:
+ self.coappear[geneID] = []
+
+ coappearing = False
+ differentMotifs = False
+ coappearList = []
+ prevOccurence = occurenceList[geneID][0]
+ del occurenceList[geneID][0]
+ for occurence in occurenceList[geneID]:
+ if occurence[0] < prevOccurence[0] + distance:
+ coappearing = True
+ if occurence[2] != prevOccurence[2]:
+ differentMotifs = True
+
+ coappearList.append(prevOccurence)
+ elif coappearing:
+ if strict:
+ if differentMotifs:
+ coappearList.append(prevOccurence)
+ self.coappear[geneID].append(coappearList)
+ else:
+ coappearList.append(prevOccurence)
+ self.coappear[geneID].append(coappearList)
+
+ coappearing = False
+ differentMotifs = False
+ coappearList = []
+
+ prevOccurence = occurence
+
+ if coappearing:
+ if strict:
+ if differentMotifs:
+ coappearList.append(prevOccurence)
+ self.coappear[geneID].append(coappearList)
+ else:
+ coappearList.append(prevOccurence)
+ self.coappear[geneID].append(coappearList)
+
+
+ def printCoappear(self):
+ """ prints a formatted version of the coappear dictionary built with buildCoappear()
+ """
+ for geneID in self.coappear:
+ print " ===== %s =====" % str(geneID)
+ for occurence in self.coappear[geneID]:
+ print str(occurence)
+ print " ============="
+
+
+ def saveCoappear(self, fileName):
+ """ save coappear dictionary in tabular format. Returns:
+ index, genome, locus, pos, sense, tag in tab-separated format.
+ """
+ index = 0
+ outLines = []
+ if 1:
+ if len(fileName) < 1:
+ raise IOError
+
+ outFile = open(fileName, "a")
+ for geneID in self.coappear:
+ (genome, locus) = geneID
+ for occurence in self.coappear[geneID]:
+ index += 1
+ coappearLine = "%d\t%s\t%s" % (index, genome, locus)
+ for match in occurence:
+ (pos, sense, tag) = match
+ coappearLine += "\t%s\t%s\t%s" % (pos, sense, tag)
+
+ outLines.append(coappearLine + "\n")
+
+ outFile.writelines(outLines)
+ outFile.close()
+ else:
+ self.mlog("Could not save coappear to file %s\n" % fileName)
+
+
+ def sql(self, stmt, commit=""):
+ """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
+ """
+ db = sqlite.connect(self.dbName, timeout=60)
+ sqlc = db.cursor()
+ if self.debugSQL:
+ print "sql: %s" % stmt
+
+ sqlc.execute(stmt)
+ res = sqlc.fetchall()
+ if commit != "":
+ db.commit()
+
+ sqlc.close()
+ db.close()
+
+ return res
+
+
+ def batchsql(self, stmt, batch):
+ """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
+ """
+ res = []
+ db = sqlite.connect(self.dbName, timeout=60)
+ sqlc = db.cursor()
+ if self.debugSQL:
+ print "batchsql: %s" % stmt
+ print "batchsql: %s" % str(batch)
+
+ sqlc.executemany(stmt, batch)
+ db.commit()
+ sqlc.close()
+ db.close()
+
+ return res
\ No newline at end of file