X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fexperiments%2FanalyzeMotifs.py;fp=cistematic%2Fexperiments%2FanalyzeMotifs.py;h=b1791b0df3e28edd9e0a4ebe36a86d1e5df58756;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/experiments/analyzeMotifs.py b/cistematic/experiments/analyzeMotifs.py new file mode 100644 index 0000000..b1791b0 --- /dev/null +++ b/cistematic/experiments/analyzeMotifs.py @@ -0,0 +1,1097 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# basic analysis code +from cistematic.core.motif import matrixRow +import cistematic.core +import string + +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +knownMotifsDB = "%s/db/known_motifs.db" % cisRoot + + +class AnalyzeMotifs: + motifSize = {} + coappear = {} + motifToAnnotations = {} + annotationEntries = [] + genomeMatches = {} + loadedGenomes = [] + dbName = "" + analysisID = "default" + + + def initializeKnownMotifs(self, source=""): + """ loads known motifs from motif database, optionally limiting by source. + """ + self.annotationEntries = [] + dbkm = sqlite.connect(knownMotifsDB, timeout=60) + sqlkm = dbkm.cursor() + if len(source) > 0: + sqlkm.execute("select * from motifs where source = '%s' " % source) + else: + sqlkm.execute("select * from motifs") + + dbEntries = sqlkm.fetchall() + dbkm.close() + + for entry in dbEntries: + (index, source, name, theseq, reference, other_info) = entry + self.annotationEntries.append((index, source, name, theseq, reference, other_info)) + + + def findAnnotations(self, motID, thresh=-1, numberN=0): + mot = self.findMotif(motID) + if thresh < 0: + thresh = self.threshold + + for entry in self.annotationEntries: + (id, source, name, theseq, reference, other) = entry + loc = mot.locateMotif(theseq, thresh, numberN) + if len(loc) > 0 : + self.motifToAnnotations[motID].append((source, name, theseq, reference, other, loc)) + + if len(self.motifToAnnotations[motID]) >= 500: + break + + + def findAnnotationsRE(self, motID, numMismatches=0): + """ Calls the motif's consensus locator on each entry in annotationEntries. + """ + mot = self.findMotif(motID) + if numMismatches > 0: + mot.initializeMismatchRE(numMismatches) + else: + mot.initializeRE() + + for entry in self.annotationEntries: + (id, source, name, annotSeq, reference, other) = entry + # Must use non-RE optimized version for short annotations + if len(annotSeq) < len(mot): + pad = "N" * (len(mot) - len(annotSeq)) + annotSeq = pad + annotSeq + pad + loc = mot.locateConsensus(annotSeq) + else: + loc = mot.locateConsensusRE(annotSeq) + + if len(loc) > 0 : + self.motifToAnnotations[motID].append((source, name, annotSeq, reference, other, loc)) + + if len(self.motifToAnnotations[motID]) >= 500: + break + + + def printAnnotations(self): + """ prints the motif annotations for every motif result. + """ + for mot in self.getResults(): + annotations = self.motifToAnnotations[mot.tagID] + print "motif %s\t%s" % (mot.tagID, mot.buildConsensus()) + for annotation in annotations: + (source, name, annotSeq, reference, other, loc) = annotation + print "%s:%s\t%s\t%s\t%s" % (source, name, reference, annotSeq, str(loc)) + print other + print + + + def printAnnotationsShort(self): + """ prints a compressed version of the annotations. + """ + for motID in self.motifToAnnotations.keys(): + for annotation in self.motifToAnnotations[motID]: + print "%s: %s (%s)\t%s - %s" % (annotation[0], annotation[1], annotation[4], annotation[2], annotation[5]) + + + def returnAnnotations(self, motID): + """ returns the [annotations] for a particular motID + """ + try: + return self.motifToAnnotations[motID] + except KeyError: + return [] + + + def annotateMotifs(self, thresh=0.0, numberN=0): + self.mlog( "Annotating Motifs with threshold %d and %d extra Ns" % (1.0 + thresh, numberN)) + if len(self.annotationEntries) == 0: + self.initializeKnownMotifs() + + for mot in self.getResults(): + mot.setThreshold(thresh) + self.motifToAnnotations[mot.tagID] = [] + self.findAnnotations(mot.tagID, thresh, numberN) + + + def annotateConsensus(self, numMismatches=0, source=""): + self.mlog( "Annotating Consensus") + if len(self.annotationEntries) == 0: + self.initializeKnownMotifs(source) + + for mot in self.getResults(): + self.motifToAnnotations[mot.tagID] = [] + self.findAnnotationsRE(mot.tagID, numMismatches) + + + def printConsensus(self): + """ print the consensus for every motif result. + """ + for mot in self.getResults(): + print "motif %s\t%s" % (mot.tagID, mot.buildConsensus()) + + + def formatPWM(self, aPWM): + """ format a PWM into a printable string. + """ + aRow = "" + cRow = "" + gRow = "" + tRow = "" + for col in aPWM: + aRow = string.join([aRow, str(round(col[matrixRow["A"]],4))], "\t") + cRow = string.join([cRow, str(round(col[matrixRow["C"]],4))], "\t") + gRow = string.join([gRow, str(round(col[matrixRow["G"]],4))], "\t") + tRow = string.join([tRow, str(round(col[matrixRow["T"]],4))], "\t") + + formattedPWM = "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow) + + return formattedPWM + + + def appendGeneToMotif(self,mTagID, geneID, pos): + """ make an entry in the geneToMotif table. + """ + (genome, locus) = geneID + (loc, sense) = pos + stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense) + res = self.sql(stmt, "commit") + + + def appendGeneToMotifBatch(self, batch): + """ make a batch of entries in the geneToMotif table. + """ + batchInserts = [] + stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for entry in batch: + (mTagID, geneID, pos) = entry + (genome, locus) = geneID + (loc, sense) = pos + batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense)) + + res = self.batchsql(stmt, batchInserts) + + + def geneToMotifKeys(self, thekey="geneID"): + """ return the keys to the geneToMotif table. The default key is geneID, otherwise returns mTagID. + """ + results = [] + if thekey == "geneID": + stmt = "SELECT distinct genome, locus from geneToMotif where expID = '%s' and analysisID = '%s' order by locus" % (self.experimentID, self.analysisID) + else: + stmt = "SELECT distinct mTagID from geneToMotif where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID) + + res = self.sql(stmt) + for entry in res: + if thekey == "geneID": + (genome, locus) = entry + results.append((str(genome), str(locus))) + else: + mTagID = entry[0] + results.append(str(mTagID)) + + return results + + + def appendMotifNeighbors(self,mTagID, match, condition="", geneEntry=""): + """ make an entry in the motifNeighbors table. + """ + (chromo, pos) = match + (genome, chromNum) = chromo + (loc, mSense) = pos + if geneEntry != "": + (start, stop, sense, geneID) = geneEntry + (genome2, locus) = geneID + else: + start = "-" + stop = "-" + sense = "-" + locus = "NO GENE IN RADIUS" + + stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition) + res = self.sql(stmt, "commit") + + + def appendMotifNeighborsBatch(self, batch): + """ make a batch of entries in the motifNeighbors table. + """ + batchInserts = [] + stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + for entry in batch: + (mTagID, match, condition, geneEntry) = entry + (chromo, pos) = match + (genome, chromNum) = chromo + (loc, mSense) = pos + if geneEntry != "": + (start, stop, sense, geneID) = geneEntry + (genome2, locus) = geneID + else: + start = "-" + stop = "-" + sense = "-" + locus = "NO GENE IN RADIUS" + + batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition)) + + res = self.batchsql(stmt, batchInserts) + + + def motifNeighborsKeys(self, condition=""): + """ returns a [list of motif ID's] in the motifNeighbor table. Can restrict to a particular condition. + """ + results = [] + if condition != "": + stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' and condition ='%s' order by mTagID" % (self.experimentID, self.analysisID, condition) + else: + stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID) + + res = self.sql(stmt) + for entry in res: + mTagID = entry[0] + results.append(mTagID) + + return results + + + def motifNeighbors(self,mTagID, condition="", discardNullMatches=False): + """ get entries in the geneToMotif table that match a particular Motif & condition. + """ + results = [] + genome = "" + chromNum = "" + loc = "" + mSense = "" + start = "" + stop = "" + sense = "" + locus = "" + stmt = "select distinct genome, chromNum, location, motifSense, start, stop, sense, locus, condition from motifNeighbors where expID='%s' and analysisID = '%s' and mTagID='%s' " % (self.experimentID, self.analysisID, mTagID) + if discardNullMatches: + stmt += " and condition <> 'NONE' " + + if condition != '': + stmt += " and condition = '%s' " % (condition) + + stmt += " order by ID desc " + res = self.sql(stmt) + + for entry in res: + (genome, chromNum, loc, mSense, start, stop, sense, locus, condition) = entry + match = ((genome, chromNum),(int(loc), mSense)) + geneEntry = (start, stop, sense, (genome, locus)) + results.append((match, geneEntry, condition)) + + return results + + + def motifToGene(self, mTagID): + """ returns a list of matching geneIDs with location and sense found for a given motif. + """ + results = [] + stmt = "SELECT distinct genome, locus, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and mTagID = '%s' order by locus" % (self.experimentID, self.analysisID, mTagID) + res = self.sql(stmt) + for entry in res: + (genome, locus, location, sense) = entry + results.append(((genome, locus), (int(location), sense))) + + return results + + + def geneToMotif(self, geneID): + """ returns a list of matching motifs with location and sense found for a given geneID + """ + results = [] + (genome, locus) = geneID + stmt = "SELECT distinct mTagID, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and genome = '%s' and locus = '%s' order by location" % (self.experimentID, self.analysisID, genome, locus) + res = self.sql(stmt) + for entry in res: + (mTagID, location, sense) = entry + results.append((mTagID, (location, sense))) + + return results + + + def mapMotifs(self, Threshold=90.0, numberN=0, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, verbose=True): + """ find occurences of result motifs in the current genepool using PWMs. Slow. + """ + currentGenome = "" + currentDataset = -1 + genepoolKeys = self.genepool.keys() + posResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + runID = mTagID.split("-")[0] + (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID)) + if enforceSanity and not mot.isSane(): + self.mlog("mapMotifs: not mapping %s - failed sanity check" % (mTagID)) + continue + + if mot.tagID in ignoreList: + self.mlog("mapMotifs: not mapping %s" % mot.tagID) + continue + + if len(runList) > 0: + if int(runID) not in runList: + self.mlog("mapMotifs: not mapping run %s" % runID) + continue + + if mapAllSeqs: + dataID = 0 + + self.mlog("mapMotifs: mapping %s with threshold %f and at most %d N" % (mTagID, Threshold, numberN)) + if dataID <> currentDataset: + currentDataset = dataID + for geneID in self.getSetting(dataID): + if geneID[0] <> currentGenome: + currentGenome = geneID[0] + + try: + if geneID not in genepoolKeys: + self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstreamself.geneDB, False, self.boundToNextGene) + genepoolKeys.append[geneID] + except: + self.mlog("mapMotifs: could not load %s" % (str(geneID))) + break + + for geneID in genepoolKeys: + if verbose: + print "mapMotifs: %s\n" % str(geneID) + + motifPos = mot.locateMotif(self.genepool[geneID], Threshold, numberN) + + if len(motifPos) > 0: + for pos in motifPos: + posResults.append((mTagID, geneID, pos)) + + self.appendGeneToMotifBatch(posResults) + + + def mapConsensus(self, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, numMismatches=0): + """ find occurences of result motifs in the current genepool using regular expressions, allowing + for a number of mismatches. + """ + currentGenome = "" + currentDataset = -1 + genepoolKeys = self.genepool.keys() + posResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + runID = mTagID.split("-")[0] + (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID)) + if mot.tagID in ignoreList: + self.mlog("mapConsensus: not mapping %s" % mot.tagID) + continue + + if len(runList) > 0: + if int(runID) not in runList: + self.mlog("mapConsensus: not mapping run %s" % runID) + continue + + if enforceSanity and not mot.isSane(): + self.mlog("mapConsensus: not mapping %s - failed sanity check" % (mTagID)) + continue + + if mapAllSeqs: + dataID = 0 + + if numMismatches > 0: + mot.initializeMismatchRE(numMismatches) + else: + mot.initializeRE() + self.mlog("mapConsensus: mapping %s with %s mismatches" % (mTagID, numMismatches)) + if dataID <> currentDataset: + currentDataset = dataID + for geneID in self.getSetting(dataID): + if geneID[0] <> currentGenome: + currentGenome = geneID[0] + + try: + if geneID not in genepoolKeys: + self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstream) + genepoolKeys.append[geneID] + except: + self.mlog("mapConsensus: could not load %s" % (str(geneID))) + break + + for geneID in genepoolKeys: + motifPos = mot.locateConsensusRE(self.genepool[geneID]) + + if len(motifPos) > 0: + for pos in motifPos: + posResults.append((mTagID, geneID, pos)) + + self.appendGeneToMotifBatch(posResults) + + + def mapFeatures(self, radius): + """ returns features within a certain radius in bp of all matches. + """ + FeatureList = [] + for mot in self.getResults(): + mTagID = mot.tagID + self.mlog("mapFeatures: mapping %s using a radius of %d bp" % (mTagID, radius)) + for match in self.motifToGene(mTagID): + matchFeatures = cistematic.core.retrieveFeatures(match, radius) + for entry in matchFeatures: + FeatureList.append((mTagID, match, entry)) + + return FeatureList + + + def limitGeneEntries(self, geneList, lowerBound, upperBound): + results = [] + for entry in geneList: + if entry[1] < lowerBound or entry[0] > upperBound: + continue + + results.append(entry) + + return results + + + def mapNeighbors(self, radius, annotate=True): + """ returns genes on a chromosome within a certain radius in bp. + """ + localGeneList = {} + prevChromo = "" + neighborResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + motLen = len(mot) + motRadius = motLen / 2 + mtgList = self.motifToGene(mTagID) + self.mlog("mapNeighbors: mapping %s using a radius of %d bp (%d instances)" % (mTagID, radius, len(mtgList))) + index = 0 + for match in mtgList: + (chromo, hit) = match + if annotate and chromo != prevChromo: + prevChromo = chromo + localGeneList = cistematic.core.getChromoGeneEntries(chromo) + + index += 1 + if (index % 1000) == 0: + print "." + + matchCounter = 0 + matchCDS = [] + if annotate: + (chromo, hit) = match + matchpos = int(hit[0]) + lowerBound = matchpos - radius + upperBound = matchpos + radius + match2 = (chromo, (matchpos + motRadius, hit[1])) + matchFeatures = cistematic.core.retrieveFeatures(match2, motRadius, "CDS") + for entry in matchFeatures: + matchCDS.append((chromo[0], entry[0])) + + geneEntriesList = self.limitGeneEntries(localGeneList, lowerBound, upperBound) + for geneEntry in geneEntriesList: + beg = int(geneEntry[0]) + end = int(geneEntry[1]) + sense = geneEntry[2] + gID = geneEntry[3] + if gID in matchCDS: # matching within the coding sequence + neighborResults.append((mTagID, match, "CDS", geneEntry)) + matchCounter += 1 + elif matchpos >= beg and matchpos <= end: # not in CDS, but in gene + neighborResults.append((mTagID, match, "GENE", geneEntry)) + matchCounter += 1 + elif matchpos < beg: + if sense == "F": + neighborResults.append((mTagID, match, "UPSTREAM", geneEntry)) + else: + neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry)) + + matchCounter += 1 + else: + if sense == "F": + neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry)) + else: + neighborResults.append((mTagID,match, "UPSTREAM", geneEntry)) + + matchCounter += 1 + + if matchCounter < 1: + neighborResults.append((mTagID, match, "NONE", "")) + + self.appendMotifNeighborsBatch(neighborResults) + + + def printMotifToGene(self, motifList=[], runList=[]): + motKeys = self.geneToMotifKeys(thekey="mTagID") + if len(motifList) == 0 and len(runList) == 0: + for tag in motKeys: + print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag))) + else: + for tag in motKeys: + runID = tag.split("-")[0] + if len(runList) > 0: + if runID not in runList: + continue + + if tag in motifList: + print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag))) + + + def motifToGeneStat(self): + tags = self.geneToMotifKeys(thekey="mTagID") + counter = 0 + min = len(self.motifToGene(tags[0])) + max = 0 + for tag in tags: + numGenes = len(self.motifToGene(tag)) + print "%s: %d" % (tag, numGenes) + if numGenes > max: + max = numGenes + elif numGenes < min: + min = numGenes + + counter += numGenes + + print "for a total of %d matches - min: %d max: %d" % (counter, min, max) + + + def printGeneToMotif(self, geneList = []): + if len(geneList) == 0: + for geneID in self.geneToMotifKeys(): + print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID))) + else: + for geneID in self.geneToMotifKeys(): + if geneID in geneList: + print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID))) + + + def geneToMotifStat(self): + genes = self.geneToMotifKeys() + counter = 0 + min = len(self.geneToMotif(genes[0])) + max = 0 + for gene in genes: + numMotifs = len(self.geneToMotif(gene)) + print "%s - %s: %d" % (gene[0], gene[1], numMotifs) + if numMotifs > max: + max = numMotifs + elif numMotifs < min: + min = numMotifs + + counter += numMotifs + + print "for a total of %d matches - min: %d max: %d" % (counter, min, max) + + + def printGeneProfile(self, geneID): + print "\nMotif matches for %s " % (str(geneID)) + geneProfile = self.makeGeneProfile(geneID) + positions = geneProfile.keys() + if len(positions) > 0: + positions.sort() + for pos in positions: + print "%s %s" % (str(pos), str(geneProfile[pos])) + else: + print "Gene had no matching motifs" + + + def makeGeneProfile(self, geneID): + geneBucket = {} + geneMotifs = self.geneToMotif(geneID) + if len(geneMotifs) > 0: + for mot in geneMotifs: + (motID, pos) = mot + (loc, sense) = pos + if int(loc) not in geneBucket.keys(): + geneBucket[int(loc)] = [] + + geneBucket[int(loc)].append(mot) + + return geneBucket + + + def getMotifMatches(self, motifIDs=[]): + results = {} + if len(motifIDs) < 1: + motifIDs = self.geneToMotifKeys(thekey="mTagID") + + for motID in motifIDs: + results[motID] = [] + mot = self.findMotif(motID) + motLength = len(mot) + for match in self.motifToGene(motID): + (chrom, hit) = match + (pos, sense) = hit + if sense == "F": + results[motID].append(self.genepool[chrom][pos:pos + motLength]) + else: + results[motID].append(cistematic.core.complement(self.genepool[chrom][pos:pos + motLength], motLength)) + + return results + + + def summarizeMotifs(self, fileName): + """ saves the number of occurences and actual occurence PWMs of motifs to fileName. + """ + motDict = {} + motText = [] + try: + if len(fileName) < 1: + raise IOError + + outFile = open(fileName, "a") + self.mlog("Saving motif summary") + for motID in self.geneToMotifKeys(thekey="mTagID"): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + motDict[motID] = [] + for index in range(motLength): + motDict[motID].append([0.0, 0.0, 0.0, 0.0]) + + for match in self.motifToGene(motID): + matchNum += 1 + (chromo, hit) = match + (pos, sense) = hit + if sense == "F": + matchSeq = self.genepool[chromo][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[chromo][pos: pos + motLength], motLength) + + for index in range(motLength): + NT = matchSeq[index] + NT = NT.upper() + if NT in ["A", "C", "G", "T"]: + motDict[motID][index][matrixRow[NT]] += 1 + + motLine = "motif %s\t %s matches\t'%s'\n %s\n" % (motID, str(matchNum), mot.buildConsensus(), self.formatPWM(motDict[motID])) + motText.append(motLine) + + outFile.writelines(motText) + outFile.close() + except: + self.mlog("Could not save motif summary to file %s\n" % fileName) + + + def printMotifNeighbors(self): + for motID in self.motifNeighborsKeys(): + print "Matches for motif %s" % motID + currentHit = () + for entry in self.motifNeighbors(motID): + if entry[0] != currentHit: + print "=====================" + print "Match %s" % str(entry[0]) + currentHit = entry[0] + + print "\tGene %s:" % str(entry[1]) + try: + goInfo = cistematic.core.getGOInfo(entry[1][3]) + for entry in goInfo: + print "\t\t%s" % str(entry) + except: + pass + + print "----------------" + + + def summarizeMotifNeighbors(self, fileName): + """ saves the number of occurences and PWMs of motifs within the gene + neighborhood radius as mapped using mapNeighbors() to fileName. + """ + motDict = {} + motText = [] + try: + if len(fileName) < 1: + raise IOError + outFile = open(fileName, "a") + self.mlog("Saving neighbor summary") + for motID in self.motifNeighborsKeys(): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + motDict[motID] = [] + for index in range(motLength): + motDict[motID].append([0.0, 0.0, 0.0, 0.0]) + + currentMatch = "" + for entry in self.motifNeighbors(motID, discardNullMatches = True): + if currentMatch != entry[0]: + currentMatch = entry[0] + (genechrom, loc) = entry[0] + (pos, sense) = loc + geneID = entry[1][3] + (geno, gene) = geneID + if gene != "NO GENE IN RADIUS": + matchNum += 1 + if sense == "F": + matchSeq = self.genepool[genechrom][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength) + + for index in range(motLength): + NT = matchSeq[index] + NT = NT.upper() + if NT in ["A", "C", "G", "T"]: + motDict[motID][index][matrixRow[NT]] += 1 + + motLine = "motif %s\t %s matches\n %s\n" % (motID, str(matchNum), self.formatPWM(motDict[motID])) + motText.append(motLine) + + outFile.writelines(motText) + outFile.close() + except: + self.mlog("Could not save motif neighbors to file %s\n" % fileName) + + + def saveMotifNeighbors(self, fileName, fullAnnotations=True): + """ save every occurence of the motifs with any adjoining gene(s) to fileName. + Records annotations and GO terms if available when fullAnnotations=True. + """ + goDict = {} + annotDict = {} + currentGenome = "" + neighborList = [] + self.mlog("Saving motif neighbors to file %s\n" % fileName) + if True: + if len(fileName) < 1: + raise IOError + outFile = open(fileName, "a") + for motID in self.motifNeighborsKeys(): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + currentMatch = "" + for entry in self.motifNeighbors(motID): + neighborList = [] + if currentMatch != entry[0]: + matchNum += 1 + currentMatch = entry[0] + (genechrom, loc) = entry[0] + (genome, chromo) = genechrom + (pos, sense) = loc + if fullAnnotations and genome != currentGenome: + currentGenome = genome + goDict = cistematic.core.getAllGOInfo(genome) + annotDict = cistematic.core.getAllAnnotInfo(genome) + + start = entry[1][0] + stop = entry[1][1] + geneSense = entry[1][2] + geneID = entry[1][3] + (geno, gene) = geneID + condition = entry[2] + if sense == "F": + matchSeq = self.genepool[genechrom][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength) + + currentEntry = "%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motID, matchNum, genome, chromo, pos, sense, condition, start, stop, geneSense, geno, gene, matchSeq) + if fullAnnotations: + try: + goInfo = goDict.get(geneID, []) + annotInfo = annotDict.get(geneID,[]) + annotDescription = "" + prevGoInfo = "" + if self.debugSQL: + print "%s\t%s\t%s" % (str(geneID), str(goInfo), str(annotInfo)) + for annot in annotInfo: + annotDescription += annot + " " + if len(goInfo) > 0: + for entry in goInfo: + if entry == prevGoInfo: + continue + neighborList.append("%s\t%s\t%s\n" % (currentEntry, annotDescription, entry)) + prevGoInfo = entry + else: + neighborList.append("%s\t%s\n" % (currentEntry, annotDescription)) + except: + neighborList.append("%s\n" % currentEntry) + else: + neighborList.append("%s\n" % currentEntry) + + outFile.writelines(neighborList) + + outFile.close() + else: + self.mlog("Could not save motif neighbors to file %s\n" % fileName) + + return neighborList + + + def buildMotifSize(self): + mots = self.getResults() + index = 0 + for mot in mots: + motLength = len(mot) + if motLength not in self.motifSize.keys(): + self.motifSize[motLength] =[] + + self.motifSize[motLength].append((index, mot.buildConsensus())) + index += 1 + + + def findIdenticalMotifs(self): + # FIXME!!!! + identicalMotifs = [] + + for motLength in self.motifSize.keys(): + motList = self.motifSize[motLength] + motListLen = len(motList) + print "doing motif length %d" % motLength + for pos in range(motListLen): + index = pos + 1 + while index < motListLen: + if motList[pos][1] == motList[index][1]: + identicalMotifs.append((self.results[pos].tagID, self.results[index].tagID)) + + index += 1 + + return identicalMotifs + + + def createAnalysis(self, dbName=""): + """ creates the analysis SQL tables. Should only be run once per database file. + """ + if len(dbName) == 0: + dbName = self.expFile + + db = sqlite.connect(dbName, timeout=60) + try: + sql = db.cursor() + sql.execute("CREATE table geneToMotif(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar)") + sql.execute("CREATE table genomeMatches(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar, threshold varchar)") + sql.execute("CREATE table motifNeighbors(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, chromNum varchar, location varchar, motifSense varchar, start varchar, stop varchar, sense varchar, locus varchar, condition varchar)") + sql.execute("CREATE index mot1 on geneToMotif(expID, analysisID, mTagID)") + sql.execute("CREATE index mot2 on genomeMatches(expID, analysisID, mTagID)") + sql.execute("CREATE index mot3 on motifNeighbors(expID, analysisID, mTagID)") + sql.execute("CREATE index locus1 on geneToMotif(locus)") + sql.execute("CREATE index locus2 on motifNeighbors(locus)") + sql.execute("CREATE index condition1 on motifNeighbors(condition)") + db.commit() + sql.close() + db.close() + self.mlog("Created analysis tables in database %s" % dbName) + except: + db.close() + self.mlog("Could not create tables in database %s" % dbName) + self.mlog("WARNING: perhaps you have called createAnalysis twice on the same database?") + + + def loadAnalysis(self, analysisID="default", dbName=""): + """ changes the analysisID to use the specified one, or use default. Must be used before reading or + writing any data to analysis tables. + """ + self.analysisID = analysisID + if len (dbName) > 0: + self.dbName = dbName + else: + self.dbName = self.expFile + + + def resetAnalysis(self): + """ currently zeros out some of the analysis structures. obsolete ? + """ + self.mlog("resetting analysis") + self.motifSize = {} + self.motifToAnnotations = {} + self.coappear = {} + + + def saveAnalysis(self): + """ currently obsolete - kept for backward compability. + """ + try: + self.mlog("saved analysis %s" % self.analysisID) + except: + self.mlog("could not save %s" % self.analysisID) + + + def deleteAnalysis(self, analysisID="default"): + """ delete all of the analysis either named aName, or matching + an experiment (the current one by default.) Currently a NO-OP. Obsolete. + """ + pass + + + def buildCoappear(self, motifList=[], distance="100000", strict = False): + """ Builds coappear dictionary of geneIDs where motifs occur with others. Can limit to + motifs in motifList (default is all motifs), and by distance (default is 100,000 base pairs.) + Results are accessed through printCoappear() and saveCoappear(). + """ + occurenceList = {} + self.coappear = {} + processedMotifs = [] + motifListLen = len(motifList) + if motifListLen == 0: + #use all motifs + motifList = self.geneToMotifKeys(thekey="mTagID") + motifListLen = len(motifList) + + for motif in motifList: + if motif not in processedMotifs: + matchList = self.motifToGene(motif) + for match in matchList: + (geneID, loc) = match + if geneID not in occurenceList: + occurenceList[geneID] = [] + + (location, sense) = loc + occurenceList[geneID].append((location, sense, motif)) + + processedMotifs.append(motif) + + for geneID in occurenceList: + occurenceList[geneID].sort() + if geneID not in self.coappear: + self.coappear[geneID] = [] + + coappearing = False + differentMotifs = False + coappearList = [] + prevOccurence = occurenceList[geneID][0] + del occurenceList[geneID][0] + for occurence in occurenceList[geneID]: + if occurence[0] < prevOccurence[0] + distance: + coappearing = True + if occurence[2] != prevOccurence[2]: + differentMotifs = True + + coappearList.append(prevOccurence) + elif coappearing: + if strict: + if differentMotifs: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + else: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + + coappearing = False + differentMotifs = False + coappearList = [] + + prevOccurence = occurence + + if coappearing: + if strict: + if differentMotifs: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + else: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + + + def printCoappear(self): + """ prints a formatted version of the coappear dictionary built with buildCoappear() + """ + for geneID in self.coappear: + print " ===== %s =====" % str(geneID) + for occurence in self.coappear[geneID]: + print str(occurence) + print " =============" + + + def saveCoappear(self, fileName): + """ save coappear dictionary in tabular format. Returns: + index, genome, locus, pos, sense, tag in tab-separated format. + """ + index = 0 + outLines = [] + if 1: + if len(fileName) < 1: + raise IOError + + outFile = open(fileName, "a") + for geneID in self.coappear: + (genome, locus) = geneID + for occurence in self.coappear[geneID]: + index += 1 + coappearLine = "%d\t%s\t%s" % (index, genome, locus) + for match in occurence: + (pos, sense, tag) = match + coappearLine += "\t%s\t%s\t%s" % (pos, sense, tag) + + outLines.append(coappearLine + "\n") + + outFile.writelines(outLines) + outFile.close() + else: + self.mlog("Could not save coappear to file %s\n" % fileName) + + + def sql(self, stmt, commit=""): + """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list. + """ + db = sqlite.connect(self.dbName, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "sql: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + if commit != "": + db.commit() + + sqlc.close() + db.close() + + return res + + + def batchsql(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.dbName, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "batchsql: %s" % stmt + print "batchsql: %s" % str(batch) + + sqlc.executemany(stmt, batch) + db.commit() + sqlc.close() + db.close() + + return res \ No newline at end of file