1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
31 from cistematic.core.motif import matrixRow
32 import cistematic.core
36 from pysqlite2 import dbapi2 as sqlite
38 from sqlite3 import dbapi2 as sqlite
39 from os import environ
41 if environ.get("CISTEMATIC_ROOT"):
42 cisRoot = environ.get("CISTEMATIC_ROOT")
44 cisRoot = "/proj/genome"
46 knownMotifsDB = "%s/db/known_motifs.db" % cisRoot
52 motifToAnnotations = {}
53 annotationEntries = []
57 analysisID = "default"
60 def initializeKnownMotifs(self, source=""):
61 """ loads known motifs from motif database, optionally limiting by source.
63 self.annotationEntries = []
64 dbkm = sqlite.connect(knownMotifsDB, timeout=60)
67 sqlkm.execute("select * from motifs where source = '%s' " % source)
69 sqlkm.execute("select * from motifs")
71 dbEntries = sqlkm.fetchall()
74 for entry in dbEntries:
75 (index, source, name, theseq, reference, other_info) = entry
76 self.annotationEntries.append((index, source, name, theseq, reference, other_info))
79 def findAnnotations(self, motID, thresh=-1, numberN=0):
80 mot = self.findMotif(motID)
82 thresh = self.threshold
84 for entry in self.annotationEntries:
85 (id, source, name, theseq, reference, other) = entry
86 loc = mot.locateMotif(theseq, thresh, numberN)
88 self.motifToAnnotations[motID].append((source, name, theseq, reference, other, loc))
90 if len(self.motifToAnnotations[motID]) >= 500:
94 def findAnnotationsRE(self, motID, numMismatches=0):
95 """ Calls the motif's consensus locator on each entry in annotationEntries.
97 mot = self.findMotif(motID)
99 mot.initializeMismatchRE(numMismatches)
103 for entry in self.annotationEntries:
104 (id, source, name, annotSeq, reference, other) = entry
105 # Must use non-RE optimized version for short annotations
106 if len(annotSeq) < len(mot):
107 pad = "N" * (len(mot) - len(annotSeq))
108 annotSeq = pad + annotSeq + pad
109 loc = mot.locateConsensus(annotSeq)
111 loc = mot.locateConsensusRE(annotSeq)
114 self.motifToAnnotations[motID].append((source, name, annotSeq, reference, other, loc))
116 if len(self.motifToAnnotations[motID]) >= 500:
120 def printAnnotations(self):
121 """ prints the motif annotations for every motif result.
123 for mot in self.getResults():
124 annotations = self.motifToAnnotations[mot.tagID]
125 print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
126 for annotation in annotations:
127 (source, name, annotSeq, reference, other, loc) = annotation
128 print "%s:%s\t%s\t%s\t%s" % (source, name, reference, annotSeq, str(loc))
133 def printAnnotationsShort(self):
134 """ prints a compressed version of the annotations.
136 for motID in self.motifToAnnotations.keys():
137 for annotation in self.motifToAnnotations[motID]:
138 print "%s: %s (%s)\t%s - %s" % (annotation[0], annotation[1], annotation[4], annotation[2], annotation[5])
141 def returnAnnotations(self, motID):
142 """ returns the [annotations] for a particular motID
145 return self.motifToAnnotations[motID]
150 def annotateMotifs(self, thresh=0.0, numberN=0):
151 self.mlog( "Annotating Motifs with threshold %d and %d extra Ns" % (1.0 + thresh, numberN))
152 if len(self.annotationEntries) == 0:
153 self.initializeKnownMotifs()
155 for mot in self.getResults():
156 mot.setThreshold(thresh)
157 self.motifToAnnotations[mot.tagID] = []
158 self.findAnnotations(mot.tagID, thresh, numberN)
161 def annotateConsensus(self, numMismatches=0, source=""):
162 self.mlog( "Annotating Consensus")
163 if len(self.annotationEntries) == 0:
164 self.initializeKnownMotifs(source)
166 for mot in self.getResults():
167 self.motifToAnnotations[mot.tagID] = []
168 self.findAnnotationsRE(mot.tagID, numMismatches)
171 def printConsensus(self):
172 """ print the consensus for every motif result.
174 for mot in self.getResults():
175 print "motif %s\t%s" % (mot.tagID, mot.buildConsensus())
178 def formatPWM(self, aPWM):
179 """ format a PWM into a printable string.
186 aRow = string.join([aRow, str(round(col[matrixRow["A"]],4))], "\t")
187 cRow = string.join([cRow, str(round(col[matrixRow["C"]],4))], "\t")
188 gRow = string.join([gRow, str(round(col[matrixRow["G"]],4))], "\t")
189 tRow = string.join([tRow, str(round(col[matrixRow["T"]],4))], "\t")
191 formattedPWM = "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
196 def appendGeneToMotif(self,mTagID, geneID, pos):
197 """ make an entry in the geneToMotif table.
199 (genome, locus) = geneID
201 stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense)
202 res = self.sql(stmt, "commit")
205 def appendGeneToMotifBatch(self, batch):
206 """ make a batch of entries in the geneToMotif table.
209 stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, ?, ?, ?, ?, ?, ?, ?)"
211 (mTagID, geneID, pos) = entry
212 (genome, locus) = geneID
214 batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense))
216 res = self.batchsql(stmt, batchInserts)
219 def geneToMotifKeys(self, thekey="geneID"):
220 """ return the keys to the geneToMotif table. The default key is geneID, otherwise returns mTagID.
223 if thekey == "geneID":
224 stmt = "SELECT distinct genome, locus from geneToMotif where expID = '%s' and analysisID = '%s' order by locus" % (self.experimentID, self.analysisID)
226 stmt = "SELECT distinct mTagID from geneToMotif where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
230 if thekey == "geneID":
231 (genome, locus) = entry
232 results.append((str(genome), str(locus)))
235 results.append(str(mTagID))
240 def appendMotifNeighbors(self,mTagID, match, condition="", geneEntry=""):
241 """ make an entry in the motifNeighbors table.
243 (chromo, pos) = match
244 (genome, chromNum) = chromo
247 (start, stop, sense, geneID) = geneEntry
248 (genome2, locus) = geneID
253 locus = "NO GENE IN RADIUS"
255 stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition)
256 res = self.sql(stmt, "commit")
259 def appendMotifNeighborsBatch(self, batch):
260 """ make a batch of entries in the motifNeighbors table.
263 stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
265 (mTagID, match, condition, geneEntry) = entry
266 (chromo, pos) = match
267 (genome, chromNum) = chromo
270 (start, stop, sense, geneID) = geneEntry
271 (genome2, locus) = geneID
276 locus = "NO GENE IN RADIUS"
278 batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition))
280 res = self.batchsql(stmt, batchInserts)
283 def motifNeighborsKeys(self, condition=""):
284 """ returns a [list of motif ID's] in the motifNeighbor table. Can restrict to a particular condition.
288 stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' and condition ='%s' order by mTagID" % (self.experimentID, self.analysisID, condition)
290 stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID)
295 results.append(mTagID)
300 def motifNeighbors(self,mTagID, condition="", discardNullMatches=False):
301 """ get entries in the geneToMotif table that match a particular Motif & condition.
312 stmt = "select distinct genome, chromNum, location, motifSense, start, stop, sense, locus, condition from motifNeighbors where expID='%s' and analysisID = '%s' and mTagID='%s' " % (self.experimentID, self.analysisID, mTagID)
313 if discardNullMatches:
314 stmt += " and condition <> 'NONE' "
317 stmt += " and condition = '%s' " % (condition)
319 stmt += " order by ID desc "
323 (genome, chromNum, loc, mSense, start, stop, sense, locus, condition) = entry
324 match = ((genome, chromNum),(int(loc), mSense))
325 geneEntry = (start, stop, sense, (genome, locus))
326 results.append((match, geneEntry, condition))
331 def motifToGene(self, mTagID):
332 """ returns a list of matching geneIDs with location and sense found for a given motif.
335 stmt = "SELECT distinct genome, locus, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and mTagID = '%s' order by locus" % (self.experimentID, self.analysisID, mTagID)
338 (genome, locus, location, sense) = entry
339 results.append(((genome, locus), (int(location), sense)))
344 def geneToMotif(self, geneID):
345 """ returns a list of matching motifs with location and sense found for a given geneID
348 (genome, locus) = geneID
349 stmt = "SELECT distinct mTagID, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and genome = '%s' and locus = '%s' order by location" % (self.experimentID, self.analysisID, genome, locus)
352 (mTagID, location, sense) = entry
353 results.append((mTagID, (location, sense)))
358 def mapMotifs(self, Threshold=90.0, numberN=0, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, verbose=True):
359 """ find occurences of result motifs in the current genepool using PWMs. Slow.
363 genepoolKeys = self.genepool.keys()
365 for mot in self.getResults():
367 runID = mTagID.split("-")[0]
368 (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
369 if enforceSanity and not mot.isSane():
370 self.mlog("mapMotifs: not mapping %s - failed sanity check" % (mTagID))
373 if mot.tagID in ignoreList:
374 self.mlog("mapMotifs: not mapping %s" % mot.tagID)
378 if int(runID) not in runList:
379 self.mlog("mapMotifs: not mapping run %s" % runID)
385 self.mlog("mapMotifs: mapping %s with threshold %f and at most %d N" % (mTagID, Threshold, numberN))
386 if dataID <> currentDataset:
387 currentDataset = dataID
388 for geneID in self.getSetting(dataID):
389 if geneID[0] <> currentGenome:
390 currentGenome = geneID[0]
393 if geneID not in genepoolKeys:
394 self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstreamself.geneDB, False, self.boundToNextGene)
395 genepoolKeys.append[geneID]
397 self.mlog("mapMotifs: could not load %s" % (str(geneID)))
400 for geneID in genepoolKeys:
402 print "mapMotifs: %s\n" % str(geneID)
404 motifPos = mot.locateMotif(self.genepool[geneID], Threshold, numberN)
406 if len(motifPos) > 0:
408 posResults.append((mTagID, geneID, pos))
410 self.appendGeneToMotifBatch(posResults)
413 def mapConsensus(self, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, numMismatches=0):
414 """ find occurences of result motifs in the current genepool using regular expressions, allowing
415 for a number of mismatches.
419 genepoolKeys = self.genepool.keys()
421 for mot in self.getResults():
423 runID = mTagID.split("-")[0]
424 (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID))
425 if mot.tagID in ignoreList:
426 self.mlog("mapConsensus: not mapping %s" % mot.tagID)
430 if int(runID) not in runList:
431 self.mlog("mapConsensus: not mapping run %s" % runID)
434 if enforceSanity and not mot.isSane():
435 self.mlog("mapConsensus: not mapping %s - failed sanity check" % (mTagID))
441 if numMismatches > 0:
442 mot.initializeMismatchRE(numMismatches)
445 self.mlog("mapConsensus: mapping %s with %s mismatches" % (mTagID, numMismatches))
446 if dataID <> currentDataset:
447 currentDataset = dataID
448 for geneID in self.getSetting(dataID):
449 if geneID[0] <> currentGenome:
450 currentGenome = geneID[0]
453 if geneID not in genepoolKeys:
454 self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstream)
455 genepoolKeys.append[geneID]
457 self.mlog("mapConsensus: could not load %s" % (str(geneID)))
460 for geneID in genepoolKeys:
461 motifPos = mot.locateConsensusRE(self.genepool[geneID])
463 if len(motifPos) > 0:
465 posResults.append((mTagID, geneID, pos))
467 self.appendGeneToMotifBatch(posResults)
470 def mapFeatures(self, radius):
471 """ returns features within a certain radius in bp of all matches.
474 for mot in self.getResults():
476 self.mlog("mapFeatures: mapping %s using a radius of %d bp" % (mTagID, radius))
477 for match in self.motifToGene(mTagID):
478 matchFeatures = cistematic.core.retrieveFeatures(match, radius)
479 for entry in matchFeatures:
480 FeatureList.append((mTagID, match, entry))
485 def limitGeneEntries(self, geneList, lowerBound, upperBound):
487 for entry in geneList:
488 if entry[1] < lowerBound or entry[0] > upperBound:
491 results.append(entry)
496 def mapNeighbors(self, radius, annotate=True):
497 """ returns genes on a chromosome within a certain radius in bp.
502 for mot in self.getResults():
505 motRadius = motLen / 2
506 mtgList = self.motifToGene(mTagID)
507 self.mlog("mapNeighbors: mapping %s using a radius of %d bp (%d instances)" % (mTagID, radius, len(mtgList)))
509 for match in mtgList:
510 (chromo, hit) = match
511 if annotate and chromo != prevChromo:
513 localGeneList = cistematic.core.getChromoGeneEntries(chromo)
516 if (index % 1000) == 0:
522 (chromo, hit) = match
523 matchpos = int(hit[0])
524 lowerBound = matchpos - radius
525 upperBound = matchpos + radius
526 match2 = (chromo, (matchpos + motRadius, hit[1]))
527 matchFeatures = cistematic.core.retrieveFeatures(match2, motRadius, "CDS")
528 for entry in matchFeatures:
529 matchCDS.append((chromo[0], entry[0]))
531 geneEntriesList = self.limitGeneEntries(localGeneList, lowerBound, upperBound)
532 for geneEntry in geneEntriesList:
533 beg = int(geneEntry[0])
534 end = int(geneEntry[1])
537 if gID in matchCDS: # matching within the coding sequence
538 neighborResults.append((mTagID, match, "CDS", geneEntry))
540 elif matchpos >= beg and matchpos <= end: # not in CDS, but in gene
541 neighborResults.append((mTagID, match, "GENE", geneEntry))
545 neighborResults.append((mTagID, match, "UPSTREAM", geneEntry))
547 neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
552 neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry))
554 neighborResults.append((mTagID,match, "UPSTREAM", geneEntry))
559 neighborResults.append((mTagID, match, "NONE", ""))
561 self.appendMotifNeighborsBatch(neighborResults)
564 def printMotifToGene(self, motifList=[], runList=[]):
565 motKeys = self.geneToMotifKeys(thekey="mTagID")
566 if len(motifList) == 0 and len(runList) == 0:
568 print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
571 runID = tag.split("-")[0]
573 if runID not in runList:
577 print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag)))
580 def motifToGeneStat(self):
581 tags = self.geneToMotifKeys(thekey="mTagID")
583 min = len(self.motifToGene(tags[0]))
586 numGenes = len(self.motifToGene(tag))
587 print "%s: %d" % (tag, numGenes)
595 print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
598 def printGeneToMotif(self, geneList = []):
599 if len(geneList) == 0:
600 for geneID in self.geneToMotifKeys():
601 print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
603 for geneID in self.geneToMotifKeys():
604 if geneID in geneList:
605 print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID)))
608 def geneToMotifStat(self):
609 genes = self.geneToMotifKeys()
611 min = len(self.geneToMotif(genes[0]))
614 numMotifs = len(self.geneToMotif(gene))
615 print "%s - %s: %d" % (gene[0], gene[1], numMotifs)
618 elif numMotifs < min:
623 print "for a total of %d matches - min: %d max: %d" % (counter, min, max)
626 def printGeneProfile(self, geneID):
627 print "\nMotif matches for %s " % (str(geneID))
628 geneProfile = self.makeGeneProfile(geneID)
629 positions = geneProfile.keys()
630 if len(positions) > 0:
632 for pos in positions:
633 print "%s %s" % (str(pos), str(geneProfile[pos]))
635 print "Gene had no matching motifs"
638 def makeGeneProfile(self, geneID):
640 geneMotifs = self.geneToMotif(geneID)
641 if len(geneMotifs) > 0:
642 for mot in geneMotifs:
645 if int(loc) not in geneBucket.keys():
646 geneBucket[int(loc)] = []
648 geneBucket[int(loc)].append(mot)
653 def getMotifMatches(self, motifIDs=[]):
655 if len(motifIDs) < 1:
656 motifIDs = self.geneToMotifKeys(thekey="mTagID")
658 for motID in motifIDs:
660 mot = self.findMotif(motID)
662 for match in self.motifToGene(motID):
666 results[motID].append(self.genepool[chrom][pos:pos + motLength])
668 results[motID].append(cistematic.core.complement(self.genepool[chrom][pos:pos + motLength], motLength))
673 def summarizeMotifs(self, fileName):
674 """ saves the number of occurences and actual occurence PWMs of motifs to fileName.
679 if len(fileName) < 1:
682 outFile = open(fileName, "a")
683 self.mlog("Saving motif summary")
684 for motID in self.geneToMotifKeys(thekey="mTagID"):
686 mot = self.findMotif(motID)
689 for index in range(motLength):
690 motDict[motID].append([0.0, 0.0, 0.0, 0.0])
692 for match in self.motifToGene(motID):
694 (chromo, hit) = match
697 matchSeq = self.genepool[chromo][pos:pos + motLength]
699 matchSeq = cistematic.core.complement(self.genepool[chromo][pos: pos + motLength], motLength)
701 for index in range(motLength):
704 if NT in ["A", "C", "G", "T"]:
705 motDict[motID][index][matrixRow[NT]] += 1
707 motLine = "motif %s\t %s matches\t'%s'\n %s\n" % (motID, str(matchNum), mot.buildConsensus(), self.formatPWM(motDict[motID]))
708 motText.append(motLine)
710 outFile.writelines(motText)
713 self.mlog("Could not save motif summary to file %s\n" % fileName)
716 def printMotifNeighbors(self):
717 for motID in self.motifNeighborsKeys():
718 print "Matches for motif %s" % motID
720 for entry in self.motifNeighbors(motID):
721 if entry[0] != currentHit:
722 print "====================="
723 print "Match %s" % str(entry[0])
724 currentHit = entry[0]
726 print "\tGene %s:" % str(entry[1])
728 goInfo = cistematic.core.getGOInfo(entry[1][3])
730 print "\t\t%s" % str(entry)
734 print "----------------"
737 def summarizeMotifNeighbors(self, fileName):
738 """ saves the number of occurences and PWMs of motifs within the gene
739 neighborhood radius as mapped using mapNeighbors() to fileName.
744 if len(fileName) < 1:
746 outFile = open(fileName, "a")
747 self.mlog("Saving neighbor summary")
748 for motID in self.motifNeighborsKeys():
750 mot = self.findMotif(motID)
753 for index in range(motLength):
754 motDict[motID].append([0.0, 0.0, 0.0, 0.0])
757 for entry in self.motifNeighbors(motID, discardNullMatches = True):
758 if currentMatch != entry[0]:
759 currentMatch = entry[0]
760 (genechrom, loc) = entry[0]
763 (geno, gene) = geneID
764 if gene != "NO GENE IN RADIUS":
767 matchSeq = self.genepool[genechrom][pos:pos + motLength]
769 matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
771 for index in range(motLength):
774 if NT in ["A", "C", "G", "T"]:
775 motDict[motID][index][matrixRow[NT]] += 1
777 motLine = "motif %s\t %s matches\n %s\n" % (motID, str(matchNum), self.formatPWM(motDict[motID]))
778 motText.append(motLine)
780 outFile.writelines(motText)
783 self.mlog("Could not save motif neighbors to file %s\n" % fileName)
786 def saveMotifNeighbors(self, fileName, fullAnnotations=True):
787 """ save every occurence of the motifs with any adjoining gene(s) to fileName.
788 Records annotations and GO terms if available when fullAnnotations=True.
794 self.mlog("Saving motif neighbors to file %s\n" % fileName)
796 if len(fileName) < 1:
798 outFile = open(fileName, "a")
799 for motID in self.motifNeighborsKeys():
801 mot = self.findMotif(motID)
804 for entry in self.motifNeighbors(motID):
806 if currentMatch != entry[0]:
808 currentMatch = entry[0]
809 (genechrom, loc) = entry[0]
810 (genome, chromo) = genechrom
812 if fullAnnotations and genome != currentGenome:
813 currentGenome = genome
814 goDict = cistematic.core.getAllGOInfo(genome)
815 annotDict = cistematic.core.getAllAnnotInfo(genome)
819 geneSense = entry[1][2]
821 (geno, gene) = geneID
824 matchSeq = self.genepool[genechrom][pos:pos + motLength]
826 matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength)
828 currentEntry = "%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motID, matchNum, genome, chromo, pos, sense, condition, start, stop, geneSense, geno, gene, matchSeq)
831 goInfo = goDict.get(geneID, [])
832 annotInfo = annotDict.get(geneID,[])
833 annotDescription = ""
836 print "%s\t%s\t%s" % (str(geneID), str(goInfo), str(annotInfo))
837 for annot in annotInfo:
838 annotDescription += annot + " "
841 if entry == prevGoInfo:
843 neighborList.append("%s\t%s\t%s\n" % (currentEntry, annotDescription, entry))
846 neighborList.append("%s\t%s\n" % (currentEntry, annotDescription))
848 neighborList.append("%s\n" % currentEntry)
850 neighborList.append("%s\n" % currentEntry)
852 outFile.writelines(neighborList)
856 self.mlog("Could not save motif neighbors to file %s\n" % fileName)
861 def buildMotifSize(self):
862 mots = self.getResults()
866 if motLength not in self.motifSize.keys():
867 self.motifSize[motLength] =[]
869 self.motifSize[motLength].append((index, mot.buildConsensus()))
873 def findIdenticalMotifs(self):
877 for motLength in self.motifSize.keys():
878 motList = self.motifSize[motLength]
879 motListLen = len(motList)
880 print "doing motif length %d" % motLength
881 for pos in range(motListLen):
883 while index < motListLen:
884 if motList[pos][1] == motList[index][1]:
885 identicalMotifs.append((self.results[pos].tagID, self.results[index].tagID))
889 return identicalMotifs
892 def createAnalysis(self, dbName=""):
893 """ creates the analysis SQL tables. Should only be run once per database file.
896 dbName = self.expFile
898 db = sqlite.connect(dbName, timeout=60)
901 sql.execute("CREATE table geneToMotif(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar)")
902 sql.execute("CREATE table genomeMatches(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar, threshold varchar)")
903 sql.execute("CREATE table motifNeighbors(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, chromNum varchar, location varchar, motifSense varchar, start varchar, stop varchar, sense varchar, locus varchar, condition varchar)")
904 sql.execute("CREATE index mot1 on geneToMotif(expID, analysisID, mTagID)")
905 sql.execute("CREATE index mot2 on genomeMatches(expID, analysisID, mTagID)")
906 sql.execute("CREATE index mot3 on motifNeighbors(expID, analysisID, mTagID)")
907 sql.execute("CREATE index locus1 on geneToMotif(locus)")
908 sql.execute("CREATE index locus2 on motifNeighbors(locus)")
909 sql.execute("CREATE index condition1 on motifNeighbors(condition)")
913 self.mlog("Created analysis tables in database %s" % dbName)
916 self.mlog("Could not create tables in database %s" % dbName)
917 self.mlog("WARNING: perhaps you have called createAnalysis twice on the same database?")
920 def loadAnalysis(self, analysisID="default", dbName=""):
921 """ changes the analysisID to use the specified one, or use default. Must be used before reading or
922 writing any data to analysis tables.
924 self.analysisID = analysisID
928 self.dbName = self.expFile
931 def resetAnalysis(self):
932 """ currently zeros out some of the analysis structures. obsolete ?
934 self.mlog("resetting analysis")
936 self.motifToAnnotations = {}
940 def saveAnalysis(self):
941 """ currently obsolete - kept for backward compability.
944 self.mlog("saved analysis %s" % self.analysisID)
946 self.mlog("could not save %s" % self.analysisID)
949 def deleteAnalysis(self, analysisID="default"):
950 """ delete all of the analysis either named aName, or matching
951 an experiment (the current one by default.) Currently a NO-OP. Obsolete.
956 def buildCoappear(self, motifList=[], distance="100000", strict = False):
957 """ Builds coappear dictionary of geneIDs where motifs occur with others. Can limit to
958 motifs in motifList (default is all motifs), and by distance (default is 100,000 base pairs.)
959 Results are accessed through printCoappear() and saveCoappear().
964 motifListLen = len(motifList)
965 if motifListLen == 0:
967 motifList = self.geneToMotifKeys(thekey="mTagID")
968 motifListLen = len(motifList)
970 for motif in motifList:
971 if motif not in processedMotifs:
972 matchList = self.motifToGene(motif)
973 for match in matchList:
974 (geneID, loc) = match
975 if geneID not in occurenceList:
976 occurenceList[geneID] = []
978 (location, sense) = loc
979 occurenceList[geneID].append((location, sense, motif))
981 processedMotifs.append(motif)
983 for geneID in occurenceList:
984 occurenceList[geneID].sort()
985 if geneID not in self.coappear:
986 self.coappear[geneID] = []
989 differentMotifs = False
991 prevOccurence = occurenceList[geneID][0]
992 del occurenceList[geneID][0]
993 for occurence in occurenceList[geneID]:
994 if occurence[0] < prevOccurence[0] + distance:
996 if occurence[2] != prevOccurence[2]:
997 differentMotifs = True
999 coappearList.append(prevOccurence)
1003 coappearList.append(prevOccurence)
1004 self.coappear[geneID].append(coappearList)
1006 coappearList.append(prevOccurence)
1007 self.coappear[geneID].append(coappearList)
1010 differentMotifs = False
1013 prevOccurence = occurence
1018 coappearList.append(prevOccurence)
1019 self.coappear[geneID].append(coappearList)
1021 coappearList.append(prevOccurence)
1022 self.coappear[geneID].append(coappearList)
1025 def printCoappear(self):
1026 """ prints a formatted version of the coappear dictionary built with buildCoappear()
1028 for geneID in self.coappear:
1029 print " ===== %s =====" % str(geneID)
1030 for occurence in self.coappear[geneID]:
1031 print str(occurence)
1032 print " ============="
1035 def saveCoappear(self, fileName):
1036 """ save coappear dictionary in tabular format. Returns:
1037 index, genome, locus, pos, sense, tag in tab-separated format.
1042 if len(fileName) < 1:
1045 outFile = open(fileName, "a")
1046 for geneID in self.coappear:
1047 (genome, locus) = geneID
1048 for occurence in self.coappear[geneID]:
1050 coappearLine = "%d\t%s\t%s" % (index, genome, locus)
1051 for match in occurence:
1052 (pos, sense, tag) = match
1053 coappearLine += "\t%s\t%s\t%s" % (pos, sense, tag)
1055 outLines.append(coappearLine + "\n")
1057 outFile.writelines(outLines)
1060 self.mlog("Could not save coappear to file %s\n" % fileName)
1063 def sql(self, stmt, commit=""):
1064 """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
1066 db = sqlite.connect(self.dbName, timeout=60)
1069 print "sql: %s" % stmt
1072 res = sqlc.fetchall()
1082 def batchsql(self, stmt, batch):
1083 """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
1086 db = sqlite.connect(self.dbName, timeout=60)
1089 print "batchsql: %s" % stmt
1090 print "batchsql: %s" % str(batch)
1092 sqlc.executemany(stmt, batch)