1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # This class contains the core code for using orthology and sequence-level conservation.
32 from pysqlite2 import dbapi2 as sqlite
34 from sqlite3 import dbapi2 as sqlite
36 import string, cistematic.core
37 from cistematic.core.homology import homologyDB, homologeneGenomes
38 from cistematic.programs.mafft import Mafft
39 from cistematic.programs.paircomp import Paircomp
40 from cistematic.programs.fastcomp import Fastcomp
44 """ The conservation class holds the conservation specific code for
45 specifying orthology/paralogy, calling conservation identification code,
46 and manipulating/storing conserved sequences. It is meant to be used
47 in conjuction with the Experiment and AnalyzeMotifs classes.
49 conservationID = "default"
56 def setTargetGenomes(self, tGenomes):
57 """ restricts homologs to the genomes specified in the tGenomes list.
59 self.targetGenomes = tGenomes
62 def setRefGenes(self, rGenes):
63 """ sets the list of loci (not geneIDs) that will be compared to homologs.
65 self.refGenes = rGenes
68 def setStartingGenome(self, sGenome):
69 """ sets the genome associated with the refGenes.
71 self.startingGenome = sGenome
74 def importHomology(self, inputFile, source="generic"):
75 """ loads homology relationships from a tab-delimited file of the form:
76 homologyGroup orthologyGroup genome locus
77 where orthologyGroup can be left blank.
80 importFile = open(inputFile, "r")
81 stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
82 for line in importFile:
83 (homGroup, orthGroup, genome, locus) = line.split("\t")
84 stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source))
87 self.batchsqlcons(stmt, stmtList)
90 def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"):
91 """ define the geneIDs in geneIDList as being homologous.
96 for tempID in geneIDList:
97 tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1]))
98 if not self.hasHomGroup(tempHomGroup):
99 homGroup = tempHomGroup
103 self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList))
106 stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
107 for geneID in geneIDList:
108 stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source))
110 self.batchsqlCons(stmt, stmtList)
114 def deleteHomolog(self, geneID, homGroup=""):
115 """ delete all entries from the homology table for a given geneID and homGroup.
117 stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1])
118 self.sqlcons(stmt, "commit")
121 def deleteOrtholog(self, geneID, orthGroup):
122 """ delete all entries from the homology table for a given geneID and orthGroup.
124 stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1])
125 self.sqlcons(stmt, "commit")
128 def deleteHomologyGroup(self, homGroup):
129 """ delete all entries from the homology table matching the given homology group.
131 stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' " % (self.conservationID, homGroup)
132 self.sqlcons(stmt, "commit")
135 def deleteOrthologyGroup(self, orthGroup):
136 """ delete all entries from the homology table matching the given orthology group.
138 stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup)
139 self.sqlcons(stmt, "commit")
142 def hasHomGroup(self, testGroup):
143 """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table.
145 stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup
146 res = self.sqlcons(stmt)
153 def returnHomologs(self, geneID):
154 """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in
155 self.targetGenomes, based on entries in the homology table. This function will try to load entries from
156 homologene for supported genomes, if necessary.
159 usedHomologene = False
160 stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
161 groups = self.sqlcons(stmt)
163 if len(groups) < 1 and geneID[0] in homologeneGenomes:
164 self.loadFromHomologene(geneID)
165 groups = self.sqlcons(stmt)
166 usedHomologene = True
168 for (recID, hIDentry) in groups:
169 stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s' " % (str(hIDentry), str(recID))
170 genes = self.sqlcons(stmt)
172 strgene = (str(gene[0]), str(gene[1]))
173 if usedHomologene and strgene[0] not in self.targetGenomes:
176 if strgene not in homologList:
177 homologList.append(strgene)
182 def returnOrthologs(self, geneID):
183 """ returns a list of genes that are orthologous to a geneID and whose genome are in
184 self.targetGenomes, based on explicit entries in the homology table.
187 stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
188 groups = self.sqlcons(stmt)
190 for (recID, oIDentry) in groups:
193 stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID))
194 genes = self.sqlcons(stmt)
196 strgene = (str(gene[0]), str(gene[1]))
197 if strgene[0] in self.targetGenomes and strgene not in orthologList:
198 orthologList.append(strgene)
203 def areOrthologs(self, geneID1, geneID2):
204 """ returns True if genes geneID1 and geneID2 are orthologs.
206 stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1
207 groups1 = self.sqlcons(stmt)
209 stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2
210 groups2 = self.sqlcons(stmt)
214 for (recID, oIDentry) in groups1:
215 oEntries1.append(oIDentry)
217 for (recID, oIDentry) in groups2:
218 oEntries2.append(oIDentry)
220 for entry in oEntries1:
221 if entry in oEntries2:
227 def loadFromHomologene(self, geneID):
228 """ load the homologous genes to geneID from homologene.
231 hdb = homologyDB(self.targetGenomes)
232 hGenes = hdb.getHomologousGenes(geneID)
234 hdb = homologyDB(self.targetGenomes, cache=True)
235 hGenes = hdb.getHomologousGenes(geneID)
237 hGenes.append(geneID)
238 homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
239 self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene")
242 def computeAlignments(self, geneIDList=[]):
243 """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of
244 all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form
245 genome-locus' from the geneIDs in the starting genome.
247 if len(geneIDList) < 1:
248 geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
251 for geneID in geneIDList:
252 homGroup = str(geneID[0]) + '-' + str(geneID[1])
253 hGenes = self.returnHomologs(geneID)
254 hGenes.append(geneID)
255 fastaFile = self.createDataFile(geneIDList = hGenes)
256 prog.inputFile(fastaFile)
258 alignedDict = prog.getAlignment()
259 for dictKey in alignedDict:
260 aGeneID = dictKey.split('-')
261 self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey])
264 def insertAlignedSequence(self, datasetID, geneID, seq):
265 """ save an aligned sequence into alignedSequence.
267 values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq)
268 stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values
269 self.sqlcons(stmt, "commit")
272 def getAlignedSequence(self, geneID, datasetID=""):
273 """ retrieve an aligned sequence from alignedSequence using datasetID and geneID.
275 stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
276 if len(datasetID) > 0:
277 stmt += "and datasetID = '%s' " % (datasetID)
279 res = self.sqlcons(stmt)
281 return str(res[0][0])
284 def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]):
285 """ map regions of sequences where multiple alignments show at least one other (default)
286 or all genes (stritct=True) as lining up.
288 if len(geneIDList) < 1:
289 geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
296 for geneID in geneIDList:
297 homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
298 hGenes = self.returnHomologs(geneID)
299 hGenes.append(geneID)
303 maskedSequences = self.maskUsingConservation(hGenes, strict)
307 for pos in range(len(maskedSequences[gID])):
308 if maskedSequences[gID][pos] == "N":
309 if start != 0 and consLength >= minConsLength:
310 self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
321 if start != 0 and consLength >= minConsLength:
322 self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
325 def maskUsingConservation(self, geneIDList, strict=False):
326 """ mask every gene in geneIDList using conservation amongst themselves, which have
327 already been computed using computeAlignments()
332 for gID in geneIDList:
335 alignmentDict[gID] = self.getAlignedSequence(gID)
337 seqLen = len(alignmentDict[gID])
342 criteria = len(geneIDList)
346 for pos in range(seqLen):
349 for geneID in geneIDList:
350 posDict[geneID] = alignmentDict[geneID][pos]
351 if posDict[geneID] != "-" and posDict[geneID] != "N":
354 if conserved >= criteria:
355 for geneID in geneIDList:
356 if posDict[geneID] != "-":
357 maskedDict[geneID] += posDict[geneID]
359 for geneID in geneIDList:
360 if posDict[geneID] != "-":
361 maskedDict[geneID] += "N"
366 def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False):
367 """ map regions of sequences with seqcomp windows in all genes that have
368 more than threshold (<= 1) conservation in more than minSequences sequences.
369 Can optionally use a higher threshold for orothlogs if specified using
373 if orthologyThreshold < threshold:
374 orthologyThreshold = threshold
376 if len(geneIDList) < 1:
377 geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
384 prog.setWindowSize(window)
385 # if we have a window on a sequence, then we have at least one match!
386 minSeqNum = minSequences - 1
387 for geneID in geneIDList:
389 homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
390 hGenes = self.returnHomologs(geneID)
391 hGenes.append(geneID)
393 for second in hGenes:
394 if first != second and [first, second] not in genePairs and [second, first] not in genePairs:
395 genePairs.append([first, second])
398 print "genepairs = %s" % str(genePairs)
399 for pair in genePairs:
400 fastaFile = self.createDataFile(geneIDList = pair)
401 if self.areOrthologs(pair[0], pair[1]):
402 prog.setThreshold(orthologyThreshold)
404 prog.setThreshold(threshold)
406 prog.inputFile(fastaFile)
408 seqcompWindows[(pair[0], pair[1])] = prog.getWindows()
412 resultWindows[gene] = {}
414 for pair in seqcompWindows:
415 (geneID1, geneID2) = pair
416 for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]:
417 if seq1pos not in resultWindows[geneID1]:
418 resultWindows[geneID1][seq1pos] = []
420 if seq2pos not in resultWindows[geneID2]:
421 resultWindows[geneID2][seq2pos] = []
423 resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches))
424 resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches))
426 for geneID in hGenes:
427 for position in resultWindows[geneID]:
429 for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
430 if geneID2 not in otherGeneIDs:
431 otherGeneIDs.append(geneID2)
433 if len(otherGeneIDs) >= minSeqNum:
434 criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window)
435 for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
436 criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
438 consList.append((homGroup, geneID, position, window, "seqcompCons", criteria))
440 self.insertConservedSequenceBatch(consList)
443 def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False):
444 """ map regions of sequences with seqcomp windows in all genes that have
445 more than threshold (<= 1) conservation. Uses transivity.
447 self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp)
450 def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False):
451 """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of
452 sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1)
453 conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity.
456 if len(geneIDList) < 1:
457 geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
464 prog.setWindowSize(window)
465 for geneID in geneIDList:
467 homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
468 oGenes = self.returnOrthologs(geneID)
469 # homologs should contain orthologs!
470 hGenes = self.returnHomologs(geneID)
475 genePairs = [[geneID, gene] for gene in hGenes]
476 for pair in genePairs:
477 if pair[1] in oGenes:
478 prog.setThreshold(orthologyThreshold)
480 prog.setThreshold(paralogyThreshold)
482 fastaFile = self.createDataFile(geneIDList = pair)
483 prog.inputFile(fastaFile)
485 seqcompWindows[pair[1]] = prog.getWindows()
488 print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows))
489 for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]:
490 resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)]
492 for gene in hGenes[1:]:
494 for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]:
495 if seq1pos not in resultWindows:
498 newseqPositions.append(seq1pos)
499 resultWindows[seq1pos].append((gene, seq2pos, sense, matches))
501 for pos in resultWindows.keys():
502 if pos not in newseqPositions:
503 del resultWindows[pos]
505 for windowPos in resultWindows.keys():
506 windowList = resultWindows[windowPos]
507 criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window)
508 for (geneID2, seq2pos, sense, matches) in windowList:
509 criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
511 consList.append((homGroup, geneID, windowPos, window, tag, criteria))
513 for windowEntry in windowList:
514 (gID, seq2pos, sense, matches) = windowEntry
515 consList.append((homGroup, gID, seq2pos, window, tag, criteria))
517 self.insertConservedSequenceBatch(consList)
520 def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria):
521 """ insert an entry in conservedSequence.
523 values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria)
524 stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values
525 self.sqlcons(stmt, 'commit')
528 def insertConservedSequenceBatch(self, batchlist):
529 """ insert a list of entries in conservedSequence.
532 stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)"
533 for (datasetID, geneID, pos, length, method, criteria) in batchlist:
534 sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria)))
536 self.batchsqlCons(stmt, sqlList)
539 def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""):
540 """ retrieve a list of conservation windows of the form (start, length) from
541 conservedSequence using datasetID and geneID, which match a particular method and criteria.
544 stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
546 stmt += "and datasetID = '%d' " % (datasetID)
549 stmt += " and method = '%s' " % (method)
551 if len(criteria) > 0:
552 stmt += "and score = '%s' " % (criteria)
554 res = self.sqlcons(stmt)
555 for (location, length, criteria) in res:
556 results.append((int(location), int(length), str(criteria)))
561 def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""):
562 """ returns True if a particular sequence of position and length falls within a conservation
565 if len(position) == 2:
566 position = position[0]
567 stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1])
569 stmt += "and datasetID = '%d' " % (datasetID)
572 stmt += " and method = '%s' " % (method)
574 if len(criteria) > 0:
575 stmt += "and score = '%s' " % (criteria)
577 res = self.sqlcons(stmt)
578 for (location, wlen) in res:
579 if int(location) <= position and (int(location) + int(wlen)) >= (position + length):
585 def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21):
586 """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns
587 a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger.
591 datasetID = self.genepoolID
593 settingsList = self.getSettingsID(datasetID)
594 geneIDList = eval(settingsList[1])
595 for geneID in geneIDList:
596 theseq = self.genepool[geneID]
598 maskedseq = ["N"] * seqlen
599 conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria)
600 for (start, length, crit) in conservedWindows:
604 if start + length >= seqlen:
605 length = seqlen - start - 1
607 for index in range(start, start + length):
608 maskedseq[index] = theseq[index]
612 for letter in maskedseq:
613 if letter.upper() == "N":
618 if numN > stripNLonger:
621 tempseq.append(letter)
623 maskedSeqDict[geneID] = string.join(tempseq, "")
628 def conservationStat(self, datasetID=-1, method="", criteria=""):
629 """ report conservation level in each of the genes in the dataset.
631 nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"]
634 consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria)
635 for geneID in consGeneDict:
637 for NT in nucleotides:
638 ntstat += consGeneDict[geneID].count(NT)
640 blocks = (len(consGeneDict[geneID]) - ntstat) / 21
641 origSize = len(self.genepool[geneID])
642 consPercentage = 100. * ntstat / float(origSize)
643 print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage)
645 totalsize += origSize
647 consPercentage = 100. * totalcons / float(totalsize)
648 print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage)
651 def motifConservationStat(self, motifList=[]):
652 """ check how many motifs instances for the motifs in the mapped motifs
653 (or only in motifList) are conserved.
655 if len(motifList) == 0:
656 motifList = self.geneToMotifKeys(thekey="motif")
658 for motID in motifList:
660 matches = self.motifToGene(motID)
661 for (loc, pos) in matches:
662 mot = self.findMotif(motID)
663 if self.isConserved(loc, pos, len(mot)):
666 print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches)))
669 def checkForConservedSequence(self, datasetID=-1, method="", criteria=""):
670 """ checks that at least two or more sequences in the dataset have conservation.
672 someHaveConservation = False
674 checkDict = self.maskNonConservedSequence(datasetID, method, criteria)
675 for entry in checkDict:
676 theseq = checkDict[entry].upper()
677 if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq:
681 someHaveConservation = True
683 return someHaveConservation
686 def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"):
687 """ exports conserved sequences to a fasta file.
689 (up, cds, down) = self.getSeqParameters()
690 consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000)
692 consDictEntries = consDict.keys()
693 for entry in consDictEntries:
694 if entry[0] not in genomes:
697 outfilename = "%s/%s.fsa" % (directory, prefix)
698 consOutfile = open(outfilename, "w")
699 for entry in consDict:
700 entryCoordinates = cistematic.core.geneEntry(entry)
701 entrySense = entryCoordinates[4]
702 theseq = consDict[entry].upper()
704 (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates)
705 if entrySense == "R":
706 theseq = cistematic.core.complement(theseq)
708 conservedBlockStart = -1
709 for pos in range(seqlen):
710 if theseq[pos] == "N":
711 if conservedBlockStart >= 0:
712 consStart = start + conservedBlockStart
713 consSeq = theseq[conservedBlockStart:pos]
714 consStop = consStart + len(consSeq) - 1
715 consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
716 conservedBlockStart = -1
720 if conservedBlockStart < 0:
721 conservedBlockStart = pos
723 if conservedBlockStart >= 0:
724 consStart = start + conservedBlockStart
725 consSeq = theseq[conservedBlockStart:pos]
726 consStop = consStart + len(consSeq) - 1
727 consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
732 def createConservation(self, conservationID="default", dbName=""):
733 """ creates the conservation SQL tables. Should only be run once per database file.
736 self.loadConservation(conservationID, dbName)
738 stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)")
739 stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)")
740 stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)")
741 stmtList.append("CREATE index cons1 on homology(conservationID)")
742 stmtList.append("CREATE index cons2 on alignedSequence(conservationID)")
743 stmtList.append("CREATE index cons3 on conservedSequence(conservationID)")
744 stmtList.append("CREATE index hom1 on homology(homologyGroup)")
745 stmtList.append("CREATE index orth1 on homology(orthologyGroup)")
746 stmtList.append("CREATE index homlocus1 on homology(genome, locus)")
747 stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)")
748 stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)")
749 for stmt in stmtList:
750 self.sqlcons(stmt, commit=True)
752 self.mlog("Created conservation tables in database %s" % dbName)
754 self.mlog("Could not create conservation tables in database %s" % dbName)
755 self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?")
758 def loadConservation(self, conservationID="default", dbName=""):
759 """ changes the conservationID to use the specified one, or use default. Must be used before reading or
760 writing any data to analysis tables.
762 self.conservationID = conservationID
764 self.consDBName = dbName
766 self.consDBName = self.expFile
769 def sqlcons(self, stmt, commit=""):
770 """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
773 db = sqlite.connect(self.consDBName)
777 print "Conservation->sqlcons: %s" % stmt
780 res = sqlc.fetchall()
785 self.mlog("Conservation->sqlcons (commit exception)")
787 self.mlog("Conservation->sqlcons (statement exception): %s" % stmt)
795 def batchsqlCons(self, stmt, batch):
796 """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
799 db = sqlite.connect(self.consDBName)
803 print "batchsqlCons: %s" % stmt
804 print "batchsqlCons: %s" % (str(batch))
806 sqlc.executemany(stmt, batch)
810 self.mlog("Conservation->batchsqlCons (commit exception)")
812 self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt)