cistematic/experiments/conservation.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # This class contains the core code for using orthology and sequence-level conservation.
  31 try:
  32     from pysqlite2 import dbapi2 as sqlite
  33 except:
  34     from sqlite3 import dbapi2 as sqlite
  35
  36 import string, cistematic.core
  37 from cistematic.core.homology import homologyDB, homologeneGenomes
  38 from cistematic.programs.mafft import Mafft
  39 from cistematic.programs.paircomp import Paircomp
  40 from cistematic.programs.fastcomp import Fastcomp
  41
  42
  43 class Conservation:
  44     """ The conservation class holds the conservation specific code for
  45         specifying orthology/paralogy, calling conservation identification code,
  46         and manipulating/storing conserved sequences. It is meant to be used
  47         in conjuction with the Experiment and AnalyzeMotifs classes.
  48     """
  49     conservationID = "default"
  50     consDBName = ""
  51     startingGenome = ""
  52     targetGenomes = []
  53     refGenes = []
  54
  55
  56     def setTargetGenomes(self, tGenomes):
  57         """ restricts homologs to the genomes specified in the tGenomes list.
  58         """
  59         self.targetGenomes = tGenomes
  60
  61
  62     def setRefGenes(self, rGenes):
  63         """ sets the list of loci (not geneIDs) that will be compared to homologs.
  64         """
  65         self.refGenes = rGenes
  66
  67
  68     def setStartingGenome(self, sGenome):
  69         """ sets the genome associated with the refGenes.
  70         """
  71         self.startingGenome = sGenome
  72
  73
  74     def importHomology(self, inputFile, source="generic"):
  75         """ loads homology relationships from a tab-delimited file of the form:
  76             homologyGroup orthologyGroup genome locus
  77             where orthologyGroup can be left blank.
  78         """
  79         stmtList = []
  80         importFile = open(inputFile, "r")
  81         stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
  82         for line in importFile:
  83             (homGroup, orthGroup, genome, locus) = line.split("\t")
  84             stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source))
  85
  86         importFile.close()
  87         self.batchsqlcons(stmt, stmtList)
  88
  89
  90     def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"):
  91         """ define the geneIDs in geneIDList as being homologous.
  92         """
  93         stmtList = []
  94
  95         if homGroup == "":
  96             for tempID in geneIDList:
  97                 tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1]))
  98                 if not self.hasHomGroup(tempHomGroup):
  99                     homGroup = tempHomGroup
 100                     break
 101
 102             if homGroup == "":
 103                 self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList))
 104                 return ""
 105
 106         stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
 107         for geneID in geneIDList:
 108             stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source))
 109
 110         self.batchsqlCons(stmt, stmtList)
 111         return homGroup
 112
 113
 114     def deleteHomolog(self, geneID, homGroup=""):
 115         """ delete all entries from the homology table for a given geneID and homGroup.
 116         """
 117         stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1])
 118         self.sqlcons(stmt, "commit")
 119
 120
 121     def deleteOrtholog(self, geneID, orthGroup):
 122         """ delete all entries from the homology table for a given geneID and orthGroup.
 123         """
 124         stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1])
 125         self.sqlcons(stmt, "commit")
 126
 127
 128     def deleteHomologyGroup(self, homGroup):
 129         """ delete all entries from the homology table matching the given homology group.
 130         """
 131         stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s'  " % (self.conservationID, homGroup)
 132         self.sqlcons(stmt, "commit")
 133
 134
 135     def deleteOrthologyGroup(self, orthGroup):
 136         """ delete all entries from the homology table matching the given orthology group.
 137         """
 138         stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup)
 139         self.sqlcons(stmt, "commit")
 140
 141
 142     def hasHomGroup(self, testGroup):
 143         """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table.
 144         """
 145         stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup
 146         res = self.sqlcons(stmt)
 147         if int(res[0]) > 0:
 148             return True
 149
 150         return False
 151
 152
 153     def returnHomologs(self, geneID):
 154         """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in
 155             self.targetGenomes, based on entries in the homology table. This function will try to load entries from
 156             homologene for supported genomes, if necessary.
 157         """
 158         homologList = []
 159         usedHomologene = False
 160         stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
 161         groups = self.sqlcons(stmt)
 162
 163         if len(groups) < 1 and geneID[0] in homologeneGenomes:
 164             self.loadFromHomologene(geneID)
 165             groups = self.sqlcons(stmt)
 166             usedHomologene = True
 167
 168         for (recID, hIDentry) in groups:
 169             stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s'  " % (str(hIDentry), str(recID))
 170             genes = self.sqlcons(stmt)
 171             for gene in genes:
 172                 strgene = (str(gene[0]), str(gene[1]))
 173                 if usedHomologene and strgene[0] not in self.targetGenomes:
 174                     continue
 175
 176                 if strgene not in homologList:
 177                     homologList.append(strgene)
 178
 179         return homologList
 180
 181
 182     def returnOrthologs(self, geneID):
 183         """ returns a list of genes that are orthologous to a geneID and whose genome are in
 184             self.targetGenomes, based on explicit entries in the homology table.
 185         """
 186         orthologList = []
 187         stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
 188         groups = self.sqlcons(stmt)
 189
 190         for (recID, oIDentry) in groups:
 191             if oIDentry == "":
 192                 continue
 193             stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID))
 194             genes = self.sqlcons(stmt)
 195             for gene in genes:
 196                 strgene = (str(gene[0]), str(gene[1]))
 197                 if strgene[0] in self.targetGenomes and strgene not in orthologList:
 198                     orthologList.append(strgene)
 199
 200         return orthologList
 201
 202
 203     def areOrthologs(self, geneID1, geneID2):
 204         """ returns True if genes geneID1 and geneID2 are orthologs.
 205         """
 206         stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1
 207         groups1 = self.sqlcons(stmt)
 208
 209         stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2
 210         groups2 = self.sqlcons(stmt)
 211
 212         oEntries1 = []
 213         oEntries2 = []
 214         for (recID, oIDentry) in groups1:
 215             oEntries1.append(oIDentry)
 216
 217         for (recID, oIDentry) in groups2:
 218             oEntries2.append(oIDentry)
 219
 220         for entry in oEntries1:
 221             if entry in oEntries2:
 222                 return True
 223
 224         return False
 225
 226
 227     def loadFromHomologene(self, geneID):
 228         """ load the homologous genes to geneID from homologene.
 229         """
 230         try:
 231             hdb = homologyDB(self.targetGenomes)
 232             hGenes = hdb.getHomologousGenes(geneID)
 233         except:
 234             hdb = homologyDB(self.targetGenomes, cache=True)
 235             hGenes = hdb.getHomologousGenes(geneID)
 236
 237         hGenes.append(geneID)
 238         homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
 239         self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene")
 240
 241
 242     def computeAlignments(self, geneIDList=[]):
 243         """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of
 244             all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form
 245             genome-locus' from the geneIDs in the starting genome.
 246         """
 247         if len(geneIDList) < 1:
 248             geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
 249
 250         prog = Mafft()
 251         for geneID in geneIDList:
 252             homGroup =  str(geneID[0]) + '-' + str(geneID[1])
 253             hGenes = self.returnHomologs(geneID)
 254             hGenes.append(geneID)
 255             fastaFile = self.createDataFile(geneIDList = hGenes)
 256             prog.inputFile(fastaFile)
 257             prog.run()
 258             alignedDict = prog.getAlignment()
 259             for dictKey in alignedDict:
 260                 aGeneID = dictKey.split('-')
 261                 self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey])
 262
 263
 264     def insertAlignedSequence(self, datasetID, geneID, seq):
 265         """ save an aligned sequence into alignedSequence.
 266         """
 267         values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq)
 268         stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values
 269         self.sqlcons(stmt, "commit")
 270
 271
 272     def getAlignedSequence(self, geneID, datasetID=""):
 273         """ retrieve an aligned sequence from alignedSequence using datasetID and geneID.
 274         """
 275         stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
 276         if len(datasetID) > 0:
 277             stmt += "and datasetID = '%s' " % (datasetID)
 278
 279         res = self.sqlcons(stmt)
 280
 281         return str(res[0][0])
 282
 283
 284     def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]):
 285         """ map regions of sequences where multiple alignments show at least one other (default)
 286             or all genes (stritct=True) as lining up.
 287         """
 288         if len(geneIDList) < 1:
 289             geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
 290
 291         if strict:
 292             criteria = "strict"
 293         else:
 294             criteria = "partial"
 295
 296         for geneID in geneIDList:
 297             homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
 298             hGenes = self.returnHomologs(geneID)
 299             hGenes.append(geneID)
 300             if len(hGenes) < 2:
 301                 continue
 302
 303             maskedSequences = self.maskUsingConservation(hGenes, strict)
 304             for gID in hGenes:
 305                 start = 0
 306                 consLength = 0
 307                 for pos in range(len(maskedSequences[gID])):
 308                     if maskedSequences[gID][pos] == "N":
 309                         if start != 0 and consLength >= minConsLength:
 310                             self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
 311
 312                         start = 0
 313                         consLength = 0
 314                         continue
 315
 316                     if start == 0:
 317                         start = pos
 318
 319                     consLength += 1
 320
 321                 if start != 0 and consLength >= minConsLength:
 322                     self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)
 323
 324
 325     def maskUsingConservation(self, geneIDList, strict=False):
 326         """ mask every gene in geneIDList using conservation amongst themselves, which have
 327             already been computed using computeAlignments()
 328         """
 329         alignmentDict = {}
 330         maskedDict = {}
 331
 332         for gID in geneIDList:
 333             seqLen = 0
 334             try:
 335                 alignmentDict[gID] = self.getAlignedSequence(gID)
 336                 maskedDict[gID] = ""
 337                 seqLen = len(alignmentDict[gID])
 338             except:
 339                 return maskedDict
 340
 341         if strict:
 342             criteria = len(geneIDList)
 343         else:
 344             criteria = 2
 345
 346         for pos in range(seqLen):
 347             posDict = {}
 348             conserved = 0
 349             for geneID in geneIDList:
 350                 posDict[geneID] = alignmentDict[geneID][pos]
 351                 if posDict[geneID] != "-" and posDict[geneID] != "N":
 352                     conserved += 1
 353
 354             if conserved >= criteria:
 355                 for geneID in geneIDList:
 356                     if posDict[geneID] != "-":
 357                         maskedDict[geneID] += posDict[geneID]
 358             else:
 359                 for geneID in geneIDList:
 360                     if posDict[geneID] != "-":
 361                         maskedDict[geneID] += "N"
 362
 363         return maskedDict
 364
 365
 366     def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False):
 367         """ map regions of sequences with seqcomp windows in all genes that have
 368             more than threshold (<= 1) conservation in more than minSequences sequences.
 369             Can optionally use a higher threshold for orothlogs if specified using
 370             orthologyThreshold.
 371         """
 372         consList = []
 373         if orthologyThreshold < threshold:
 374             orthologyThreshold = threshold
 375
 376         if len(geneIDList) < 1:
 377             geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
 378
 379         if useFastcomp:
 380             prog = Fastcomp()
 381         else:
 382             prog = Paircomp()
 383
 384         prog.setWindowSize(window)
 385         # if we have a window on a sequence, then we have at least one match!
 386         minSeqNum = minSequences - 1
 387         for geneID in geneIDList:
 388             genePairs = []
 389             homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
 390             hGenes = self.returnHomologs(geneID)
 391             hGenes.append(geneID)
 392             for first in hGenes:
 393                 for second in hGenes:
 394                     if first != second and [first, second] not in genePairs and [second, first] not in genePairs:
 395                         genePairs.append([first, second])
 396
 397             seqcompWindows = {}
 398             print "genepairs = %s" % str(genePairs)
 399             for pair in genePairs:
 400                 fastaFile = self.createDataFile(geneIDList = pair)
 401                 if self.areOrthologs(pair[0], pair[1]):
 402                     prog.setThreshold(orthologyThreshold)
 403                 else:
 404                     prog.setThreshold(threshold)
 405
 406                 prog.inputFile(fastaFile)
 407                 prog.run()
 408                 seqcompWindows[(pair[0], pair[1])] = prog.getWindows()
 409
 410             resultWindows = {}
 411             for gene in hGenes:
 412                 resultWindows[gene] = {}
 413
 414             for pair in seqcompWindows:
 415                 (geneID1, geneID2) = pair
 416                 for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]:
 417                     if seq1pos not in resultWindows[geneID1]:
 418                         resultWindows[geneID1][seq1pos] = []
 419
 420                     if seq2pos not in resultWindows[geneID2]:
 421                         resultWindows[geneID2][seq2pos] = []
 422
 423                     resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches))
 424                     resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches))
 425
 426             for geneID in hGenes:
 427                 for position in resultWindows[geneID]:
 428                     otherGeneIDs = []
 429                     for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
 430                         if geneID2 not in otherGeneIDs:
 431                             otherGeneIDs.append(geneID2)
 432
 433                     if len(otherGeneIDs) >= minSeqNum:
 434                         criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window)
 435                         for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
 436                             criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
 437
 438                         consList.append((homGroup, geneID, position, window, "seqcompCons", criteria))
 439
 440         self.insertConservedSequenceBatch(consList)
 441
 442
 443     def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False):
 444         """ map regions of sequences with seqcomp windows in all genes that have
 445             more than threshold (<= 1) conservation. Uses transivity.
 446         """
 447         self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp)
 448
 449
 450     def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False):
 451         """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of
 452             sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1)
 453             conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity.
 454         """
 455         consList = []
 456         if len(geneIDList) < 1:
 457             geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]
 458
 459         if useFastcomp:
 460             prog = Fastcomp()
 461         else:
 462             prog = Paircomp()
 463
 464         prog.setWindowSize(window)
 465         for geneID in geneIDList:
 466             genePairs = []
 467             homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
 468             oGenes = self.returnOrthologs(geneID)
 469             # homologs should contain orthologs!
 470             hGenes = self.returnHomologs(geneID)
 471             if len(hGenes) < 1:
 472                 continue
 473
 474             seqcompWindows = {}
 475             genePairs = [[geneID, gene] for gene in hGenes]
 476             for pair in genePairs:
 477                 if pair[1] in oGenes:
 478                     prog.setThreshold(orthologyThreshold)
 479                 else:
 480                     prog.setThreshold(paralogyThreshold)
 481
 482                 fastaFile = self.createDataFile(geneIDList = pair)
 483                 prog.inputFile(fastaFile)
 484                 prog.run()
 485                 seqcompWindows[pair[1]] = prog.getWindows()
 486
 487             resultWindows = {}
 488             print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows))
 489             for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]:
 490                 resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)]
 491
 492             for gene in hGenes[1:]:
 493                 newseqPositions = []
 494                 for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]:
 495                     if seq1pos not in resultWindows:
 496                         continue
 497
 498                     newseqPositions.append(seq1pos)
 499                     resultWindows[seq1pos].append((gene, seq2pos, sense, matches))
 500
 501                 for pos in resultWindows.keys():
 502                     if pos not in newseqPositions:
 503                         del resultWindows[pos]
 504
 505             for windowPos in resultWindows.keys():
 506                 windowList = resultWindows[windowPos]
 507                 criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window)
 508                 for (geneID2, seq2pos, sense, matches) in windowList:
 509                     criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)
 510
 511                 consList.append((homGroup, geneID, windowPos, window, tag, criteria))
 512
 513                 for windowEntry in windowList:
 514                     (gID, seq2pos, sense, matches) = windowEntry
 515                     consList.append((homGroup, gID, seq2pos, window, tag, criteria))
 516
 517         self.insertConservedSequenceBatch(consList)
 518
 519
 520     def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria):
 521         """ insert an entry in conservedSequence.
 522         """
 523         values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria)
 524         stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values
 525         self.sqlcons(stmt, 'commit')
 526
 527
 528     def insertConservedSequenceBatch(self, batchlist):
 529         """ insert a list of entries in conservedSequence.
 530         """
 531         sqlList = []
 532         stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)"
 533         for (datasetID, geneID, pos, length, method, criteria) in batchlist:
 534             sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria)))
 535
 536         self.batchsqlCons(stmt, sqlList)
 537
 538
 539     def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""):
 540         """ retrieve a list of conservation windows of the form (start, length) from
 541             conservedSequence using datasetID and geneID, which match a particular method and criteria.
 542         """
 543         results = []
 544         stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
 545         if datasetID > 0:
 546             stmt += "and datasetID = '%d' " % (datasetID)
 547
 548         if len(method) > 0:
 549             stmt += " and method = '%s' " % (method)
 550
 551         if len(criteria) > 0:
 552             stmt += "and score = '%s' " % (criteria)
 553
 554         res = self.sqlcons(stmt)
 555         for (location, length, criteria) in res:
 556             results.append((int(location), int(length), str(criteria)))
 557
 558         return results
 559
 560
 561     def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""):
 562         """ returns True if a particular sequence of position and length falls within a conservation
 563             window.
 564         """
 565         if len(position) == 2:
 566             position = position[0]
 567         stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
 568         if datasetID > 0:
 569             stmt += "and datasetID = '%d' " % (datasetID)
 570
 571         if len(method) > 0:
 572             stmt += " and method = '%s' " % (method)
 573
 574         if len(criteria) > 0:
 575             stmt += "and score = '%s' " % (criteria)
 576
 577         res = self.sqlcons(stmt)
 578         for (location, wlen) in res:
 579             if int(location) <= position and (int(location) + int(wlen)) >= (position + length):
 580                 return True
 581
 582         return False
 583
 584
 585     def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21):
 586         """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns
 587             a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger.
 588         """
 589         maskedSeqDict = {}
 590         if datasetID < 0:
 591             datasetID = self.genepoolID
 592
 593         settingsList =  self.getSettingsID(datasetID)
 594         geneIDList = eval(settingsList[1])
 595         for geneID in geneIDList:
 596             theseq = self.genepool[geneID]
 597             seqlen = len(theseq)
 598             maskedseq = ["N"] * seqlen
 599             conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria)
 600             for (start, length, crit) in conservedWindows:
 601                 if start < 0:
 602                     start = 0
 603
 604                 if start + length >= seqlen:
 605                     length = seqlen - start - 1
 606
 607                 for index in range(start, start + length):
 608                     maskedseq[index] = theseq[index]
 609
 610             tempseq = []
 611             numN = 0
 612             for letter in maskedseq:
 613                 if letter.upper() == "N":
 614                     numN += 1
 615                 else:
 616                     numN = 0
 617
 618                 if numN > stripNLonger:
 619                     continue
 620
 621                 tempseq.append(letter)
 622
 623             maskedSeqDict[geneID] = string.join(tempseq, "")
 624
 625         return maskedSeqDict
 626
 627
 628     def conservationStat(self, datasetID=-1, method="", criteria=""):
 629         """ report conservation level in each of the genes in the dataset.
 630         """
 631         nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"]
 632         totalcons = 0
 633         totalsize = 0
 634         consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria)
 635         for geneID in consGeneDict:
 636             ntstat = 0
 637             for NT in nucleotides:
 638                 ntstat += consGeneDict[geneID].count(NT)
 639
 640             blocks = (len(consGeneDict[geneID]) - ntstat) / 21
 641             origSize = len(self.genepool[geneID])
 642             consPercentage = 100. * ntstat / float(origSize)
 643             print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage)
 644             totalcons += ntstat
 645             totalsize += origSize
 646
 647         consPercentage = 100. * totalcons / float(totalsize)
 648         print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage)
 649
 650
 651     def motifConservationStat(self, motifList=[]):
 652         """ check how many motifs instances for the motifs in the mapped motifs
 653             (or only in motifList) are conserved.
 654         """
 655         if len(motifList) == 0:
 656             motifList = self.geneToMotifKeys(thekey="motif")
 657
 658         for motID in motifList:
 659             index = 0
 660             matches = self.motifToGene(motID)
 661             for (loc, pos) in matches:
 662                 mot = self.findMotif(motID)
 663                 if self.isConserved(loc, pos, len(mot)):
 664                     index += 1
 665
 666             print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches)))
 667
 668
 669     def checkForConservedSequence(self, datasetID=-1, method="", criteria=""):
 670         """ checks that at least two or more sequences in the dataset have conservation.
 671         """
 672         someHaveConservation = False
 673         numCons = 0
 674         checkDict = self.maskNonConservedSequence(datasetID, method, criteria)
 675         for entry in checkDict:
 676             theseq = checkDict[entry].upper()
 677             if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq:
 678                 numCons += 1
 679
 680         if numCons > 1:
 681             someHaveConservation = True
 682
 683         return someHaveConservation
 684
 685
 686     def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"):
 687         """ exports conserved sequences to a fasta file.
 688         """
 689         (up, cds, down) = self.getSeqParameters()
 690         consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000)
 691         if len(genomes) > 0:
 692             consDictEntries = consDict.keys()
 693             for entry in consDictEntries:
 694                 if entry[0] not in genomes:
 695                     del consDict[entry]
 696
 697         outfilename = "%s/%s.fsa" % (directory, prefix)
 698         consOutfile = open(outfilename, "w")
 699         for entry in consDict:
 700             entryCoordinates = cistematic.core.geneEntry(entry)
 701             entrySense = entryCoordinates[4]
 702             theseq = consDict[entry].upper()
 703             seqlen = len(theseq)
 704             (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates)
 705             if entrySense == "R":
 706                 theseq = cistematic.core.complement(theseq)
 707
 708             conservedBlockStart = -1
 709             for pos in range(seqlen):
 710                 if theseq[pos] == "N":
 711                     if conservedBlockStart >= 0:
 712                         consStart = start + conservedBlockStart
 713                         consSeq = theseq[conservedBlockStart:pos]
 714                         consStop = consStart + len(consSeq) - 1
 715                         consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
 716                         conservedBlockStart = -1
 717
 718                     continue
 719
 720                 if conservedBlockStart < 0:
 721                     conservedBlockStart = pos
 722
 723             if conservedBlockStart >= 0:
 724                 consStart = start + conservedBlockStart
 725                 consSeq = theseq[conservedBlockStart:pos]
 726                 consStop = consStart + len(consSeq) - 1
 727                 consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
 728
 729         consOutfile.close()
 730
 731
 732     def createConservation(self, conservationID="default", dbName=""):
 733         """ creates the conservation SQL tables. Should only be run once per database file.
 734         """
 735         stmtList = []
 736         self.loadConservation(conservationID, dbName)
 737         try:
 738             stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)")
 739             stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)")
 740             stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)")
 741             stmtList.append("CREATE index cons1 on homology(conservationID)")
 742             stmtList.append("CREATE index cons2 on alignedSequence(conservationID)")
 743             stmtList.append("CREATE index cons3 on conservedSequence(conservationID)")
 744             stmtList.append("CREATE index hom1 on homology(homologyGroup)")
 745             stmtList.append("CREATE index orth1 on homology(orthologyGroup)")
 746             stmtList.append("CREATE index homlocus1 on homology(genome, locus)")
 747             stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)")
 748             stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)")
 749             for stmt in stmtList:
 750                 self.sqlcons(stmt, commit=True)
 751
 752             self.mlog("Created conservation tables in database %s" % dbName)
 753         except:
 754             self.mlog("Could not create conservation tables in database %s" % dbName)
 755             self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?")
 756
 757
 758     def loadConservation(self, conservationID="default", dbName=""):
 759         """ changes the conservationID to use the specified one, or use default. Must be used before reading or
 760             writing any data to analysis tables.
 761         """
 762         self.conservationID = conservationID
 763         if len (dbName) > 0:
 764             self.consDBName = dbName
 765         else:
 766             self.consDBName = self.expFile
 767
 768
 769     def sqlcons(self, stmt, commit=""):
 770         """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
 771         """
 772         res = []
 773         db = sqlite.connect(self.consDBName)
 774         sqlc = db.cursor()
 775         try:
 776             if self.debugSQL:
 777                 print "Conservation->sqlcons: %s" % stmt
 778
 779             sqlc.execute(stmt)
 780             res = sqlc.fetchall()
 781             try:
 782                 if commit != "":
 783                     db.commit()
 784             except:
 785                 self.mlog("Conservation->sqlcons (commit exception)")
 786         except:
 787             self.mlog("Conservation->sqlcons (statement exception): %s" % stmt)
 788
 789         sqlc.close()
 790         db.close()
 791
 792         return res
 793
 794
 795     def batchsqlCons(self, stmt, batch):
 796         """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
 797         """
 798         res = []
 799         db = sqlite.connect(self.consDBName)
 800         sqlc = db.cursor()
 801         try:
 802             if self.debugSQL:
 803                 print "batchsqlCons: %s" % stmt
 804                 print "batchsqlCons: %s" % (str(batch))
 805
 806             sqlc.executemany(stmt, batch)
 807             try:
 808                 db.commit()
 809             except:
 810                 self.mlog("Conservation->batchsqlCons (commit exception)")
 811         except:
 812             self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt)
 813
 814         sqlc.close()
 815         db.close()
 816
 817         return res