cistematic/core/cismatcher.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 import string
  31 import cistematic
  32 from cistematic.core.geneinfo import geneinfoDB
  33 import cistematic.core
  34 from cistematic.core.motif import Motif
  35 try:
  36     from pysqlite2 import dbapi2 as sqlite
  37 except:
  38     from sqlite3 import dbapi2 as sqlite
  39
  40 import os, tempfile, shutil
  41 from os import environ
  42
  43 if environ.get("CISTEMATIC_ROOT"):
  44     cisRoot = environ.get("CISTEMATIC_ROOT")
  45 else:
  46     cisRoot = "/proj/genome"
  47
  48 if environ.get("CISTEMATIC_TEMP"):
  49     cisTemp = environ.get("CISTEMATIC_TEMP")
  50 else:
  51     cisTemp = "/tmp"
  52
  53 tempfile.tempdir = cisTemp
  54
  55 def getModules(motifID, motLen, totalLen=60, mainGenome={}, queryGenomes={}, directory=cisTemp, filterMasked=True):
  56     """ build three fasta files containing sequences from (a) one main genome, (b) multiple query genomes,
  57         and (c) motif matches in all genomes.
  58     """
  59     doMainGenome = True
  60     upLen = (totalLen - motLen) / 2
  61     downLen = totalLen - upLen - motLen
  62     for genomeDict in [mainGenome, queryGenomes]:
  63         if doMainGenome:
  64             outfilename = "%s/%s-main.fsa" % (directory, motifID)
  65             outfile = open(outfilename, "w")
  66             matchfilename = "%s/%s-matches.fsa" % (directory, motifID)
  67             matchfile = open(matchfilename, "w")
  68
  69         for genome in genomeDict:
  70             infile = open(genomeDict[genome])
  71             print "doing %s" % genome
  72             cistematic.core.cacheGeneDB(genome)
  73             current = "-1"
  74             prevpos = -1 * totalLen
  75             prevchrom = "nochrom"
  76             lineArray = []
  77             misc = ""
  78             for line in infile:
  79                 fields = line.split("\t")
  80                 chrom = fields[3]
  81                 pos = int(fields[4])
  82                 lineArray.append((chrom, pos, line))
  83
  84             lineArray.sort()
  85             infile.close()
  86             for (chrom, pos, line) in lineArray:
  87                 fields = line.split("\t")
  88                 if fields[1] == current:
  89                     continue
  90
  91                 current = fields[1]
  92                 misc = fields[0][-1]
  93                 if chrom == prevchrom and pos < (prevpos + 2 * (upLen + downLen)):
  94                     print "skipping %s-%s-%s%s\ttoo close to previous pos" % (genome, chrom, misc, current)
  95                     continue
  96
  97                 try:
  98                     motseq = cistematic.core.retrieveSequence(genome, chrom, pos + 1, pos + motLen)
  99                     start = pos - upLen
 100                     if start < 0:
 101                         start = 0
 102
 103                     seq = cistematic.core.retrieveSequence(genome, chrom, start, start + totalLen)
 104                 except:
 105                     print "skipping %s-%s-%s%s\terror retrieving sequence" % (genome, chrom, misc, current)
 106                     continue
 107
 108                 sense = fields[5]
 109                 if sense == "R":
 110                     motseq = cistematic.core.complement(motseq)
 111                     seq = cistematic.core.complement(seq, len(seq))
 112
 113                 if filterMasked and motseq != motseq.upper():
 114                     continue
 115
 116                 stop = start + len(seq)
 117                 matchID = "%s-%s-%s-%s%s" % (motifID.replace("-", "_"), genome, chrom.replace("-", "_"), misc, current)
 118                 outfile.write("> %s %d [%d to %d, %s]\n%s\n" % (matchID, pos, start, stop, sense, seq))
 119                 matchfile.write("> %s\n%s\n" % (matchID, motseq))
 120                 prevchrom = chrom
 121                 prevpos = pos
 122
 123         if doMainGenome:
 124             outfile.close()
 125             print "doing database genomes"
 126             outfilename = "%s/%s-db.fsa" % (directory, motifID)
 127             outfile = open(outfilename, "w")
 128             doMainGenome = False
 129
 130     cistematic.core.uncacheGeneDB()
 131     outfile.close()
 132     matchfile.close()
 133
 134
 135 def doBlast(motifID, matchLength, similarity, directory=cisTemp, maxInputSize=3000, firstMatchOnly=False):
 136     """ Blast sequences generated by getModules() to identify regions in the main genome sequences that are
 137         conserved in the query genomes. Results are saved in a motifIDBlast.db sqlite database.
 138     """
 139     # check that we don't exceed maxSetSize
 140     fsafilename = "%s/%s-main.fsa" % (directory, motifID)
 141     fsafile = open(fsafilename, "r")
 142     index = 0
 143     for line in fsafile:
 144         if line[0] != ">":
 145             continue
 146
 147         index += 1
 148
 149     if index > maxInputSize:
 150         print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.doBlast()" % maxInputSize
 151         return
 152
 153     #build blast DB
 154     #run blast
 155     blastCommands = ["cd %s" % directory,
 156                      "nohup %s/programs/blast/bin/formatdb -t %smodDB -n %smodDB -p F -i %s-db.fsa" % (cisRoot, motifID, motifID, motifID),
 157                      "%s/programs/blast/bin/blastall -p blastn -d %smodDB -i %s-main.fsa -o %s.blastres -m 8 -v 15 -b 15" % (cisRoot, motifID, motifID, motifID)]
 158     blastCommandLine = string.join(blastCommands, "; ")
 159     os.system(blastCommandLine)
 160     #filter blast
 161     fsafilename = "%s/%s-main.fsa" % (directory, motifID)
 162     fsafile = open(fsafilename, "r")
 163     infilename = "%s/%s.blastres" % (directory, motifID)
 164     infile = open(infilename, "r")
 165     outfilename = "%s/%s.cisblast" % (directory, motifID)
 166     outfile = open(outfilename, "w")
 167     tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID)
 168     db = sqlite.connect(tempdb)
 169     sql = db.cursor()
 170
 171     stmt = "create table blast_entries(ID INTEGER PRIMARY KEY, GENOME1 varchar, GENEID1 varchar, GENOME2 varchar, GENEID2 varchar, SIMILARITY float, LENGTH int, MISMATCHES int, INDEL int, start1 int, stop1 int, start2 int, stop2 int, evalue float, score float)"
 172     sql.execute(stmt)
 173     stmt = "create table blast_segments(ID INTEGER PRIMARY KEY, GENOME varchar, MATCHID varchar, chrom varchar, loc int, start int, stop int, sense varchar)"
 174     sql.execute(stmt)
 175     db.commit()
 176
 177     stmt = "INSERT into blast_segments VALUES(NULL,  ?, ?, ?, ?, ?, ?, ?) "
 178     batch = []
 179     for line in fsafile:
 180         if line[0] != ">":
 181             continue
 182
 183         (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split()
 184         start = int(segstart[1:])
 185         stop = int(segstop[:-1])
 186         sense = sense[0]
 187         (motTag, genome, chromid, mid) = matchid.split("-")
 188         batch.append((genome, matchid, chromid, int(loc), start, stop, sense))
 189
 190     sql.executemany(stmt, batch)
 191
 192     counter = 0
 193     previousGene1 = ""
 194     previousGene2 = ""
 195     matchDict = {}
 196
 197     print "Building matchDict"
 198     for line in infile:
 199         fields = line.split("\t")
 200         geneid1 = str(fields[0])
 201         (motTAG, genome1, mchrom1, match1) = geneid1.split("-")
 202         geneid2 = str(fields[1])
 203         (motTAG, genome2, mchrom2, match2) = geneid2.split("-")
 204         lineMatchLength = int(fields[3])
 205         lineSimilarity = float(fields[2])
 206         evalue = float(fields[10])
 207         score  = float(fields[11].strip())
 208         if evalue < 0.01 and lineMatchLength > matchLength and lineSimilarity > similarity:
 209             if geneid1 == geneid2:
 210                 continue
 211
 212             if geneid1 == previousGene1 and geneid2 == previousGene2:
 213                 continue
 214
 215             previousGene1 = geneid1
 216             previousGene2 = geneid2
 217             if geneid1 not in matchDict:
 218                 matchDict[geneid1] = {}
 219
 220             elif firstMatchOnly:
 221                 continue
 222
 223             if geneid2 not in matchDict[geneid1]:
 224                 matchDict[geneid1][geneid2] = []
 225
 226             matchDict[geneid1][geneid2].append((score, line))
 227
 228     os.remove(infilename)
 229     print "Processing matchDict"
 230     matchKeys = matchDict.keys()
 231     matchKeys.sort()
 232
 233     stmt = "INSERT into blast_entries VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) "
 234     batch = []
 235     for geneid1 in matchKeys:
 236         geneKeys = matchDict[geneid1].keys()
 237         geneKeys.sort()
 238         for geneid2 in geneKeys:
 239             counter += 1
 240             array = matchDict[geneid1][geneid2]
 241             array.sort()
 242             (score, line) = array[-1]
 243             outfile.write(line)
 244             fields = line.split("\t")
 245             geneid1 = str(fields[0])
 246             (motTAG, genome1, mchrom1, gid1) = geneid1.split('-')
 247             geneid2 = str(fields[1])
 248             (motTAG, genome2, mchrom2, gid2) = geneid2.split('-')
 249             similarity = float(fields[2])
 250             matchLength = int(fields[3])
 251             mismatches = int(fields[4])
 252             indels = int(fields[5])
 253             start1 = int(fields[6])
 254             stop1  = int(fields[7])
 255             start2 = int(fields[8])
 256             stop2  = int(fields[9])
 257             evalue = float(fields[10])
 258             score  = float(fields[11].strip())
 259             batch.append((genome1, geneid1, genome2, geneid2, similarity, matchLength, mismatches, indels, start1, stop1, start2, stop2, evalue, score))
 260
 261     sql.executemany(stmt, batch)
 262     stmt = "CREATE index blastE on blast_entries(genome1, geneid1)"
 263     sql.execute(stmt)
 264     stmt = "CREATE index blastS on blast_segments(GENOME, MATCHID)"
 265     sql.execute(stmt)
 266     db.commit()
 267     sql.close()
 268     db.close()
 269     shutil.copyfile(tempdb, "%s/%sBlast.db" % (directory, motifID))
 270     os.remove(tempdb)
 271     infile.close()
 272     outfile.close()
 273     print "inserted %d lines" % counter
 274
 275
 276 def getCandidates(motifID, motLen, mainGenome={}, radiusStep=1000, maxRadius=200000, maxInputSize=3000, refineMotifs=False, directory=cisTemp, ucscOrg="Human", ucscDB="hg17"):
 277     """ Core of cismatcher. Extracts conserved regions from data in the Blast.db database generated by doBlast().
 278     """
 279     genome = mainGenome.keys()[0]
 280     cistematic.core.cacheGeneDB(genome)
 281     motfile = open(mainGenome[genome], "r")
 282     modfilename = "%s/%s-main.fsa" % (directory, motifID)
 283     modfile = open(modfilename, "r")
 284     outfilename = "%s/cismatcher.%s.out" % (directory, motifID)
 285     outfile = open(outfilename, "w")
 286     annotfilename = "%s/cismatcher.%s.annot.txt" % (directory, motifID)
 287     annotfile = open(annotfilename, "w")
 288     motseqs = {}
 289     goodseqs = []
 290     if refineMotifs:
 291         mfilename = "%s/%s-matches.fsa" % (directory, motifID)
 292         mfile = open(mfilename, "r")
 293         mid = ""
 294         for line in mfile:
 295             if line[0] != ">":
 296                 motseqs[mid] = line.strip()
 297             else:
 298                 mid = line.strip()[2:]
 299
 300         mfile.close()
 301
 302     current = "-1"
 303     prevchrom = "nochrom"
 304     lines = []
 305     numEntries = 0
 306     for line in motfile:
 307         fields = line.split("\t")
 308         chrom = fields[3]
 309         if fields[1] == current and chrom == prevchrom:
 310             continue
 311
 312         if chrom != prevchrom:
 313             refchrom = chrom
 314             if "rand" in refchrom:
 315                 refchrom = "%s_random" % refchrom[:-4]
 316
 317             prevchrom = chrom
 318
 319         current = fields[1]
 320         motstart = int(fields[4]) + 1
 321         motstop = motstart + motLen
 322         sense = fields[5]
 323         if sense == "F":
 324             sense = "+"
 325         else:
 326             sense = "-"
 327
 328         lines.append("chr%s\tcistematic\tmotif\t%d\t%d\t.\t%s\t.\t%s-%s-%s\n" % (refchrom, motstart, motstop, sense, motifID, chrom, current))
 329         numEntries += 1
 330
 331     if numEntries > maxInputSize:
 332         print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.getCandidates()" % maxInputSize
 333         return
 334
 335     if lines:
 336         outheader = 'track name=%strack description="Cistematic %s hits"\n' % (motifID, motifID)
 337         annotfile.write(outheader)
 338         for outline in lines:
 339             annotfile.write(outline)
 340
 341     motfile.close()
 342     try:
 343         dbname = "%s/%sBlast.db" % (directory, motifID)
 344         tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID)
 345         shutil.copyfile(dbname, tempdb)
 346     except:
 347         print "no blast database - aborting cisMatcher->getCandidates()"
 348         return
 349
 350     index = 0
 351     prevchrom = "nochrom"
 352     for line in modfile:
 353         if line[0] != ">":
 354             continue
 355
 356         (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split()
 357         start = int(segstart[1:]) + 1
 358         stop = int(segstop[:-1]) + 1
 359         (motifID, genome, chrom, match) = matchid.split('-')
 360         if sense == "F":
 361             sense = "="
 362         else:
 363             sense = "-"
 364
 365         if genome not in matchid:
 366             continue
 367
 368         if chrom != prevchrom:
 369             refchrom = chrom
 370             if "rand" in refchrom:
 371                 refchrom = "%s_random" % refchrom[:-4]
 372
 373             prevchrom = chrom
 374
 375         line = "chr%s\tcistematic\tblast_region\t%d\t%d\t.\t%s\t.\t%sb%d\n" % (refchrom, start, stop, sense, matchid, index)
 376         annotfile.write(line)
 377         index += 1
 378
 379     modfile.close()
 380     db = sqlite.connect(tempdb)
 381     sql = db.cursor()
 382     stmt = ' select * from blast_entries where GENOME1="%s" ORDER BY GENEID1 ' % genome
 383     sql.execute(stmt)
 384     res = sql.fetchall()
 385     idb = geneinfoDB(cache=True)
 386     prevchrom="nochrom"
 387     outlines = []
 388     index = 0
 389     for entry in res:
 390         (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry
 391         line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score)
 392         stmt2 = 'select chrom, loc, start, stop, sense from blast_segments where GENOME = "%s" and MATCHID = "%s" ' % (gen1, mid1)
 393         sql.execute(stmt2)
 394         result = sql.fetchone()
 395         (chrom, loc, start, stop, sense)  = result
 396         if chrom != prevchrom:
 397             refchrom = chrom
 398             if "rand" in refchrom:
 399                 refchrom = "%s_random" % refchrom[:-4]
 400
 401             prevchrom = chrom
 402
 403         if sense == "F":
 404             matchstart = start +  + 1 + int(start1)
 405             matchstop = start +  + 1 + int(stop1)
 406         else:
 407             matchstart = stop + 1 - int(start1)
 408             matchstop = stop + 1 - int(stop1)
 409
 410         if sense == "F":
 411             sense = "+"
 412         else:
 413             sense = "-"
 414
 415         if matchstart > matchstop:
 416             temp = matchstop
 417             matchstop = matchstart
 418             matchstart = temp
 419
 420         line = "chr%s\tcistematic\tmatch\t%d\t%d\t%d\t%s\t.\t%s.%d\n" % (refchrom, matchstart, matchstop, (float(sim) - 90) * 100, sense, mid2, index)
 421         index += 1
 422         annotfile.write(line)
 423         feature = "NONE"
 424         symbol = ""
 425         desc = ""
 426         relativeLoc = ""
 427         radius = 0
 428         featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start , motLen, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR")
 429         if len(featureList) > 0:
 430             relativeLoc = featureList[0][6]
 431             feature = featureList[0][0] # always pick first feature
 432             try:
 433                 desc = cistematic.core.getAnnotInfo((genome, feature))[0]
 434                 symbol = idb.getGeneInfo((genome, feature))[0]
 435             except:
 436                 pass
 437
 438         while feature == "NONE" and radius < maxRadius:
 439             radius += radiusStep
 440             featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR")
 441             if genome == "human":
 442                 featureList += cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="WGRNA")
 443
 444             if len(featureList) > 0:
 445                 feature = featureList[0][0] # always pick first feature
 446                 try:
 447                     geneID = (genome, feature)
 448                     (gchrom, gstart, gstop, glength, gsense) = cistematic.core.geneEntry(geneID)
 449                     if gstart <= start and start <= gstop:
 450                         relativeLoc = "GENE"
 451                     elif start < gstart:
 452                         if gsense == "F":
 453                             relativeLoc = "UP"
 454                         else:
 455                             relativeLoc = "DOWN"
 456                     else:
 457                         if gsense == "F":
 458                             relativeLoc = "DOWN"
 459                         else:
 460                             relativeLoc = "UP"
 461
 462                     desc = cistematic.core.getAnnotInfo(geneID)[0]
 463                     symbol = idb.getGeneInfo(geneID)[0]
 464                 except:
 465                     pass
 466
 467         line = '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="http://genome.ucsc.edu/cgi-bin/hgTracks?org=%s&db=%s&position=chr%s:%d-%d" target="new">chr%s:%d-%d</a></td><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>\n' % (relativeLoc, mid1, mid2, sim, length, ucscOrg, ucscDB, refchrom, start - radius, stop + radius, refchrom, matchstart , matchstop, feature, symbol, radius, desc)
 468         outlines.append((float(sim), int(length), mid1, line))
 469         if index % 100 == 0:
 470             print "."
 471
 472     cistematic.core.uncacheGeneDB()
 473     annotfile.close()
 474     sql.close()
 475     db.close()
 476     os.remove(tempdb)
 477     outlines.sort()
 478     outlines.reverse()
 479     alreadySeen = []
 480     print "writing outfile"
 481     for line in outlines:
 482         if line[2] in alreadySeen:
 483             continue
 484
 485         alreadySeen.append(line[2])
 486         outfile.write(line[3])
 487
 488     outfile.close()
 489     if refineMotifs:
 490         print "Refining motifs"
 491         goodIDs = []
 492         for entry in res:
 493             (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry
 494             if mid1 not in goodIDs:
 495                 goodIDs.append(mid1)
 496                 goodseqs.append(motseqs[mid1].upper())
 497
 498             if mid2 not in goodIDs:
 499                 goodIDs.append(mid2)
 500                 goodseqs.append(motseqs[mid2].upper())
 501
 502         mot = Motif("%s+R" % motifID, "", "", goodseqs)
 503         motfilename = "%s/%s+R.mot" % (directory, motifID)
 504         mot.saveMotif(motfilename)