1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
32 from cistematic.core.geneinfo import geneinfoDB
33 import cistematic.core
34 from cistematic.core.motif import Motif
36 from pysqlite2 import dbapi2 as sqlite
38 from sqlite3 import dbapi2 as sqlite
40 import os, tempfile, shutil
41 from os import environ
43 if environ.get("CISTEMATIC_ROOT"):
44 cisRoot = environ.get("CISTEMATIC_ROOT")
46 cisRoot = "/proj/genome"
48 if environ.get("CISTEMATIC_TEMP"):
49 cisTemp = environ.get("CISTEMATIC_TEMP")
53 tempfile.tempdir = cisTemp
55 def getModules(motifID, motLen, totalLen=60, mainGenome={}, queryGenomes={}, directory=cisTemp, filterMasked=True):
56 """ build three fasta files containing sequences from (a) one main genome, (b) multiple query genomes,
57 and (c) motif matches in all genomes.
60 upLen = (totalLen - motLen) / 2
61 downLen = totalLen - upLen - motLen
62 for genomeDict in [mainGenome, queryGenomes]:
64 outfilename = "%s/%s-main.fsa" % (directory, motifID)
65 outfile = open(outfilename, "w")
66 matchfilename = "%s/%s-matches.fsa" % (directory, motifID)
67 matchfile = open(matchfilename, "w")
69 for genome in genomeDict:
70 infile = open(genomeDict[genome])
71 print "doing %s" % genome
72 cistematic.core.cacheGeneDB(genome)
74 prevpos = -1 * totalLen
79 fields = line.split("\t")
82 lineArray.append((chrom, pos, line))
86 for (chrom, pos, line) in lineArray:
87 fields = line.split("\t")
88 if fields[1] == current:
93 if chrom == prevchrom and pos < (prevpos + 2 * (upLen + downLen)):
94 print "skipping %s-%s-%s%s\ttoo close to previous pos" % (genome, chrom, misc, current)
98 motseq = cistematic.core.retrieveSequence(genome, chrom, pos + 1, pos + motLen)
103 seq = cistematic.core.retrieveSequence(genome, chrom, start, start + totalLen)
105 print "skipping %s-%s-%s%s\terror retrieving sequence" % (genome, chrom, misc, current)
110 motseq = cistematic.core.complement(motseq)
111 seq = cistematic.core.complement(seq, len(seq))
113 if filterMasked and motseq != motseq.upper():
116 stop = start + len(seq)
117 matchID = "%s-%s-%s-%s%s" % (motifID.replace("-", "_"), genome, chrom.replace("-", "_"), misc, current)
118 outfile.write("> %s %d [%d to %d, %s]\n%s\n" % (matchID, pos, start, stop, sense, seq))
119 matchfile.write("> %s\n%s\n" % (matchID, motseq))
125 print "doing database genomes"
126 outfilename = "%s/%s-db.fsa" % (directory, motifID)
127 outfile = open(outfilename, "w")
130 cistematic.core.uncacheGeneDB()
135 def doBlast(motifID, matchLength, similarity, directory=cisTemp, maxInputSize=3000, firstMatchOnly=False):
136 """ Blast sequences generated by getModules() to identify regions in the main genome sequences that are
137 conserved in the query genomes. Results are saved in a motifIDBlast.db sqlite database.
139 # check that we don't exceed maxSetSize
140 fsafilename = "%s/%s-main.fsa" % (directory, motifID)
141 fsafile = open(fsafilename, "r")
149 if index > maxInputSize:
150 print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.doBlast()" % maxInputSize
155 blastCommands = ["cd %s" % directory,
156 "nohup %s/programs/blast/bin/formatdb -t %smodDB -n %smodDB -p F -i %s-db.fsa" % (cisRoot, motifID, motifID, motifID),
157 "%s/programs/blast/bin/blastall -p blastn -d %smodDB -i %s-main.fsa -o %s.blastres -m 8 -v 15 -b 15" % (cisRoot, motifID, motifID, motifID)]
158 blastCommandLine = string.join(blastCommands, "; ")
159 os.system(blastCommandLine)
161 fsafilename = "%s/%s-main.fsa" % (directory, motifID)
162 fsafile = open(fsafilename, "r")
163 infilename = "%s/%s.blastres" % (directory, motifID)
164 infile = open(infilename, "r")
165 outfilename = "%s/%s.cisblast" % (directory, motifID)
166 outfile = open(outfilename, "w")
167 tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID)
168 db = sqlite.connect(tempdb)
171 stmt = "create table blast_entries(ID INTEGER PRIMARY KEY, GENOME1 varchar, GENEID1 varchar, GENOME2 varchar, GENEID2 varchar, SIMILARITY float, LENGTH int, MISMATCHES int, INDEL int, start1 int, stop1 int, start2 int, stop2 int, evalue float, score float)"
173 stmt = "create table blast_segments(ID INTEGER PRIMARY KEY, GENOME varchar, MATCHID varchar, chrom varchar, loc int, start int, stop int, sense varchar)"
177 stmt = "INSERT into blast_segments VALUES(NULL, ?, ?, ?, ?, ?, ?, ?) "
183 (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split()
184 start = int(segstart[1:])
185 stop = int(segstop[:-1])
187 (motTag, genome, chromid, mid) = matchid.split("-")
188 batch.append((genome, matchid, chromid, int(loc), start, stop, sense))
190 sql.executemany(stmt, batch)
197 print "Building matchDict"
199 fields = line.split("\t")
200 geneid1 = str(fields[0])
201 (motTAG, genome1, mchrom1, match1) = geneid1.split("-")
202 geneid2 = str(fields[1])
203 (motTAG, genome2, mchrom2, match2) = geneid2.split("-")
204 lineMatchLength = int(fields[3])
205 lineSimilarity = float(fields[2])
206 evalue = float(fields[10])
207 score = float(fields[11].strip())
208 if evalue < 0.01 and lineMatchLength > matchLength and lineSimilarity > similarity:
209 if geneid1 == geneid2:
212 if geneid1 == previousGene1 and geneid2 == previousGene2:
215 previousGene1 = geneid1
216 previousGene2 = geneid2
217 if geneid1 not in matchDict:
218 matchDict[geneid1] = {}
223 if geneid2 not in matchDict[geneid1]:
224 matchDict[geneid1][geneid2] = []
226 matchDict[geneid1][geneid2].append((score, line))
228 os.remove(infilename)
229 print "Processing matchDict"
230 matchKeys = matchDict.keys()
233 stmt = "INSERT into blast_entries VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) "
235 for geneid1 in matchKeys:
236 geneKeys = matchDict[geneid1].keys()
238 for geneid2 in geneKeys:
240 array = matchDict[geneid1][geneid2]
242 (score, line) = array[-1]
244 fields = line.split("\t")
245 geneid1 = str(fields[0])
246 (motTAG, genome1, mchrom1, gid1) = geneid1.split('-')
247 geneid2 = str(fields[1])
248 (motTAG, genome2, mchrom2, gid2) = geneid2.split('-')
249 similarity = float(fields[2])
250 matchLength = int(fields[3])
251 mismatches = int(fields[4])
252 indels = int(fields[5])
253 start1 = int(fields[6])
254 stop1 = int(fields[7])
255 start2 = int(fields[8])
256 stop2 = int(fields[9])
257 evalue = float(fields[10])
258 score = float(fields[11].strip())
259 batch.append((genome1, geneid1, genome2, geneid2, similarity, matchLength, mismatches, indels, start1, stop1, start2, stop2, evalue, score))
261 sql.executemany(stmt, batch)
262 stmt = "CREATE index blastE on blast_entries(genome1, geneid1)"
264 stmt = "CREATE index blastS on blast_segments(GENOME, MATCHID)"
269 shutil.copyfile(tempdb, "%s/%sBlast.db" % (directory, motifID))
273 print "inserted %d lines" % counter
276 def getCandidates(motifID, motLen, mainGenome={}, radiusStep=1000, maxRadius=200000, maxInputSize=3000, refineMotifs=False, directory=cisTemp, ucscOrg="Human", ucscDB="hg17"):
277 """ Core of cismatcher. Extracts conserved regions from data in the Blast.db database generated by doBlast().
279 genome = mainGenome.keys()[0]
280 cistematic.core.cacheGeneDB(genome)
281 motfile = open(mainGenome[genome], "r")
282 modfilename = "%s/%s-main.fsa" % (directory, motifID)
283 modfile = open(modfilename, "r")
284 outfilename = "%s/cismatcher.%s.out" % (directory, motifID)
285 outfile = open(outfilename, "w")
286 annotfilename = "%s/cismatcher.%s.annot.txt" % (directory, motifID)
287 annotfile = open(annotfilename, "w")
291 mfilename = "%s/%s-matches.fsa" % (directory, motifID)
292 mfile = open(mfilename, "r")
296 motseqs[mid] = line.strip()
298 mid = line.strip()[2:]
303 prevchrom = "nochrom"
307 fields = line.split("\t")
309 if fields[1] == current and chrom == prevchrom:
312 if chrom != prevchrom:
314 if "rand" in refchrom:
315 refchrom = "%s_random" % refchrom[:-4]
320 motstart = int(fields[4]) + 1
321 motstop = motstart + motLen
328 lines.append("chr%s\tcistematic\tmotif\t%d\t%d\t.\t%s\t.\t%s-%s-%s\n" % (refchrom, motstart, motstop, sense, motifID, chrom, current))
331 if numEntries > maxInputSize:
332 print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.getCandidates()" % maxInputSize
336 outheader = 'track name=%strack description="Cistematic %s hits"\n' % (motifID, motifID)
337 annotfile.write(outheader)
338 for outline in lines:
339 annotfile.write(outline)
343 dbname = "%s/%sBlast.db" % (directory, motifID)
344 tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID)
345 shutil.copyfile(dbname, tempdb)
347 print "no blast database - aborting cisMatcher->getCandidates()"
351 prevchrom = "nochrom"
356 (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split()
357 start = int(segstart[1:]) + 1
358 stop = int(segstop[:-1]) + 1
359 (motifID, genome, chrom, match) = matchid.split('-')
365 if genome not in matchid:
368 if chrom != prevchrom:
370 if "rand" in refchrom:
371 refchrom = "%s_random" % refchrom[:-4]
375 line = "chr%s\tcistematic\tblast_region\t%d\t%d\t.\t%s\t.\t%sb%d\n" % (refchrom, start, stop, sense, matchid, index)
376 annotfile.write(line)
380 db = sqlite.connect(tempdb)
382 stmt = ' select * from blast_entries where GENOME1="%s" ORDER BY GENEID1 ' % genome
385 idb = geneinfoDB(cache=True)
390 (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry
391 line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score)
392 stmt2 = 'select chrom, loc, start, stop, sense from blast_segments where GENOME = "%s" and MATCHID = "%s" ' % (gen1, mid1)
394 result = sql.fetchone()
395 (chrom, loc, start, stop, sense) = result
396 if chrom != prevchrom:
398 if "rand" in refchrom:
399 refchrom = "%s_random" % refchrom[:-4]
404 matchstart = start + + 1 + int(start1)
405 matchstop = start + + 1 + int(stop1)
407 matchstart = stop + 1 - int(start1)
408 matchstop = stop + 1 - int(stop1)
415 if matchstart > matchstop:
417 matchstop = matchstart
420 line = "chr%s\tcistematic\tmatch\t%d\t%d\t%d\t%s\t.\t%s.%d\n" % (refchrom, matchstart, matchstop, (float(sim) - 90) * 100, sense, mid2, index)
422 annotfile.write(line)
428 featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start , motLen, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR")
429 if len(featureList) > 0:
430 relativeLoc = featureList[0][6]
431 feature = featureList[0][0] # always pick first feature
433 desc = cistematic.core.getAnnotInfo((genome, feature))[0]
434 symbol = idb.getGeneInfo((genome, feature))[0]
438 while feature == "NONE" and radius < maxRadius:
440 featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR")
441 if genome == "human":
442 featureList += cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="WGRNA")
444 if len(featureList) > 0:
445 feature = featureList[0][0] # always pick first feature
447 geneID = (genome, feature)
448 (gchrom, gstart, gstop, glength, gsense) = cistematic.core.geneEntry(geneID)
449 if gstart <= start and start <= gstop:
462 desc = cistematic.core.getAnnotInfo(geneID)[0]
463 symbol = idb.getGeneInfo(geneID)[0]
467 line = '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="http://genome.ucsc.edu/cgi-bin/hgTracks?org=%s&db=%s&position=chr%s:%d-%d" target="new">chr%s:%d-%d</a></td><td>%s</td><td>%s</td><td>%d</td><td>%s</td></tr>\n' % (relativeLoc, mid1, mid2, sim, length, ucscOrg, ucscDB, refchrom, start - radius, stop + radius, refchrom, matchstart , matchstop, feature, symbol, radius, desc)
468 outlines.append((float(sim), int(length), mid1, line))
472 cistematic.core.uncacheGeneDB()
480 print "writing outfile"
481 for line in outlines:
482 if line[2] in alreadySeen:
485 alreadySeen.append(line[2])
486 outfile.write(line[3])
490 print "Refining motifs"
493 (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry
494 if mid1 not in goodIDs:
496 goodseqs.append(motseqs[mid1].upper())
498 if mid2 not in goodIDs:
500 goodseqs.append(motseqs[mid2].upper())
502 mot = Motif("%s+R" % motifID, "", "", goodseqs)
503 motfilename = "%s/%s+R.mot" % (directory, motifID)
504 mot.saveMotif(motfilename)