--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# python parent class for experiments
+try:
+ from pysqlite2 import dbapi2 as sqlite
+except:
+ from sqlite3 import dbapi2 as sqlite
+
+import os, time, tempfile, string
+
+from cistematic.core import retrieveSeq, retrieveSeqFeatures, fasta
+import cistematic.core
+from cistematic.core.motif import matrixRow, Motif
+from cistematic.genomes import Genome
+from cistematic import version
+from os import environ
+
+if environ.get("CISTEMATIC_TEMP"):
+ cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+ cisTemp = "/tmp"
+
+tempfile.tempdir = cisTemp
+
+
+class Experiment:
+ """ genepool = {(genomeA, geneA1): seq1, (genomeA, geneA2): seq2, (genomeB, geneB1): seq3,...}
+ geneFeatures = ((genomeA, geneA1):[feature0, feature1, .....], (genomeB, geneB1):[feature0, feature1,...],....}
+ programs = [(program_object1, setting1), (program_object2, setting2), ....]
+ """
+ genepool = {}
+ geneFeatures = {}
+ programs = []
+ genepoolID = 0
+ workdir = ""
+ experimentID = ""
+ experimentType = "generic"
+ expFile = ""
+ geneDB = ""
+ debugSQL = False
+
+
+ def __init__(self, expID, expDBFile, geneDBFile=""):
+ self.experimentID = expID
+ self.expFile = expDBFile
+ self.geneDB = geneDBFile
+ self.maskLowerCase = False
+ self.boundToNextGene = False
+ print "cistematic version %s" % version
+ self.startDB()
+
+
+ def __del__(self):
+ self.savePrograms()
+ self.saveGeneDB()
+ self.uncacheGeneDB()
+ self.removeWorkdir()
+
+
+ def setExperimentID(self, expID):
+ self.mlog("changing experiment ID from %s to %s" % (self.experimentID, expID))
+ self.experimentID = expID
+
+
+ def setGeneDB(self, geneDBFile=""):
+ self.mlog("using %s as the gene database" % geneDBFile)
+ self.geneDB = geneDBFile
+
+
+ def cacheGeneDB(self, genome):
+ try:
+ cistematic.core.cacheGeneDB(genome)
+ self.mlog("cached genome %s" % genome)
+ except:
+ self.mlog("could not cache genome %s" % genome)
+
+
+ def uncacheGeneDB(self):
+ try:
+ cistematic.core.uncacheGeneDB()
+ except:
+ self.mlog("could not uncache genomes")
+
+
+ def setMaskLowerCase(self, maskValue):
+ if maskValue == True or maskValue == 1 or maskValue == "1":
+ self.maskLowerCase = True
+ self.setSettings("maskLowerCase", ["1"])
+ else:
+ self.maskLowerCase = False
+ self.setSettings("maskLowerCase", ["0"])
+
+
+ def setBoundToNextGene(self, boundValue):
+ if boundValue == True or boundValue == 1 or boundValue == "1":
+ self.boundToNextGene = True
+ self.setSettings("boundToNextGene", ["1"])
+ else:
+ self.boundToNextGene = False
+ self.setSettings("boundToNextGene", ["0"])
+
+
+ def setSeqParameters(self, up=0, cds=1, down=0):
+ self.upstream = up
+ self.cds = cds
+ self.downstream = down
+ if cds == 0:
+ cdsStatus = "NO "
+ else:
+ cdsStatus = ""
+
+ self.setSettings("seq_parameters", ["%d\t%d\t%d" % (up, cds, down)])
+ self.mlog("setting sequence retrieval parameters to %d bp upstream, %d bp downstream, and %scds" % (up, down, cdsStatus))
+
+
+ def getSeqParameters(self):
+ return (self.upstream, self.cds, self.downstream)
+
+
+ def dsetLength(self):
+ stmt="SELECT count(*) from dataset where expID = '%s' " % self.experimentID
+ res = self.sqlexp(stmt)
+ try:
+ answer = int(res[0][0])
+ except IndexError:
+ answer = 0
+
+ return answer
+
+
+ def resultsLength(self):
+ stmt="SELECT count(*) from results where expID = '%s' " % self.experimentID
+ res = self.sqlexp(stmt)
+ try:
+ answer = int(res[0][0])
+ except IndexError:
+ answer = 0
+
+ return answer
+
+
+ def checkMotID(self, motID):
+ answer = False
+ stmt = "SELECT ID from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID)
+ res = self.sqlexp(stmt)
+ if len(res) > 0:
+ answer = True
+
+ return answer
+
+
+ def findMotif(self,mTagID):
+ if self.checkMotID(mTagID):
+ return self.makeMotif(mTagID)
+
+ self.mlog("could not find %s" % mTagID)
+
+ return ""
+
+
+ def makeMotif(self, motID):
+ mPWM = []
+ mseqs = []
+ stmt = "SELECT motifSeq, threshold, info from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID)
+ res = self.sqlexp(stmt)
+ entry = res[0]
+ (seq, threshold, info) = entry
+ stmt = "SELECT aFreq, cFreq, gFreq, tFreq from motifPWMs where expID = '%s' and mTagID = '%s' order by position" % (self.experimentID, motID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ col = [0.0, 0.0, 0.0, 0.0]
+ (aFreq, cFreq, gFreq, tFreq) = entry
+ col[matrixRow["A"]] = aFreq
+ col[matrixRow["C"]] = cFreq
+ col[matrixRow["G"]] = gFreq
+ col[matrixRow["T"]] = tFreq
+ mPWM.append(col)
+
+ stmt = "SELECT sequence from motifSequences where expID = '%s' and mTagID = '%s' and type = 'instance' " % (self.experimentID, motID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ seq = entry[0]
+ mseqs.append(seq)
+
+ return Motif(motID, seq, mPWM, mseqs, threshold, info)
+
+
+ def saveMotif(self, mot):
+ motID = mot.tagID
+ motifSeq = mot.motifSeq
+ motifInfo = mot.info
+ motifThreshold = mot.threshold
+ if self.checkMotID(motID):
+ stmt = "DELETE from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID)
+ self.sqlexp(stmt, commit=True)
+ stmt = "DELETE from motifPWMs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID)
+ self.sqlexp(stmt, commit=True)
+ stmt = "DELETE from motifSequences where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID)
+ self.sqlexp(stmt, commit=True)
+
+ stmtList = []
+ values = "(NULL, '%s', '%s', '%s', %f, '%s')" % (self.experimentID, motID, motifSeq, motifThreshold, motifInfo)
+ stmt = "INSERT into motifs(ID, expID, mTagID, motifSeq, threshold, info) values %s " % values
+ self.sqlexp(stmt, commit=True)
+ pos = 0
+ stmt = "INSERT into motifPWMs(ID, expID, mTagID, position, aFreq, cFreq, gFreq, tFreq) values (NULL, ?, ?, ?, ?, ?, ?, ?)"
+ for col in mot.motifPWM:
+ aFreq = round(col[matrixRow["A"]],4)
+ cFreq = round(col[matrixRow["C"]],4)
+ gFreq = round(col[matrixRow["G"]],4)
+ tFreq = abs(1.0 - aFreq - cFreq - gFreq)
+ stmtList.append((self.experimentID, motID, pos, aFreq, cFreq, gFreq, tFreq))
+ pos += 1
+
+ self.batchsqlexp(stmt, stmtList)
+ if len(mot.sequences) > 0:
+ stmt = "INSERT into motifSequences(ID, expID, mTagID, sequence, type, location) values (NULL, ?, ?, ?, ?, ?) "
+ stmtList = []
+ for seq in mot.sequences:
+ stmtList.append((self.experimentID, motID, seq, "instance", "-"))
+
+ self.batchsqlexp(stmt, stmtList)
+
+
+ def exportMotifs(self, directory=".", prefix="", suffix="mot"):
+ stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ if prefix == "":
+ prefix = self.experimentID
+
+ for entry in res:
+ motID = entry[0]
+ mot = self.makeMotif(motID)
+ fileName = "%s/%s-%s.%s" % (directory, prefix, motID, suffix)
+ self.mlog("exporting %s as %s" % (motID, fileName))
+ mot.saveMotif(fileName)
+
+
+ def exportLogos(self, directory=".", prefix=""):
+ stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ if prefix == "":
+ prefix = self.experimentID
+ for entry in res:
+ motID = entry[0]
+ mot = self.makeMotif(motID)
+ fileName = "%s/%s-%s" % (directory, prefix, motID)
+ self.mlog("saving logo for %s as %s.png" % (motID, fileName))
+ mot.saveLogo(fileName)
+
+
+ def appendResults(self, mot, resultsGroup="-"):
+ motID = mot.tagID
+ self.saveMotif(mot)
+ stmt = "INSERT into results(ID, expID, resultsGroup, mTagID) values (NULL, '%s', '%s', '%s') " % (self.experimentID, resultsGroup, motID)
+ self.sqlexp(stmt, "commit")
+
+
+ def getGeneDB(self, geneDBFile=""):
+ return self.geneDB
+
+
+ def getLog(self):
+ answer = []
+ stmt = "SELECT timestamp, message from expLog where expID = '%s' order by timestamp" % self.experimentID
+ res = self.sqlexp(stmt)
+ for entry in res:
+ (timestamp, message) = entry
+ answer.append((eval(timestamp), message))
+
+ return answer
+
+
+ def getResults(self):
+ answer = []
+ stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ motID = entry[0]
+ answer.append(self.makeMotif(motID))
+
+ return answer
+
+
+ def getSettings(self):
+ answer = {}
+ stmt = "SELECT settingName, data from settings where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ (settingName, data) = entry
+ if settingName not in answer.keys():
+ answer[settingName] = []
+
+ answer[settingName].append(data)
+
+ return answer
+
+
+ def getSetting(self, settingName):
+ answer = []
+ stmt = "SELECT data from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ answer.append(entry[0])
+
+ return answer
+
+
+ def settingsHasKey(self, thekey):
+ answer = False
+ stmt = "SELECT distinct ID from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, thekey)
+ res = self.sqlexp(stmt)
+ if len(res) > 0:
+ answer = True
+
+ return answer
+
+
+ def setSettings(self, settingName, settingList):
+ """ insert or replace (i.e. delete previous entry) a setting with one or more setting data.
+ """
+ # delete existing, insert new
+ try:
+ stmt = "DELETE from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName)
+ res = self.sqlexp(stmt, "commit")
+ except:
+ pass
+
+ stmtList = []
+ stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, ?, ?, ?)"
+ for entry in settingList:
+ stmtList.append((self.experimentID, settingName, entry))
+
+ res = self.batchsqlexp(stmt, stmtList)
+
+ return settingName
+
+
+ def setSettingsID(self, settingName, data):
+ """ return the settingsID for the inserted settingName:data pair in the settings table.
+ """
+ stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, '%s', '%s', \"%s\")" % (self.experimentID, settingName, data)
+ res = self.sqlexp(stmt, "commit")
+
+ return res
+
+
+ def getSettingsID(self, settingID):
+ """ get a setting by settingsID in the settings table.
+ """
+ answer = ""
+ stmt = "SELECT settingName, data from settings where expID = '%s' and ID = %d" % (self.experimentID, int(settingID))
+ res = self.sqlexp(stmt)
+ try:
+ (name, data) = res[0]
+ answer = (name, data)
+ except:
+ pass
+
+ return answer
+
+
+ def setRun(self, progName, datasetID, settingsID):
+ values = "(NULL, '%s', '%s', '%s', %d, '%s', '%s')" % (self.experimentID, progName, datasetID, settingsID, time.localtime(), "-")
+ stmt = "INSERT into runs (ID, expID, progName, datasetGroup, settingsID, timestamp, resultsGroup) values %s" % values
+ runID = self.sqlexp(stmt, "commit")
+ self.mlog("run %s: program %s with settings %d and dataset %s" % (runID, progName, settingsID, datasetID))
+
+ return int(runID)
+
+
+ def getRun(self, rID):
+ res = ""
+ stmt = "SELECT progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' and ID = %d" % (self.experimentID, rID)
+ res = self.sqlexp(stmt)
+ (progName, datasetID, settingsID, timestamp, resultsID) = res[0]
+ if resultsID == "-":
+ resultsID = []
+ if datasetID != "chromolist":
+ datasetID = int(datasetID)
+
+ return (progName, datasetID, int(settingsID), timestamp, resultsID)
+
+
+ def getRunsByProg(self, prog):
+ runs = self.getRuns()
+ matchingRuns = []
+ for entry in runs:
+ if (runs[entry][0] == prog):
+ matchingRuns.append(runs[entry])
+
+ return matchingRuns
+
+
+ def getRuns(self):
+ runs = {}
+ res = ''
+ stmt = "SELECT ID, progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ (ID, progName, datasetID, settingsID, timestamp, resultsID) = entry
+ runs[int(ID)] = (progName, datasetID, settingsID, eval(timestamp), resultsID)
+
+ return runs
+
+
+ def appendDataset(self, dset):
+ stmtList = []
+ stmt = "INSERT into dataset(ID, genome, locus, sequence, expID) values (NULL, ?, ?, ?, ?) "
+ for (genome, entries) in dset:
+ for entry in entries:
+ if len(entry) == 2:
+ (locus, sequence) = entry
+ else:
+ locus = entry
+ sequence = "-"
+
+ stmtList.append((genome, locus, sequence, self.experimentID))
+
+ res = self.batchsqlexp(stmt, stmtList)
+
+
+ def getDataset(self):
+ dset = []
+ stmt = "SELECT genome, locus, sequence from dataset where expID = '%s' " % (self.experimentID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ (genome, locus, sequence) = entry
+ if sequence == "-":
+ dset.append((genome, [locus]))
+ else:
+ dset.append((genome, [locus, sequence]))
+
+ return dset
+
+
+ def getDatasetNames(self):
+ dset = []
+ stmt = "select settingName from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' "
+ res = self.sqlexp(stmt)
+ for entry in res:
+ dset.append(entry)
+
+ return dset
+
+
+ def getDatasetIDs(self):
+ dset = []
+ stmt = "select ID from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' "
+ res = self.sqlexp(stmt)
+ for entry in res:
+ dset.append(entry)
+
+ return dset
+
+
+ def getFeatures(self, geneID):
+ results = []
+ try:
+ res = []
+ stmt = "select featureType, start, stop, orientation from sequenceFeatures where seqGenome = '%s' and seqID = '%s' and expID = '%s' " % (geneID[0], geneID[1], self.experimentID)
+ res = self.sqlexp(stmt)
+ for entry in res:
+ (ftype, start, stop, orientation) = entry
+ results.append((ftype, start, stop, orientation))
+ except:
+ pass
+
+ results.sort()
+
+ return results
+
+
+ def absoluteLocation(self, match, featureLength, seqparams="", gidCoordinates="", customDB=""):
+ """ Returns the absolute location of the start of a match = ((genome, gID), (pos, sense)), given a relative
+ location with respect to the gene and a feature length.
+ Can be passed cached sequence parameters (up, cds, down) and geneEntry tuple to avoid hitting database.
+ """
+ result = ["", 0, "F"]
+ (geneID, loc) = match
+ (pos, sense) = loc
+ pos = int(pos)
+ if seqparams == "":
+ seqparams = self.getSeqParameters()
+
+ (up, cds, down) = seqparams
+ if gidCoordinates == "":
+ gidCoordinates = cistematic.core.geneEntry(geneID)
+
+ (gidChrom, gidStart, gidStop, gidLength, gidSense) = gidCoordinates
+ result[0] = gidChrom
+ if self.boundToNextGene:
+ up = cistematic.core.upstreamToNextGene(geneID, up, db=customDB)
+ down = cistematic.core.downstreamToNextGene(geneID, down, db=customDB)
+
+ if gidSense == "F":
+ result[2] = sense
+ if pos < up or cds > 0:
+ result[1] = gidStart - up + pos
+ else:
+ result[1] = gidStop - up + pos
+ else:
+ if gidStart > gidStop:
+ temp = gidStart
+ gidStart = gidStop
+ gidStop = temp
+
+ if sense == "F":
+ result[2] = "R"
+
+ if pos < up or cds > 0:
+ result[1] = gidStop + up - pos - featureLength
+ else:
+ result[1] = gidStart + up - pos - featureLength
+
+ return result
+
+
+ def setWorkdir(self, wdir=""):
+ self.workdir = wdir
+ self.mlog("changed workdir to %s" % (wdir))
+
+
+ def resetDataset(self):
+ stmt = "DELETE from dataset where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ self.mlog("log reset")
+
+
+ def resetLog(self):
+ stmt = "DELETE from expLog where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ self.mlog("log reset")
+
+
+ def resetResults(self):
+ stmt = "DELETE from results where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ self.mlog("results reset")
+
+
+ def resetRuns(self):
+ stmt = "DELETE from expLog where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ self.mlog("runs reset")
+
+
+ def resetPrograms(self):
+ self.programs = []
+ self.mlog("programs reset")
+
+
+ def resetSettings(self):
+ stmt = "DELETE from expLog where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ self.log("settings reset")
+
+
+ def appendProgram(self, program):
+ self.programs.append((program, self.setSettingsID(program.name(), program.getSettings())))
+ self.mlog("adding program %s" % program.name())
+
+
+ def removeProgram(self, progName):
+ for index in range(len(self.programs)):
+ if self.programs[index][0].name() == progName:
+ del self.programs[index]
+ self.mlog("removed program %s" % progName)
+
+
+ def createWorkdir(self):
+ if self.workdir == "":
+ self.workdir = tempfile.mktemp()
+
+ try:
+ os.mkdir(self.workdir)
+ self.mlog("created workdir: %s" % (self.workdir))
+ except:
+ pass
+
+
+ def removeWorkdir(self):
+ try:
+ filenames = os.listdir(self.workdir)
+ for entry in filenames:
+ os.remove("%s/%s" % (self.workdir, entry))
+ os.rmdir(self.workdir)
+ except:
+ self.mlog("could not delete workdir: %s" % (self.workdir))
+
+
+ def mlog(self,msg):
+ print msg
+ stmt = "INSERT into expLog(ID, expID, timestamp, message) values (NULL, '%s', '%s', '%s') " % (self.experimentID, time.localtime(),string.replace(msg, "'", '"'))
+ self.sqlexp(stmt, "commit")
+
+
+ def logToString(self):
+ response = ""
+ for line in self.getLog():
+ response += time.asctime(line[0]) + ": " + line[1] + "\n"
+
+ return response
+
+
+ def printLog(self):
+ for line in self.getLog():
+ print "%s: %s" % (time.asctime(line[0]), line[1])
+
+
+ def tailLog(self):
+ theLog = self.getLog()
+ for line in theLog[-10:]:
+ print "%s: %s" % (time.asctime(line[0]), line[1])
+
+
+ def loadPrograms(self):
+ self.programs = []
+ for progs in self.getSetting("loaded_programs"):
+ (progs0, progs1, progs2) = progs.split("\t")
+ execString = "from %s import %s" % (progs0, progs1)
+ exec execString
+ execString = 'self.programs.append((apply(%s), %s))' % (progs1, progs2)
+ exec execString
+
+
+ def savePrograms(self):
+ progs = []
+ for (program, settingID) in self.programs:
+ progs.append("%s\t%s\t%s" % (program.__class__.__module__, program.__class__.__name__, settingID))
+
+ if len(progs) > 0:
+ self.setSettings("loaded_programs", progs)
+
+ def saveGeneDB(self):
+ if self.geneDB != "":
+ self.setSettings("geneDB", [self.geneDB])
+
+
+ def loadFasta(self, fastaFile, genomeName):
+ """ load fasta file into the dataset.
+ """
+ genIDList = self.loadFastaFromFile(fastaFile, genomeName)
+
+ return [(genomeName, genIDList)]
+
+
+ def loadFastaFromFile(self, fastaFile, genomeName):
+ seqArray = []
+ seqLen = 0
+ seqName = []
+ geneDBPath = "%s.genedb" % genomeName
+ self.mlog("Loading fasta file %s into database %s" % (genomeName, geneDBPath))
+ aGenome = Genome(genomeName, dbFile=geneDBPath)
+ aGenome.createGeneDB()
+ inFile = open(fastaFile, "r")
+ header = inFile.readline()
+ while header != "":
+ seqArray = []
+ seqLen = 0
+ chromID = header.strip()[1:]
+ currentLine = inFile.readline()
+ while currentLine != "" and currentLine[0] != ">":
+ lineSeq = currentLine.strip()
+ seqLen += len(lineSeq)
+ seqArray.append(lineSeq)
+ currentLine = inFile.readline()
+
+ seq = string.join(seqArray, "")
+ print "Added sequence %s to database" % chromID
+ aGenome.addSequence((genomeName, chromID), seq, "chromosome", str(seqLen))
+ aGenome.addChromosomeEntry(chromID, chromID, "db")
+ aGenome.addGeneEntry((genomeName, chromID), chromID, 0, seqLen - 1, "F", "IMPORT", "1")
+ header = currentLine
+ seqName.append(chromID)
+
+ inFile.close()
+ aGenome.createIndices()
+ self.setGeneDB(geneDBPath)
+
+ return seqName
+
+
+ def loadGenepool(self):
+ self.genepool = {}
+ for geneTuple in self.getDataset():
+ genome = geneTuple[0]
+ geneList = geneTuple[1]
+ for gene in geneList:
+ try:
+ if len(gene) == 2:
+ tag = gene[0]
+ geneID = (genome, tag)
+ seq = gene[1]
+ # need to deal with masking lowercase
+ self.genepool[geneID] = seq.upper()
+ else:
+ geneID = (genome, gene)
+ # note that we will only keep one copy of a geneid in the genepool, even if we
+ # retrieve it multiple times.
+ self.genepool[geneID] = retrieveSeq(geneID, self.upstream, self.cds, self.downstream, self.geneDB, self.maskLowerCase, self.boundToNextGene)
+ except:
+ self.mlog("could not load %s" % (str(geneID)))
+
+ self.genepoolID = self.setSettingsID("genepool", self.genepool.keys())
+
+
+ def loadGeneFeatures(self):
+ stmt = "DELETE from sequenceFeatures where expID = '%s'" % self.experimentID
+ res = self.sqlexp(stmt, "commit")
+ stmtList = []
+ stmt = "INSERT into sequenceFeatures (ID, expID, seqGenome, seqID, featureType, start, stop, orientation) values (NULL, ?, ?, ?, ?, ?, ?, ?)"
+ for geneTuple in self.getDataset():
+ genome = geneTuple[0]
+ geneList = geneTuple[1]
+ for gene in geneList:
+ try:
+ geneID = (genome, gene)
+ results = retrieveSeqFeatures(geneID, self.upstream, self.cds, self.downstream, self.boundToNextGene, self.geneDB)
+ for entry in results:
+ (ftype, start, stop, orientation) = entry
+ stmtList.append((self.experimentID, genome, gene, ftype, start, stop, orientation))
+ except:
+ self.mlog("could not find features for %s" % (gene))
+
+ if len(stmtList) > 0:
+ self.batchsqlexp(stmt, stmtList)
+
+
+ def reload(self):
+ pass
+
+
+ def toFile(self, geneIDList, filename, geneDict=[]):
+ outFile = open(filename, "w")
+ for geneID in geneIDList:
+ if geneID in geneDict:
+ outFile.write(fasta(geneID, geneDict[geneID]))
+ elif geneID in self.genepool:
+ outFile.write(fasta(geneID, self.genepool[geneID]))
+ else:
+ self.mlog("could not write %s to file" % str(geneID))
+
+ outFile.close()
+
+
+ def createDataFile(self, datasetID=-1, geneIDList=[], geneDict=[]):
+ oldtempdir = tempfile.tempdir
+ tempfile.tempdir = self.workdir
+ dataFile = tempfile.mktemp()
+ tempfile.tempdir = oldtempdir
+ if datasetID < 0:
+ datasetID = self.genepoolID
+
+ if len(geneIDList) < 1:
+ settingsList = self.getSettingsID(datasetID)
+ geneIDList = eval(settingsList[1])
+
+ try:
+ self.toFile(geneIDList, dataFile, geneDict)
+ except:
+ self.mlog("could not create dataFile %s" % (dataFile))
+
+ return dataFile
+
+
+ def initialize(self, dataset=[], workdir=""):
+ if workdir != "":
+ self.setWorkdir(workdir)
+
+ self.createWorkdir()
+ if len(dataset) > 0:
+ self.appendDataset(dataset)
+ self.loadGenepool()
+ self.loadGeneFeatures()
+
+
+ def run(self):
+ if len(self.programs) == 0:
+ self.mlog("Must instantiate one or more programs first")
+ elif len(self.genepool) == 0:
+ self.mlog("Must have one or more valid sequences in the dataset")
+
+
+ def sqlexp(self, stmt, commit=""):
+ db = sqlite.connect(self.expFile, timeout=60)
+ sqlc = db.cursor()
+ if self.debugSQL:
+ print "sqlexp: %s" % stmt
+
+ sqlc.execute(stmt)
+ res = sqlc.fetchall()
+ if commit != "":
+ db.commit()
+
+ if stmt[0:6] == "INSERT":
+ res = sqlc.lastrowid
+
+ sqlc.close()
+ db.close()
+
+ return res
+
+
+ def batchsqlexp(self, stmt, batch):
+ """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
+ """
+ res = []
+ db = sqlite.connect(self.expFile, timeout=60)
+ sqlc = db.cursor()
+ if self.debugSQL:
+ print "batchsql: %s" % stmt
+ print "batchsql: %s" % str(batch)
+
+ sqlc.executemany(stmt, batch)
+ db.commit()
+ sqlc.close()
+ db.close()
+
+ return res
+
+
+ def setExternalStatus(self, status):
+ self.mlog("Setting status to %s" % status)
+ statfile = open("%s.status" % self.expFile, "w")
+ statfile.write(status)
+ statfile.close()
+
+
+ def resetExternalStatus(self):
+ try:
+ os.remove("%s.status" % self.expFile)
+ except:
+ pass
+
+
+ def startDB(self):
+ if not os.path.exists(self.expFile):
+ try:
+ db = sqlite.connect(self.expFile, timeout=60)
+ sql = db.cursor()
+ sql.execute("CREATE table experiment(ID INTEGER PRIMARY KEY, expID varchar, expType varchar, expStatus varchar, timestamp varchar)")
+ sql.execute("CREATE table dataset(ID INTEGER PRIMARY KEY, expID varchar, datasetGroup varchar, genome varchar, locus varchar, sequence varchar)")
+ sql.execute("CREATE table results(ID INTEGER PRIMARY KEY, expID varchar, resultsGroup varchar, mTagID varchar)")
+ sql.execute("CREATE table motifs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, motifSeq varchar, threshold varchar, info varchar)")
+ sql.execute("CREATE table motifPWMs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, position int, aFreq float, cFreq float, gFreq float, tFreq float)")
+ sql.execute("CREATE table motifSequences(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, sequence varchar, type varchar, location varchar)")
+ sql.execute("CREATE table settings(ID INTEGER PRIMARY KEY, expID varchar, settingName varchar, data varchar)")
+ sql.execute("CREATE table runs(ID INTEGER PRIMARY KEY, expID varchar, progName varchar, datasetGroup varchar, settingsID int, timestamp varchar, resultsGroup varchar)")
+ sql.execute("CREATE table expLog(ID INTEGER PRIMARY KEY, expID varchar, timestamp varchar, message varchar)")
+ sql.execute("CREATE table sequenceFeatures(ID INTEGER PRIMARY KEY, expID varchar, seqGenome varchar, seqID varchar, featureType varchar, start int, stop int, orientation varchar)")
+
+ sql.execute("CREATE INDEX settingIndex1 on settings(expID, settingName)")
+ sql.execute("CREATE INDEX datasetIndex1 on dataset(expID, datasetGroup)")
+ sql.execute("CREATE INDEX motifsIndex1 on motifs(expID, mTagID)")
+ sql.execute("CREATE INDEX motifPWMsIndex1 on motifPWMs(expID, mTagID)")
+ sql.execute("CREATE INDEX motifSequencesIndex1 on motifSequences(expID, mTagID)")
+ sql.execute("CREATE INDEX featuresIndex1 on sequenceFeatures(expID, seqGenome, seqID)")
+
+ sql.execute("INSERT INTO settings(ID, expID, settingName, data) values (NULL, '%s', 'experimentType', '%s')" % (self.experimentID, self.experimentType))
+
+ db.commit()
+ sql.close()
+ db.close()
+ self.mlog("Created experiment database %s" % self.expFile)
+ except:
+ self.mlog("Could not create experiment database %s" % self.expFile)
+ else:
+ self.mlog("Using existing experiment database %s" % self.expFile)
+
+ if self.settingsHasKey("loaded_programs"):
+ self.loadPrograms()
+
+ if self.settingsHasKey("seq_parameters"):
+ res = self.getSetting("seq_parameters")
+ (up, cds, down) = res[0].split("\t")
+ self.setSeqParameters(int(up), int(cds), int(down))
+ else:
+ self.setSeqParameters()
+
+ if self.settingsHasKey("maskLowerCase"):
+ res = self.getSetting("maskLowerCase")
+ self.setMaskLowerCase(res[0])
+ else:
+ self.setMaskLowerCase(False)
+
+ if self.settingsHasKey("boundToNextGene"):
+ res = self.getSetting("boundToNextGene")
+ self.setBoundToNextGene(res[0])
+ else:
+ self.setBoundToNextGene(False)
+
+ if self.settingsHasKey("experimentType"):
+ res = self.getSetting("experimentType")
+ self.experimentType = res[0]
+
+ if self.settingsHasKey("geneDB"):
+ res = self.getSetting("geneDB")
+ self.geneDB = res[0]
+
+ if self.dsetLength() > 0:
+ self.loadGenepool()
+
+ self.createWorkdir()
\ No newline at end of file