X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fexperiments%2Fexperiment.py;fp=cistematic%2Fexperiments%2Fexperiment.py;h=fbe728ac9e14cb916f5eeb3a9272c21d708c11e0;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/experiments/experiment.py b/cistematic/experiments/experiment.py new file mode 100644 index 0000000..fbe728a --- /dev/null +++ b/cistematic/experiments/experiment.py @@ -0,0 +1,931 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# python parent class for experiments +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import os, time, tempfile, string + +from cistematic.core import retrieveSeq, retrieveSeqFeatures, fasta +import cistematic.core +from cistematic.core.motif import matrixRow, Motif +from cistematic.genomes import Genome +from cistematic import version +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + + +class Experiment: + """ genepool = {(genomeA, geneA1): seq1, (genomeA, geneA2): seq2, (genomeB, geneB1): seq3,...} + geneFeatures = ((genomeA, geneA1):[feature0, feature1, .....], (genomeB, geneB1):[feature0, feature1,...],....} + programs = [(program_object1, setting1), (program_object2, setting2), ....] + """ + genepool = {} + geneFeatures = {} + programs = [] + genepoolID = 0 + workdir = "" + experimentID = "" + experimentType = "generic" + expFile = "" + geneDB = "" + debugSQL = False + + + def __init__(self, expID, expDBFile, geneDBFile=""): + self.experimentID = expID + self.expFile = expDBFile + self.geneDB = geneDBFile + self.maskLowerCase = False + self.boundToNextGene = False + print "cistematic version %s" % version + self.startDB() + + + def __del__(self): + self.savePrograms() + self.saveGeneDB() + self.uncacheGeneDB() + self.removeWorkdir() + + + def setExperimentID(self, expID): + self.mlog("changing experiment ID from %s to %s" % (self.experimentID, expID)) + self.experimentID = expID + + + def setGeneDB(self, geneDBFile=""): + self.mlog("using %s as the gene database" % geneDBFile) + self.geneDB = geneDBFile + + + def cacheGeneDB(self, genome): + try: + cistematic.core.cacheGeneDB(genome) + self.mlog("cached genome %s" % genome) + except: + self.mlog("could not cache genome %s" % genome) + + + def uncacheGeneDB(self): + try: + cistematic.core.uncacheGeneDB() + except: + self.mlog("could not uncache genomes") + + + def setMaskLowerCase(self, maskValue): + if maskValue == True or maskValue == 1 or maskValue == "1": + self.maskLowerCase = True + self.setSettings("maskLowerCase", ["1"]) + else: + self.maskLowerCase = False + self.setSettings("maskLowerCase", ["0"]) + + + def setBoundToNextGene(self, boundValue): + if boundValue == True or boundValue == 1 or boundValue == "1": + self.boundToNextGene = True + self.setSettings("boundToNextGene", ["1"]) + else: + self.boundToNextGene = False + self.setSettings("boundToNextGene", ["0"]) + + + def setSeqParameters(self, up=0, cds=1, down=0): + self.upstream = up + self.cds = cds + self.downstream = down + if cds == 0: + cdsStatus = "NO " + else: + cdsStatus = "" + + self.setSettings("seq_parameters", ["%d\t%d\t%d" % (up, cds, down)]) + self.mlog("setting sequence retrieval parameters to %d bp upstream, %d bp downstream, and %scds" % (up, down, cdsStatus)) + + + def getSeqParameters(self): + return (self.upstream, self.cds, self.downstream) + + + def dsetLength(self): + stmt="SELECT count(*) from dataset where expID = '%s' " % self.experimentID + res = self.sqlexp(stmt) + try: + answer = int(res[0][0]) + except IndexError: + answer = 0 + + return answer + + + def resultsLength(self): + stmt="SELECT count(*) from results where expID = '%s' " % self.experimentID + res = self.sqlexp(stmt) + try: + answer = int(res[0][0]) + except IndexError: + answer = 0 + + return answer + + + def checkMotID(self, motID): + answer = False + stmt = "SELECT ID from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + if len(res) > 0: + answer = True + + return answer + + + def findMotif(self,mTagID): + if self.checkMotID(mTagID): + return self.makeMotif(mTagID) + + self.mlog("could not find %s" % mTagID) + + return "" + + + def makeMotif(self, motID): + mPWM = [] + mseqs = [] + stmt = "SELECT motifSeq, threshold, info from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + entry = res[0] + (seq, threshold, info) = entry + stmt = "SELECT aFreq, cFreq, gFreq, tFreq from motifPWMs where expID = '%s' and mTagID = '%s' order by position" % (self.experimentID, motID) + res = self.sqlexp(stmt) + for entry in res: + col = [0.0, 0.0, 0.0, 0.0] + (aFreq, cFreq, gFreq, tFreq) = entry + col[matrixRow["A"]] = aFreq + col[matrixRow["C"]] = cFreq + col[matrixRow["G"]] = gFreq + col[matrixRow["T"]] = tFreq + mPWM.append(col) + + stmt = "SELECT sequence from motifSequences where expID = '%s' and mTagID = '%s' and type = 'instance' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + for entry in res: + seq = entry[0] + mseqs.append(seq) + + return Motif(motID, seq, mPWM, mseqs, threshold, info) + + + def saveMotif(self, mot): + motID = mot.tagID + motifSeq = mot.motifSeq + motifInfo = mot.info + motifThreshold = mot.threshold + if self.checkMotID(motID): + stmt = "DELETE from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + stmt = "DELETE from motifPWMs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + stmt = "DELETE from motifSequences where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + + stmtList = [] + values = "(NULL, '%s', '%s', '%s', %f, '%s')" % (self.experimentID, motID, motifSeq, motifThreshold, motifInfo) + stmt = "INSERT into motifs(ID, expID, mTagID, motifSeq, threshold, info) values %s " % values + self.sqlexp(stmt, commit=True) + pos = 0 + stmt = "INSERT into motifPWMs(ID, expID, mTagID, position, aFreq, cFreq, gFreq, tFreq) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for col in mot.motifPWM: + aFreq = round(col[matrixRow["A"]],4) + cFreq = round(col[matrixRow["C"]],4) + gFreq = round(col[matrixRow["G"]],4) + tFreq = abs(1.0 - aFreq - cFreq - gFreq) + stmtList.append((self.experimentID, motID, pos, aFreq, cFreq, gFreq, tFreq)) + pos += 1 + + self.batchsqlexp(stmt, stmtList) + if len(mot.sequences) > 0: + stmt = "INSERT into motifSequences(ID, expID, mTagID, sequence, type, location) values (NULL, ?, ?, ?, ?, ?) " + stmtList = [] + for seq in mot.sequences: + stmtList.append((self.experimentID, motID, seq, "instance", "-")) + + self.batchsqlexp(stmt, stmtList) + + + def exportMotifs(self, directory=".", prefix="", suffix="mot"): + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + if prefix == "": + prefix = self.experimentID + + for entry in res: + motID = entry[0] + mot = self.makeMotif(motID) + fileName = "%s/%s-%s.%s" % (directory, prefix, motID, suffix) + self.mlog("exporting %s as %s" % (motID, fileName)) + mot.saveMotif(fileName) + + + def exportLogos(self, directory=".", prefix=""): + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + if prefix == "": + prefix = self.experimentID + for entry in res: + motID = entry[0] + mot = self.makeMotif(motID) + fileName = "%s/%s-%s" % (directory, prefix, motID) + self.mlog("saving logo for %s as %s.png" % (motID, fileName)) + mot.saveLogo(fileName) + + + def appendResults(self, mot, resultsGroup="-"): + motID = mot.tagID + self.saveMotif(mot) + stmt = "INSERT into results(ID, expID, resultsGroup, mTagID) values (NULL, '%s', '%s', '%s') " % (self.experimentID, resultsGroup, motID) + self.sqlexp(stmt, "commit") + + + def getGeneDB(self, geneDBFile=""): + return self.geneDB + + + def getLog(self): + answer = [] + stmt = "SELECT timestamp, message from expLog where expID = '%s' order by timestamp" % self.experimentID + res = self.sqlexp(stmt) + for entry in res: + (timestamp, message) = entry + answer.append((eval(timestamp), message)) + + return answer + + + def getResults(self): + answer = [] + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + motID = entry[0] + answer.append(self.makeMotif(motID)) + + return answer + + + def getSettings(self): + answer = {} + stmt = "SELECT settingName, data from settings where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (settingName, data) = entry + if settingName not in answer.keys(): + answer[settingName] = [] + + answer[settingName].append(data) + + return answer + + + def getSetting(self, settingName): + answer = [] + stmt = "SELECT data from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName) + res = self.sqlexp(stmt) + for entry in res: + answer.append(entry[0]) + + return answer + + + def settingsHasKey(self, thekey): + answer = False + stmt = "SELECT distinct ID from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, thekey) + res = self.sqlexp(stmt) + if len(res) > 0: + answer = True + + return answer + + + def setSettings(self, settingName, settingList): + """ insert or replace (i.e. delete previous entry) a setting with one or more setting data. + """ + # delete existing, insert new + try: + stmt = "DELETE from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName) + res = self.sqlexp(stmt, "commit") + except: + pass + + stmtList = [] + stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, ?, ?, ?)" + for entry in settingList: + stmtList.append((self.experimentID, settingName, entry)) + + res = self.batchsqlexp(stmt, stmtList) + + return settingName + + + def setSettingsID(self, settingName, data): + """ return the settingsID for the inserted settingName:data pair in the settings table. + """ + stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, '%s', '%s', \"%s\")" % (self.experimentID, settingName, data) + res = self.sqlexp(stmt, "commit") + + return res + + + def getSettingsID(self, settingID): + """ get a setting by settingsID in the settings table. + """ + answer = "" + stmt = "SELECT settingName, data from settings where expID = '%s' and ID = %d" % (self.experimentID, int(settingID)) + res = self.sqlexp(stmt) + try: + (name, data) = res[0] + answer = (name, data) + except: + pass + + return answer + + + def setRun(self, progName, datasetID, settingsID): + values = "(NULL, '%s', '%s', '%s', %d, '%s', '%s')" % (self.experimentID, progName, datasetID, settingsID, time.localtime(), "-") + stmt = "INSERT into runs (ID, expID, progName, datasetGroup, settingsID, timestamp, resultsGroup) values %s" % values + runID = self.sqlexp(stmt, "commit") + self.mlog("run %s: program %s with settings %d and dataset %s" % (runID, progName, settingsID, datasetID)) + + return int(runID) + + + def getRun(self, rID): + res = "" + stmt = "SELECT progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' and ID = %d" % (self.experimentID, rID) + res = self.sqlexp(stmt) + (progName, datasetID, settingsID, timestamp, resultsID) = res[0] + if resultsID == "-": + resultsID = [] + if datasetID != "chromolist": + datasetID = int(datasetID) + + return (progName, datasetID, int(settingsID), timestamp, resultsID) + + + def getRunsByProg(self, prog): + runs = self.getRuns() + matchingRuns = [] + for entry in runs: + if (runs[entry][0] == prog): + matchingRuns.append(runs[entry]) + + return matchingRuns + + + def getRuns(self): + runs = {} + res = '' + stmt = "SELECT ID, progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (ID, progName, datasetID, settingsID, timestamp, resultsID) = entry + runs[int(ID)] = (progName, datasetID, settingsID, eval(timestamp), resultsID) + + return runs + + + def appendDataset(self, dset): + stmtList = [] + stmt = "INSERT into dataset(ID, genome, locus, sequence, expID) values (NULL, ?, ?, ?, ?) " + for (genome, entries) in dset: + for entry in entries: + if len(entry) == 2: + (locus, sequence) = entry + else: + locus = entry + sequence = "-" + + stmtList.append((genome, locus, sequence, self.experimentID)) + + res = self.batchsqlexp(stmt, stmtList) + + + def getDataset(self): + dset = [] + stmt = "SELECT genome, locus, sequence from dataset where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (genome, locus, sequence) = entry + if sequence == "-": + dset.append((genome, [locus])) + else: + dset.append((genome, [locus, sequence])) + + return dset + + + def getDatasetNames(self): + dset = [] + stmt = "select settingName from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' " + res = self.sqlexp(stmt) + for entry in res: + dset.append(entry) + + return dset + + + def getDatasetIDs(self): + dset = [] + stmt = "select ID from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' " + res = self.sqlexp(stmt) + for entry in res: + dset.append(entry) + + return dset + + + def getFeatures(self, geneID): + results = [] + try: + res = [] + stmt = "select featureType, start, stop, orientation from sequenceFeatures where seqGenome = '%s' and seqID = '%s' and expID = '%s' " % (geneID[0], geneID[1], self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (ftype, start, stop, orientation) = entry + results.append((ftype, start, stop, orientation)) + except: + pass + + results.sort() + + return results + + + def absoluteLocation(self, match, featureLength, seqparams="", gidCoordinates="", customDB=""): + """ Returns the absolute location of the start of a match = ((genome, gID), (pos, sense)), given a relative + location with respect to the gene and a feature length. + Can be passed cached sequence parameters (up, cds, down) and geneEntry tuple to avoid hitting database. + """ + result = ["", 0, "F"] + (geneID, loc) = match + (pos, sense) = loc + pos = int(pos) + if seqparams == "": + seqparams = self.getSeqParameters() + + (up, cds, down) = seqparams + if gidCoordinates == "": + gidCoordinates = cistematic.core.geneEntry(geneID) + + (gidChrom, gidStart, gidStop, gidLength, gidSense) = gidCoordinates + result[0] = gidChrom + if self.boundToNextGene: + up = cistematic.core.upstreamToNextGene(geneID, up, db=customDB) + down = cistematic.core.downstreamToNextGene(geneID, down, db=customDB) + + if gidSense == "F": + result[2] = sense + if pos < up or cds > 0: + result[1] = gidStart - up + pos + else: + result[1] = gidStop - up + pos + else: + if gidStart > gidStop: + temp = gidStart + gidStart = gidStop + gidStop = temp + + if sense == "F": + result[2] = "R" + + if pos < up or cds > 0: + result[1] = gidStop + up - pos - featureLength + else: + result[1] = gidStart + up - pos - featureLength + + return result + + + def setWorkdir(self, wdir=""): + self.workdir = wdir + self.mlog("changed workdir to %s" % (wdir)) + + + def resetDataset(self): + stmt = "DELETE from dataset where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("log reset") + + + def resetLog(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("log reset") + + + def resetResults(self): + stmt = "DELETE from results where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("results reset") + + + def resetRuns(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("runs reset") + + + def resetPrograms(self): + self.programs = [] + self.mlog("programs reset") + + + def resetSettings(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.log("settings reset") + + + def appendProgram(self, program): + self.programs.append((program, self.setSettingsID(program.name(), program.getSettings()))) + self.mlog("adding program %s" % program.name()) + + + def removeProgram(self, progName): + for index in range(len(self.programs)): + if self.programs[index][0].name() == progName: + del self.programs[index] + self.mlog("removed program %s" % progName) + + + def createWorkdir(self): + if self.workdir == "": + self.workdir = tempfile.mktemp() + + try: + os.mkdir(self.workdir) + self.mlog("created workdir: %s" % (self.workdir)) + except: + pass + + + def removeWorkdir(self): + try: + filenames = os.listdir(self.workdir) + for entry in filenames: + os.remove("%s/%s" % (self.workdir, entry)) + os.rmdir(self.workdir) + except: + self.mlog("could not delete workdir: %s" % (self.workdir)) + + + def mlog(self,msg): + print msg + stmt = "INSERT into expLog(ID, expID, timestamp, message) values (NULL, '%s', '%s', '%s') " % (self.experimentID, time.localtime(),string.replace(msg, "'", '"')) + self.sqlexp(stmt, "commit") + + + def logToString(self): + response = "" + for line in self.getLog(): + response += time.asctime(line[0]) + ": " + line[1] + "\n" + + return response + + + def printLog(self): + for line in self.getLog(): + print "%s: %s" % (time.asctime(line[0]), line[1]) + + + def tailLog(self): + theLog = self.getLog() + for line in theLog[-10:]: + print "%s: %s" % (time.asctime(line[0]), line[1]) + + + def loadPrograms(self): + self.programs = [] + for progs in self.getSetting("loaded_programs"): + (progs0, progs1, progs2) = progs.split("\t") + execString = "from %s import %s" % (progs0, progs1) + exec execString + execString = 'self.programs.append((apply(%s), %s))' % (progs1, progs2) + exec execString + + + def savePrograms(self): + progs = [] + for (program, settingID) in self.programs: + progs.append("%s\t%s\t%s" % (program.__class__.__module__, program.__class__.__name__, settingID)) + + if len(progs) > 0: + self.setSettings("loaded_programs", progs) + + def saveGeneDB(self): + if self.geneDB != "": + self.setSettings("geneDB", [self.geneDB]) + + + def loadFasta(self, fastaFile, genomeName): + """ load fasta file into the dataset. + """ + genIDList = self.loadFastaFromFile(fastaFile, genomeName) + + return [(genomeName, genIDList)] + + + def loadFastaFromFile(self, fastaFile, genomeName): + seqArray = [] + seqLen = 0 + seqName = [] + geneDBPath = "%s.genedb" % genomeName + self.mlog("Loading fasta file %s into database %s" % (genomeName, geneDBPath)) + aGenome = Genome(genomeName, dbFile=geneDBPath) + aGenome.createGeneDB() + inFile = open(fastaFile, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + print "Added sequence %s to database" % chromID + aGenome.addSequence((genomeName, chromID), seq, "chromosome", str(seqLen)) + aGenome.addChromosomeEntry(chromID, chromID, "db") + aGenome.addGeneEntry((genomeName, chromID), chromID, 0, seqLen - 1, "F", "IMPORT", "1") + header = currentLine + seqName.append(chromID) + + inFile.close() + aGenome.createIndices() + self.setGeneDB(geneDBPath) + + return seqName + + + def loadGenepool(self): + self.genepool = {} + for geneTuple in self.getDataset(): + genome = geneTuple[0] + geneList = geneTuple[1] + for gene in geneList: + try: + if len(gene) == 2: + tag = gene[0] + geneID = (genome, tag) + seq = gene[1] + # need to deal with masking lowercase + self.genepool[geneID] = seq.upper() + else: + geneID = (genome, gene) + # note that we will only keep one copy of a geneid in the genepool, even if we + # retrieve it multiple times. + self.genepool[geneID] = retrieveSeq(geneID, self.upstream, self.cds, self.downstream, self.geneDB, self.maskLowerCase, self.boundToNextGene) + except: + self.mlog("could not load %s" % (str(geneID))) + + self.genepoolID = self.setSettingsID("genepool", self.genepool.keys()) + + + def loadGeneFeatures(self): + stmt = "DELETE from sequenceFeatures where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + stmtList = [] + stmt = "INSERT into sequenceFeatures (ID, expID, seqGenome, seqID, featureType, start, stop, orientation) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for geneTuple in self.getDataset(): + genome = geneTuple[0] + geneList = geneTuple[1] + for gene in geneList: + try: + geneID = (genome, gene) + results = retrieveSeqFeatures(geneID, self.upstream, self.cds, self.downstream, self.boundToNextGene, self.geneDB) + for entry in results: + (ftype, start, stop, orientation) = entry + stmtList.append((self.experimentID, genome, gene, ftype, start, stop, orientation)) + except: + self.mlog("could not find features for %s" % (gene)) + + if len(stmtList) > 0: + self.batchsqlexp(stmt, stmtList) + + + def reload(self): + pass + + + def toFile(self, geneIDList, filename, geneDict=[]): + outFile = open(filename, "w") + for geneID in geneIDList: + if geneID in geneDict: + outFile.write(fasta(geneID, geneDict[geneID])) + elif geneID in self.genepool: + outFile.write(fasta(geneID, self.genepool[geneID])) + else: + self.mlog("could not write %s to file" % str(geneID)) + + outFile.close() + + + def createDataFile(self, datasetID=-1, geneIDList=[], geneDict=[]): + oldtempdir = tempfile.tempdir + tempfile.tempdir = self.workdir + dataFile = tempfile.mktemp() + tempfile.tempdir = oldtempdir + if datasetID < 0: + datasetID = self.genepoolID + + if len(geneIDList) < 1: + settingsList = self.getSettingsID(datasetID) + geneIDList = eval(settingsList[1]) + + try: + self.toFile(geneIDList, dataFile, geneDict) + except: + self.mlog("could not create dataFile %s" % (dataFile)) + + return dataFile + + + def initialize(self, dataset=[], workdir=""): + if workdir != "": + self.setWorkdir(workdir) + + self.createWorkdir() + if len(dataset) > 0: + self.appendDataset(dataset) + self.loadGenepool() + self.loadGeneFeatures() + + + def run(self): + if len(self.programs) == 0: + self.mlog("Must instantiate one or more programs first") + elif len(self.genepool) == 0: + self.mlog("Must have one or more valid sequences in the dataset") + + + def sqlexp(self, stmt, commit=""): + db = sqlite.connect(self.expFile, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "sqlexp: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + if commit != "": + db.commit() + + if stmt[0:6] == "INSERT": + res = sqlc.lastrowid + + sqlc.close() + db.close() + + return res + + + def batchsqlexp(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.expFile, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "batchsql: %s" % stmt + print "batchsql: %s" % str(batch) + + sqlc.executemany(stmt, batch) + db.commit() + sqlc.close() + db.close() + + return res + + + def setExternalStatus(self, status): + self.mlog("Setting status to %s" % status) + statfile = open("%s.status" % self.expFile, "w") + statfile.write(status) + statfile.close() + + + def resetExternalStatus(self): + try: + os.remove("%s.status" % self.expFile) + except: + pass + + + def startDB(self): + if not os.path.exists(self.expFile): + try: + db = sqlite.connect(self.expFile, timeout=60) + sql = db.cursor() + sql.execute("CREATE table experiment(ID INTEGER PRIMARY KEY, expID varchar, expType varchar, expStatus varchar, timestamp varchar)") + sql.execute("CREATE table dataset(ID INTEGER PRIMARY KEY, expID varchar, datasetGroup varchar, genome varchar, locus varchar, sequence varchar)") + sql.execute("CREATE table results(ID INTEGER PRIMARY KEY, expID varchar, resultsGroup varchar, mTagID varchar)") + sql.execute("CREATE table motifs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, motifSeq varchar, threshold varchar, info varchar)") + sql.execute("CREATE table motifPWMs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, position int, aFreq float, cFreq float, gFreq float, tFreq float)") + sql.execute("CREATE table motifSequences(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, sequence varchar, type varchar, location varchar)") + sql.execute("CREATE table settings(ID INTEGER PRIMARY KEY, expID varchar, settingName varchar, data varchar)") + sql.execute("CREATE table runs(ID INTEGER PRIMARY KEY, expID varchar, progName varchar, datasetGroup varchar, settingsID int, timestamp varchar, resultsGroup varchar)") + sql.execute("CREATE table expLog(ID INTEGER PRIMARY KEY, expID varchar, timestamp varchar, message varchar)") + sql.execute("CREATE table sequenceFeatures(ID INTEGER PRIMARY KEY, expID varchar, seqGenome varchar, seqID varchar, featureType varchar, start int, stop int, orientation varchar)") + + sql.execute("CREATE INDEX settingIndex1 on settings(expID, settingName)") + sql.execute("CREATE INDEX datasetIndex1 on dataset(expID, datasetGroup)") + sql.execute("CREATE INDEX motifsIndex1 on motifs(expID, mTagID)") + sql.execute("CREATE INDEX motifPWMsIndex1 on motifPWMs(expID, mTagID)") + sql.execute("CREATE INDEX motifSequencesIndex1 on motifSequences(expID, mTagID)") + sql.execute("CREATE INDEX featuresIndex1 on sequenceFeatures(expID, seqGenome, seqID)") + + sql.execute("INSERT INTO settings(ID, expID, settingName, data) values (NULL, '%s', 'experimentType', '%s')" % (self.experimentID, self.experimentType)) + + db.commit() + sql.close() + db.close() + self.mlog("Created experiment database %s" % self.expFile) + except: + self.mlog("Could not create experiment database %s" % self.expFile) + else: + self.mlog("Using existing experiment database %s" % self.expFile) + + if self.settingsHasKey("loaded_programs"): + self.loadPrograms() + + if self.settingsHasKey("seq_parameters"): + res = self.getSetting("seq_parameters") + (up, cds, down) = res[0].split("\t") + self.setSeqParameters(int(up), int(cds), int(down)) + else: + self.setSeqParameters() + + if self.settingsHasKey("maskLowerCase"): + res = self.getSetting("maskLowerCase") + self.setMaskLowerCase(res[0]) + else: + self.setMaskLowerCase(False) + + if self.settingsHasKey("boundToNextGene"): + res = self.getSetting("boundToNextGene") + self.setBoundToNextGene(res[0]) + else: + self.setBoundToNextGene(False) + + if self.settingsHasKey("experimentType"): + res = self.getSetting("experimentType") + self.experimentType = res[0] + + if self.settingsHasKey("geneDB"): + res = self.getSetting("geneDB") + self.geneDB = res[0] + + if self.dsetLength() > 0: + self.loadGenepool() + + self.createWorkdir() \ No newline at end of file