From: Sean Upchurch Date: Tue, 30 Nov 2010 22:55:15 +0000 (-0800) Subject: erange 4.0a dev release with integrated cistematic X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=commitdiff_plain;h=bc30aca13e5ec397c92e67002fbf7a103130b828 erange 4.0a dev release with integrated cistematic --- diff --git a/cistematic/__init__.py b/cistematic/__init__.py new file mode 100644 index 0000000..72ba39b --- /dev/null +++ b/cistematic/__init__.py @@ -0,0 +1,30 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +version = '3.0' \ No newline at end of file diff --git a/cistematic/cisstat/__init__.py b/cistematic/cisstat/__init__.py new file mode 100644 index 0000000..a968dd5 --- /dev/null +++ b/cistematic/cisstat/__init__.py @@ -0,0 +1,29 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# diff --git a/cistematic/cisstat/analyzego.py b/cistematic/cisstat/analyzego.py new file mode 100644 index 0000000..10ebe31 --- /dev/null +++ b/cistematic/cisstat/analyzego.py @@ -0,0 +1,303 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# get Gene Ontology Information and distribution + +import cistematic.core +from cistematic.core.geneinfo import geneinfoDB +from cistematic.cisstat.score import pvalue +from cistematic.cisstat.helper import hyperGeometric +from math import sqrt +from random import shuffle +import string + +def calculateGOStats(geneIDList, fileprefix, excludeIDList = [], roundsRandomization=0, sigLevel = 0.01): + """ calculates GO enrichment (and depletion) statistics for a given list of + geneIDs (assumed to be all from the same genome) using either the + hypergeometric distribution or random sampling for roundsRandomization + rounds if greater than 0. Specific geneID's can be excluded from the + calculations if included in excludeIDList. + + Results are saved in files with the given fileprefix. GO Terms that are + larger than 15 genes and that have pvalues < sigLevel / (#num GO categories) + are reported in fileprefix.gosig, whereas genes that matches the GO term are + listed in fileprefix.ZZZZZZZ where the Z's are the GOID. + """ + print "calculateGOStats: %s" % fileprefix + if roundsRandomization > 0: + goexpfile = open("%s.goexp" % fileprefix, "w") + gostatfile = open("%s.gostat" % fileprefix, "w") + gozscorefile = open("%s.gozscore" % fileprefix, "w") + gosigfile = open("%s.gosig" % fileprefix, "w") + + if len(geneIDList) < 1: + print "Need at the very least one gene!" + return + + firstgid = geneIDList[0] + genome = firstgid[0] + cistematic.core.cacheGeneDB(genome) + idb = geneinfoDB(cache=True) + goBin = {} + goPossible = {} + found = {} + locusList = [] + excludeLocusList = [] + excludeGOBin = {} + zList = [] + sigList = [] + print "Getting GO list" + goDesc = cistematic.core.allGOTerms(genome) + goInfo = cistematic.core.getAllGOInfo(genome) + print "len(goDesc) = %d" % len(goDesc) + for GOID in goDesc: + goBin[GOID] = 0 + goPossible[GOID] = cistematic.core.getGOIDCount(genome, GOID) + + for entry in geneIDList: + if entry not in excludeIDList and entry not in locusList: + locusList.append(entry) + + for (genome, gID) in excludeIDList: + if gID not in excludeLocusList: + excludeLocusList.append(gID) + try: + excludeGOTerms = goInfo[(genome, gID)] + except: + continue + + for entry in excludeGOTerms: + excludeGOTermsFields = entry.split("\t") + GOID = excludeGOTermsFields[0] + if GOID not in excludeGOBin: + excludeGOBin[GOID] = 0 + + excludeGOBin[GOID] += 1 + + print "Getting GO list for locusList" + for locus in locusList: + try: + locusGOTerms = goInfo[locus] + except: + continue + + for entry in locusGOTerms: + locusGOTermsFields = entry.split("\t") + GOID = locusGOTermsFields[0] + if GOID not in found: + found[GOID] = [] + + if locus not in found[GOID]: + found[GOID].append(locus) + goBin[GOID] += 1 + + numGenes = len(locusList) + goSize = {} + print "Arranging by size" + print "numGenes = %s" % str(numGenes) + for GOID in goBin: + goLen = goBin[GOID] + if goLen not in goSize: + goSize[goLen] = [] + + goSize[goLen].append(GOID) + + goLengths = goSize.keys() + goLengths.sort() + goLengths.reverse() + goList = [] + for goLen in goLengths: + for GOID in goSize[goLen]: + goList.append(GOID) + + allGIDs = [] + try: + theGIDs = cistematic.core.getGenomeGeneIDs(genome) + except: + print "could not get gene entries for %s" % genome + + for aGID in theGIDs: + if aGID not in excludeLocusList: + allGIDs.append(aGID) + + gokeys = goBin.keys() + mean = {} + standardDev = {} + if roundsRandomization > 0: + print "Get Random sampling" + sample = {} + for sampleNum in range(roundsRandomization): + print "Round %d" % (sampleNum + 1) + sample[sampleNum] = {} + for GOID in gokeys: + sample[sampleNum][GOID] = 0 + + shuffle(allGIDs) + sampleGenes = allGIDs[:numGenes] + for gid in sampleGenes: + try: + goarray = goInfo[(genome, gid)] + except: + continue + + for entry in goarray: + goarrayFields = entry.split("\t") + GOID = goarrayFields[0] + if GOID in gokeys: + sample[sampleNum][GOID] += 1 + + print "Calculating stats" + for GOID in gokeys: + mean[GOID] = 0 + sumofsquares = 0 + for sampleNum in range(roundsRandomization): + mean[GOID] += sample[sampleNum][GOID] + + mean[GOID] = float(mean[GOID]) / float(roundsRandomization) + for sampleNum in range(roundsRandomization): + sumofsquares += (sample[sampleNum][GOID] - mean[GOID]) * (sample[sampleNum][GOID] - mean[GOID]) + + standardDev[GOID] = sqrt(sumofsquares / float(roundsRandomization - 1)) + goexpfile.write("%s\t%f\t%f\n" % (GOID, mean[GOID], standardDev[GOID])) + else: + # Use hypergeometric + N = float(len(allGIDs)) + for GOID in gokeys: + possible = goPossible[GOID] + if GOID in excludeGOBin: + possible -= excludeGOBin[GOID] + + mean[GOID] = (numGenes * possible) / N + standardDev[GOID] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0)) + + print "Writing out gostat" + for GOID in goList: + count = goBin[GOID] + possible = goPossible[GOID] + if GOID in excludeGOBin: + possible -= excludeGOBin[GOID] + + try: + divisor = float(standardDev[GOID]) + if divisor > 0: + zscore = float(count - mean[GOID]) / float(standardDev[GOID]) + else: + zscore = 0.0 + + if possible > 0: + percentage = float(count) * 100.0 / float(possible) + else: + percentage = 0 + + pval = 1.0 + if roundsRandomization > 0: + if zscore < 0.0: + pval = pvalue(-1.0 * zscore) + else: + pval = pvalue(zscore) + else: + zscore = count - mean[GOID] + pval = hyperGeometric(len(allGIDs), possible, numGenes, count) + + status = "enriched" + if (count - mean[GOID]) < 0: + status = "depleted" + + gostatfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % (GOID, count, possible, percentage, zscore, pval, goDesc[GOID])) + zList.append((zscore, (GOID, count, possible, percentage, zscore, pval, goDesc[GOID]))) + if len(goList) > 0 and pval < (sigLevel / float(len(goList))) and possible > 15 and count > 1: + sigList.append((pval, (GOID, count, possible, percentage, pval, status, goDesc[GOID]))) + except: + gostatfile.write("%s\t%d out of %d\t \t%s\n" % (GOID, count, possible, goDesc[GOID])) + + gostatfile.close() + print "writing gozscore" + zList.sort() + zList.reverse() + for (zscore, entry) in zList: + gozscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\t%s\n" % entry) + + gozscorefile.close() + print "writing gosig" + sigList.sort() + annotCache = {} + for (pval, entry) in sigList: + GOID = entry[0] + gosigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\t%s\n" % entry) + goidsigfile = open("%s.%s" % (fileprefix, GOID[3:]), "w") + goidsigfileList = [] + if GOID not in found: + print "could not find %s" % GOID + continue + + print "%s\t%d genes" % (GOID, len(found[GOID])) + for gID in found[GOID]: + geneSymbol = "" + outsyn = "" + outdesc = "" + geneInfo = "" + if gID in annotCache: + (geneSymbol, outsyn, outdesc) = annotCache[gID] + else: + try: + geneInfo = idb.getGeneInfo(gID) + geneSymbol = geneInfo[0] + synonyms = idb.geneIDSynonyms(gID) + description = idb.getDescription(gID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + geneSymbol = str(gID) + try: + if len(gID) == 2: + newGeneID = idb.getGeneID(gID[0], gID[1]) + if len(newGeneID) > 1: + geneInfo = idb.getGeneInfo(newGeneID) + + if len(geneInfo) > 0: + geneSymbol = gID[1] + synonyms = idb.geneIDSynonyms(newGeneID) + description = idb.getDescription(newGeneID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + pass + + goidsigfileList.append("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc)) + if gID not in annotCache: + annotCache[gID] = (geneSymbol, outsyn, outdesc) + + goidsigfileList.sort() + for line in goidsigfileList: + goidsigfile.write(line) + + goidsigfile.close() + + gosigfile.close() + cistematic.core.uncacheGeneDB(genome) \ No newline at end of file diff --git a/cistematic/cisstat/chi2.py b/cistematic/cisstat/chi2.py new file mode 100644 index 0000000..f6fc3dd --- /dev/null +++ b/cistematic/cisstat/chi2.py @@ -0,0 +1,53 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# + +def chiSquare(A, B, C, D): + """ Chi-square for a 2x2 table, as described in Gopal Kanji's 100 Statistical Tests + note that we only have one degree of freedom, work best if no cell is less than 3. + Table has form: + A | B | A + B + ----------------------------- + C | D | C + D + ----------------------------- + A + C | B + D | A + B + C + D + """ + chiAB = long(A + B) + chiCD = long(C +D) + chiAC = long(A + C) + chiBD = long(B + D) + chiAD = long(A * D) + chiBC = long(B * C) + chiN = long(chiAB + chiCD) + + chiNum = long(chiN - 1) * long(pow(chiAD - chiBC, 2)) + chiDen = long(chiAB * chiAC * chiBD * chiCD) + + return long(chiNum / chiDen) \ No newline at end of file diff --git a/cistematic/cisstat/enrichment.py b/cistematic/cisstat/enrichment.py new file mode 100644 index 0000000..b2fec46 --- /dev/null +++ b/cistematic/cisstat/enrichment.py @@ -0,0 +1,249 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# get Gene Enrichment Information and distribution + +import cistematic.core +from cistematic.core.geneinfo import geneinfoDB +from cistematic.cisstat.helper import hyperGeometric +from math import sqrt +import string + + +def calculateEnrichmentStats(geneIDList, dataDict, fileprefix, excludeIDList=[], sigLevel=0.01, minPossible=10): + """ calculates Data enrichment (and depletion) statistics for a given list of + geneIDs (assumed to be all from the same genome) using either the + hypergeometric distribution. Specific geneID's can be excluded from the + calculations if included in excludeIDList. + + dataDict is is a dictionary of conditions to respective lists of (genome, gID)'s. + + Results are saved in files with the given fileprefix. Conditions that are larger + than minPossible genes and that have pvalues < sigLevel / (# conditions) + are reported in fileprefix.sig, whereas genes that matches condition are + listed in fileprefix.ZZZZZZZ where the Z's are the condition. + """ + print "calculateEnrichmentStats: %s" % fileprefix + statfile = open("%s.stat" % fileprefix, "w") + zscorefile = open("%s.zscore" % fileprefix, "w") + sigfile = open("%s.sig" % fileprefix, "w") + if len(geneIDList) < 2: + print "Need at the very least two genes!" + return + + firstgid = geneIDList[0] + genome = firstgid[0] + cistematic.core.cacheGeneDB(genome) + idb = geneinfoDB(cache=True) + dataBin = {} + dataInfo = {} + dataPossible = {} + found = {} + locusList = [] + excludeLocusList = [] + excludeDataBin = {} + zList = [] + sigList = [] + print "len(dataDict) = %d" % len(dataDict) + print "building dataInfo" + for condition in dataDict: + dataBin[condition] = 0 + dataPossible[condition] = len(dataDict[condition]) + for geneID in dataDict[condition]: + if geneID not in dataInfo: + dataInfo[geneID] = [] + + dataInfo[geneID].append(condition) + + for entry in geneIDList: + if entry not in excludeIDList and entry not in locusList: + locusList.append(entry) + + for (genome, gID) in excludeIDList: + if gID not in excludeLocusList: + excludeLocusList.append(gID) + try: + excludeDataTerms = dataInfo[(genome, gID)] + except: + continue + + for condition in excludeDataTerms: + if condition not in excludeDataBin: + excludeDataBin[condition] = 0 + + excludeDataBin[condition] += 1 + + print "Getting condition list for locusList" + for locus in locusList: + try: + locusDataTerms = dataInfo[locus] + except: + continue + + for condition in locusDataTerms: + if condition not in found: + found[condition] = [] + + if locus not in found[condition]: + found[condition].append(locus) + dataBin[condition] += 1 + + numGenes = len(locusList) + dataSize = {} + print "Arranging by size" + print "numGenes = %s" % str(numGenes) + for condition in dataBin: + dataLen = dataBin[condition] + if dataLen not in dataSize: + dataSize[dataLen] = [] + + dataSize[dataLen].append(condition) + + dataLengths = dataSize.keys() + dataLengths.sort() + dataLengths.reverse() + dataList = [] + for dataLen in dataLengths: + for condition in dataSize[dataLen]: + dataList.append(condition) + + allGIDs = [] + try: + theGIDs = cistematic.core.getGenomeGeneIDs(genome) + except: + print "could not get gene entries for %s" % genome + + for aGID in theGIDs: + if aGID not in excludeLocusList: + allGIDs.append(aGID) + + datakeys = dataBin.keys() + mean = {} + standardDev = {} + N = float(len(allGIDs)) + for condition in datakeys: + possible = dataPossible[condition] + if condition in excludeDataBin: + possible -= excludeDataBin[condition] + + mean[condition] = (numGenes * possible) / N + standardDev[condition] = sqrt(numGenes * (possible / N) * (N - possible / N ) * (N - numGenes) / (N - 1.0)) + + print "Writing out .stat" + for condition in dataList: + count = dataBin[condition] + possible = dataPossible[condition] + if condition in excludeDataBin: + possible -= excludeDataBin[condition] + + try: + divisor = float(standardDev[condition]) + if divisor > 0: + zscore = float(count - mean[condition]) / float(standardDev[condition]) + else: + zscore = 0.0 + + if possible > 0: + percentage = float(count) * 100.0 / float(possible) + else: + percentage = 0 + + pval = 1.0 + zscore = count - mean[condition] + pval = hyperGeometric(len(allGIDs), possible, numGenes, count) + status = "enriched" + if (count - mean[condition]) < 0: + status = "depleted" + + statfile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % (condition, count, possible, percentage, zscore, pval)) + zList.append((zscore, (condition, count, possible, percentage, zscore, pval))) + if len(dataList) > 0 and pval < (sigLevel / float(len(dataList))) and possible > minPossible and count > 1: + sigList.append((pval, (condition, count, possible, percentage, pval, status))) + except: + statfile.write("%s\t%d out of %d\t \n" % (condition, count, possible)) + + statfile.close() + print "writing zscore" + zList.sort() + zList.reverse() + for (zscore, entry) in zList: + zscorefile.write("%s\t%d out of %d\t%2.2f\t%f\t%3.3g\n" % entry) + + zscorefile.close() + print "writing sig" + sigList.sort() + annotCache = {} + for (pval, entry) in sigList: + condition = entry[0].replace(" ", "_") + sigfile.write("%s\t%d out of %d\t%2.2f\t%3.3g\t%s\n" % entry) + conditionsigfile = open("%s.%s" % (fileprefix, condition.replace(" ", "_")), "w") + if entry[0] not in found: + print "could not find %s" % entry[0] + continue + + for gID in found[entry[0]]: + geneSymbol = "" + outsyn = "" + outdesc = "" + geneInfo = "" + if gID in annotCache: + (geneSymbol, outsyn, outdesc) = annotCache[gID] + else: + try: + geneInfo = idb.getGeneInfo(gID) + geneSymbol = geneInfo[0] + synonyms = idb.geneIDSynonyms(gID) + description = idb.getDescription(gID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + geneSymbol = str(gID) + try: + if len(gID) == 2: + newGeneID = idb.getGeneID(gID[0], gID[1]) + if len(newGeneID) > 1: + geneInfo = idb.getGeneInfo(newGeneID) + + if len(geneInfo) > 0: + geneSymbol = gID[1] + synonyms = idb.geneIDSynonyms(newGeneID) + description = idb.getDescription(newGeneID) + outsyn = string.join(synonyms) + outdesc = string.join(description, ";") + except: + pass + + conditionsigfile.write("%s\t%s\t%s\n" % (geneSymbol, outsyn, outdesc)) + if gID not in annotCache: + annotCache[gID] = (geneSymbol, outsyn, outdesc) + + conditionsigfile.close() + + sigfile.close() + cistematic.core.uncacheGeneDB(genome) \ No newline at end of file diff --git a/cistematic/cisstat/fisher.py b/cistematic/cisstat/fisher.py new file mode 100644 index 0000000..9708c73 --- /dev/null +++ b/cistematic/cisstat/fisher.py @@ -0,0 +1,96 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# +# Fisher's exact test, as described in Gopal Kanji's 100 Statistical Tests +# Table has form: +# A | B | A + B +# ----------------------------- +# C | D | C + D +# ----------------------------- +# A + C | B + D | A + B + C + D +# + +from math import exp +from helper import factln + +def fisher(a, b, c, d): + lowest = 0 + if a < b and a < c and a < d: + lowest = "a" + lowestNum = a + elif b < c and b < d: + lowest = "b" + lowestNum = b + elif c < d: + lowest = "c" + lowestNum = c + else: + lowest = "d" + lowestNum = d + + origAB = a + b + origCD = c + d + origAC = a + c + origBD = b + d + N = a + b + c + d + + first = factln(origAB) + factln(origCD) + factln(origAC) + factln(origBD) - factln(N) + + prob = 0.0 + for x in range(lowestNum + 1): + if lowest == "a": + newA = x + newC = origAC - newA + newB = origAB - newA + newD = origCD - newC + elif lowest == "b": + newB = x + newA = origAB - newB + newD = origBD - newB + newC = origAC - newA + elif lowest == "c": + newC = x + newA = origAC - newC + newD = origCD - newC + newB = origAB - newA + else: + newD = x + newB = origBD - newD + newC = origCD - newD + newA = origAB - newB + + currentProb = exp(first - ((factln(newA) + factln(newB) + factln(newC) + factln(newD)))) + #print 'did %f %f %f %f: %f' % (newA, newB, newC, newD, currentProb) + prob += currentProb + + if prob > 1.0: + prob = 1.0 + + return prob \ No newline at end of file diff --git a/cistematic/cisstat/helper.py b/cistematic/cisstat/helper.py new file mode 100644 index 0000000..7abf827 --- /dev/null +++ b/cistematic/cisstat/helper.py @@ -0,0 +1,127 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# special functions that are helpful for our statistics +# gammaln, gamma, factorial, factln, bico, and beta are inspired by: +# Numerical Recipes in C +# by Press, Flannery, Teukolsky, Vetterling +# +from math import log, exp, floor + +def gammaln(Z): + cof = [ 76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, -0.536382e-5] + + x = float(Z) - 1.0 + temp = x + 5.5 + ser = 1.0 + j = 0 + + temp -= (x + 0.5) * log(temp) + for j in range(6): + x += 1.0 + ser += cof[j] / x + + return -temp + log(2.50662827465 * ser) + + +def gammaln2(Z): + cof = [676.5203681218835, -1259.139216722289, 771.3234287757674, -176.6150291498386, + 12.50734324009056, -0.1385710331296526, 0.9934937113930748e-5, 0.1659470187408462e-6] + + ser = 0.9999999999995183 + x = float(Z) - 1.0 + + for j in range(7): + x += 1.0 + ser += cof[j] / x + + result = log(ser) - 5.58106146679532777 - float(Z) + (float(Z) - 0.5) * log(float(Z) + 6.5) + + return result + + +def gamma(Z): + return round(exp(gammaln(Z)), 8) + + +def gamma2(Z): + return round(exp(gammaln2(Z)), 8) + + +def factorial(N): + ntop = 4 + arr = [1.0, 1.0, 2.0, 6.0, 24.0] + + if N < 0: + print "error: factorial only works for non-negative integers" + return -1 + elif N > 32: + return gamma(N + 1) + else: + while ntop < N: + j = ntop + ntop += 1 + arr.append(arr[j] * ntop) + return arr[N] + + +def factln(N): + if N < 0: + print "error: factorial only works for non-negative integers" + return -1 + if N <= 1: + return 0.0 + else: + return gammaln2(N + 1.0) + + +def bicoln(n, k): + return factln(n) - factln(k) - factln(n - k) + + +def bico(n, k): + return floor(0.5 + exp(bicoln(n, k))) + + +def beta(z, w): + return exp(gammaln(z) + gammaln(w) - gammaln(z + w)) + + +def hyperGeometric(N, r, n, x): + """ hyperGeometric(N, r, n, x): + N is population size + r is the number in the population identified as a success + n is the sample size + x is the number in the sample identified as a success + """ + N = float(N) + r = float(r) + n = float(n) + x = float(x) + return exp(bicoln(r, x) + bicoln(N - r, n - x) - bicoln(N, n)) \ No newline at end of file diff --git a/cistematic/cisstat/score.py b/cistematic/cisstat/score.py new file mode 100644 index 0000000..36a747b --- /dev/null +++ b/cistematic/cisstat/score.py @@ -0,0 +1,109 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# Score formulas +# +from math import log, sqrt, exp + +try: + import _stat + hasStatExtension = True +except: + hasStatExtension = False + + +def llScore(observed, expected): + """ returns log likelihood for one pair of observed vs. expected. + """ + score = float(observed) * log(float(observed) / float(expected)) + return score + + +def zScore(observed, expected, prob): + """ returns (observed - expected)/(standard deviation), where the standard deviation + is calculated assuming normality as sqrt(obs * prob * (1 - prob)). + """ + score = (float(observed) - float(expected)) / sqrt(float(observed) * prob * (1 - prob)) + return score + + +def expected(length, prob): + return length * prob + + +def pvalue(zvalue): + """ approximation for converting from zvalue to P-value using Hamaker's + formula as described in Zar (App18). This is dependable down to + abs(z) ~ 0.2 + """ + z = abs(zvalue) + c = 0.806 * z * (1 - 0.018 * z) + pval = (1 - sqrt(1 - exp(-1 * (c * c))))/2 + + if zvalue < 0: + pval = 1 - pval + + return pval + + +def pearsonCorrelation(colA, colB): + if hasStatExtension: + return _stat.pearsonCorrelation(colA, colB) + else: + return localPearsonCorrelation(colA, colB) + + +def localPearsonCorrelation(colA, colB): + meanA = 0.0 + meanB = 0.0 + + length = len(colA) + if length > len(colB): + length = len(colB) + + for index in range(length): + meanA += colA[index] + meanB += colB[index] + + meanA /= length + meanB /= length + + numerator = 0.0 + denominatorA = 0.0 + denominatorB = 0.0 + + for index in range(length): + numerator += (colA[index] - meanA) * (colB[index] - meanB) + denominatorA += (colA[index] - meanA) * (colA[index] - meanA) + denominatorB += (colB[index] - meanB) * (colB[index] - meanB) + + if denominatorA == 0.0 or denominatorB == 0.0: + return 0.0 + + return (numerator / sqrt(denominatorA * denominatorB)) \ No newline at end of file diff --git a/cistematic/cisstat/setup.py b/cistematic/cisstat/setup.py new file mode 100644 index 0000000..b5fdd4e --- /dev/null +++ b/cistematic/cisstat/setup.py @@ -0,0 +1,33 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# + +from distutils.core import setup, Extension + +setup(name="stat", version="1.0", ext_modules=[Extension("_stat", ["statextension.c"])]) diff --git a/cistematic/cisstat/statextension.c b/cistematic/cisstat/statextension.c new file mode 100644 index 0000000..a21703a --- /dev/null +++ b/cistematic/cisstat/statextension.c @@ -0,0 +1,112 @@ +/*##########################################################################*/ +/*# #*/ +/*# C O P Y R I G H T N O T I C E #*/ +/*# Copyright (c) 2003-10 by: #*/ +/*# * California Institute of Technology #*/ +/*# #*/ +/*# All Rights Reserved. #*/ +/*# #*/ +/*# Permission is hereby granted, free of charge, to any person #*/ +/*# obtaining a copy of this software and associated documentation files #*/ +/*# (the "Software"), to deal in the Software without restriction, #*/ +/*# including without limitation the rights to use, copy, modify, merge, #*/ +/*# publish, distribute, sublicense, and/or sell copies of the Software, #*/ +/*# and to permit persons to whom the Software is furnished to do so, #*/ +/*# subject to the following conditions: #*/ +/*# #*/ +/*# The above copyright notice and this permission notice shall be #*/ +/*# included in all copies or substantial portions of the Software. #*/ +/*# #*/ +/*# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #*/ +/*# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #*/ +/*# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #*/ +/*# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #*/ +/*# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #*/ +/*# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #*/ +/*# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #*/ +/*# SOFTWARE. #*/ +/*###########################################################################*/ +/*# C extension for commonly used stat methods. */ + +#include +#include +#include +#include + +static char module_doc[] = +"This module only implements the pearsonCorrelation in C for now."; + +static PyObject* +the_func(PyObject *self, PyObject *args) +{ + PyObject *a, *b; + double c, numerator; + double *pwmA, meanA, denominatorA; + double *pwmB, meanB, denominatorB; + long listLen, index; + + if (!PyArg_UnpackTuple(args, "func", 2, 2, &a, &b)) { + return NULL; + } + + listLen = PyList_Size(a); + pwmA = malloc(listLen * sizeof(double)); + for (index = 0; index < listLen; index++) { + pwmA[index] = PyFloat_AsDouble(PyList_GetItem(a, index)); + } + + pwmB = malloc(PyList_Size(b) * sizeof(double)); + for (index = 0; index < listLen; index++) { + pwmB[index] = PyFloat_AsDouble(PyList_GetItem(b, index)); + } + + if (listLen > PyList_Size(b)) { + listLen = PyList_Size(b); + } + + meanA = 0.0; + meanB = 0.0; + + for (index = 0; index < listLen; index++) { + meanA += pwmA[index]; + meanB += pwmB[index]; + } + + meanA /= listLen; + meanB /= listLen; + + denominatorA = 0.0; + denominatorB = 0.0; + numerator = 0.0; + + for (index = 0; index < listLen; index++) { + numerator += (pwmA[index] - meanA) * (pwmB[index] - meanB); + denominatorA += (pwmA[index] - meanA) * (pwmA[index] - meanA); + denominatorB += (pwmB[index] - meanB) * (pwmB[index] - meanB); + } + + free(pwmA); + free(pwmB); + + if (denominatorA == 0.0 || denominatorB == 0.0) { + c = 0.0; + } else { + c = numerator / sqrt(denominatorA * denominatorB); + } + + return PyFloat_FromDouble(c); +} + +static char the_func_doc[] = +"pearsonCorrelation(a,b)\n\nReturns the pearsonCorrelation of a and b."; + +static PyMethodDef module_methods[] = { + {"pearsonCorrelation", the_func, METH_VARARGS, the_func_doc}, + {NULL, NULL} +}; + +PyMODINIT_FUNC +init_stat(void) +{ + Py_InitModule3("_stat", module_methods, module_doc); +} diff --git a/cistematic/core/__init__.py b/cistematic/core/__init__.py new file mode 100644 index 0000000..e6b7f20 --- /dev/null +++ b/cistematic/core/__init__.py @@ -0,0 +1,688 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +__all__ = ["motif", "homology", "geneinfo", "protein"] + +import cistematic +from cistematic.genomes import Genome, geneDB +import shutil, tempfile, os +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + +goDict = {} +annotDict = {} +global cache +cache = {} + + +def cacheGeneDB(genome): + """ save a copy of a genome's gene database to a local cache. + """ + if genome not in cache: + try: + tempgen = "%s.db" % tempfile.mktemp() + shutil.copyfile(geneDB[genome], tempgen) + cache[genome] = tempgen + except: + print "could not cache genome %s" % genome + else: + tempgen = cache[genome] + + return tempgen + + +def uncacheGeneDB(genome=""): + """ remove the local copy of a genome's gene database. + """ + global cache + if genome in cache: + try: + os.remove(cache[genome]) + except: + print "could not delete %s" % cache[genome] + + del cache[genome] + else: + for gen in cache: + try: + os.remove(cache[gen]) + except: + print "could not delete %s" % cache[gen] + + cache = {} + + +def cachedGenomes(): + """ return lists of genomes with a gene database in the local cache. + """ + return cache.keys() + + +def chooseDB(genome, dbfile=""): + """ helper function to use genome's gene database from the local cache if present. + """ + global cache + if dbfile == "" and genome in cache: + dbfile = cache[genome] + + return dbfile + + +def readChromosome(genome, chrom, db=""): + """ return sequence for entire chromosome + """ + aGenome = Genome(genome, chrom, dbFile=chooseDB(genome, db)) + return aGenome.getChromosomeSequence() + + +def getGenomeEntries(genome, db=""): + """ return the entries for a given genome. + """ + global cache + if db == "" and genome in cache: + db = cache[genome] + + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + return (genome, aGenome.allGIDs()) + + +def getGenomeGeneIDs(genome, db=""): + """ return the entries for a given genome. + """ + global cache + if db == "" and genome in cache: + db = cache[genome] + + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + return aGenome.allGIDs() + + +def getChromoGeneEntries(chromosome, lowerbound=-1, upperbound=-1, db=""): + """ return the entries for a given chromosome. + """ + (genome, chromID) = chromosome + aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db)) + return aGenome.chromGeneEntries(chromID, lowerbound, upperbound) + + +def getChromosomeNames(genome, db="", partition=1, slice=0): + """ return the chromosomes for a given genome. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + return aGenome.allChromNames(partition, slice) + + +def geneEntry(geneID, db="", version="1"): + """ returns (chrom, start, stop, length, sense) for a given geneID + """ + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + return aGenome.geneInfo(geneID, version) + + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = {"A": "T", "T": "A", + "G": "C", "C": "G", + "S": "S", + "W": "W", + "R": "Y", "Y": "R", + "M": "K", "K": "M", + "H": "D", "D": "H", + "B": "V", "V": "B", + "N": "N", + "a": "t", "t": "a", + "g": "c", "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=-1): + """ returns the complement of the sequence. + """ + newSeq = "" + seqLength = len(sequence) + if length == seqLength or length < 0: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + for index in range(seqLength - 1,seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +def upstreamToNextGene(geneID, radius, version="1", db=""): + """ return distance to gene immediately upstream. + """ + upstream = radius + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + if aGenome.checkGene(geneID): + (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version) + if sense == "F": + upstream = aGenome.leftGeneDistance(geneID, upstream, version) + else: + upstream = aGenome.rightGeneDistance(geneID, upstream, version) + except: + pass + + return upstream + + +def downstreamToNextGene(geneID, radius, version="1", db=""): + """ return distance to gene immediately downstream. + """ + downstream = radius + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + + try: + if aGenome.checkGene(geneID): + (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version) + if sense == "F": + downstream = aGenome.rightGeneDistance(geneID, downstream, version) + else: + downstream = aGenome.leftGeneDistance(geneID, downstream, version) + except: + pass + + return downstream + + +def retrieveFeatures(match, radius, featureType="", db=""): + """ return the features around a given match. + """ + (chromosome, hit) = match + (genome, chromID) = chromosome + lowerboundHit = int(hit[0]) - int(radius) + if lowerboundHit < 0: + lowerboundHit = 0 + + aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db)) + results = aGenome.getFeaturesIntersecting(chromID, lowerboundHit, 2 * int(radius), featureType) + + return results + + +def retrieveSeqFeatures(geneID, upstream, cds, downstream, boundToNextGene = False, geneDB=""): + """ retrieve CDS features upstream, all or none of the cds, and downstream of a geneID. + Feature positions are normalized and truncated to local sequence coordinates. + """ + results = [] + (genome, gID) = geneID + aGenome = Genome(genome, dbFile=chooseDB(genome, geneDB)) + if True: + seqstart = 0 + seqlen = 0 + if aGenome.checkGene(geneID): + (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID) + if stop < start: + pos = stop + stop = start + start = pos + + if sense == "F": + # figure out normalized seqstart and seqstop + if upstream > 0: + if boundToNextGene: + upstream = aGenome.leftGeneDistance(geneID, upstream) + + seqstart = start - upstream + if seqstart < 0: + seqstart = 0 + upstream = start + + seqlen = upstream + + if cds > 0: + if seqlen == 0: + seqstart = start + + seqlen += length + + if downstream > 0: + if boundToNextGene: + downstream = aGenome.rightGeneDistance(geneID, downstream) + + if seqlen == 0: + seqstart = stop + + seqlen += downstream + + # process features + allresults = aGenome.getFeaturesIntersecting(chrom, seqstart, seqlen, "CDS") + for entry in allresults: + (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry + if fstop < fstart: + fstop = fstart + fstart = fstop + + forstart = fstart - seqstart # normalize + if forstart < 0: # truncate + forstart = 0 + + forstop = fstop - seqstart # normalize + if forstop > seqlen: # truncate + forstop = seqlen + + if (ftype, forstart, forstop, forientation) not in results: + results.append((ftype, forstart, forstop, forientation)) + else: + # figure out normalized seqstart and seqstop + if upstream > 0: + if boundToNextGene: + upstream = aGenome.rightGeneDistance(geneID, upstream) + + seqstart = stop + upstream + seqlen = upstream + + if cds > 0: + if seqlen == 0: + seqstart = stop + + seqlen += length + + if downstream > 0: + if boundToNextGene: + downstream = aGenome.leftGeneDistance(geneID, downstream) + + if seqlen == 0: + seqstart = start + + seqlen += downstream + + # process features + allresults = aGenome.getFeaturesIntersecting(chrom, seqstart - seqlen, seqlen, "CDS") + for entry in allresults: + (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry + if fstop < fstart: + fstop = fstart + fstart = fstop + + revstart = seqstart - fstop + if revstart < 0: + revstart = 0 + + revstop = seqstart - fstart + if revstop > seqlen: + fstop = seqlen + + if (ftype, revstart, revstop, forientation) not in results: + results.append((ftype, revstart, revstop, forientation)) + else: + pass + + return results + + +def getFeaturesIntersecting(genome, chrom, start, length, db="", ftype="CDS"): + """ return features of type ftype that fall within the given region. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + return aGenome.getFeaturesIntersecting(chrom, start, length, ftype) + + +def retrieveSequence(genome, chrom, start, stop, sense="F", db=""): + """ retrieve a sequence given a genome, chromosome, start, stop, and sense. + """ + entrySeq = "" + length = abs(stop - start) + 1 + try: + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + if sense == "F": + if start < 1: + seqStart = 0 + else: + seqStart = start - 1 + + sequence = aGenome.sequence(chrom, seqStart, length) + entrySeq = sequence + else: + seqStart = stop - 1 + entrySeq= aGenome.sequence(chrom, seqStart, length) + + except IOError: + print "Couldn't retrieve sequence %s %s %s %s %s" % (genome, chrom, start, stop, sense) + + return entrySeq + + +def retrieveCDS(geneID, maskCDS=False, maskLower=False, db="", version="1"): + """ retrieveCDS() - retrieve a sequence given a gene identifier + """ + entrySeq = "" + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + if aGenome.checkGene(geneID): + entrySeq = aGenome.geneSeq(geneID, maskCDS, maskLower, version) + except IOError: + print "Could not find %s " % str(geneID) + + return entrySeq + + +def retrieveUpstream(geneID, upstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"): + """ retrieve sequence 5' of cds of length upstream for a given a gene identifier + """ + entrySeq = "" + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + if aGenome.checkGene(geneID): + (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version) + if sense == "F": + if boundToNextGene: + upstream = aGenome.leftGeneDistance(geneID, upstream, version) + + if (start - upstream) > 1: + seqStart = start - upstream - 1 + seqLength = upstream + else: + seqStart = 0 + seqLength = upstream + else: + if boundToNextGene: + upstream = aGenome.rightGeneDistance(geneID, upstream, version) + + seqStart = stop + seqLength = upstream + + sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower) + # do CDS masking here.... + if sense == "F": + entrySeq = sequence + else: + entrySeq = complement(sequence, upstream) + + except IOError: + print "Couldn't find ", geneID + + return entrySeq + + +def retrieveDownstream(geneID, downstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"): + """ retrieve sequence 3' of CDS of length downstream for a given a gene identifier + """ + entrySeq = "" + genome = geneID[0] + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + if True: + if aGenome.checkGene(geneID): + (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version) + if sense == "F": + if boundToNextGene: + downstream = aGenome.rightGeneDistance(geneID, downstream, version) + + seqStart = stop - 1 + seqLength = downstream + 1 + else: + if boundToNextGene: + downstream = aGenome.leftGeneDistance(geneID, downstream, version) + + if (start - downstream) > 1: + seqStart = start - downstream + seqLength = downstream + else: + seqStart = 0 + seqLength = stop + + sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower) + # do CDS masking here + if sense == "F": + entrySeq = sequence + else: + entrySeq = complement(sequence, downstream) + + return entrySeq + + +def retrieveSeq(geneID, upstream, cds, downstream, geneDB="", maskLower = False, boundToNextGene = False, version="1"): + """ retrieve upstream, all or none of the cds, and downstream of a geneID + """ + geneSeq = "" + if int(cds) == 2: + maskCDS = True + else: + maskCDS = False + + if upstream > 0: + geneSeq += retrieveUpstream(geneID, upstream, maskCDS, maskLower, boundToNextGene, geneDB, version) + + if cds > 0: + geneSeq += retrieveCDS(geneID, maskCDS, maskLower, geneDB, version) + + if downstream > 0: + geneSeq += retrieveDownstream(geneID, downstream, maskCDS, maskLower, boundToNextGene, geneDB, version) + + if len(geneSeq) == 0: + print "retrieveSeq Warning: retrieved null sequence for %s: %s (splice form %s) from geneDB %s" % (geneID[0], geneID[1], version, geneDB) + + return geneSeq + + +def retrieveAll(genome, genes, upstream, downstream, outputFilePath): + """ retrieve set of upstream and downstrean sequences for a list of genes in a genome and save them to a file. + """ + outFile = open(outputFilePath, "w") + for gene in genes: + print "Processing " , gene + outFile.write("> %s \n" % (gene)) + geneID = (genome, gene) + outFile.write("%s\n" % retrieveSeq(geneID, upstream, 0, downstream)) + + outFile.close() + + +def fasta(geneID, seq): + """ write a fasta formated seq with geneID in the header. + """ + fastaString = "> %s-%s\n%s\n" % (geneID[0],geneID[1], seq) + + return fastaString + + +def loadGOInfo(genome, db=""): + """ load GO for a given genome + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + if genome not in goDict.keys(): + goDict[genome] = aGenome.allGOInfo() + + +def getGOInfo(geneID, db=""): + """ retrieve GO info for geneID + """ + (genome, locus) = geneID + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.goInfo(geneID) + except: + return [] + + +def getGOIDCount(genome, GOID, db=""): + """ retrieve count of genes with a particular GOID. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.getGOIDCount(GOID) + except: + return [] + + +def allGOTerms(genome, db=""): + """ return all GO Terms. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.allGOterms() + except: + return [] + + +def getAllGOInfo(genome, db=""): + """ return all GO Info. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.allGoInfo() + except: + return [] + + +def loadAnnotInfo(genome, db=""): + """ load Annotations for a given genome + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + if genome not in annotDict.keys(): + annotDict[genome] = aGenome.allAnnotInfo() + + +def getAnnotInfo(geneID, db=""): + """ retrieve Annotations for a given geneID + """ + (genome, locus) = geneID + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.annotInfo(geneID) + except: + return [] + + +def getAllAnnotInfo(genome, db=""): + """ return all Annotation Info. + """ + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + try: + return aGenome.allAnnotInfo() + except: + return [] + + +def sanitize(inSeq, windowSize=16): + """ make sure that very simple repeats are out of the sequence. + Soft-mask any window that has windowSize - 2 of mononucleotides + and (windowSize / 2) - 1 non-GC dinucleotides. + """ + seqlen = len(inSeq) + outSeq = list(inSeq.upper()) + winmin2 = windowSize - 2 + winhalf = windowSize/2 - 1 + for pos in range(seqlen - windowSize): + window = inSeq[pos:pos + windowSize].upper() + if window.count("A") > winmin2 or window.count("C") > winmin2 or window.count("G") > winmin2 or window.count("T") > winmin2: + for index in range(windowSize): + outSeq[pos + index] = outSeq[pos + index].lower() + + if window.count("AC") >= winhalf or window.count("AG") >= winhalf or window.count("AT") >= winhalf or window.count("CT") >= winhalf or window.count("GT") >= winhalf or window.count("TA") >= winhalf or window.count("TC") >= winhalf or window.count("TG") >= winhalf or window.count("GA") > winhalf or window.count("CA") > winhalf: + for index in range(windowSize): + outSeq[pos + index] = outSeq[pos + index].lower() + + return "".join(outSeq) + + +def featuresIntersecting(genome, posList, radius, ftype, name="", chrom="", version="", db="", extendGen="", replaceMod=False): + """ returns a dictionary of matching features to positions of the double form (chromosome, position). + Only positions with features within radius are returned. + """ + resultDict = {} + if extendGen != "": + aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True) + aGenome.extendFeatures(extendGen, replace = replaceMod) + else: + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + + features = aGenome.getFeatures(ftype, name, chrom, version) + if len(posList) < 1 or len(features) < 1: + return resultDict + + chromList = features.keys() + for (chrom, pos) in posList: + tempList = [] + if chrom not in chromList: + continue + + for (name, version, chromosome, start, stop, orientation, atype) in features[chrom]: + if (pos + radius) < start or (pos - radius) > stop: + continue + + tempList.append((name, version, chromosome, start, stop, orientation, atype)) + + if len(tempList) > 0: + resultDict[(chrom, pos)] = tempList + + return resultDict + + +def genesIntersecting(genome, posList, name="", chrom="", version="", db="", flank=0, extendGen="", replaceMod=False): + """ returns a dictionary of matching genes to positions of the double form (chromosome, position). + Only positions with features within radius are returned. + """ + resultDict = {} + if extendGen != "": + aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True) + aGenome.extendFeatures(extendGen, replace = replaceMod) + else: + aGenome = Genome(genome, dbFile=chooseDB(genome, db)) + + genes = aGenome.getallGeneInfo(name, chrom, version) + if len(posList) < 1 or len(genes) < 1: + return resultDict + + chromList = genes.keys() + for (chrom, pos) in posList: + tempList = [] + if chrom not in chromList: + continue + + for (name, chromosome, start, stop, orientation) in genes[chrom]: + if start-flank <= pos <= stop+flank: + tempList.append((name, "noversion", chromosome, start, stop, orientation)) + + if len(tempList) > 0: + resultDict[(chrom, pos)] = tempList + + return resultDict \ No newline at end of file diff --git a/cistematic/core/cismatcher.py b/cistematic/core/cismatcher.py new file mode 100644 index 0000000..a6a804b --- /dev/null +++ b/cistematic/core/cismatcher.py @@ -0,0 +1,504 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +import string +import cistematic +from cistematic.core.geneinfo import geneinfoDB +import cistematic.core +from cistematic.core.motif import Motif +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import os, tempfile, shutil +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + +def getModules(motifID, motLen, totalLen=60, mainGenome={}, queryGenomes={}, directory=cisTemp, filterMasked=True): + """ build three fasta files containing sequences from (a) one main genome, (b) multiple query genomes, + and (c) motif matches in all genomes. + """ + doMainGenome = True + upLen = (totalLen - motLen) / 2 + downLen = totalLen - upLen - motLen + for genomeDict in [mainGenome, queryGenomes]: + if doMainGenome: + outfilename = "%s/%s-main.fsa" % (directory, motifID) + outfile = open(outfilename, "w") + matchfilename = "%s/%s-matches.fsa" % (directory, motifID) + matchfile = open(matchfilename, "w") + + for genome in genomeDict: + infile = open(genomeDict[genome]) + print "doing %s" % genome + cistematic.core.cacheGeneDB(genome) + current = "-1" + prevpos = -1 * totalLen + prevchrom = "nochrom" + lineArray = [] + misc = "" + for line in infile: + fields = line.split("\t") + chrom = fields[3] + pos = int(fields[4]) + lineArray.append((chrom, pos, line)) + + lineArray.sort() + infile.close() + for (chrom, pos, line) in lineArray: + fields = line.split("\t") + if fields[1] == current: + continue + + current = fields[1] + misc = fields[0][-1] + if chrom == prevchrom and pos < (prevpos + 2 * (upLen + downLen)): + print "skipping %s-%s-%s%s\ttoo close to previous pos" % (genome, chrom, misc, current) + continue + + try: + motseq = cistematic.core.retrieveSequence(genome, chrom, pos + 1, pos + motLen) + start = pos - upLen + if start < 0: + start = 0 + + seq = cistematic.core.retrieveSequence(genome, chrom, start, start + totalLen) + except: + print "skipping %s-%s-%s%s\terror retrieving sequence" % (genome, chrom, misc, current) + continue + + sense = fields[5] + if sense == "R": + motseq = cistematic.core.complement(motseq) + seq = cistematic.core.complement(seq, len(seq)) + + if filterMasked and motseq != motseq.upper(): + continue + + stop = start + len(seq) + matchID = "%s-%s-%s-%s%s" % (motifID.replace("-", "_"), genome, chrom.replace("-", "_"), misc, current) + outfile.write("> %s %d [%d to %d, %s]\n%s\n" % (matchID, pos, start, stop, sense, seq)) + matchfile.write("> %s\n%s\n" % (matchID, motseq)) + prevchrom = chrom + prevpos = pos + + if doMainGenome: + outfile.close() + print "doing database genomes" + outfilename = "%s/%s-db.fsa" % (directory, motifID) + outfile = open(outfilename, "w") + doMainGenome = False + + cistematic.core.uncacheGeneDB() + outfile.close() + matchfile.close() + + +def doBlast(motifID, matchLength, similarity, directory=cisTemp, maxInputSize=3000, firstMatchOnly=False): + """ Blast sequences generated by getModules() to identify regions in the main genome sequences that are + conserved in the query genomes. Results are saved in a motifIDBlast.db sqlite database. + """ + # check that we don't exceed maxSetSize + fsafilename = "%s/%s-main.fsa" % (directory, motifID) + fsafile = open(fsafilename, "r") + index = 0 + for line in fsafile: + if line[0] != ">": + continue + + index += 1 + + if index > maxInputSize: + print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.doBlast()" % maxInputSize + return + + #build blast DB + #run blast + blastCommands = ["cd %s" % directory, + "nohup %s/programs/blast/bin/formatdb -t %smodDB -n %smodDB -p F -i %s-db.fsa" % (cisRoot, motifID, motifID, motifID), + "%s/programs/blast/bin/blastall -p blastn -d %smodDB -i %s-main.fsa -o %s.blastres -m 8 -v 15 -b 15" % (cisRoot, motifID, motifID, motifID)] + blastCommandLine = string.join(blastCommands, "; ") + os.system(blastCommandLine) + #filter blast + fsafilename = "%s/%s-main.fsa" % (directory, motifID) + fsafile = open(fsafilename, "r") + infilename = "%s/%s.blastres" % (directory, motifID) + infile = open(infilename, "r") + outfilename = "%s/%s.cisblast" % (directory, motifID) + outfile = open(outfilename, "w") + tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID) + db = sqlite.connect(tempdb) + sql = db.cursor() + + stmt = "create table blast_entries(ID INTEGER PRIMARY KEY, GENOME1 varchar, GENEID1 varchar, GENOME2 varchar, GENEID2 varchar, SIMILARITY float, LENGTH int, MISMATCHES int, INDEL int, start1 int, stop1 int, start2 int, stop2 int, evalue float, score float)" + sql.execute(stmt) + stmt = "create table blast_segments(ID INTEGER PRIMARY KEY, GENOME varchar, MATCHID varchar, chrom varchar, loc int, start int, stop int, sense varchar)" + sql.execute(stmt) + db.commit() + + stmt = "INSERT into blast_segments VALUES(NULL, ?, ?, ?, ?, ?, ?, ?) " + batch = [] + for line in fsafile: + if line[0] != ">": + continue + + (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split() + start = int(segstart[1:]) + stop = int(segstop[:-1]) + sense = sense[0] + (motTag, genome, chromid, mid) = matchid.split("-") + batch.append((genome, matchid, chromid, int(loc), start, stop, sense)) + + sql.executemany(stmt, batch) + + counter = 0 + previousGene1 = "" + previousGene2 = "" + matchDict = {} + + print "Building matchDict" + for line in infile: + fields = line.split("\t") + geneid1 = str(fields[0]) + (motTAG, genome1, mchrom1, match1) = geneid1.split("-") + geneid2 = str(fields[1]) + (motTAG, genome2, mchrom2, match2) = geneid2.split("-") + lineMatchLength = int(fields[3]) + lineSimilarity = float(fields[2]) + evalue = float(fields[10]) + score = float(fields[11].strip()) + if evalue < 0.01 and lineMatchLength > matchLength and lineSimilarity > similarity: + if geneid1 == geneid2: + continue + + if geneid1 == previousGene1 and geneid2 == previousGene2: + continue + + previousGene1 = geneid1 + previousGene2 = geneid2 + if geneid1 not in matchDict: + matchDict[geneid1] = {} + + elif firstMatchOnly: + continue + + if geneid2 not in matchDict[geneid1]: + matchDict[geneid1][geneid2] = [] + + matchDict[geneid1][geneid2].append((score, line)) + + os.remove(infilename) + print "Processing matchDict" + matchKeys = matchDict.keys() + matchKeys.sort() + + stmt = "INSERT into blast_entries VALUES(NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) " + batch = [] + for geneid1 in matchKeys: + geneKeys = matchDict[geneid1].keys() + geneKeys.sort() + for geneid2 in geneKeys: + counter += 1 + array = matchDict[geneid1][geneid2] + array.sort() + (score, line) = array[-1] + outfile.write(line) + fields = line.split("\t") + geneid1 = str(fields[0]) + (motTAG, genome1, mchrom1, gid1) = geneid1.split('-') + geneid2 = str(fields[1]) + (motTAG, genome2, mchrom2, gid2) = geneid2.split('-') + similarity = float(fields[2]) + matchLength = int(fields[3]) + mismatches = int(fields[4]) + indels = int(fields[5]) + start1 = int(fields[6]) + stop1 = int(fields[7]) + start2 = int(fields[8]) + stop2 = int(fields[9]) + evalue = float(fields[10]) + score = float(fields[11].strip()) + batch.append((genome1, geneid1, genome2, geneid2, similarity, matchLength, mismatches, indels, start1, stop1, start2, stop2, evalue, score)) + + sql.executemany(stmt, batch) + stmt = "CREATE index blastE on blast_entries(genome1, geneid1)" + sql.execute(stmt) + stmt = "CREATE index blastS on blast_segments(GENOME, MATCHID)" + sql.execute(stmt) + db.commit() + sql.close() + db.close() + shutil.copyfile(tempdb, "%s/%sBlast.db" % (directory, motifID)) + os.remove(tempdb) + infile.close() + outfile.close() + print "inserted %d lines" % counter + + +def getCandidates(motifID, motLen, mainGenome={}, radiusStep=1000, maxRadius=200000, maxInputSize=3000, refineMotifs=False, directory=cisTemp, ucscOrg="Human", ucscDB="hg17"): + """ Core of cismatcher. Extracts conserved regions from data in the Blast.db database generated by doBlast(). + """ + genome = mainGenome.keys()[0] + cistematic.core.cacheGeneDB(genome) + motfile = open(mainGenome[genome], "r") + modfilename = "%s/%s-main.fsa" % (directory, motifID) + modfile = open(modfilename, "r") + outfilename = "%s/cismatcher.%s.out" % (directory, motifID) + outfile = open(outfilename, "w") + annotfilename = "%s/cismatcher.%s.annot.txt" % (directory, motifID) + annotfile = open(annotfilename, "w") + motseqs = {} + goodseqs = [] + if refineMotifs: + mfilename = "%s/%s-matches.fsa" % (directory, motifID) + mfile = open(mfilename, "r") + mid = "" + for line in mfile: + if line[0] != ">": + motseqs[mid] = line.strip() + else: + mid = line.strip()[2:] + + mfile.close() + + current = "-1" + prevchrom = "nochrom" + lines = [] + numEntries = 0 + for line in motfile: + fields = line.split("\t") + chrom = fields[3] + if fields[1] == current and chrom == prevchrom: + continue + + if chrom != prevchrom: + refchrom = chrom + if "rand" in refchrom: + refchrom = "%s_random" % refchrom[:-4] + + prevchrom = chrom + + current = fields[1] + motstart = int(fields[4]) + 1 + motstop = motstart + motLen + sense = fields[5] + if sense == "F": + sense = "+" + else: + sense = "-" + + lines.append("chr%s\tcistematic\tmotif\t%d\t%d\t.\t%s\t.\t%s-%s-%s\n" % (refchrom, motstart, motstop, sense, motifID, chrom, current)) + numEntries += 1 + + if numEntries > maxInputSize: + print "number of matches in main genome exceeded maxInputSize %d - aborting cisMatcher.getCandidates()" % maxInputSize + return + + if lines: + outheader = 'track name=%strack description="Cistematic %s hits"\n' % (motifID, motifID) + annotfile.write(outheader) + for outline in lines: + annotfile.write(outline) + + motfile.close() + try: + dbname = "%s/%sBlast.db" % (directory, motifID) + tempdb = "%s%sBlast.db" % (tempfile.mktemp(), motifID) + shutil.copyfile(dbname, tempdb) + except: + print "no blast database - aborting cisMatcher->getCandidates()" + return + + index = 0 + prevchrom = "nochrom" + for line in modfile: + if line[0] != ">": + continue + + (junk, matchid, loc, segstart, junk2, segstop, sense) = line.split() + start = int(segstart[1:]) + 1 + stop = int(segstop[:-1]) + 1 + (motifID, genome, chrom, match) = matchid.split('-') + if sense == "F": + sense = "=" + else: + sense = "-" + + if genome not in matchid: + continue + + if chrom != prevchrom: + refchrom = chrom + if "rand" in refchrom: + refchrom = "%s_random" % refchrom[:-4] + + prevchrom = chrom + + line = "chr%s\tcistematic\tblast_region\t%d\t%d\t.\t%s\t.\t%sb%d\n" % (refchrom, start, stop, sense, matchid, index) + annotfile.write(line) + index += 1 + + modfile.close() + db = sqlite.connect(tempdb) + sql = db.cursor() + stmt = ' select * from blast_entries where GENOME1="%s" ORDER BY GENEID1 ' % genome + sql.execute(stmt) + res = sql.fetchall() + idb = geneinfoDB(cache=True) + prevchrom="nochrom" + outlines = [] + index = 0 + for entry in res: + (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry + line = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) + stmt2 = 'select chrom, loc, start, stop, sense from blast_segments where GENOME = "%s" and MATCHID = "%s" ' % (gen1, mid1) + sql.execute(stmt2) + result = sql.fetchone() + (chrom, loc, start, stop, sense) = result + if chrom != prevchrom: + refchrom = chrom + if "rand" in refchrom: + refchrom = "%s_random" % refchrom[:-4] + + prevchrom = chrom + + if sense == "F": + matchstart = start + + 1 + int(start1) + matchstop = start + + 1 + int(stop1) + else: + matchstart = stop + 1 - int(start1) + matchstop = stop + 1 - int(stop1) + + if sense == "F": + sense = "+" + else: + sense = "-" + + if matchstart > matchstop: + temp = matchstop + matchstop = matchstart + matchstart = temp + + line = "chr%s\tcistematic\tmatch\t%d\t%d\t%d\t%s\t.\t%s.%d\n" % (refchrom, matchstart, matchstop, (float(sim) - 90) * 100, sense, mid2, index) + index += 1 + annotfile.write(line) + feature = "NONE" + symbol = "" + desc = "" + relativeLoc = "" + radius = 0 + featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start , motLen, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR") + if len(featureList) > 0: + relativeLoc = featureList[0][6] + feature = featureList[0][0] # always pick first feature + try: + desc = cistematic.core.getAnnotInfo((genome, feature))[0] + symbol = idb.getGeneInfo((genome, feature))[0] + except: + pass + + while feature == "NONE" and radius < maxRadius: + radius += radiusStep + featureList = cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="CDS") + cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="%UTR") + if genome == "human": + featureList += cistematic.core.getFeaturesIntersecting(genome, chrom, start - radius, 2 * radius, ftype="WGRNA") + + if len(featureList) > 0: + feature = featureList[0][0] # always pick first feature + try: + geneID = (genome, feature) + (gchrom, gstart, gstop, glength, gsense) = cistematic.core.geneEntry(geneID) + if gstart <= start and start <= gstop: + relativeLoc = "GENE" + elif start < gstart: + if gsense == "F": + relativeLoc = "UP" + else: + relativeLoc = "DOWN" + else: + if gsense == "F": + relativeLoc = "DOWN" + else: + relativeLoc = "UP" + + desc = cistematic.core.getAnnotInfo(geneID)[0] + symbol = idb.getGeneInfo(geneID)[0] + except: + pass + + line = '%s%s%s%s%schr%s:%d-%d%s%s%d%s\n' % (relativeLoc, mid1, mid2, sim, length, ucscOrg, ucscDB, refchrom, start - radius, stop + radius, refchrom, matchstart , matchstop, feature, symbol, radius, desc) + outlines.append((float(sim), int(length), mid1, line)) + if index % 100 == 0: + print "." + + cistematic.core.uncacheGeneDB() + annotfile.close() + sql.close() + db.close() + os.remove(tempdb) + outlines.sort() + outlines.reverse() + alreadySeen = [] + print "writing outfile" + for line in outlines: + if line[2] in alreadySeen: + continue + + alreadySeen.append(line[2]) + outfile.write(line[3]) + + outfile.close() + if refineMotifs: + print "Refining motifs" + goodIDs = [] + for entry in res: + (ID, gen1, mid1, gen2, mid2, sim, length, mis, indel, start1, stop1, start2, stop2, evalue, score) = entry + if mid1 not in goodIDs: + goodIDs.append(mid1) + goodseqs.append(motseqs[mid1].upper()) + + if mid2 not in goodIDs: + goodIDs.append(mid2) + goodseqs.append(motseqs[mid2].upper()) + + mot = Motif("%s+R" % motifID, "", "", goodseqs) + motfilename = "%s/%s+R.mot" % (directory, motifID) + mot.saveMotif(motfilename) \ No newline at end of file diff --git a/cistematic/core/geneinfo.py b/cistematic/core/geneinfo.py new file mode 100644 index 0000000..01c0833 --- /dev/null +++ b/cistematic/core/geneinfo.py @@ -0,0 +1,280 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import tempfile, shutil, os +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +dbPath = "%s/db/gene_info.db" % cisRoot + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + +speciesMap = {"3702": "athaliana", + "4932": "scerevisiae", + "6239": "celegans", + "7227": "dmelanogaster", + "7668": "spurpuratus", + "7955": "drerio", + "8364": "xtropicalis", + "9031": "ggallus", + "9606": "hsapiens", + "9615": "cfamiliaris", + "9796": "ecaballus", + "9913": "btaurus", + "10090": "mmusculus", + "10116": "rnorvegicus", + "13616": "mdomestica" +} + + +class geneinfoDB: + """ The geneinfoDB class allows for the querying of NCBI gene data. + """ + startingGenome = "" + targetGenomes = [] + cachedDB = "" + + + def __init__(self, tGenomes=[], cache=False): + """ initialize the geneinfoDB object with a target genome and cache database, if desired. + """ + self.targetGenomes = tGenomes + if cache: + self.cacheDB() + + + def __del__(self): + """ cleanup copy in local cache, if present. + """ + if self.cachedDB != "": + self.uncacheDB() + + + def cacheDB(self): + """ copy geneinfoDB to a local cache. + """ + self.cachedDB = "%s.db" % tempfile.mktemp() + shutil.copyfile(dbPath, self.cachedDB) + + + def uncacheDB(self): + """ delete geneinfoDB from local cache. + """ + global cachedDB + if self.cachedDB != "": + try: + os.remove(self.cachedDB) + except OSError: + print "could not delete %s" % self.cachedDB + + self.cachedDB = "" + + + def connectDB(self): + """ return a handle to the database. + """ + path = dbPath + if self.cachedDB != "": + path = self.cachedDB + + return sqlite.connect(path, timeout=60) + + + def getGeneInfo(self, geneID): + """ returns a list of one or more (symbol, locustag, dbxrefs, chromosome, map_location) for a geneID. + """ + db = self.connectDB() + cursor = db.cursor() + emptyRes = [] + (gen, gid) = geneID + cursor.execute("select symbol, locustag, dbxrefs, chromosome, map_location from gene_info where genome = :gen and gID = :gid " , locals()) + entry = cursor.fetchone() + cursor.close() + db.close() + try: + (symbol, locustag, dbxrefs, chromosome, map_location) = entry + return (str(symbol), str(locustag), str(dbxrefs), str(chromosome), str(map_location)) + except ValueError: + pass + + return emptyRes + + + def getallGeneInfo(self, genome, infoKey="gid"): + """ returns a dictionary of one or more (symbol, locustag, dbxrefs, chromosome, map_location) per gID. + acceptable infoKey arguments are: 'locus', and 'gid'. + """ + db = self.connectDB() + cursor = db.cursor() + resDict = {} + cursor.execute("select gid, symbol, locustag, dbxrefs, chromosome, map_location from gene_info where genome = :genome", locals()) + results = cursor.fetchall() + cursor.close() + db.close() + for (gid, symbol, locustag, dbxrefs, chromosome, map_location) in results: + if infoKey == "locus": + if str(locustag) not in resDict: + resDict[str(locustag)] = [] + + resDict[str(locustag)].append((str(symbol), str(gid), str(dbxrefs), str(chromosome), str(map_location))) + else: + if str(gid) not in resDict: + resDict[str(gid)] = [] + + resDict[str(gid)].append((str(symbol), str(locustag), str(dbxrefs), str(chromosome), str(map_location))) + + return resDict + + + def getDescription(self, geneID): + """ returns a list of one or more gene description for a geneID. + """ + db = self.connectDB() + cursor = db.cursor() + results = [] + (gen, gid) = geneID + cursor.execute("select description from gene_description where genome = :gen and gID = :gid", locals()) + entries = cursor.fetchall() + cursor.close() + db.close() + for entry in entries: + results.append(str(entry[0])) + + return results + + + def geneIDSynonyms(self, geneID): + """ returns a list of synonyms for a geneID. + """ + db = self.connectDB() + cursor = db.cursor() + results = [] + (gen, gid) = geneID + cursor.execute("select synonym from gene_synonyms where genome = :gen and gID = :gid", locals()) + entries = cursor.fetchall() + cursor.close() + db.close() + for entry in entries: + results.append(str(entry[0])) + + return results + + + def getGeneID(self, genome, synonym): + """ returns a geneID given a genome and a synonym. + """ + db = self.connectDB() + cursor = db.cursor() + results = [] + cursor.execute("select gID from gene_info where genome= :genome and symbol= :synonym", locals()) + entry = cursor.fetchone() + if entry: + cursor.close() + db.close() + return (genome, str(entry[0])) + + cursor.execute("select gID from gene_synonyms where genome = :genome and synonym = :synonym", locals()) + entry = cursor.fetchone() + if entry: + cursor.close() + db.close() + return (genome, str(entry[0])) + + cursor.execute("select gID from gene_info where genome = :genome and locustag = :synonym", locals()) + entry = cursor.fetchone() + cursor.close() + db.close() + if entry: + return (genome, str(entry[0])) + + return results + + +def buildgeneinfoDB(datafile, path=dbPath): + """ populate geneinfo database from NCBI gene information. + """ + inFile = open(datafile, "r") + idb = geneinfoDB() + db = idb.connectDB() + cursor = db.cursor() + cursor.execute("create table gene_info(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, symbol varchar, locustag varchar, dbxrefs varchar, chromosome varchar, map_location varchar)") + cursor.execute("create table gene_description(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, description varchar)") + cursor.execute("create table gene_synonyms(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, synonym varchar)") + genomeKeys = speciesMap.keys() + + for line in inFile: + line = line.replace("'", "prime") + field = line.split("\t") + if field[0] in genomeKeys: + try: + genome = speciesMap[field[0]] + sqlstmt = "INSERT into gene_info(ID, genome, gID, symbol, locustag, dbxrefs, chromosome, map_location) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (genome, field[1], field[2], field[3], field[5], field[6], field[7]) + cursor.execute(sqlstmt) + descr = field[8].strip() + if len(descr) > 1: + sqlstmt2 = "INSERT into gene_description(ID, genome, gID, description) values (NULL, '%s', '%s', '%s')" % (genome, field[1], field[8].strip()) + cursor.execute(sqlstmt2) + + sqlstmt3 = "INSERT into gene_synonyms(ID, genome, gID, synonym) values (NULL, '%s', '%s', '%s')" % (genome, field[1], field[1].strip()) + cursor.execute(sqlstmt3) + synonyms = field[4].split("|") + for entry in synonyms: + try: + if entry != "-" and entry != field[1].strip(): + sqlstmt3 = "INSERT into gene_synonyms(ID, genome, gID, synonym) values (NULL, '%s', '%s', '%s')" % (genome, field[1], entry.strip()) + cursor.execute(sqlstmt3) + except sqlite.OperationalError: + pass + except sqlite.OperationalError: + print "could not register %s" % (line) + + cursor.execute("create index genIdx1 on gene_info(genome)") + cursor.execute("create index genIdx2 on gene_description(genome)") + cursor.execute("create index genIdx3 on gene_synonyms(genome)") + cursor.execute("create index gIDIdx1 on gene_info(gID)") + cursor.execute("create index gIDIdx2 on gene_description(gID)") + cursor.execute("create index gIDIdx3 on gene_synonyms(gID)") + cursor.execute("create index synIdx on gene_synonyms(synonym)") + db.commit() + cursor.close() + db.close() \ No newline at end of file diff --git a/cistematic/core/homology.py b/cistematic/core/homology.py new file mode 100644 index 0000000..0798753 --- /dev/null +++ b/cistematic/core/homology.py @@ -0,0 +1,222 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import tempfile, shutil, os +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +from cistematic.core.geneinfo import speciesMap + +dbPath = "%s/db/homologene.db" % cisRoot +homologeneGenomes = ["hsapiens", "mmusculus", "rnorvegicus", "celegans", + "cbriggsae", "cremanei", "dmelanogaster", "athaliana", + "ggallus", "cfamiliaris", "drerio", "scerevisiae"] + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" +tempfile.tempdir = cisTemp + + +class homologyDB: + """ The homologyDB class allows for the mapping and return of predefined homology relationships. + """ + startingGenome = "" + targetGenomes = [] + homologousGenes = {} + cachedDB = "" + + + def __init__(self, tGenomes=[], cache=False): + """ initialize the homologyDB object with a target genome and cache database, if desired. + """ + self.targetGenomes = tGenomes + if cache: + self.cacheDB() + + + def __del__(self): + """ cleanup copy in local cache, if present. + """ + if self.cachedDB != "": + self.uncacheDB() + + + def cacheDB(self): + """ copy homologyDB to a local cache. + """ + self.cachedDB = "%s.db" % tempfile.mktemp() + shutil.copyfile(dbPath, self.cachedDB) + + + def uncacheDB(self): + """ delete homologyDB from local cache. + """ + global cachedDB + if self.cachedDB != "": + try: + os.remove(self.cachedDB) + except: + print "could not delete %s" % self.cachedDB + + self.cachedDB = "" + + + def connectDB(self): + """ return a handle to the database. + """ + path = dbPath + if self.cachedDB != "": + path = self.cachedDB + + return sqlite.connect(path, timeout=60) + + + def loadDB(self): + """ deprecated. + """ + pass + + + def getHomologousGenes(self, geneID): + """ return list of geneIDs homologous to given geneID. Limit to target genomes if specified at initialization. + """ + db = self.connectDB() + cursor = db.cursor() + results = [] + (gen, gid) = geneID + cursor.execute("select homoloID from homolog where genome = :gen and gID = :gid", locals()) + groups = cursor.fetchall() + for hIDentry in groups: + homoloID = str(hIDentry[0]) + cursor.execute("select genome, gID from homolog where homoloID = :homoloID ", locals()) + genes = cursor.fetchall() + for gene in genes: + if gene != geneID: + (genome, gID) = gene + if len(self.targetGenomes) > 0: + if genome not in self.targetGenomes: + pass + else: + results.append((str(genome), str(gID))) + else: + results.append((str(genome), str(gID))) + + cursor.close() + db.close() + + return results + + +def buildHomologeneDB(hFile="homologene.data", hdb=dbPath): + """ Populate a new homologyDB database with homology relationships from homologene. + """ + inFile = open(hFile) + db = sqlite.connect(hdb) + cursor = db.cursor() + cursor.execute("create table homolog(ID INTEGER PRIMARY KEY, homoloID varchar, genome varchar, gID varchar)") + for line in inFile: + doInsert = False + sqlstmt = "INSERT into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?)" + field = line.split("\t") + if field[1] in speciesMap: + gid = field[2] + if speciesMap[field[1]] == "arabidopsis": + gid = field[3].upper() + + values = ("homologene-%s" % field[0], speciesMap[field[1]], gid.strip()) + doInsert = True + + if doInsert: + cursor.execute(sqlstmt, values) + + inFile.close() + sqlstmt = "CREATE INDEX idx1 on homolog(genome, gID)" + cursor.execute(sqlstmt) + sqlstmt = "CREATE INDEX idx2 on homolog(homoloID)" + cursor.execute(sqlstmt) + db.commit() + cursor.close() + db.close() + + +def addHomologs(genomeA, genomeB, entries, hdb=dbPath): + """ Specify homology relationships between geneIDs to be inserted into homology database. + The entries list contains doubles of the form (gIDa, gIDb) from genome A and genome B, respectively. + """ + mapping = {} + for (geneID1, geneID2) in entries: + mapping[geneID1] = geneID2 + + if len(mapping) < 1: + return + + db = sqlite.connect(hdb) + sql = db.cursor() + sql.execute('select * from homolog where genome = "%s" ' % genomeA) + results = sql.fetchall() + + stmt = "insert into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?) " + stmtArray = [] + + for entry in results: + (rowID, hID, genome, gID) = entry + if gID in mapping: + stmtArray.append((hID, genomeB, mapping[gID])) + del mapping[gID] + + topHID = 0 + if len(stmtArray) > 0: + print "Updating %d entries in homolog table" % len(stmtArray) + sql.executemany(stmt, stmtArray) + + stmtArray = [] + for gID in mapping: + topHID += 1 + homologID = "%s-%s-%s" % (genomeA, genomeB, str(topHID)) + stmtArray.append((homologID, genomeA, gID)) + stmtArray.append((homologID, genomeB, mapping[gID])) + + if len(mapping) > 0: + print "Adding %d new homology entries" % len(mapping) + sql.executemany(stmt, stmtArray) + + db.commit() + sql.close() + db.close() \ No newline at end of file diff --git a/cistematic/core/motif.py b/cistematic/core/motif.py new file mode 100644 index 0000000..9a40e67 --- /dev/null +++ b/cistematic/core/motif.py @@ -0,0 +1,1373 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# motif.py - defines the motif object and its methods in cistematic +from string import upper, lower +from math import log, exp +from copy import deepcopy +from cistematic.core import complement +from cistematic.cisstat.score import pearsonCorrelation +import re, os, tempfile + +if os.environ.get("CISTEMATIC_ROOT"): + cisRoot = os.environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +if os.environ.get("CISTEMATIC_TEMP"): + cisTemp = os.environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + +try: + import cistematic.core._motif as _motif + hasMotifExtension = True +except: + hasMotifExtension = False + +matrixRow = {"A": 0, + "C": 1, + "G": 2, + "T": 3 +} + +symbolToMatrix = {"A": [1.0, 0.0, 0.0, 0.0], + "C": [0.0, 1.0, 0.0, 0.0], + "G": [0.0, 0.0, 1.0, 0.0], + "T": [0.0, 0.0, 0.0, 1.0], + "W": [0.5, 0.0, 0.0, 0.5], + "S": [0.0, 0.5, 0.5, 0.0], + "R": [0.5, 0.0, 0.5, 0.0], + "Y": [0.0, 0.5, 0.0, 0.5], + "M": [0.5, 0.5, 0.0, 0.0], + "K": [0.0, 0.0, 0.5, 0.5], + "H": [0.33, 0.33, 0.0, 0.34], + "B": [0.0, 0.33, 0.33, 0.34], + "V": [0.33, 0.33, 0.34, 0.0], + "D": [0.33, 0.0, 0.33, 0.34], + "N": [0.25, 0.25, 0.25, 0.25] +} + +reMAP = {"A": "A", + "C": "C", + "G": "G", + "T": "T", + "N": "[ACGT]", + "W": "[AT]", + "S": "[GC]", + "R": "[AG]", + "Y": "[CT]", + "M": "[AC]", + "K": "[GT]", + "H": "[ACT]", + "B": "[CGT]", + "V": "[ACG]", + "D": "[AGT]", + "|": "I" +} + +motifDict = {"A": ["W", "R", "M", "H", "V", "D"], + "T": ["W", "Y", "K", "H", "B", "D"], + "C": ["S", "Y", "M", "H", "B", "V"], + "G": ["S", "R", "K", "B", "V", "D"] +} + + +class Motif: + """ The Motif class is the heart of cistematic. It captures both the consensus, + PWM, and Markov(1) versions of the motif, as well as provides methods for scanning + sequences for matches. + """ + motifSeq = "" + motifPWM = [] + reversePWM = [] + motifMarkov1 = [] + revMarkov1 = [] + tagID = "" + sequences = [] + annotations = [] + strictConsensus = "" + threshold = 1.0 + info = "" + + + def __init__(self, tagID="", motif="", PWM=[], seqs=[], thresh=0.0, info="", motifFile="", seqfile=""): + """ initialize a Motif object either with (A) a tagID identifier and either of + (i) a IUPAC consensus or (ii) a PWM or (iii) example sequences, or (B) from + a motif file in Cistematic motif format. + """ + fileTagID = "" + if motifFile != "": + (fileTagID, motif, PWM, seqs, thresh, info) = self.readMotif(motifFile) + + if len(tagID) > 0: + self.setTagID(tagID) + elif len(fileTagID) > 0: + self.setTagID(fileTagID) + else: + self.setTagID("default") + + if motif <> "": + self.setMotif(motif) + + # PWM overrules the motif + if len(PWM) > 1: + self.setPWM(PWM) + + # a seqfile can be used to create a list of sequences + if len(seqfile) > 0: + seqs = self.readSeqFile(seqfile) + + # Sequences overrules the PWM and motif + if len(seqs) > 1: + self.setSequences(seqs) + self.setThreshold(len(seqs[0])) + + self.setThreshold(float(thresh)) + if len(info) > 0: + self.setInfo(info) + + + def __len__(self): + """ returns the length of the motif + """ + return len(self.motifPWM) + + + def readMotif(self, motifFile): + """ read motif in cistmeatic motif format to initialize motif instance. + """ + motif = "" + PWM = [] + seqs = [] + threshold = 0.0 + info = "" + infile = open(motifFile, "r") + for line in infile: + if len(line) < 4 or line[0] == "#": + continue + + fields = line.strip().split("\t") + fieldType = fields[0].lower() + if fieldType not in ["tagid", "motif", "acgt", "sequence", "threshold", "info"]: + print "could not process line %s" % line + continue + + if len(fields) < 2: + continue + + if fieldType == "motif": + motif = fields[1] + elif fieldType == "acgt": + PWM.append([float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4])]) + elif fieldType == "sequence": + seqs.append(fields[1]) + elif fieldType == "threshold": + threshold = float(fields[1]) + elif fieldType == "info": + info = fields[1].strip() + elif fieldType == "tagid": + tagID = fields[1] + + infile.close() + + return (tagID, motif, PWM, seqs, threshold, info) + + + def readSeqFile(self, seqFile): + """ read sequences from a sequence file. + """ + seqs = [] + infile = open(seqFile, "r") + for line in infile: + seqs.append(line.strip().upper()) + + infile.close() + + return seqs + + + def saveMotif(self, motFile): + """ Save motif in cistematic motif format. + """ + outfile = open(motFile, "w") + outfile.write("tagid\t%s\n" % self.tagID) + outfile.write("info\t%s\n" % self.info.strip()) + outfile.write("threshold\t%s\n" % str(self.threshold)) + outfile.write("motif\t%s\n" % self.buildConsensus()) + for col in self.motifPWM: + outfile.write("acgt\t%f\t%f\t%f\t%f\n" % (col[0], col[1], col[2], col[3])) + + for seq in self.sequences: + outfile.write("sequence\t%s\n" % seq) + + outfile.close() + + + def setTagID(self, tag): + """ set motif identifier. + """ + self.tagID = tag + + + def setInfo(self, info): + """ set motif info string. + """ + self.info = info + + + def setThreshold(self, threshold): + """ set a pre-defined threshold or the highest-possible threshold, otherwise. + """ + (sForward, sBackward) = self.scoreMotif(self.buildStrictConsensus()) + if sForward > sBackward: + self.threshold = sForward + else: + self.threshold = sBackward + + if threshold < self.threshold and threshold > 0: + self.threshold = threshold + + + def setMotif(self, motif): + """ set the motif PWM using a IUPAC consensus. + """ + self.motifSeq = motif + self.motifPWM = self.buildPWM(motif) + self.buildReversePWM() + self.strictConsensus = self.buildStrictConsensus() + + + def setSequences(self, seqs): + """ set the founder sequences for the motif and recalculate PWM with them. + """ + self.sequences = seqs + self.calculatePWMfromSeqs() + self.calculateMarkov1() + + + def setPWM(self, PWM): + """ set the PWM for the motif and calculate consensus. + """ + self.motifPWM = PWM + self.buildReversePWM() + self.motifSeq = self.buildConsensus() + self.strictConsensus = self.buildStrictConsensus() + + + def appendToPWM(self, col): + """ add a column to the PWM for the motif + """ + self.motifPWM.append(col) + + + def buildPWM(self, motif): + """ returns the PWM for the provided consensus motif. + """ + PWM = [] + for letter in upper(motif): + PWM.append(symbolToMatrix[letter]) + + return PWM + + + def buildReversePWM(self): + """ returns the reverse PWM for the motif + """ + theRevPWM = [] + tempPWM = deepcopy(self.motifPWM) + tempPWM.reverse() + for col in tempPWM: + theRevPWM.append([col[matrixRow["T"]], col[matrixRow["G"]], col[matrixRow["C"]], col[matrixRow["A"]]]) + + self.reversePWM = deepcopy(theRevPWM) + + return self.reversePWM + + + def getPWM(self): + """ returns the PWM for the motif + """ + return self.motifPWM + + + def printPWM(self): + """ print the PWM and the consensus + """ + aRow = "" + cRow = "" + gRow = "" + tRow = "" + cons = self.buildConsensus() + consLine = "Cons:" + for NT in cons: + consLine += "\t" + consLine += NT + + for col in self.motifPWM: + aRow = "%s%s\t" % (aRow, str(round(col[matrixRow["A"]],4))) + cRow = "%s%s\t" % (cRow, str(round(col[matrixRow["C"]],4))) + gRow = "%s%s\t" % (gRow, str(round(col[matrixRow["G"]],4))) + tRow = "%s%s\t" % (tRow, str(round(col[matrixRow["T"]],4))) + + print "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow) + print "%s\n" % consLine + + + def saveLogo(self, outfilename, height=-1, width=-1): + """ saves a logo version of the motif as outfilename (assumes has .png) + if the motif is built from sequences. + will fail if weblogo 2.8.2 package is not installed in correct place. + """ + logoPath = "%s/programs/weblogo/seqlogo" % cisRoot + if outfilename[-4:] in [".png", ".PNG"]: + outfilename = outfilename[:-4] + + if len(self.sequences) < 1: + print "cannot run logo representation without founder sequences" + else: + if True: + seqfilename = "%s.tmp" % tempfile.mktemp() + seqfile = open(seqfilename, "w") + for sequence in self.sequences: + seqfile.write("%s\n" % sequence) + + seqfile.flush() + dimensions = "" + if height > 0: + dimensions += "-h %d " % height + if width > 0: + dimensions += "-w %d " % width + else: + dimensions += "-w %d " % len(self.motifPWM) + + cmd = logoPath + " -f " + seqfilename + " -F PNG -c " + dimensions + "-a -Y -n -M -k 1 -o " + outfilename + contents = os.system(cmd) + seqfile.close() + os.remove(seqfilename) + else: + print "failed to make logo: expecting weblogo 2.8.2 package in %s" % logoPath + print "also check if ghostscript package is correctly installed." + + + def getSymbol(self, col): + """ helper function for buildConsensus() + """ + for NT in ["A", "C", "G", "T"]: + row = matrixRow[NT] + if col[row] > 0.9: + return NT + + aColValue = col[matrixRow["A"]] + cColValue = col[matrixRow["C"]] + gColValue = col[matrixRow["G"]] + tColValue = col[matrixRow["T"]] + + dualsList = [("R", aColValue + gColValue), + ("Y", tColValue + cColValue), + ("W", aColValue + tColValue), + ("S", cColValue + gColValue), + ("M", aColValue + cColValue), + ("K", tColValue + gColValue) + ] + + bestDual = self.getBestSymbol(dualsList) + if bestDual[1] > 0.9: + return bestDual[0] + + trioList = [("B", cColValue + gColValue + tColValue), + ("D", aColValue + gColValue + tColValue), + ("H", aColValue + cColValue + tColValue), + ("V", aColValue + cColValue + gColValue) + ] + + bestTrio = self.getBestSymbol(trioList) + if bestTrio[1] > 0.9: + return bestTrio[0] + + return "N" + + + def getBestSymbol(self, symbolProbabilityList): + bestSymbol = symbolProbabilityList[0] + for symbol in symbolProbabilityList[1:]: + if symbol[1] > bestSymbol[1]: + bestSymbol = symbol + + return bestSymbol + + + def buildConsensus(self): + """ returns the best consensus using the IUPAC alphabet. + """ + consensus = "" + for col in self.motifPWM: + consensus += self.getSymbol(col) + + return consensus + + + def buildStrictConsensus(self): + """ returns the best consensus using solely nucleotides. + """ + consensus = "" + for col in self.motifPWM: + mRow = [] + for nt in ("A", "C", "G", "T"): + mRow.append((col[matrixRow[nt]], nt)) + + mRow.sort() + consensus += mRow[3][1] + + return consensus + + + def bestConsensusScore(self): + """ returns the best consensus score possible. + """ + score = 0.0 + for col in self.motifPWM: + mRow = [] + for nt in ["A", "C", "G", "T"]: + mRow.append((col[matrixRow[nt]], nt)) + + mRow.sort() + score += mRow[3][0] + + return score + + + def expected(self, length, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0): + """ returns the expected number of matches to the consensus in a sequence of a given length and background. + """ + expectedNum = length * self.consensusProbability(background, numMismatches) + return expectedNum + + + def consensusProbability(self, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0): + """ returns the probability of the consensus given the background. + """ + prob = 0 + motifs = [] + if numMismatches> 0: + seqs = self.seqMismatches(self.buildConsensus().upper(), numMismatches) + motifs = seqs.split("|") + else: + motifs.append(self.buildConsensus()) + + for theCons in motifs: + motProb = 0 + for NT in theCons: + currentProb = 0.0 + if NT in ("A", "W", "R", "M", "H", "V", "D", "N"): + currentProb += background[matrixRow["A"]] + + if NT in ("C", "S", "Y", "M", "H", "B", "V", "N"): + currentProb += background[matrixRow["C"]] + + if NT in ("G", "S", "R", "K", "B", "V", "D", "N"): + currentProb += background[matrixRow["G"]] + + if NT in ("T", "W", "Y", "K", "H", "B", "D", "N"): + currentProb += background[matrixRow["T"]] + + motProb = motProb + log(currentProb) + + prob += exp(motProb) + + return prob + + + def pwmProbability(self, background): + """ returns probability of the PWM. + """ + prob = 1.0 + for row in self.motifPWM: + currentProb = 0.0 + for NT in ["A", "C", "G", "T"]: + currentProb += row[matrixRow[NT]] * background[matrixRow[NT]] + + prob = prob * currentProb + + return prob + + + def revComp(self): + """ returns the reverse complement of the consensus of this motif. + """ + return complement(self.buildConsensus(), len(self.motifPWM)) + + + def numberOfN(self): + """ returns numbers of effective Ns in motif. + """ + index = 0 + for col in self.motifPWM: + if self.getSymbol(col) == "N": + index += 1 + + return index + + + def buildMismatchSeq(self, rootSeq, tailSeq, numMismatches): + """ helper function called from seqMismatches(). + """ + finalSeq = "" + tailLen = len(tailSeq) + if tailLen < 1 or numMismatches < 1: + return rootSeq + tailSeq + + for pos in range(tailLen - numMismatches + 1): + newRootSeq = rootSeq + newRootSeq += tailSeq[:pos] + newRootSeq += "N" + finalSeq += self.buildMismatchSeq(newRootSeq, tailSeq[pos + 1:], numMismatches - 1) + finalSeq += "|" + + return finalSeq[:-1] + + + def seqMismatches(self, seq, numMismatches): + """ Returns list of sequences that will be used by initializeMismatchRE(). + """ + return self.buildMismatchSeq("", seq, numMismatches) + + + def probMatchPWM(self, PWM, NT, colIndex): + """ returns the probability of seeing that particular nucleotide according to the PSFM. + """ + + if NT in ["A", "T", "C", "G"]: + row = matrixRow[NT] + return PWM[colIndex][row] + + if NT == "N": + return 1.0 + + currentProb = 0.0 + for motifNucleotide in ["A", "T", "C", "G"]: + if NT in motifDict[motifNucleotide]: + row = matrixRow[NT] + currentProb += PWM[colIndex][row] + + return currentProb + + + def psfmOdds(self, PWM, NT, colIndex, background=[0.25, 0.25, 0.25, 0.25]): + """ calculates the odds of nucleotide NT coming from position colIndex in thePWM + as opposed to coming from the background. + """ + + currentProb = self.probMatchPWM(PWM, NT, colIndex) + backgroundProb = self.getBackgroundProbability(self, NT, background) + + try: + odds = currentProb / backgroundProb + except ZeroDivisionError: + odds = 1.0 + + return odds + + + def getBackgroundProbability(self, NT, background=[0.25, 0.25, 0.25, 0.25]): + + if NT in ["A", "T", "C", "G"]: + row = matrixRow[NT] + return background[row] + + if NT == "N": + return 1.0 + + backgroundProb = 0.0 + for motifNucleotide in ["A", "T", "C", "G"]: + if NT in motifDict[motifNucleotide]: + row = matrixRow[NT] + backgroundProb += background[row] + + return backgroundProb + + + def ntMatch(self, motifNT, seqNT): + """ returns True if seqNT matches motifNT. + """ + if motifNT == seqNT: + return True + + if seqNT == "N" or motifNT == "N": + return True + + if motifNT in motifDict[seqNT]: + return True + + return False + + + def scoreMotif(self, aSeq, diff=1000): + """ calculates the consensus score using the PSFM + """ + motLength = len(self.motifPWM) + if len(aSeq) < motLength: + return (0.0, 0.0) + + matchPWM = self.probMatchPWM + motPWM = self.motifPWM + revPWM = self.reversePWM + theSeq = upper(aSeq) + forwardCons = 0.0 + reverseCons = 0.0 + bestCons = 0.0 + for index in range(motLength): + currentNT = theSeq[index] + forwardCons += matchPWM(motPWM, currentNT, index) + reverseCons += matchPWM(revPWM, currentNT, index) + bestCons += matchPWM(motPWM,self.strictConsensus[index], index) + + if (forwardCons + diff) < bestCons and (reverseCons + diff) < bestCons: + return (-1, -1) + + return (forwardCons, reverseCons) + + + def scoreMotifLogOdds(self, aSeq, background=[0.25, 0.25, 0.25, 0.25]): + """ calculates the log-odds score using the PSFM given the markov(0) background. + """ + motLength = len(self.motifPWM) + if len(aSeq) < motLength: + return (0.0, 0.0) + + odds = self.psfmOdds + motPWM = self.motifPWM + revPWM = self.reversePWM + theSeq = upper(aSeq) + forwardCons = 0.0 + reverseCons = 0.0 + bestCons = 0.0 + for index in range(motLength): + currentNT = theSeq[index] + try: + forwardCons += log(odds(motPWM, currentNT, index, background), 2) + except: + forwardCons += log(0.01, 2) + + try: + reverseCons += log(odds(revPWM, currentNT, index, background), 2) + except: + reverseCons += log(0.01, 2) + + bestCons += log(odds(motPWM,self.strictConsensus[index], index, background), 2) + + return (forwardCons, reverseCons) + + + def bestLogOddsScore(self, background=[0.25, 0.25, 0.25, 0.25]): + """ calculates the best possible log-odds score using the PSFM given the markov(0) background. + """ + motLength = len(self.motifPWM) + odds = self.psfmOdds + motPWM = self.motifPWM + bestLogOdds = 0.0 + for index in range(motLength): + bestLogOdds += log(odds(motPWM,self.strictConsensus[index], index, background), 2) + + return bestLogOdds + + + def locateConsensus(self, aSeq): + """ returns a list of positions on aSeq that match the consensus exactly. + """ + cons = self.buildConsensus() + revComp = self.revComp() + motLength = len(cons) + Position = [] + if len(aSeq) < motLength: + return [] + else: + theSeq = upper(aSeq) + + pos = 0 + seqLength = len(theSeq) + while pos <= (seqLength - motLength): + subSeq = theSeq[pos:pos + motLength].strip() + try: + forwardMot = 1 + for index in range(motLength): + if not self.ntMatch(cons[index], subSeq[index]): + forwardMot = 0 + break + + revCompMot = 1 + for index in range(motLength): + if not self.ntMatch(revComp[index], subSeq[index]): + revCompMot = 0 + break + except: + print "chocked at pos %d" % pos + forwardMot = 0 + + if forwardMot == 1: + Position.append((pos, "F")) + pos += motLength + elif revCompMot == 1: + Position.append((pos, "R")) + pos += motLength + else: + pos +=1 + + return Position + + + def compareConsensus(self, aSeq): + """ returns a sequence with nucleotide differences from consensus in lower case. + """ + cons = self.buildConsensus() + revComp = self.revComp() + motLength = len(cons) + if len(aSeq) < motLength: + raise NameError, "Sequence too short" + else: + theSeq = upper(aSeq) + + forwardMismatch = 0 + backwardMismatch = 0 + forwardSeq = "" + backwardSeq = "" + + for index in range(motLength): + if not self.ntMatch(cons[index], theSeq[index]): + forwardMismatch += 1 + forwardSeq += lower(theSeq[index]) + else: + forwardSeq += theSeq[index] + + if not self.ntMatch(revComp[index], theSeq[index]): + backwardMismatch += 1 + backwardSeq += lower(theSeq[index]) + else: + backwardSeq += theSeq[index] + + if forwardMismatch <= backwardMismatch: + return forwardSeq + else: + return backwardSeq + + + def scoreDiffPWM(self, compMotif): + """ returns a score scaled from 0 (no difference) to 2 (completely different) to + quantify the difference between the motif and another motif. + """ + score = 0.0 + diffPWM = self.getDiffPWM(compMotif) + for pos in range(len(diffPWM)): + (adiff, cdiff, gdiff, tdiff) = diffPWM[pos] + score += abs(adiff) + abs(cdiff) + abs(gdiff) + abs(tdiff) + + score /= float(len(diffPWM)) + + return score + + + def getDiffPWM(self, compMotif): + """ subtracts the PWM of compMotif from existing PWM to compare differences. + Note that the comparison is only done on the length of the shorter motif. + """ + diffPWM = [] + compPWM = compMotif.getPWM() + numBasesToCompare = min(len(compMotif), len(self.motifPWM)) + + for pos in range(numBasesToCompare): + pwmCol = self.motifPWM[pos] + pwmColComp = compPWM[pos] + pwmEntry = [] + for NT in range(4): + pwmEntry.append(pwmCol[NT] - pwmColComp[NT]) + + diffPWM.append(pwmEntry) + + return diffPWM + + + def initializeRE(self): + """ initializes Regular Expression motif engine. + """ + global forwardRE + global backwardRE + cons = self.buildConsensus().upper() + revComp = self.revComp().upper() + reCons = "" + reBackward = "" + for NT in cons: + reCons += reMAP[NT] + + if revComp != cons: + for NT in revComp: + reBackward += reMAP[NT] + else: + reBackward = "ZZZZZZZ" + + forwardRE = re.compile(reCons, re.IGNORECASE) + backwardRE = re.compile(reBackward, re.IGNORECASE) + + + def initializeMismatchRE(self, numMismatches): + """ initializes Regular Expression motif engine allowing for mismatches. + """ + global forwardRE + global backwardRE + cons = self.seqMismatches(self.buildConsensus().upper(), numMismatches) + revComp = self.seqMismatches(self.revComp().upper(), numMismatches) + reCons = "" + reBackward = "" + for NT in cons: + reCons += reMAP[NT] + + if self.revComp().upper() != self.buildConsensus().upper(): + for NT in revComp: + reBackward += reMAP[NT] + else: + reBackward = "ZZZZZZZ" + + forwardRE = re.compile(reCons, re.IGNORECASE) + backwardRE = re.compile(reBackward, re.IGNORECASE) + + + def locateConsensusRE(self, sequence): + """ Returns a list of positions on aSeq that match the consensus exactly. + Should be run after either initializeRE() or initializeMismatchRE(numMismatches) + """ + motLength = len(self.motifPWM) + position = [] + results = [] + if len(sequence) < motLength: + return [] + + forwardIter = forwardRE.finditer(sequence) + backwardIter = backwardRE.finditer(sequence) + for match in forwardIter: + position.append((match.start(), "F")) + + for match in backwardIter: + position.append((match.start(), "R")) + + positionLength = len(position) + if positionLength >= 1: + position.sort() + (prevPos, prevSense) = position[0] + results.append((prevPos, prevSense)) + + for index in range(1, positionLength): + (pos, sense) = position[index] + if pos >= prevPos + motLength: + results.append((pos, sense)) + (pos, sense) = (prevPos, prevSense) + + return results + + + def locateStrictConsensus(self, aSeq, mismatches=0): + """ returns a list of positions on aSeq that match the strict + consensus within some mismatches. + Only available as a C-extension for greater speed-up for now. + """ + forwardMer = self.buildStrictConsensus() + motLength = len(forwardMer) + revcompMer = complement(forwardMer, motLength) + if hasMotifExtension: + return _motif.locateMer(aSeq, forwardMer, revcompMer, mismatches) + else: + print "only supported as part of the C extension for now" + return [] + + + def locateMotif(self, sequence, threshold=90.0, numberN=0): + """ returns a list of positions on aSeq that match the PWM within a Threshold, + given as a percentage of the optimal consensus score. + Will call C-extension for greater speed-up if compiled. + """ + motifLength = len(self.motifPWM) + sequenceLength = len(sequence) + threshold /= 100.0 + if threshold < 0.5: + print "Threshold less than 50% - will abort locateMotif()" + return [] + + maxScore = self.bestConsensusScore() + maxDiff = maxScore * (1 - threshold) + if sequenceLength < motifLength: + return [] + else: + sequence.strip() + + if hasMotifExtension: + return _motif.locateMotif(sequence, self.motifPWM, self.reversePWM, maxScore, maxDiff) + + sequence = upper(sequence) + positionList = [] + position = 0 + while position <= (sequenceLength - motifLength): + subSequence = sequence[position: position + motifLength] + if subSequence.count("N") > numberN: + position += 1 + continue + + (seqScore, revSeqScore) = self.scoreMotif(subSequence, maxDiff) + if seqScore >= revSeqScore and seqScore > 1.0: + positionList.append((position, "F")) + elif revSeqScore > 1.0: + positionList.append((position, "R")) + + position += 1 + + return positionList + + + def locateMarkov1(self, sequence, maxFold=5.0): + """ returns a list of positions on sequence that match the Markov1 within maxFold. + """ + motifLength = len(self.motifPWM) + sequenceLength = len(sequence) + if maxFold < 1.0: + print "maxFold less than 1.0 - will abort locateMarkov1()" + return [] + + maxScore = self.bestMarkov1Score() * maxFold + if sequenceLength < motifLength: + return [] + else: + sequence.strip() + + if hasMotifExtension: + return _motif.locateMarkov1(sequence, self.motifMarkov1, self.revMarkov1, maxScore) + + sequence = upper(sequence) + positionList = [] + position = 0 + while position <= (sequenceLength - motifLength): + subSequence = sequence[position: position + motifLength] + if subSequence.count("N") > 0: + position += 1 + continue + + (seqScore, revSeqScore) = self.scoreMarkov1(subSequence, maxScore) + if seqScore <= revSeqScore and seqScore < maxScore: + positionList.append((position, "F")) + elif revSeqScore < maxScore: + positionList.append((position, "R")) + + position += 1 + + return positionList + + + def calculatePWMfromSeqs(self): + """ calculate the PWM using a set of non-degenerate instances of the motif. + """ + PWM = [] + numSeqs = len(self.sequences) + + if numSeqs < 1: + return + + # using length of first sequence as the length of the motif + length = len(self.sequences[0]) + for index in range(length): + PWM.append([0.0, 0.0, 0.0, 0.0]) + + for seq in self.sequences: + index = 0 + theseq = seq.upper() + for NT in theseq: + PWM[index][matrixRow[NT]] += 1.0 + index += 1 + + for index in range(length): + for NT in ["A", "C", "G", "T"]: + PWM[index][matrixRow[NT]] /= numSeqs + + self.motifPWM = PWM + self.buildReversePWM() + self.motifSeq = self.buildConsensus() + self.strictConsensus = self.buildStrictConsensus() + + + def printMarkov1(self): + """ print the Markov1 PSSM of the form previous NT -> current NT. + """ + row = [] + for prior in range(4): + row.append(["", "", "", ""]) + + for pos in self.motifMarkov1: + for prior in range(4): + for current in range(4): + row[prior][current] += str(round(pos[prior][current], 4)) + "\t" + + for prior in ["A", "C", "G", "T"]: + for current in ["A", "C", "G", "T"]: + try: + print "%s -> %s\t%s\n" % (prior, current, row[matrixRow[prior]][matrixRow[current]]) + except: + print "ERROR: %s %s" % (prior, current) + print "\n" + + + def bestMarkov1Score(self): + """ returns the best markov1 score possible. + """ + motLength = len(self.motifMarkov1) + matchMarkov1 = self.probMatchMarkov1 + score = 0.0 + for index in range(motLength): + col = self.motifMarkov1[index] + mRow = [] + for prior in ["A", "C", "G", "T"]: + for current in ["A", "C", "G", "T"]: + mRow.append((col[matrixRow[prior]][matrixRow[current]], prior, current)) + mRow.sort() + + if index == 0: + currentProb = matchMarkov1(self.motifMarkov1, "N", mRow[-1][2], index) + else: + currentProb = matchMarkov1(self.motifMarkov1, mRow[-1][1], mRow[-1][2], index) + + if currentProb < 0.0001: + currentProb = 0.0001 + + if currentProb > 0.0: + score -= log(currentProb,2.0) + + return score + + + def worstMarkov1Score(self): + """ returns the worst markov1 score possible. + """ + motLength = len(self.motifMarkov1) + currentProb = 0.0001 + score = -log(currentProb, 2.0) * (motLength - 1) + + return score + + + def calculateMarkov1(self, pseudoCount=1.0): + """ calculate the Markov1 PSSM using a set of non-degenerate instances of the motif. + adds a pseudoCount for unseen combinations. + """ + self.motifMarkov1 = [] + numSeqs = len(self.sequences) + pseudoCount + + if numSeqs < 2: + return [] + + # using length of first sequence as the length of the motif + length = len(self.sequences[0]) + + for index in range(length): + self.motifMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount]]) + + for seq in self.sequences: + theseq = seq.upper() + index = 0 + prior = -1 + for pos in theseq: + if index == 0: + for priorNT in range(4): + self.motifMarkov1[index][priorNT][matrixRow[pos]] += 0.25 + else: + self.motifMarkov1[index][prior][matrixRow[pos]] += 1.0 + + prior = matrixRow[pos] + index += 1 + + for index in range(length): + for prior in range(4): + for current in range(4): + self.motifMarkov1[index][prior][current] /= numSeqs + + self.buildReverseMarkov1(pseudoCount) + + + def buildReverseMarkov1(self, pseudoCount=1.0): + """ calculate the Reverse Markov1 PSSM using a set of non-degenerate instances of the motif. + """ + self.revMarkov1 = [] + numSeqs = len(self.sequences) + pseudoCount + + if numSeqs < 2: + return [] + + # using length of first sequence as the length of the motif + length = len(self.sequences[0]) + for index in range(length): + self.revMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount], + [pseudoCount, pseudoCount, pseudoCount, pseudoCount]]) + + for aSeq in self.sequences: + seq = complement(aSeq.upper(), length) + index = 0 + prior = -1 + for pos in seq: + if index == 0: + for priorNT in range(4): + self.revMarkov1[index][priorNT][matrixRow[pos]] += 0.25 + else: + self.revMarkov1[index][prior][matrixRow[pos]] += 1.0 + + prior = matrixRow[pos] + index += 1 + + for index in range(length): + for prior in range(4): + for current in range(4): + self.revMarkov1[index][prior][current] /= numSeqs + + + def scoreMarkov1(self, aSeq, maxScore=10000000.): + """ calculate the matching score using the Markov1. + limit search if given a low maxScore + """ + motLength = len(self.motifMarkov1) + matchMarkov1 = self.probMatchMarkov1 + Score = [] + if len(aSeq) < motLength: + return Score + else: + theSeq = upper(aSeq) + + pos = 0 + seqProb = 0.0 + revSeqProb = 0.0 + subSeq = theSeq[pos: pos + motLength] + previousNT = "N" + for index in range(motLength): + currentNT = subSeq[index] + currentProb = matchMarkov1(self.motifMarkov1, previousNT, currentNT, index) + + if currentProb < 0.0001: + currentProb = 0.0001 + + if currentProb > 0.0: + seqProb -= log(currentProb,2.0) + + revCurrentProb = matchMarkov1(self.revMarkov1, previousNT, currentNT, index) + if revCurrentProb < 0.002: + revCurrentProb = 0.002 + + if revCurrentProb > 0.0: + revSeqProb -= log(revCurrentProb, 2.0) + + if seqProb > maxScore and revSeqProb > maxScore: + return (seqProb, revSeqProb) + + previousNT = currentNT + + return (seqProb, revSeqProb) + + + def probMatchMarkov1(self, theMarkov1, previousNT, NT, colIndex): + """ returns the likelihood of seeing NT given previousNT at this position of the motif. + """ + currentProb = 0.0 + if NT in ["A", "C", "G", "T"]: + currentNT = matrixRow[NT] + else: + currentNT = 0 + + try: + prevNT = matrixRow[previousNT] + except KeyError: + for index in range(4): + currentProb += theMarkov1[colIndex][index][currentNT] + + return currentProb + + if NT in ["A", "C", "G", "T"]: + return theMarkov1[colIndex][prevNT][currentNT] + + if NT == "N": + return 1.0 + + for motifNucleotide in ["A", "T", "C", "G"]: + if NT in motifDict[motifNucleotide]: + row = matrixRow[NT] + currentProb += theMarkov1[colIndex][prevNT][row] + + return currentProb + + + def isSane(self, minLen=7, stretchLen=6): + """ check for motif sanity, which includes: minimum length, less than half N's in consensus, + motifs include more than two nucleotides, no nucleotide or dinucleotide is repeated more + than stretchlen. The appropriate exceptions are made for 'GC' dinucleotides. + """ + stretchLen = int(stretchLen) + minLen = int(minLen) + stretchLen = min(stretchLen, minLen - 1) + + cons = self.buildConsensus() + motifLen = float(len(cons)) + if motifLen < minLen: + return False + + nCount = cons.count("N") + if (nCount >= 0.5 * motifLen): + return False + + aCount = cons.count("A") + gCount = cons.count("G") + cCount = cons.count("C") + tCount = cons.count("T") + + atCount = aCount + tCount + agCount = aCount + gCount + acCount = aCount + cCount + gtCount = gCount + tCount + tcCount = tCount + cCount + + for pairedCount in [atCount, agCount, acCount, gtCount, tcCount]: + if pairedCount == motifLen: + return False + + cons = self.buildStrictConsensus() + repeatSequences = [] + for nucleotide in ["A", "G", "C", "T"]: + repeatSequences.append(nucleotide * stretchLen) + + if stretchLen % 2 != 0: + stretchLen += 1 + + repeatCount = stretchLen/2 + for dinucleotide in ["AG", "AC", "AT", "CT", "GT"]: + repeatSequences.append(dinucleotide * repeatCount) + + for testSequence in repeatSequences: + if cons.count(testSequence): + return False + + return True + + +def correlateMotifs(actualMotifA, actualMotifB, maxSlide=1): + """ Compares two motifs using the "pearson correlation coefficient-like" MSS. + Will slide a motif up to maxSlide bases compared to the other, + and reports the best score. + """ + bestScore = 0.0 + + if len(actualMotifA) < len(actualMotifB): + motA = actualMotifB + motB = actualMotifA + else: + motA = actualMotifA + motB = actualMotifB + + motApwm = motA.getPWM() + motBpwm = motB.getPWM() + motCpwm = motB.buildReversePWM() + if hasMotifExtension: + return _motif.correlateMotifs(motApwm, motBpwm, motCpwm, maxSlide) + else: + length = len(motA) + padLength = length - len(motB) + Ncol = [symbolToMatrix["N"]] + for slide in range(-1 * maxSlide, maxSlide + padLength + 1): + pwmA = deepcopy(motApwm) + pwmB = deepcopy(motBpwm) + pwmC = deepcopy(motCpwm) + if slide < 0: + pwmA = Ncol * abs(slide) + pwmA + pwmB = pwmB + Ncol * (abs(slide) + padLength) + pwmC = pwmC + Ncol * (abs(slide) + padLength) + elif slide > 0 and slide <= maxSlide: + if padLength > 0: + if padLength >= slide: + adjustedPadLength = padLength - slide + adjustedSlide = 0 + else: + adjustedPadLength = 0 + adjustedSlide = slide - padLength + + pwmA = pwmA + Ncol * adjustedSlide + pwmB = Ncol * slide + pwmB + Ncol * adjustedPadLength + pwmC = Ncol * slide + pwmC + Ncol * adjustedPadLength + else: + pwmA = pwmA + Ncol * slide + pwmB = Ncol * slide + pwmB + pwmC = Ncol * slide + pwmC + elif slide > maxSlide: + maxDiff = slide - maxSlide + pwmA = pwmA + Ncol * maxSlide + pwmB = Ncol * slide + pwmB + Ncol * (padLength - maxDiff) + pwmC = Ncol * slide + pwmC + Ncol * (padLength - maxDiff) + else: + pwmB = pwmB + Ncol * padLength + pwmC = pwmC + Ncol * padLength + + score1 = 0.0 + score2 = 0.0 + thisLength = len(pwmA) + for index in range(thisLength): + score1 += pearsonCorrelation(pwmA[index], pwmB[index]) + score2 += pearsonCorrelation(pwmA[index], pwmC[index]) + + score1 = score1 / float(thisLength) + score2 = score2 / float(thisLength) + if score1 < score2 and score2 > bestScore: + bestScore = score2 + elif score1 > bestScore: + bestScore = score1 + + return bestScore + + +def MSS(motifA, motifB, maxSlide=1): + """ Compares two motifs using the motif similarity score (MSS). + Will slide a motif up to maxSlide bases compared to the other, + and reports the best score. Wrapper around correlateMotifs() + """ + return correlateMotifs(motifA, motifB, maxSlide) + + +def printMSS(motifList, maxSlide=1): + """ Prints a matrix of MSS comparisons between different motifs + in motifList. + """ + for mot1 in motifList: + print mot1.tagID, + for mot2 in motifList: + val = "\t%1.2f" % correlateMotifs(mot1, mot2, maxSlide) + print val, + + print "" \ No newline at end of file diff --git a/cistematic/core/motifextension.c b/cistematic/core/motifextension.c new file mode 100644 index 0000000..c069d0c --- /dev/null +++ b/cistematic/core/motifextension.c @@ -0,0 +1,646 @@ +/*##########################################################################*/ +/*# #*/ +/*# C O P Y R I G H T N O T I C E #*/ +/*# Copyright (c) 2003-10 by: #*/ +/*# * California Institute of Technology #*/ +/*# #*/ +/*# All Rights Reserved. #*/ +/*# #*/ +/*# Permission is hereby granted, free of charge, to any person #*/ +/*# obtaining a copy of this software and associated documentation files #*/ +/*# (the "Software"), to deal in the Software without restriction, #*/ +/*# including without limitation the rights to use, copy, modify, merge, #*/ +/*# publish, distribute, sublicense, and/or sell copies of the Software, #*/ +/*# and to permit persons to whom the Software is furnished to do so, #*/ +/*# subject to the following conditions: #*/ +/*# #*/ +/*# The above copyright notice and this permission notice shall be #*/ +/*# included in all copies or substantial portions of the Software. #*/ +/*# #*/ +/*# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #*/ +/*# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #*/ +/*# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #*/ +/*# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #*/ +/*# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #*/ +/*# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #*/ +/*# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #*/ +/*# SOFTWARE. #*/ +/*###########################################################################*/ +/*# C extension for commonly used motif methods. */ + +#include +#include +#include +#include + +static char module_doc[] = +"This module only implements the locateMotif and correlateMotifs in C for now."; + +double probMatchPWM_func(double *PWM, int index, char *aSeq, long seqPos) { + double aval, cval, gval, tval; + + aval = PWM[4 * index]; + cval = PWM[4 * index + 1]; + gval = PWM[4 * index + 2]; + tval = PWM[4 * index + 3]; + + switch (aSeq[seqPos]) { + case 'A': case 'a': + return aval; + break; + case 'C': case 'c': + return cval; + break; + case 'G': case 'g': + return gval; + break; + case 'T': case 't': + return tval; + break; + case 'N': case 'n': + return 1.0; + break; + case 'S': case 's': + return cval + gval; + break; + case 'W': case 'w': + return aval + tval; + break; + case 'R': case 'r': + return aval + gval; + break; + case 'Y': case 'y': + return cval + tval; + break; + case 'M': case 'm': + return aval + cval; + break; + case 'K': case 'k': + return gval + tval; + break; + case 'B': case 'b': + return cval + gval + tval; + break; + case 'D': case 'd': + return aval + gval + tval; + break; + case 'H': case 'h': + return aval + cval + tval; + break; + case 'V': case 'v': + return aval + cval + gval; + break; + } + return 0.0; +} + +double probMatchDPWM_func(double *PWM, int index, char *aSeq, long seqPos) { + double aval, cval, gval, tval; + int prevIndex; + + aval = 0.0; + cval = 0.0; + gval = 0.0; + tval = 0.0; + + if (index == 0) { + for (prevIndex = 0; prevIndex < 4; prevIndex++) { + aval += PWM[prevIndex * 4]; + cval += PWM[prevIndex * 4 + 1]; + gval += PWM[prevIndex * 4 + 2]; + tval += PWM[prevIndex * 4 + 3]; + } + } else { + if (seqPos > 0) { + switch(aSeq[seqPos - 1]) { + case 'A': case 'a': + aval = PWM[16 * index]; + cval = PWM[16 * index + 1]; + gval = PWM[16 * index + 2]; + tval = PWM[16 * index + 3]; + break; + case 'C': case 'c': + aval = PWM[16 * index + 4]; + cval = PWM[16 * index + 5]; + gval = PWM[16 * index + 6]; + tval = PWM[16 * index + 7]; + break; + case 'G': case 'g': + aval = PWM[16 * index + 8]; + cval = PWM[16 * index + 9]; + gval = PWM[16 * index + 10]; + tval = PWM[16 * index + 11]; + break; + case 'T': case 't': + aval = PWM[16 * index + 12]; + cval = PWM[16 * index + 13]; + gval = PWM[16 * index + 14]; + tval = PWM[16 * index + 15]; + break; + } + } + } + + switch (aSeq[seqPos]) { + case 'A': case 'a': + return aval; + break; + case 'C': case 'c': + return cval; + break; + case 'G': case 'g': + return gval; + break; + case 'T': case 't': + return tval; + break; + } + + return 0.0001; +} + +void scoreMot_func(char *theSeq, double *mPWM, double *rPWM, long pos, long motLen, float *bestCons, float *maxDiff, float *score, char *sense) +{ + float forScore, revScore; + long index, currentPos; + + forScore = 0.0; + revScore = 0.0; + *sense = 'F'; + + for (index = 0; index < motLen; index++) { + currentPos = pos + index; + forScore += probMatchPWM_func(mPWM, index, theSeq, currentPos); + revScore += probMatchPWM_func(rPWM, index, theSeq, currentPos); + } + + if (((forScore + *maxDiff) < *bestCons) && ((revScore + *maxDiff) < *bestCons)) { + *score = -1.0; + return; + } + + if (forScore > revScore) { + *score = forScore; + } else { + *score = revScore; + *sense = 'R'; + } +} + +void scoreDPWM_func(char *theSeq, double *mDPWM, double *rDPWM, long pos, long motLen, float *bestCons, float *score, char *sense) +{ + float forScore, revScore; + long index, currentPos; + + forScore = 0.0; + revScore = 0.0; + *sense = 'F'; + + for (index = 0; index < motLen; index++) { + currentPos = pos + index; + forScore -= log(probMatchDPWM_func(mDPWM, index, theSeq, currentPos)) / log(2.0); + revScore -= log(probMatchDPWM_func(rDPWM, index, theSeq, currentPos)) / log(2.0); + } + + if ((forScore > *bestCons) && (revScore > *bestCons)) { + *score = -1.0; + return; + } + + if (forScore < revScore) { + *score = forScore; + } else { + *score = revScore; + *sense = 'R'; + } +} + +static PyObject* +the_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *motifPWM, *revPWM, *mScore, *mDiff, *results; + double *mPWM, *rPWM; + float maxScore, maxDiff, seqScore; + long pos, maxPos; + int ok, index, indexMax, ntIndex, motLen, seqLen, skipping; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 5, 5, &pySeq, &motifPWM, &revPWM, &mScore, &mDiff); + + seq = PyString_AsString(pySeq); + motLen = PyList_Size(motifPWM); + seqLen = PyString_Size(pySeq); + maxScore = PyFloat_AsDouble(mScore); + maxDiff = PyFloat_AsDouble(mDiff); + + indexMax = 4 * motLen; + mPWM = malloc(indexMax * sizeof(double)); + rPWM = malloc(indexMax * sizeof(double)); + + for (index = 0; index < motLen; index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + mPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(motifPWM, index), ntIndex)); + rPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(revPWM, index), ntIndex)); + } + } + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + scoreMot_func(seq, mPWM, rPWM, pos, motLen, &maxScore, &maxDiff, &seqScore, &sense); + + if (seqScore > 1.0) { + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + free(mPWM); + free(rPWM); + return results; +} + +static PyObject* +dpwm_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *motifPWM, *revPWM, *mScore, *results; + double *mDPWM, *rDPWM; + float maxScore, seqScore; + long pos, maxPos; + int ok, index, indexMax, prevIndex, currentIndex, motLen, seqLen, skipping; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &motifPWM, &revPWM, &mScore); + + seq = PyString_AsString(pySeq); + motLen = PyList_Size(motifPWM); + seqLen = PyString_Size(pySeq); + maxScore = PyFloat_AsDouble(mScore); + + indexMax = 16 * motLen; + mDPWM = malloc(indexMax * sizeof(double)); + rDPWM = malloc(indexMax * sizeof(double)); + + for (index = 0; index < motLen; index++) { + for (prevIndex = 0; prevIndex < 4; prevIndex++) { + for (currentIndex = 0; currentIndex < 4; currentIndex++) { + mDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(motifPWM, index), prevIndex), currentIndex)); + rDPWM[16 * index + 4 * prevIndex + currentIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyList_GetItem(revPWM, index), prevIndex), currentIndex)); + } + } + } + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + scoreDPWM_func(seq, mDPWM, rDPWM, pos, motLen, &maxScore, &seqScore, &sense); + + if (seqScore > 1.0) { + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + free(mDPWM); + free(rDPWM); + return results; +} + +static PyObject* +mer_func(PyObject *self, PyObject *args) +{ + PyObject *pySeq, *forwardMer, *revcompMer, *maxMismatches, *results; + long pos, maxPos; + int ok, index, motLen, seqLen, skipping, mismatches, fmis, rmis; + char *fMer, *rMer; + char *seq, sense; + + results = Py_BuildValue("[]"); + + ok = PyArg_UnpackTuple(args, "ref", 4, 4, &pySeq, &forwardMer, &revcompMer, &maxMismatches); + + seq = PyString_AsString(pySeq); + fMer = PyString_AsString(forwardMer); + rMer = PyString_AsString(revcompMer); + motLen = PyString_Size(forwardMer); + seqLen = PyString_Size(pySeq); + mismatches = PyInt_AsLong(maxMismatches); + + pos = 0; + maxPos = seqLen - motLen; + + while (pos <= maxPos) { + skipping = 0; + for (index = 0; index < motLen; index++) { + if ((seq[pos + index] == 'N') || (seq[pos + index] == 'n')) { + skipping = index + 1; + } + } + + if (skipping) { + pos += skipping; + continue; + } + + fmis = 0; + rmis = 0; + for (index = 0; index < motLen; index++) { + if (seq[pos + index] != fMer[index]) { + fmis++; + } + if (seq[pos + index] != rMer[index]) { + rmis++; + } + if ((fmis > mismatches) && (rmis > mismatches)) { + break; + } + } + + if ((fmis <= mismatches) || (rmis <= mismatches)) { + sense = 'F'; + if (rmis < fmis) { + sense = 'R'; + } + PyList_Append(results, Py_BuildValue("(i, c)", pos, sense)); + } + pos++; + } + + return results; +} + +double pearson_func(double *colA, double *colB, int pos) +{ + double c, numerator; + double meanA, denominatorA; + double meanB, denominatorB; + long index; + + meanA = 0.0; + meanB = 0.0; + + for (index = 0; index < 4; index++) { + meanA += colA[pos + index]; + meanB += colB[pos + index]; + } + + meanA /= 4; + meanB /= 4; + + denominatorA = 0.0; + denominatorB = 0.0; + numerator = 0.0; + + for (index = 0; index < 4; index++) { + numerator += (colA[pos + index] - meanA) * (colB[pos + index] - meanB); + denominatorA += (colA[pos + index] - meanA) * (colA[pos + index] - meanA); + denominatorB += (colB[pos + index] - meanB) * (colB[pos + index] - meanB); + } + + if (denominatorA == 0.0 || denominatorB == 0.0) { + c = 0.0; + } else { + c = numerator / sqrt(denominatorA * denominatorB); + } + + return c; +} + +static PyObject* +corr_func(PyObject *self, PyObject *args) +{ + PyObject *PyaPWM, *PybPWM, *PycPWM, *PyMaxSlide; + double *aPWM, *bPWM, *cPWM, *tempA, *tempB, *tempC; + float fscore, rscore, bestScore; + long maxSlide; + int ok, index, indexMax, bIndexMax, ntIndex, motLen, padLen, slide, adjustedPadLen, adjustedSlide, tempMax, tempSize; + + bestScore = 0.0; + + ok = PyArg_UnpackTuple(args, "corr", 4, 4, &PyaPWM, &PybPWM, &PycPWM, &PyMaxSlide); + + motLen = PyList_Size(PyaPWM); + padLen = motLen - PyList_Size(PybPWM); + maxSlide = PyInt_AsLong(PyMaxSlide); + + if (maxSlide > motLen) { + maxSlide = motLen - 1; + } + + indexMax = 4 * motLen; + bIndexMax = 4 * (motLen - padLen); + aPWM = malloc(indexMax * sizeof(double)); + bPWM = malloc(bIndexMax * sizeof(double)); + cPWM = malloc(bIndexMax * sizeof(double)); + + for (index = 0; index < (motLen - padLen); index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex)); + bPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PybPWM, index), ntIndex)); + cPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PycPWM, index), ntIndex)); + } + } + + for (; index < motLen; index++) { + for (ntIndex = 0; ntIndex < 4; ntIndex++) { + aPWM[4 * index + ntIndex] = PyFloat_AsDouble(PyList_GetItem(PyList_GetItem(PyaPWM, index), ntIndex)); + } + } + + for (slide = -1 * maxSlide; slide < (maxSlide + padLen + 1); slide++ ) { + tempA = malloc(3 * indexMax * sizeof(double)); + tempB = malloc(3 * indexMax * sizeof(double)); + tempC = malloc(3 * indexMax * sizeof(double)); + if (slide < 0) { + tempSize = abs(slide) + motLen; + tempMax = 4 * tempSize; + for (index = 0; index < 4 * abs(slide); index++) { + tempA[index] = 0.25; + } + for (; index < tempMax; index++) { + tempA[index] = aPWM[index - 4 * slide]; + } + for (index = 0; index < bIndexMax; index++) { + tempB[index] = bPWM[index]; + tempC[index] = cPWM[index]; + } + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else if ((slide > 0) && (slide <= maxSlide)) { + if (padLen > 0) { + if (padLen >= slide) { + adjustedPadLen = padLen - slide; + adjustedSlide = 0; + } else { + adjustedPadLen = 0; + adjustedSlide = slide - padLen; + } + tempSize = motLen + adjustedSlide; + tempMax = indexMax + 4 * adjustedSlide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + tempMax = indexMax + 4 * adjustedSlide; + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else { + tempSize = motLen + slide; + tempMax = indexMax + 4 * slide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + } + } else if (slide > maxSlide) { + tempSize = motLen + maxSlide; + tempMax = indexMax + 4 * maxSlide; + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (; index < tempMax; index++) { + tempA[index] = 0.25; + } + tempMax = 4 * slide; + for (index = 0; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + tempMax += bIndexMax; + for (; index < tempMax; index++) { + tempB[index] = bPWM[index - 4 * slide]; + tempC[index] = cPWM[index - 4 * slide]; + } + tempMax = indexMax + 4 * maxSlide; + for (; index < tempMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } else { + tempSize = motLen; + + for (index = 0; index < indexMax; index++) { + tempA[index] = aPWM[index]; + } + for (index = 0; index < bIndexMax; index++) { + tempB[index] = bPWM[index]; + tempC[index] = cPWM[index]; + } + for (; index < indexMax; index++) { + tempB[index] = 0.25; + tempC[index] = 0.25; + } + } + fscore = 0.0; + rscore = 0.0; + + for (index = 0; index bestScore)) { + bestScore = rscore; + } else if (fscore > bestScore) { + bestScore = fscore; + } + + free(tempA); + free(tempB); + free(tempC); + } + + free(aPWM); + free(bPWM); + free(cPWM); + + return PyFloat_FromDouble(bestScore); +} + +static char the_func_doc[] = +"returns a list of positions on aSeq that match the PWM within a Threshold, given as a percentage of the optimal consensus score."; + +static char dpwm_func_doc[] = +"returns a list of positions on aSeq that match the DPWM within a given Fold of the optimal consensus score."; + +static char mer_func_doc[] = +"returns a list of positions on aSeq that match an N-mer within M mismatches. Assumes Mers and Seq are in the same case."; + +static char corr_func_doc[] = +"returns a pearson-correlation coefficient based similarity value between -1 (anti-correlated) and +1 (identical) for two motifs."; + +static PyMethodDef module_methods[] = { + {"locateMotif", the_func, METH_VARARGS, the_func_doc}, + {"locateMarkov1", dpwm_func, METH_VARARGS, dpwm_func_doc}, + {"locateMer", mer_func, METH_VARARGS, mer_func_doc}, + {"correlateMotifs", corr_func, METH_VARARGS, corr_func_doc}, + {NULL, NULL} +}; + +PyMODINIT_FUNC +init_motif(void) +{ + Py_InitModule3("_motif", module_methods, module_doc); +} diff --git a/cistematic/core/orthomatcher.py b/cistematic/core/orthomatcher.py new file mode 100644 index 0000000..36d40da --- /dev/null +++ b/cistematic/core/orthomatcher.py @@ -0,0 +1,174 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +import string +from cistematic.core.homology import homologyDB +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + + +def crossMatchGenes(matchFiles={}, prefix="", matchdir=cisTemp, filterCDS=False, filterLowerCase=False): + """ save to one file per genome the genes that are considred homologs in match files from different species. + """ + theGenomes = matchFiles.keys() + outFileNames = {} + genes = {} + entryDict = {} + ignoreList = {} + for aGenome in theGenomes: + outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome) + genes[aGenome] = [] + entryDict[aGenome] = {} + ignoreList[aGenome] = [] + + for aGenome in theGenomes: + goodLines = [] + ignoreLines = [] + theFile = open(matchFiles[aGenome], "r") + for entry in theFile: + fields = entry.split("\t") + gid = fields[11] + if filterCDS and fields[6] == "CDS": + ignoreLines.append(string.join(fields[:4], "-")) + continue + + if fields[6] == "NONE": + continue + + if filterLowerCase and fields[12] != fields[12].upper(): + continue + + goodLines.append(entry) + + theFile.close() + for entry in goodLines: + fields = entry.split() + gid = fields[11] + geneID = (aGenome, gid) + matchID = string.join(fields[:4], "-") + if matchID in ignoreLines: + continue + + if geneID not in genes[aGenome]: + genes[aGenome].append(geneID) + + if geneID not in entryDict[aGenome]: + entryDict[aGenome][geneID] = [] + entryDict[aGenome][geneID].append(entry) + + print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome) + doCrossMatch(outFileNames, genes, entryDict, ignoreList) + + +def orthoMatcher(matchFiles={}, prefix="", matchdir=".", gidField=1, fileList=False): + """ save to one file per genome the genes that are considred homologs in match files from different species. + """ + theGenomes = matchFiles.keys() + outFileNames = {} + genes = {} + entryDict = {} + ignoreList = {} + for aGenome in theGenomes: + outFileNames[aGenome] = "%s/%s-%s.hom" % (matchdir, prefix, aGenome) + genes[aGenome] = [] + entryDict[aGenome] = {} + ignoreList[aGenome] = [] + + for aGenome in theGenomes: + theList = [] + if not fileList: + theList = [matchFiles[aGenome]] + else: + theList = matchFiles[aGenome] + + for aFile in theList: + theFile = open(aFile, "r") + for entry in theFile: + fields = entry.split() + gid = fields[gidField] + geneID = (aGenome, gid) + if geneID not in genes[aGenome]: + genes[aGenome].append(geneID) + + if geneID not in entryDict[aGenome]: + entryDict[aGenome][geneID] = [] + entryDict[aGenome][geneID].append(entry) + + theFile.close() + + print "Loaded %d %s entries" % (len(genes[aGenome]), aGenome) + doCrossMatch(outFileNames, genes, entryDict, ignoreList) + + +def doCrossMatch(outFileNames, genes, entryDict, ignoreList): + theGenomes = [] + outFile = {} + for genome in outFileNames: + theGenomes.append(genome) + + hdb = homologyDB(cache=True) + for aGenome in theGenomes: + outFile = open(outFileNames[aGenome], "w") + counter = 0 + for geneID in genes[aGenome]: + try: + if geneID in ignoreList[aGenome]: + continue + + res = hdb.getHomologousGenes(geneID) + newcounter = 0 + for entry in res: + if entry[0] in theGenomes: + secondGenome = entry[0] + + if entry in genes[secondGenome]: + print "%d: %s %s" % (counter, str(geneID), str(entry)) + if newcounter < 1: + for rec in entryDict[aGenome][geneID]: + outFile.write("%d\t%s\t%s" % (counter, str(geneID), rec)) + + for rec in entryDict[secondGenome][entry]: + outFile.write("%d\t%s\t%s" % (counter, str(entry), rec)) + + newcounter += 1 + + if secondGenome == aGenome: + ignoreList[aGenome].append(entry) + + if newcounter > 0: + counter += 1 + outFile.write("\n") + except: + print "Problem around %s" % (str(geneID)) + + outFile.close() \ No newline at end of file diff --git a/cistematic/core/protein.py b/cistematic/core/protein.py new file mode 100644 index 0000000..e1dc95c --- /dev/null +++ b/cistematic/core/protein.py @@ -0,0 +1,440 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +import string +import copy +from math import log + +AA = {"TTT": "F", + "TTC": "F", + "TTA": "L", + "TTG": "L", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "TAT": "Y", + "TAC": "Y", + "TAA": "*", + "TAG": "*", + "TGT": "C", + "TGC": "C", + "TGA": "*", + "TGG": "W", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CAT": "H", + "CAC": "H", + "CAA": "Q", + "CAG": "Q", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATG": "M", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "AAT": "N", + "AAC": "N", + "AAA": "K", + "AAG": "K", + "AGT": "S", + "AGC": "S", + "AGA": "R", + "AGG": "R", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GAT": "D", + "GAC": "D", + "GAA": "E", + "GAG": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", + "---": "-", + "NNN": "X" +} + + +def getAA(codon): + """ returns one-letter AA symbol corresponding to codon, if existing. + returns X for unknown codon, '-' for a complete gap, and '*' for a + stop codon. + """ + codon = codon.upper() + codon = string.replace(codon, "U", "T") + try: + aa = AA[codon] + except KeyError: + aa = "X" + + return aa + + +def translate(mRNA, frame=1): + """ translate a sequence into protein based on frame (1, 2, 3 only) + """ + prot = "" + if frame < 1: + frame = 1 + elif frame > 3: + frame = 3 + + for nucPos in range(frame - 1, len(mRNA) - 2, 3): + theCodon = mRNA[nucPos:nucPos+3] + print theCodon + theAA = getAA(theCodon) + prot += theAA + + return prot + + +def iCodon(codon): + """ returns the number of possible synonymous changes at + each site of the codon, e.g (1, 0, 3) for CTG (Leucine) + """ + isynonyms = [0, 0, 0] + origAA = getAA(codon) + if origAA == "X" or origAA == "-": + return [-1, -1, -1] + + for pos in range(len(codon)): + isyn = 0 + bases = ["A", "C", "G", "T"] + site = codon[pos] + bases.remove(site) + for nt in bases: + newcodon = [] + for nuc in codon: + newcodon.append(nuc) + + newcodon[pos] = nt + newcodon = string.join(newcodon, "") + newAA = getAA(newcodon) + if newAA == origAA: + isyn += 1 + + isynonyms[pos] = isyn + + return isynonyms + + +def calcMutationTypes(codonList1, codonList2, verbose=False): + """ returns the number of (synonymous, nonsynonymous) + mutations in informative codons. + """ + synonymous = 0.0 + nonsynonymous = 0.0 + for pos in range(len(codonList1)): + diffList = [] + codon1 = codonList1[pos] + codon2 = codonList2[pos] + for isite in range(len(codon1)): + isite1 = codon1[isite] + isite2 = codon2[isite] + if isite1 != isite2: + diffList.append(isite) + + aa1 = getAA(codon1) + aa2 = getAA(codon2) + if aa1 == aa2: + if aa1 == "S": + if codon1[0] != codon2[0]: + if verbose: + print "nonsynonymous Serines" + + nonsynonymous += 1 + + synonymous += 1 + else: + synonymous += len(diffList) + + else: + if len(diffList) == 1: + nonsynonymous += 1 + else: + if verbose: + print "parsimonious mutation path estimator - assuming 1 parameter" + print "diffList = %s\t%s (%s) \t%s (%s)" % (diffList, codon1, aa1, codon2, aa2) + + if 1 in diffList: + if verbose: + print "middle site is nonsynonymous" + + nonsynonymous += 1 + diffList.remove(1) + + if 2 in diffList: + codon3 = codon1[:-1] + codon1[2] + aa3 = getAA(codon3) + if aa3 == aa1: + if verbose: + print "last site is synonymous" + + synonymous += 1 + diffList.remove(2) + + if 0 in diffList: + codon3 = codon2[0] + codon1[1:] + aa3 = getAA(codon3) + if aa3 == aa1: + if verbose: + print "first site is synonymous" + + synonymous += 1 + diffList.remove(0) + + if len(diffList) > 0: + if verbose: + print "%s must be non-synonymous" % str(diffList) + + nonsynonymous += len(diffList) + + return (synonymous, nonsynonymous) + + +def calcSubstitutionSites(codonList): + """ returns the number of (synonymous, nonsynonymous) sites + in a list of codons, which should be filtered for gaps and X's. + """ + synonymous = 0.0 + nonsynonymous = 0.0 + for codon in codonList: + (site0, site1, site2) = iCodon(codon) + if site0 < 0: + continue + + synonymous += (site0 + site1 + site2) / 3.0 + nonsynonymous += (9.0 - site0 - site1 - site2) / 3.0 + + return (synonymous, nonsynonymous) + + +def calcSubstitutionsPerSite(codonList1, codonList2): + """ returns the number of substitutions per site as + a triplet in comparable or informative sites. + """ + site0 = 0.0 + site1 = 0.0 + site2 = 0.0 + for pos in range(len(codonList1)): + codon1 = codonList1[pos] + codon2 = codonList2[pos] + if codon1[0] != codon2[0]: + site0 +=1 + + if codon1[1] != codon2[1]: + site1 += 1 + + if codon1[2] != codon2[2]: + site2 += 1 + + return (site0, site1, site2) + + +def calcKs(Ms, Ns): + """ returns a Ks calculated using Ms and Ns and adjusted using + Jukes and Cantor's formula. + """ + Ks = -0.75 * log(1 - ((4.0/3.0) * Ms / Ns)) + return Ks + + +def calcKa(Ma, Na): + """ returns a Ka calculated using Ma and Na and adjusted using + Jukes and Cantor's formula. + """ + Ka = -0.75 * log(1 - ((4.0/3.0) * Ma / Na)) + return Ka + + +def printCDSdict(cdsDict, printAA=True): + """ Prints every locus in a cdsDict. Optionally truns off AA translation. + """ + for locus in cdsDict: + printCDSlocus(cdsDict, locus, printAA) + + +def printCDSlocus(cdsDict, locus, printAA=True): + """ Prints a locus in a given cdsDict. Optionally turns off AA translation. + """ + cdsOutLines = [] + aaOutLines = [] + cdsOutLine = locus + "\t" + aaOutLine = " " * len(locus) + "\t" + for pos in range(len(cdsDict[locus])): + if len(cdsOutLine) > 69: + cdsOutLines.append(cdsOutLine + cdsDict[locus][pos] + "\n") + cdsOutLine = locus + "\t" + aaOutLines.append(aaOutLine + " " + getAA(cdsDict[locus][pos]) + " \n") + aaOutLine = " " * len(locus) + "\t" + else: + cdsOutLine += cdsDict[locus][pos] + " " + aaOutLine += " " + getAA(cdsDict[locus][pos]) + " " + + cdsOutLines.append(cdsOutLine + "\n") + aaOutLines.append(aaOutLine + " \n") + for index in range(len(cdsOutLines)): + print cdsOutLines[index] + if printAA: + print aaOutLines[index] + + +def getComparableSites(cdsDict, loci=[]): + """ given a cdsDict and a list of loci, returns a new cdsDict with positions with + gaps or X codons in any one sequence deleted in all sequences. + """ + newDict = {} + seqArray = [] + locArray = [] + deleteList = [] + index = 0 + for locus in loci: + locArray.append(locus) + seqArray.append(copy.deepcopy(cdsDict[locus])) + + for index in range(len(locArray)): + for pos in range(len(seqArray[index])): + aa = getAA(seqArray[index][pos]) + if aa == "X" or aa == "-": + if pos not in deleteList: + deleteList.append(pos) + + deleteList.sort() + deleteList.reverse() + for pos in deleteList: + for index in range(len(locArray)): + del seqArray[index][pos] + + for index in range(len(locArray)): + newDict[locArray[index]] = seqArray[index] + + return newDict + + +def getInformativeSites(cdsDict, loci=[]): + """ given a cdsDict of comparable sites and a list of loci, returns a new + cdsDict with positions with only codons that differ in one or more + sequences and which are therefore (possibly) informative. + """ + newDict = {} + seqArray = [] + locArray = [] + deleteList = [] + index = 0 + for locus in loci: + locArray.append(locus) + seqArray.append(copy.deepcopy(cdsDict[locus])) + + for pos in range(len(seqArray[0])): + deleteCodon = True + refcodon = seqArray[0][pos] + for index in range(1, len(locArray)): + seqcodon = seqArray[index][pos] + if seqcodon != refcodon: + deleteCodon = False + + if deleteCodon: + deleteList.append(pos) + + deleteList.sort() + deleteList.reverse() + for pos in deleteList: + for index in range(len(locArray)): + del seqArray[index][pos] + + for index in range(len(locArray)): + newDict[locArray[index]] = seqArray[index] + + return newDict + + +def buildCDSdict(cdsFileName): + """ imports a set of *ALIGNED* sequences in a fasta-format file and splits + each sequence into its individual codons. Returns a Dictionary of the + sequences with the fasta ID as the key and the codons in a list. + """ + cdsfile = open(cdsFileName, "r") + cdslines = cdsfile.readlines() + cdsfile.close() + cdsDict = {} + locus = "" + for line in cdslines: + partialCodon = "" + inFrame = True + line = line[:-1] + if line[0] == ">": + fields = line.split(" ") + if len(fields[0]) > 1 and fields[0][0] == ">": + locus = fields[0][1:] + else: + locus = fields[1] + + cdsDict[locus] = [] + else: + for pos in range(0, len(line), 3): + codon = line[pos:pos+3] + if codon == "---": + cdsDict[locus].append("---") + elif "-" in codon and inFrame: + inFrame = False + partialCodon = string.replace(codon, "-", "") + cdsDict[locus].append("---") + elif not inFrame: + partialCodon += string.replace(codon, "-", "") + if len(partialCodon) == 3: + cdsDict[locus].append(partialCodon) + inFrame = True + partialCodon = "" + + if len(partialCodon) > 3: + cdsDict[locus].append(partialCodon[:3]) + partialCodon = partialCodon[3:] + else: + cdsDict[locus].append(codon) + + return cdsDict \ No newline at end of file diff --git a/cistematic/core/setup.py b/cistematic/core/setup.py new file mode 100644 index 0000000..4f50bac --- /dev/null +++ b/cistematic/core/setup.py @@ -0,0 +1,33 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### + + +from distutils.core import setup, Extension + +setup(name="motif", version="1.0", ext_modules=[Extension("_motif", ["motifextension.c"])]) diff --git a/cistematic/experiments/__init__.py b/cistematic/experiments/__init__.py new file mode 100644 index 0000000..cf421d9 --- /dev/null +++ b/cistematic/experiments/__init__.py @@ -0,0 +1,99 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +__all__ = ["analyzeMotifs", "experiment", "locate", "phyloFoot", "phyloTest", + "randomset", "simple", "varyLength", "orthology"] + +experimentTypes = [("simple", "Simple"), ("fasta", "Fasta"), ("locate", "Locate"), + ("varyLength", "VaryLength"), ("phyloFoot", "PhyloFoot"), + ("phyloTest", "PhyloTest"), ("orthology", "Orthology"), + ("experiment", "generic")] + +webExperimentTypes = [("simple", "Simple"), ("orthology", "Orthology")] + + +def loadExperiment(expID, expFilePath, analysisID="default", conservationID="default", consDB=""): + newExp = "" + try: + db = sqlite.connect(expFilePath) + sql = db.cursor() + sql.execute('select data from settings where expID = :expID and settingName = "experimentType"', locals()) + expType = str(sql.fetchone()[0]) + sql.close() + db.close() + expClass = "" + for etype in experimentTypes: + if etype[0] == expType: + expClass = etype[1] + + if expClass != "": + importString = "from %s import %s" % (expType, expClass) + exec importString + expString = 'newExp = %s("%s","%s")' % (expClass, expID, expFilePath) + exec expString + else: + print "Could not find associated class for %s" % expType + except: + print "Could not load experiment %s from database %s " % (expID, expFilePath) + + if consDB == "": + consDB = expFilePath + try: + expString = 'newExp.loadAnalysis("%s")' % analysisID + exec expString + print "loaded analysis %s" % analysisID + expString = "newExp.loadConservation('%s', '%s')" % (conservationID, consDB) + exec expString + print "loaded conservation %s" % conservationID + except: + pass + + return newExp + + +def listExperiments(expFilePath): + result = [] + try: + db = sqlite.connect(expFilePath) + sql = db.cursor() + sql.execute('select distinct expID, data from settings where settingName = "experimentType" ') + data = sql.fetchall() + sql.close() + db.close() + for (expID, expType) in data: + result.append((str(expID), str(expType))) + except: + print "Could not list experiments in database %s " % (expFilePath) + + return result \ No newline at end of file diff --git a/cistematic/experiments/analyzeMotifs.py b/cistematic/experiments/analyzeMotifs.py new file mode 100644 index 0000000..b1791b0 --- /dev/null +++ b/cistematic/experiments/analyzeMotifs.py @@ -0,0 +1,1097 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# basic analysis code +from cistematic.core.motif import matrixRow +import cistematic.core +import string + +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +knownMotifsDB = "%s/db/known_motifs.db" % cisRoot + + +class AnalyzeMotifs: + motifSize = {} + coappear = {} + motifToAnnotations = {} + annotationEntries = [] + genomeMatches = {} + loadedGenomes = [] + dbName = "" + analysisID = "default" + + + def initializeKnownMotifs(self, source=""): + """ loads known motifs from motif database, optionally limiting by source. + """ + self.annotationEntries = [] + dbkm = sqlite.connect(knownMotifsDB, timeout=60) + sqlkm = dbkm.cursor() + if len(source) > 0: + sqlkm.execute("select * from motifs where source = '%s' " % source) + else: + sqlkm.execute("select * from motifs") + + dbEntries = sqlkm.fetchall() + dbkm.close() + + for entry in dbEntries: + (index, source, name, theseq, reference, other_info) = entry + self.annotationEntries.append((index, source, name, theseq, reference, other_info)) + + + def findAnnotations(self, motID, thresh=-1, numberN=0): + mot = self.findMotif(motID) + if thresh < 0: + thresh = self.threshold + + for entry in self.annotationEntries: + (id, source, name, theseq, reference, other) = entry + loc = mot.locateMotif(theseq, thresh, numberN) + if len(loc) > 0 : + self.motifToAnnotations[motID].append((source, name, theseq, reference, other, loc)) + + if len(self.motifToAnnotations[motID]) >= 500: + break + + + def findAnnotationsRE(self, motID, numMismatches=0): + """ Calls the motif's consensus locator on each entry in annotationEntries. + """ + mot = self.findMotif(motID) + if numMismatches > 0: + mot.initializeMismatchRE(numMismatches) + else: + mot.initializeRE() + + for entry in self.annotationEntries: + (id, source, name, annotSeq, reference, other) = entry + # Must use non-RE optimized version for short annotations + if len(annotSeq) < len(mot): + pad = "N" * (len(mot) - len(annotSeq)) + annotSeq = pad + annotSeq + pad + loc = mot.locateConsensus(annotSeq) + else: + loc = mot.locateConsensusRE(annotSeq) + + if len(loc) > 0 : + self.motifToAnnotations[motID].append((source, name, annotSeq, reference, other, loc)) + + if len(self.motifToAnnotations[motID]) >= 500: + break + + + def printAnnotations(self): + """ prints the motif annotations for every motif result. + """ + for mot in self.getResults(): + annotations = self.motifToAnnotations[mot.tagID] + print "motif %s\t%s" % (mot.tagID, mot.buildConsensus()) + for annotation in annotations: + (source, name, annotSeq, reference, other, loc) = annotation + print "%s:%s\t%s\t%s\t%s" % (source, name, reference, annotSeq, str(loc)) + print other + print + + + def printAnnotationsShort(self): + """ prints a compressed version of the annotations. + """ + for motID in self.motifToAnnotations.keys(): + for annotation in self.motifToAnnotations[motID]: + print "%s: %s (%s)\t%s - %s" % (annotation[0], annotation[1], annotation[4], annotation[2], annotation[5]) + + + def returnAnnotations(self, motID): + """ returns the [annotations] for a particular motID + """ + try: + return self.motifToAnnotations[motID] + except KeyError: + return [] + + + def annotateMotifs(self, thresh=0.0, numberN=0): + self.mlog( "Annotating Motifs with threshold %d and %d extra Ns" % (1.0 + thresh, numberN)) + if len(self.annotationEntries) == 0: + self.initializeKnownMotifs() + + for mot in self.getResults(): + mot.setThreshold(thresh) + self.motifToAnnotations[mot.tagID] = [] + self.findAnnotations(mot.tagID, thresh, numberN) + + + def annotateConsensus(self, numMismatches=0, source=""): + self.mlog( "Annotating Consensus") + if len(self.annotationEntries) == 0: + self.initializeKnownMotifs(source) + + for mot in self.getResults(): + self.motifToAnnotations[mot.tagID] = [] + self.findAnnotationsRE(mot.tagID, numMismatches) + + + def printConsensus(self): + """ print the consensus for every motif result. + """ + for mot in self.getResults(): + print "motif %s\t%s" % (mot.tagID, mot.buildConsensus()) + + + def formatPWM(self, aPWM): + """ format a PWM into a printable string. + """ + aRow = "" + cRow = "" + gRow = "" + tRow = "" + for col in aPWM: + aRow = string.join([aRow, str(round(col[matrixRow["A"]],4))], "\t") + cRow = string.join([cRow, str(round(col[matrixRow["C"]],4))], "\t") + gRow = string.join([gRow, str(round(col[matrixRow["G"]],4))], "\t") + tRow = string.join([tRow, str(round(col[matrixRow["T"]],4))], "\t") + + formattedPWM = "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow) + + return formattedPWM + + + def appendGeneToMotif(self,mTagID, geneID, pos): + """ make an entry in the geneToMotif table. + """ + (genome, locus) = geneID + (loc, sense) = pos + stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense) + res = self.sql(stmt, "commit") + + + def appendGeneToMotifBatch(self, batch): + """ make a batch of entries in the geneToMotif table. + """ + batchInserts = [] + stmt = "INSERT into geneToMotif(ID, expID, analysisID, mTagID, genome, locus, location, sense) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for entry in batch: + (mTagID, geneID, pos) = entry + (genome, locus) = geneID + (loc, sense) = pos + batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, locus, loc, sense)) + + res = self.batchsql(stmt, batchInserts) + + + def geneToMotifKeys(self, thekey="geneID"): + """ return the keys to the geneToMotif table. The default key is geneID, otherwise returns mTagID. + """ + results = [] + if thekey == "geneID": + stmt = "SELECT distinct genome, locus from geneToMotif where expID = '%s' and analysisID = '%s' order by locus" % (self.experimentID, self.analysisID) + else: + stmt = "SELECT distinct mTagID from geneToMotif where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID) + + res = self.sql(stmt) + for entry in res: + if thekey == "geneID": + (genome, locus) = entry + results.append((str(genome), str(locus))) + else: + mTagID = entry[0] + results.append(str(mTagID)) + + return results + + + def appendMotifNeighbors(self,mTagID, match, condition="", geneEntry=""): + """ make an entry in the motifNeighbors table. + """ + (chromo, pos) = match + (genome, chromNum) = chromo + (loc, mSense) = pos + if geneEntry != "": + (start, stop, sense, geneID) = geneEntry + (genome2, locus) = geneID + else: + start = "-" + stop = "-" + sense = "-" + locus = "NO GENE IN RADIUS" + + stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition) + res = self.sql(stmt, "commit") + + + def appendMotifNeighborsBatch(self, batch): + """ make a batch of entries in the motifNeighbors table. + """ + batchInserts = [] + stmt = "INSERT into motifNeighbors(ID, expID, analysisID, mTagID, genome, chromNum, location, motifSense, start, stop, sense, locus, condition) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + for entry in batch: + (mTagID, match, condition, geneEntry) = entry + (chromo, pos) = match + (genome, chromNum) = chromo + (loc, mSense) = pos + if geneEntry != "": + (start, stop, sense, geneID) = geneEntry + (genome2, locus) = geneID + else: + start = "-" + stop = "-" + sense = "-" + locus = "NO GENE IN RADIUS" + + batchInserts.append((self.experimentID, self.analysisID, mTagID, genome, chromNum, loc, mSense, start, stop, sense, locus, condition)) + + res = self.batchsql(stmt, batchInserts) + + + def motifNeighborsKeys(self, condition=""): + """ returns a [list of motif ID's] in the motifNeighbor table. Can restrict to a particular condition. + """ + results = [] + if condition != "": + stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' and condition ='%s' order by mTagID" % (self.experimentID, self.analysisID, condition) + else: + stmt = "SELECT distinct mTagID from motifNeighbors where expID = '%s' and analysisID = '%s' order by mTagID" % (self.experimentID, self.analysisID) + + res = self.sql(stmt) + for entry in res: + mTagID = entry[0] + results.append(mTagID) + + return results + + + def motifNeighbors(self,mTagID, condition="", discardNullMatches=False): + """ get entries in the geneToMotif table that match a particular Motif & condition. + """ + results = [] + genome = "" + chromNum = "" + loc = "" + mSense = "" + start = "" + stop = "" + sense = "" + locus = "" + stmt = "select distinct genome, chromNum, location, motifSense, start, stop, sense, locus, condition from motifNeighbors where expID='%s' and analysisID = '%s' and mTagID='%s' " % (self.experimentID, self.analysisID, mTagID) + if discardNullMatches: + stmt += " and condition <> 'NONE' " + + if condition != '': + stmt += " and condition = '%s' " % (condition) + + stmt += " order by ID desc " + res = self.sql(stmt) + + for entry in res: + (genome, chromNum, loc, mSense, start, stop, sense, locus, condition) = entry + match = ((genome, chromNum),(int(loc), mSense)) + geneEntry = (start, stop, sense, (genome, locus)) + results.append((match, geneEntry, condition)) + + return results + + + def motifToGene(self, mTagID): + """ returns a list of matching geneIDs with location and sense found for a given motif. + """ + results = [] + stmt = "SELECT distinct genome, locus, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and mTagID = '%s' order by locus" % (self.experimentID, self.analysisID, mTagID) + res = self.sql(stmt) + for entry in res: + (genome, locus, location, sense) = entry + results.append(((genome, locus), (int(location), sense))) + + return results + + + def geneToMotif(self, geneID): + """ returns a list of matching motifs with location and sense found for a given geneID + """ + results = [] + (genome, locus) = geneID + stmt = "SELECT distinct mTagID, location, sense from geneToMotif where expID = '%s' and analysisID = '%s' and genome = '%s' and locus = '%s' order by location" % (self.experimentID, self.analysisID, genome, locus) + res = self.sql(stmt) + for entry in res: + (mTagID, location, sense) = entry + results.append((mTagID, (location, sense))) + + return results + + + def mapMotifs(self, Threshold=90.0, numberN=0, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, verbose=True): + """ find occurences of result motifs in the current genepool using PWMs. Slow. + """ + currentGenome = "" + currentDataset = -1 + genepoolKeys = self.genepool.keys() + posResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + runID = mTagID.split("-")[0] + (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID)) + if enforceSanity and not mot.isSane(): + self.mlog("mapMotifs: not mapping %s - failed sanity check" % (mTagID)) + continue + + if mot.tagID in ignoreList: + self.mlog("mapMotifs: not mapping %s" % mot.tagID) + continue + + if len(runList) > 0: + if int(runID) not in runList: + self.mlog("mapMotifs: not mapping run %s" % runID) + continue + + if mapAllSeqs: + dataID = 0 + + self.mlog("mapMotifs: mapping %s with threshold %f and at most %d N" % (mTagID, Threshold, numberN)) + if dataID <> currentDataset: + currentDataset = dataID + for geneID in self.getSetting(dataID): + if geneID[0] <> currentGenome: + currentGenome = geneID[0] + + try: + if geneID not in genepoolKeys: + self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstreamself.geneDB, False, self.boundToNextGene) + genepoolKeys.append[geneID] + except: + self.mlog("mapMotifs: could not load %s" % (str(geneID))) + break + + for geneID in genepoolKeys: + if verbose: + print "mapMotifs: %s\n" % str(geneID) + + motifPos = mot.locateMotif(self.genepool[geneID], Threshold, numberN) + + if len(motifPos) > 0: + for pos in motifPos: + posResults.append((mTagID, geneID, pos)) + + self.appendGeneToMotifBatch(posResults) + + + def mapConsensus(self, runList=[], ignoreList=[], enforceSanity=True, mapAllSeqs=True, numMismatches=0): + """ find occurences of result motifs in the current genepool using regular expressions, allowing + for a number of mismatches. + """ + currentGenome = "" + currentDataset = -1 + genepoolKeys = self.genepool.keys() + posResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + runID = mTagID.split("-")[0] + (pName, dataID, setID, tStamp, motArray) = self.getRun(int(runID)) + if mot.tagID in ignoreList: + self.mlog("mapConsensus: not mapping %s" % mot.tagID) + continue + + if len(runList) > 0: + if int(runID) not in runList: + self.mlog("mapConsensus: not mapping run %s" % runID) + continue + + if enforceSanity and not mot.isSane(): + self.mlog("mapConsensus: not mapping %s - failed sanity check" % (mTagID)) + continue + + if mapAllSeqs: + dataID = 0 + + if numMismatches > 0: + mot.initializeMismatchRE(numMismatches) + else: + mot.initializeRE() + self.mlog("mapConsensus: mapping %s with %s mismatches" % (mTagID, numMismatches)) + if dataID <> currentDataset: + currentDataset = dataID + for geneID in self.getSetting(dataID): + if geneID[0] <> currentGenome: + currentGenome = geneID[0] + + try: + if geneID not in genepoolKeys: + self.genepool[geneID] = cistematic.core.retrieveSeq(geneID, self.upstream, self.cds, self.downstream) + genepoolKeys.append[geneID] + except: + self.mlog("mapConsensus: could not load %s" % (str(geneID))) + break + + for geneID in genepoolKeys: + motifPos = mot.locateConsensusRE(self.genepool[geneID]) + + if len(motifPos) > 0: + for pos in motifPos: + posResults.append((mTagID, geneID, pos)) + + self.appendGeneToMotifBatch(posResults) + + + def mapFeatures(self, radius): + """ returns features within a certain radius in bp of all matches. + """ + FeatureList = [] + for mot in self.getResults(): + mTagID = mot.tagID + self.mlog("mapFeatures: mapping %s using a radius of %d bp" % (mTagID, radius)) + for match in self.motifToGene(mTagID): + matchFeatures = cistematic.core.retrieveFeatures(match, radius) + for entry in matchFeatures: + FeatureList.append((mTagID, match, entry)) + + return FeatureList + + + def limitGeneEntries(self, geneList, lowerBound, upperBound): + results = [] + for entry in geneList: + if entry[1] < lowerBound or entry[0] > upperBound: + continue + + results.append(entry) + + return results + + + def mapNeighbors(self, radius, annotate=True): + """ returns genes on a chromosome within a certain radius in bp. + """ + localGeneList = {} + prevChromo = "" + neighborResults = [] + for mot in self.getResults(): + mTagID = mot.tagID + motLen = len(mot) + motRadius = motLen / 2 + mtgList = self.motifToGene(mTagID) + self.mlog("mapNeighbors: mapping %s using a radius of %d bp (%d instances)" % (mTagID, radius, len(mtgList))) + index = 0 + for match in mtgList: + (chromo, hit) = match + if annotate and chromo != prevChromo: + prevChromo = chromo + localGeneList = cistematic.core.getChromoGeneEntries(chromo) + + index += 1 + if (index % 1000) == 0: + print "." + + matchCounter = 0 + matchCDS = [] + if annotate: + (chromo, hit) = match + matchpos = int(hit[0]) + lowerBound = matchpos - radius + upperBound = matchpos + radius + match2 = (chromo, (matchpos + motRadius, hit[1])) + matchFeatures = cistematic.core.retrieveFeatures(match2, motRadius, "CDS") + for entry in matchFeatures: + matchCDS.append((chromo[0], entry[0])) + + geneEntriesList = self.limitGeneEntries(localGeneList, lowerBound, upperBound) + for geneEntry in geneEntriesList: + beg = int(geneEntry[0]) + end = int(geneEntry[1]) + sense = geneEntry[2] + gID = geneEntry[3] + if gID in matchCDS: # matching within the coding sequence + neighborResults.append((mTagID, match, "CDS", geneEntry)) + matchCounter += 1 + elif matchpos >= beg and matchpos <= end: # not in CDS, but in gene + neighborResults.append((mTagID, match, "GENE", geneEntry)) + matchCounter += 1 + elif matchpos < beg: + if sense == "F": + neighborResults.append((mTagID, match, "UPSTREAM", geneEntry)) + else: + neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry)) + + matchCounter += 1 + else: + if sense == "F": + neighborResults.append((mTagID, match, "DOWNSTREAM", geneEntry)) + else: + neighborResults.append((mTagID,match, "UPSTREAM", geneEntry)) + + matchCounter += 1 + + if matchCounter < 1: + neighborResults.append((mTagID, match, "NONE", "")) + + self.appendMotifNeighborsBatch(neighborResults) + + + def printMotifToGene(self, motifList=[], runList=[]): + motKeys = self.geneToMotifKeys(thekey="mTagID") + if len(motifList) == 0 and len(runList) == 0: + for tag in motKeys: + print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag))) + else: + for tag in motKeys: + runID = tag.split("-")[0] + if len(runList) > 0: + if runID not in runList: + continue + + if tag in motifList: + print "Motif %s is found in: %s" % (tag, str(self.motifToGene(tag))) + + + def motifToGeneStat(self): + tags = self.geneToMotifKeys(thekey="mTagID") + counter = 0 + min = len(self.motifToGene(tags[0])) + max = 0 + for tag in tags: + numGenes = len(self.motifToGene(tag)) + print "%s: %d" % (tag, numGenes) + if numGenes > max: + max = numGenes + elif numGenes < min: + min = numGenes + + counter += numGenes + + print "for a total of %d matches - min: %d max: %d" % (counter, min, max) + + + def printGeneToMotif(self, geneList = []): + if len(geneList) == 0: + for geneID in self.geneToMotifKeys(): + print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID))) + else: + for geneID in self.geneToMotifKeys(): + if geneID in geneList: + print "Gene %s has the following motifs: %s" % (str(geneID), str(self.geneToMotif(geneID))) + + + def geneToMotifStat(self): + genes = self.geneToMotifKeys() + counter = 0 + min = len(self.geneToMotif(genes[0])) + max = 0 + for gene in genes: + numMotifs = len(self.geneToMotif(gene)) + print "%s - %s: %d" % (gene[0], gene[1], numMotifs) + if numMotifs > max: + max = numMotifs + elif numMotifs < min: + min = numMotifs + + counter += numMotifs + + print "for a total of %d matches - min: %d max: %d" % (counter, min, max) + + + def printGeneProfile(self, geneID): + print "\nMotif matches for %s " % (str(geneID)) + geneProfile = self.makeGeneProfile(geneID) + positions = geneProfile.keys() + if len(positions) > 0: + positions.sort() + for pos in positions: + print "%s %s" % (str(pos), str(geneProfile[pos])) + else: + print "Gene had no matching motifs" + + + def makeGeneProfile(self, geneID): + geneBucket = {} + geneMotifs = self.geneToMotif(geneID) + if len(geneMotifs) > 0: + for mot in geneMotifs: + (motID, pos) = mot + (loc, sense) = pos + if int(loc) not in geneBucket.keys(): + geneBucket[int(loc)] = [] + + geneBucket[int(loc)].append(mot) + + return geneBucket + + + def getMotifMatches(self, motifIDs=[]): + results = {} + if len(motifIDs) < 1: + motifIDs = self.geneToMotifKeys(thekey="mTagID") + + for motID in motifIDs: + results[motID] = [] + mot = self.findMotif(motID) + motLength = len(mot) + for match in self.motifToGene(motID): + (chrom, hit) = match + (pos, sense) = hit + if sense == "F": + results[motID].append(self.genepool[chrom][pos:pos + motLength]) + else: + results[motID].append(cistematic.core.complement(self.genepool[chrom][pos:pos + motLength], motLength)) + + return results + + + def summarizeMotifs(self, fileName): + """ saves the number of occurences and actual occurence PWMs of motifs to fileName. + """ + motDict = {} + motText = [] + try: + if len(fileName) < 1: + raise IOError + + outFile = open(fileName, "a") + self.mlog("Saving motif summary") + for motID in self.geneToMotifKeys(thekey="mTagID"): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + motDict[motID] = [] + for index in range(motLength): + motDict[motID].append([0.0, 0.0, 0.0, 0.0]) + + for match in self.motifToGene(motID): + matchNum += 1 + (chromo, hit) = match + (pos, sense) = hit + if sense == "F": + matchSeq = self.genepool[chromo][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[chromo][pos: pos + motLength], motLength) + + for index in range(motLength): + NT = matchSeq[index] + NT = NT.upper() + if NT in ["A", "C", "G", "T"]: + motDict[motID][index][matrixRow[NT]] += 1 + + motLine = "motif %s\t %s matches\t'%s'\n %s\n" % (motID, str(matchNum), mot.buildConsensus(), self.formatPWM(motDict[motID])) + motText.append(motLine) + + outFile.writelines(motText) + outFile.close() + except: + self.mlog("Could not save motif summary to file %s\n" % fileName) + + + def printMotifNeighbors(self): + for motID in self.motifNeighborsKeys(): + print "Matches for motif %s" % motID + currentHit = () + for entry in self.motifNeighbors(motID): + if entry[0] != currentHit: + print "=====================" + print "Match %s" % str(entry[0]) + currentHit = entry[0] + + print "\tGene %s:" % str(entry[1]) + try: + goInfo = cistematic.core.getGOInfo(entry[1][3]) + for entry in goInfo: + print "\t\t%s" % str(entry) + except: + pass + + print "----------------" + + + def summarizeMotifNeighbors(self, fileName): + """ saves the number of occurences and PWMs of motifs within the gene + neighborhood radius as mapped using mapNeighbors() to fileName. + """ + motDict = {} + motText = [] + try: + if len(fileName) < 1: + raise IOError + outFile = open(fileName, "a") + self.mlog("Saving neighbor summary") + for motID in self.motifNeighborsKeys(): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + motDict[motID] = [] + for index in range(motLength): + motDict[motID].append([0.0, 0.0, 0.0, 0.0]) + + currentMatch = "" + for entry in self.motifNeighbors(motID, discardNullMatches = True): + if currentMatch != entry[0]: + currentMatch = entry[0] + (genechrom, loc) = entry[0] + (pos, sense) = loc + geneID = entry[1][3] + (geno, gene) = geneID + if gene != "NO GENE IN RADIUS": + matchNum += 1 + if sense == "F": + matchSeq = self.genepool[genechrom][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength) + + for index in range(motLength): + NT = matchSeq[index] + NT = NT.upper() + if NT in ["A", "C", "G", "T"]: + motDict[motID][index][matrixRow[NT]] += 1 + + motLine = "motif %s\t %s matches\n %s\n" % (motID, str(matchNum), self.formatPWM(motDict[motID])) + motText.append(motLine) + + outFile.writelines(motText) + outFile.close() + except: + self.mlog("Could not save motif neighbors to file %s\n" % fileName) + + + def saveMotifNeighbors(self, fileName, fullAnnotations=True): + """ save every occurence of the motifs with any adjoining gene(s) to fileName. + Records annotations and GO terms if available when fullAnnotations=True. + """ + goDict = {} + annotDict = {} + currentGenome = "" + neighborList = [] + self.mlog("Saving motif neighbors to file %s\n" % fileName) + if True: + if len(fileName) < 1: + raise IOError + outFile = open(fileName, "a") + for motID in self.motifNeighborsKeys(): + matchNum = 0 + mot = self.findMotif(motID) + motLength = len(mot) + currentMatch = "" + for entry in self.motifNeighbors(motID): + neighborList = [] + if currentMatch != entry[0]: + matchNum += 1 + currentMatch = entry[0] + (genechrom, loc) = entry[0] + (genome, chromo) = genechrom + (pos, sense) = loc + if fullAnnotations and genome != currentGenome: + currentGenome = genome + goDict = cistematic.core.getAllGOInfo(genome) + annotDict = cistematic.core.getAllAnnotInfo(genome) + + start = entry[1][0] + stop = entry[1][1] + geneSense = entry[1][2] + geneID = entry[1][3] + (geno, gene) = geneID + condition = entry[2] + if sense == "F": + matchSeq = self.genepool[genechrom][pos:pos + motLength] + else: + matchSeq = cistematic.core.complement(self.genepool[genechrom][pos: pos + motLength], motLength) + + currentEntry = "%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motID, matchNum, genome, chromo, pos, sense, condition, start, stop, geneSense, geno, gene, matchSeq) + if fullAnnotations: + try: + goInfo = goDict.get(geneID, []) + annotInfo = annotDict.get(geneID,[]) + annotDescription = "" + prevGoInfo = "" + if self.debugSQL: + print "%s\t%s\t%s" % (str(geneID), str(goInfo), str(annotInfo)) + for annot in annotInfo: + annotDescription += annot + " " + if len(goInfo) > 0: + for entry in goInfo: + if entry == prevGoInfo: + continue + neighborList.append("%s\t%s\t%s\n" % (currentEntry, annotDescription, entry)) + prevGoInfo = entry + else: + neighborList.append("%s\t%s\n" % (currentEntry, annotDescription)) + except: + neighborList.append("%s\n" % currentEntry) + else: + neighborList.append("%s\n" % currentEntry) + + outFile.writelines(neighborList) + + outFile.close() + else: + self.mlog("Could not save motif neighbors to file %s\n" % fileName) + + return neighborList + + + def buildMotifSize(self): + mots = self.getResults() + index = 0 + for mot in mots: + motLength = len(mot) + if motLength not in self.motifSize.keys(): + self.motifSize[motLength] =[] + + self.motifSize[motLength].append((index, mot.buildConsensus())) + index += 1 + + + def findIdenticalMotifs(self): + # FIXME!!!! + identicalMotifs = [] + + for motLength in self.motifSize.keys(): + motList = self.motifSize[motLength] + motListLen = len(motList) + print "doing motif length %d" % motLength + for pos in range(motListLen): + index = pos + 1 + while index < motListLen: + if motList[pos][1] == motList[index][1]: + identicalMotifs.append((self.results[pos].tagID, self.results[index].tagID)) + + index += 1 + + return identicalMotifs + + + def createAnalysis(self, dbName=""): + """ creates the analysis SQL tables. Should only be run once per database file. + """ + if len(dbName) == 0: + dbName = self.expFile + + db = sqlite.connect(dbName, timeout=60) + try: + sql = db.cursor() + sql.execute("CREATE table geneToMotif(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar)") + sql.execute("CREATE table genomeMatches(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, locus varchar, location varchar, sense varchar, threshold varchar)") + sql.execute("CREATE table motifNeighbors(ID INTEGER PRIMARY KEY, expID varchar, analysisID varchar, mTagID varchar, genome varchar, chromNum varchar, location varchar, motifSense varchar, start varchar, stop varchar, sense varchar, locus varchar, condition varchar)") + sql.execute("CREATE index mot1 on geneToMotif(expID, analysisID, mTagID)") + sql.execute("CREATE index mot2 on genomeMatches(expID, analysisID, mTagID)") + sql.execute("CREATE index mot3 on motifNeighbors(expID, analysisID, mTagID)") + sql.execute("CREATE index locus1 on geneToMotif(locus)") + sql.execute("CREATE index locus2 on motifNeighbors(locus)") + sql.execute("CREATE index condition1 on motifNeighbors(condition)") + db.commit() + sql.close() + db.close() + self.mlog("Created analysis tables in database %s" % dbName) + except: + db.close() + self.mlog("Could not create tables in database %s" % dbName) + self.mlog("WARNING: perhaps you have called createAnalysis twice on the same database?") + + + def loadAnalysis(self, analysisID="default", dbName=""): + """ changes the analysisID to use the specified one, or use default. Must be used before reading or + writing any data to analysis tables. + """ + self.analysisID = analysisID + if len (dbName) > 0: + self.dbName = dbName + else: + self.dbName = self.expFile + + + def resetAnalysis(self): + """ currently zeros out some of the analysis structures. obsolete ? + """ + self.mlog("resetting analysis") + self.motifSize = {} + self.motifToAnnotations = {} + self.coappear = {} + + + def saveAnalysis(self): + """ currently obsolete - kept for backward compability. + """ + try: + self.mlog("saved analysis %s" % self.analysisID) + except: + self.mlog("could not save %s" % self.analysisID) + + + def deleteAnalysis(self, analysisID="default"): + """ delete all of the analysis either named aName, or matching + an experiment (the current one by default.) Currently a NO-OP. Obsolete. + """ + pass + + + def buildCoappear(self, motifList=[], distance="100000", strict = False): + """ Builds coappear dictionary of geneIDs where motifs occur with others. Can limit to + motifs in motifList (default is all motifs), and by distance (default is 100,000 base pairs.) + Results are accessed through printCoappear() and saveCoappear(). + """ + occurenceList = {} + self.coappear = {} + processedMotifs = [] + motifListLen = len(motifList) + if motifListLen == 0: + #use all motifs + motifList = self.geneToMotifKeys(thekey="mTagID") + motifListLen = len(motifList) + + for motif in motifList: + if motif not in processedMotifs: + matchList = self.motifToGene(motif) + for match in matchList: + (geneID, loc) = match + if geneID not in occurenceList: + occurenceList[geneID] = [] + + (location, sense) = loc + occurenceList[geneID].append((location, sense, motif)) + + processedMotifs.append(motif) + + for geneID in occurenceList: + occurenceList[geneID].sort() + if geneID not in self.coappear: + self.coappear[geneID] = [] + + coappearing = False + differentMotifs = False + coappearList = [] + prevOccurence = occurenceList[geneID][0] + del occurenceList[geneID][0] + for occurence in occurenceList[geneID]: + if occurence[0] < prevOccurence[0] + distance: + coappearing = True + if occurence[2] != prevOccurence[2]: + differentMotifs = True + + coappearList.append(prevOccurence) + elif coappearing: + if strict: + if differentMotifs: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + else: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + + coappearing = False + differentMotifs = False + coappearList = [] + + prevOccurence = occurence + + if coappearing: + if strict: + if differentMotifs: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + else: + coappearList.append(prevOccurence) + self.coappear[geneID].append(coappearList) + + + def printCoappear(self): + """ prints a formatted version of the coappear dictionary built with buildCoappear() + """ + for geneID in self.coappear: + print " ===== %s =====" % str(geneID) + for occurence in self.coappear[geneID]: + print str(occurence) + print " =============" + + + def saveCoappear(self, fileName): + """ save coappear dictionary in tabular format. Returns: + index, genome, locus, pos, sense, tag in tab-separated format. + """ + index = 0 + outLines = [] + if 1: + if len(fileName) < 1: + raise IOError + + outFile = open(fileName, "a") + for geneID in self.coappear: + (genome, locus) = geneID + for occurence in self.coappear[geneID]: + index += 1 + coappearLine = "%d\t%s\t%s" % (index, genome, locus) + for match in occurence: + (pos, sense, tag) = match + coappearLine += "\t%s\t%s\t%s" % (pos, sense, tag) + + outLines.append(coappearLine + "\n") + + outFile.writelines(outLines) + outFile.close() + else: + self.mlog("Could not save coappear to file %s\n" % fileName) + + + def sql(self, stmt, commit=""): + """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list. + """ + db = sqlite.connect(self.dbName, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "sql: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + if commit != "": + db.commit() + + sqlc.close() + db.close() + + return res + + + def batchsql(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.dbName, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "batchsql: %s" % stmt + print "batchsql: %s" % str(batch) + + sqlc.executemany(stmt, batch) + db.commit() + sqlc.close() + db.close() + + return res \ No newline at end of file diff --git a/cistematic/experiments/conservation.py b/cistematic/experiments/conservation.py new file mode 100644 index 0000000..04ce17c --- /dev/null +++ b/cistematic/experiments/conservation.py @@ -0,0 +1,817 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# This class contains the core code for using orthology and sequence-level conservation. +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import string, cistematic.core +from cistematic.core.homology import homologyDB, homologeneGenomes +from cistematic.programs.mafft import Mafft +from cistematic.programs.paircomp import Paircomp +from cistematic.programs.fastcomp import Fastcomp + + +class Conservation: + """ The conservation class holds the conservation specific code for + specifying orthology/paralogy, calling conservation identification code, + and manipulating/storing conserved sequences. It is meant to be used + in conjuction with the Experiment and AnalyzeMotifs classes. + """ + conservationID = "default" + consDBName = "" + startingGenome = "" + targetGenomes = [] + refGenes = [] + + + def setTargetGenomes(self, tGenomes): + """ restricts homologs to the genomes specified in the tGenomes list. + """ + self.targetGenomes = tGenomes + + + def setRefGenes(self, rGenes): + """ sets the list of loci (not geneIDs) that will be compared to homologs. + """ + self.refGenes = rGenes + + + def setStartingGenome(self, sGenome): + """ sets the genome associated with the refGenes. + """ + self.startingGenome = sGenome + + + def importHomology(self, inputFile, source="generic"): + """ loads homology relationships from a tab-delimited file of the form: + homologyGroup orthologyGroup genome locus + where orthologyGroup can be left blank. + """ + stmtList = [] + importFile = open(inputFile, "r") + stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " + for line in importFile: + (homGroup, orthGroup, genome, locus) = line.split("\t") + stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source)) + + importFile.close() + self.batchsqlcons(stmt, stmtList) + + + def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"): + """ define the geneIDs in geneIDList as being homologous. + """ + stmtList = [] + + if homGroup == "": + for tempID in geneIDList: + tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1])) + if not self.hasHomGroup(tempHomGroup): + homGroup = tempHomGroup + break + + if homGroup == "": + self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList)) + return "" + + stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " + for geneID in geneIDList: + stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source)) + + self.batchsqlCons(stmt, stmtList) + return homGroup + + + def deleteHomolog(self, geneID, homGroup=""): + """ delete all entries from the homology table for a given geneID and homGroup. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1]) + self.sqlcons(stmt, "commit") + + + def deleteOrtholog(self, geneID, orthGroup): + """ delete all entries from the homology table for a given geneID and orthGroup. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1]) + self.sqlcons(stmt, "commit") + + + def deleteHomologyGroup(self, homGroup): + """ delete all entries from the homology table matching the given homology group. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' " % (self.conservationID, homGroup) + self.sqlcons(stmt, "commit") + + + def deleteOrthologyGroup(self, orthGroup): + """ delete all entries from the homology table matching the given orthology group. + """ + stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup) + self.sqlcons(stmt, "commit") + + + def hasHomGroup(self, testGroup): + """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table. + """ + stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup + res = self.sqlcons(stmt) + if int(res[0]) > 0: + return True + + return False + + + def returnHomologs(self, geneID): + """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in + self.targetGenomes, based on entries in the homology table. This function will try to load entries from + homologene for supported genomes, if necessary. + """ + homologList = [] + usedHomologene = False + stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID + groups = self.sqlcons(stmt) + + if len(groups) < 1 and geneID[0] in homologeneGenomes: + self.loadFromHomologene(geneID) + groups = self.sqlcons(stmt) + usedHomologene = True + + for (recID, hIDentry) in groups: + stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s' " % (str(hIDentry), str(recID)) + genes = self.sqlcons(stmt) + for gene in genes: + strgene = (str(gene[0]), str(gene[1])) + if usedHomologene and strgene[0] not in self.targetGenomes: + continue + + if strgene not in homologList: + homologList.append(strgene) + + return homologList + + + def returnOrthologs(self, geneID): + """ returns a list of genes that are orthologous to a geneID and whose genome are in + self.targetGenomes, based on explicit entries in the homology table. + """ + orthologList = [] + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID + groups = self.sqlcons(stmt) + + for (recID, oIDentry) in groups: + if oIDentry == "": + continue + stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID)) + genes = self.sqlcons(stmt) + for gene in genes: + strgene = (str(gene[0]), str(gene[1])) + if strgene[0] in self.targetGenomes and strgene not in orthologList: + orthologList.append(strgene) + + return orthologList + + + def areOrthologs(self, geneID1, geneID2): + """ returns True if genes geneID1 and geneID2 are orthologs. + """ + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1 + groups1 = self.sqlcons(stmt) + + stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2 + groups2 = self.sqlcons(stmt) + + oEntries1 = [] + oEntries2 = [] + for (recID, oIDentry) in groups1: + oEntries1.append(oIDentry) + + for (recID, oIDentry) in groups2: + oEntries2.append(oIDentry) + + for entry in oEntries1: + if entry in oEntries2: + return True + + return False + + + def loadFromHomologene(self, geneID): + """ load the homologous genes to geneID from homologene. + """ + try: + hdb = homologyDB(self.targetGenomes) + hGenes = hdb.getHomologousGenes(geneID) + except: + hdb = homologyDB(self.targetGenomes, cache=True) + hGenes = hdb.getHomologousGenes(geneID) + + hGenes.append(geneID) + homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene") + + + def computeAlignments(self, geneIDList=[]): + """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of + all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form + genome-locus' from the geneIDs in the starting genome. + """ + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + prog = Mafft() + for geneID in geneIDList: + homGroup = str(geneID[0]) + '-' + str(geneID[1]) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + fastaFile = self.createDataFile(geneIDList = hGenes) + prog.inputFile(fastaFile) + prog.run() + alignedDict = prog.getAlignment() + for dictKey in alignedDict: + aGeneID = dictKey.split('-') + self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey]) + + + def insertAlignedSequence(self, datasetID, geneID, seq): + """ save an aligned sequence into alignedSequence. + """ + values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq) + stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values + self.sqlcons(stmt, "commit") + + + def getAlignedSequence(self, geneID, datasetID=""): + """ retrieve an aligned sequence from alignedSequence using datasetID and geneID. + """ + stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if len(datasetID) > 0: + stmt += "and datasetID = '%s' " % (datasetID) + + res = self.sqlcons(stmt) + + return str(res[0][0]) + + + def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]): + """ map regions of sequences where multiple alignments show at least one other (default) + or all genes (stritct=True) as lining up. + """ + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if strict: + criteria = "strict" + else: + criteria = "partial" + + for geneID in geneIDList: + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + if len(hGenes) < 2: + continue + + maskedSequences = self.maskUsingConservation(hGenes, strict) + for gID in hGenes: + start = 0 + consLength = 0 + for pos in range(len(maskedSequences[gID])): + if maskedSequences[gID][pos] == "N": + if start != 0 and consLength >= minConsLength: + self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria) + + start = 0 + consLength = 0 + continue + + if start == 0: + start = pos + + consLength += 1 + + if start != 0 and consLength >= minConsLength: + self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria) + + + def maskUsingConservation(self, geneIDList, strict=False): + """ mask every gene in geneIDList using conservation amongst themselves, which have + already been computed using computeAlignments() + """ + alignmentDict = {} + maskedDict = {} + + for gID in geneIDList: + seqLen = 0 + try: + alignmentDict[gID] = self.getAlignedSequence(gID) + maskedDict[gID] = "" + seqLen = len(alignmentDict[gID]) + except: + return maskedDict + + if strict: + criteria = len(geneIDList) + else: + criteria = 2 + + for pos in range(seqLen): + posDict = {} + conserved = 0 + for geneID in geneIDList: + posDict[geneID] = alignmentDict[geneID][pos] + if posDict[geneID] != "-" and posDict[geneID] != "N": + conserved += 1 + + if conserved >= criteria: + for geneID in geneIDList: + if posDict[geneID] != "-": + maskedDict[geneID] += posDict[geneID] + else: + for geneID in geneIDList: + if posDict[geneID] != "-": + maskedDict[geneID] += "N" + + return maskedDict + + + def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False): + """ map regions of sequences with seqcomp windows in all genes that have + more than threshold (<= 1) conservation in more than minSequences sequences. + Can optionally use a higher threshold for orothlogs if specified using + orthologyThreshold. + """ + consList = [] + if orthologyThreshold < threshold: + orthologyThreshold = threshold + + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if useFastcomp: + prog = Fastcomp() + else: + prog = Paircomp() + + prog.setWindowSize(window) + # if we have a window on a sequence, then we have at least one match! + minSeqNum = minSequences - 1 + for geneID in geneIDList: + genePairs = [] + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + hGenes = self.returnHomologs(geneID) + hGenes.append(geneID) + for first in hGenes: + for second in hGenes: + if first != second and [first, second] not in genePairs and [second, first] not in genePairs: + genePairs.append([first, second]) + + seqcompWindows = {} + print "genepairs = %s" % str(genePairs) + for pair in genePairs: + fastaFile = self.createDataFile(geneIDList = pair) + if self.areOrthologs(pair[0], pair[1]): + prog.setThreshold(orthologyThreshold) + else: + prog.setThreshold(threshold) + + prog.inputFile(fastaFile) + prog.run() + seqcompWindows[(pair[0], pair[1])] = prog.getWindows() + + resultWindows = {} + for gene in hGenes: + resultWindows[gene] = {} + + for pair in seqcompWindows: + (geneID1, geneID2) = pair + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]: + if seq1pos not in resultWindows[geneID1]: + resultWindows[geneID1][seq1pos] = [] + + if seq2pos not in resultWindows[geneID2]: + resultWindows[geneID2][seq2pos] = [] + + resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches)) + resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches)) + + for geneID in hGenes: + for position in resultWindows[geneID]: + otherGeneIDs = [] + for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]: + if geneID2 not in otherGeneIDs: + otherGeneIDs.append(geneID2) + + if len(otherGeneIDs) >= minSeqNum: + criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window) + for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]: + criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches) + + consList.append((homGroup, geneID, position, window, "seqcompCons", criteria)) + + self.insertConservedSequenceBatch(consList) + + + def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False): + """ map regions of sequences with seqcomp windows in all genes that have + more than threshold (<= 1) conservation. Uses transivity. + """ + self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp) + + + def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False): + """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of + sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1) + conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity. + """ + consList = [] + if len(geneIDList) < 1: + geneIDList = [(self.startingGenome, gene) for gene in self.refGenes] + + if useFastcomp: + prog = Fastcomp() + else: + prog = Paircomp() + + prog.setWindowSize(window) + for geneID in geneIDList: + genePairs = [] + homGroup = "%s-%s" % (str(geneID[0]), str(geneID[1])) + oGenes = self.returnOrthologs(geneID) + # homologs should contain orthologs! + hGenes = self.returnHomologs(geneID) + if len(hGenes) < 1: + continue + + seqcompWindows = {} + genePairs = [[geneID, gene] for gene in hGenes] + for pair in genePairs: + if pair[1] in oGenes: + prog.setThreshold(orthologyThreshold) + else: + prog.setThreshold(paralogyThreshold) + + fastaFile = self.createDataFile(geneIDList = pair) + prog.inputFile(fastaFile) + prog.run() + seqcompWindows[pair[1]] = prog.getWindows() + + resultWindows = {} + print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows)) + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]: + resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)] + + for gene in hGenes[1:]: + newseqPositions = [] + for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]: + if seq1pos not in resultWindows: + continue + + newseqPositions.append(seq1pos) + resultWindows[seq1pos].append((gene, seq2pos, sense, matches)) + + for pos in resultWindows.keys(): + if pos not in newseqPositions: + del resultWindows[pos] + + for windowPos in resultWindows.keys(): + windowList = resultWindows[windowPos] + criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window) + for (geneID2, seq2pos, sense, matches) in windowList: + criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches) + + consList.append((homGroup, geneID, windowPos, window, tag, criteria)) + + for windowEntry in windowList: + (gID, seq2pos, sense, matches) = windowEntry + consList.append((homGroup, gID, seq2pos, window, tag, criteria)) + + self.insertConservedSequenceBatch(consList) + + + def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria): + """ insert an entry in conservedSequence. + """ + values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria) + stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values + self.sqlcons(stmt, 'commit') + + + def insertConservedSequenceBatch(self, batchlist): + """ insert a list of entries in conservedSequence. + """ + sqlList = [] + stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)" + for (datasetID, geneID, pos, length, method, criteria) in batchlist: + sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria))) + + self.batchsqlCons(stmt, sqlList) + + + def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""): + """ retrieve a list of conservation windows of the form (start, length) from + conservedSequence using datasetID and geneID, which match a particular method and criteria. + """ + results = [] + stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if datasetID > 0: + stmt += "and datasetID = '%d' " % (datasetID) + + if len(method) > 0: + stmt += " and method = '%s' " % (method) + + if len(criteria) > 0: + stmt += "and score = '%s' " % (criteria) + + res = self.sqlcons(stmt) + for (location, length, criteria) in res: + results.append((int(location), int(length), str(criteria))) + + return results + + + def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""): + """ returns True if a particular sequence of position and length falls within a conservation + window. + """ + if len(position) == 2: + position = position[0] + stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s' " % (geneID[0], geneID[1]) + if datasetID > 0: + stmt += "and datasetID = '%d' " % (datasetID) + + if len(method) > 0: + stmt += " and method = '%s' " % (method) + + if len(criteria) > 0: + stmt += "and score = '%s' " % (criteria) + + res = self.sqlcons(stmt) + for (location, wlen) in res: + if int(location) <= position and (int(location) + int(wlen)) >= (position + length): + return True + + return False + + + def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21): + """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns + a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger. + """ + maskedSeqDict = {} + if datasetID < 0: + datasetID = self.genepoolID + + settingsList = self.getSettingsID(datasetID) + geneIDList = eval(settingsList[1]) + for geneID in geneIDList: + theseq = self.genepool[geneID] + seqlen = len(theseq) + maskedseq = ["N"] * seqlen + conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria) + for (start, length, crit) in conservedWindows: + if start < 0: + start = 0 + + if start + length >= seqlen: + length = seqlen - start - 1 + + for index in range(start, start + length): + maskedseq[index] = theseq[index] + + tempseq = [] + numN = 0 + for letter in maskedseq: + if letter.upper() == "N": + numN += 1 + else: + numN = 0 + + if numN > stripNLonger: + continue + + tempseq.append(letter) + + maskedSeqDict[geneID] = string.join(tempseq, "") + + return maskedSeqDict + + + def conservationStat(self, datasetID=-1, method="", criteria=""): + """ report conservation level in each of the genes in the dataset. + """ + nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"] + totalcons = 0 + totalsize = 0 + consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria) + for geneID in consGeneDict: + ntstat = 0 + for NT in nucleotides: + ntstat += consGeneDict[geneID].count(NT) + + blocks = (len(consGeneDict[geneID]) - ntstat) / 21 + origSize = len(self.genepool[geneID]) + consPercentage = 100. * ntstat / float(origSize) + print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage) + totalcons += ntstat + totalsize += origSize + + consPercentage = 100. * totalcons / float(totalsize) + print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage) + + + def motifConservationStat(self, motifList=[]): + """ check how many motifs instances for the motifs in the mapped motifs + (or only in motifList) are conserved. + """ + if len(motifList) == 0: + motifList = self.geneToMotifKeys(thekey="motif") + + for motID in motifList: + index = 0 + matches = self.motifToGene(motID) + for (loc, pos) in matches: + mot = self.findMotif(motID) + if self.isConserved(loc, pos, len(mot)): + index += 1 + + print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches))) + + + def checkForConservedSequence(self, datasetID=-1, method="", criteria=""): + """ checks that at least two or more sequences in the dataset have conservation. + """ + someHaveConservation = False + numCons = 0 + checkDict = self.maskNonConservedSequence(datasetID, method, criteria) + for entry in checkDict: + theseq = checkDict[entry].upper() + if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq: + numCons += 1 + + if numCons > 1: + someHaveConservation = True + + return someHaveConservation + + + def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"): + """ exports conserved sequences to a fasta file. + """ + (up, cds, down) = self.getSeqParameters() + consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000) + if len(genomes) > 0: + consDictEntries = consDict.keys() + for entry in consDictEntries: + if entry[0] not in genomes: + del consDict[entry] + + outfilename = "%s/%s.fsa" % (directory, prefix) + consOutfile = open(outfilename, "w") + for entry in consDict: + entryCoordinates = cistematic.core.geneEntry(entry) + entrySense = entryCoordinates[4] + theseq = consDict[entry].upper() + seqlen = len(theseq) + (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates) + if entrySense == "R": + theseq = cistematic.core.complement(theseq) + + conservedBlockStart = -1 + for pos in range(seqlen): + if theseq[pos] == "N": + if conservedBlockStart >= 0: + consStart = start + conservedBlockStart + consSeq = theseq[conservedBlockStart:pos] + consStop = consStart + len(consSeq) - 1 + consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq)) + conservedBlockStart = -1 + + continue + + if conservedBlockStart < 0: + conservedBlockStart = pos + + if conservedBlockStart >= 0: + consStart = start + conservedBlockStart + consSeq = theseq[conservedBlockStart:pos] + consStop = consStart + len(consSeq) - 1 + consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq)) + + consOutfile.close() + + + def createConservation(self, conservationID="default", dbName=""): + """ creates the conservation SQL tables. Should only be run once per database file. + """ + stmtList = [] + self.loadConservation(conservationID, dbName) + try: + stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)") + stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)") + stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)") + stmtList.append("CREATE index cons1 on homology(conservationID)") + stmtList.append("CREATE index cons2 on alignedSequence(conservationID)") + stmtList.append("CREATE index cons3 on conservedSequence(conservationID)") + stmtList.append("CREATE index hom1 on homology(homologyGroup)") + stmtList.append("CREATE index orth1 on homology(orthologyGroup)") + stmtList.append("CREATE index homlocus1 on homology(genome, locus)") + stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)") + stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)") + for stmt in stmtList: + self.sqlcons(stmt, commit=True) + + self.mlog("Created conservation tables in database %s" % dbName) + except: + self.mlog("Could not create conservation tables in database %s" % dbName) + self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?") + + + def loadConservation(self, conservationID="default", dbName=""): + """ changes the conservationID to use the specified one, or use default. Must be used before reading or + writing any data to analysis tables. + """ + self.conservationID = conservationID + if len (dbName) > 0: + self.consDBName = dbName + else: + self.consDBName = self.expFile + + + def sqlcons(self, stmt, commit=""): + """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list. + """ + res = [] + db = sqlite.connect(self.consDBName) + sqlc = db.cursor() + try: + if self.debugSQL: + print "Conservation->sqlcons: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + try: + if commit != "": + db.commit() + except: + self.mlog("Conservation->sqlcons (commit exception)") + except: + self.mlog("Conservation->sqlcons (statement exception): %s" % stmt) + + sqlc.close() + db.close() + + return res + + + def batchsqlCons(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.consDBName) + sqlc = db.cursor() + try: + if self.debugSQL: + print "batchsqlCons: %s" % stmt + print "batchsqlCons: %s" % (str(batch)) + + sqlc.executemany(stmt, batch) + try: + db.commit() + except: + self.mlog("Conservation->batchsqlCons (commit exception)") + except: + self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt) + + sqlc.close() + db.close() + + return res \ No newline at end of file diff --git a/cistematic/experiments/draw.py b/cistematic/experiments/draw.py new file mode 100644 index 0000000..17acd87 --- /dev/null +++ b/cistematic/experiments/draw.py @@ -0,0 +1,339 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# This class contains the core code for visualizing sequences from several of the experiments. + +from cistematic.core.motif import Motif +from cistematic.core.geneinfo import geneinfoDB +import math, random + +pilPresent = False + +try: + import Image, ImageDraw, ImageFont + pilPresent = True +except: + pass + + +class Draw: + """ The Draw class contains the code used to visualize the location of motifs in their + genomic neighborhood. It is meant to be used as the parent of other classes, + such as the orthology classes. It relies on the python imaging library, and saves + PNG images. + """ + drawable = False + maxWidth = 1200 + leftMargin = 100 + rightMargin = 50 + lineHeight = 110 + + if pilPresent: + theFont = ImageFont.load_default() + drawable = True + else: + print "Draw: python image library missing - will not be able to draw on this system" + + + def draw(self, picName, geneList=[], motifList=[], excludeGeneList=[], excludeMotifList=[], + showHeader=True, showFooter=True, maxOccurences=100, skipSanity=False): + """ Draws an image of motifs on the sequences. Can specifically list or exclude genes and/or motifs. + Options showHeader will add experiment information while showFooter will add a motif key. + This function will not show motifs that occur more than maxOccurences in the dataset. + """ + if not self.drawable: + return + + limitGenes = False + limitMotifs = False + if len(geneList) > 0: + limitGenes = True + + if len(motifList) > 0: + limitMotifs = True + + idb = geneinfoDB() + bound = "" + if self.boundToNextGene: + bound = "up to " + + expResults = self.getResults() + (up, cds, down) = self.getSeqParameters() + hasORF = "NO" + if cds > 0: + hasORF = "YES" + + if cds > 1: + hasORF = "MASKED" + + motColor = {} + motConsensus = {} + motNumber = {} + maxLength = 1 + geneLength = {} + adjustedMotLength = {} + datasetIDs = self.getDatasetNames() + orthologyList = [] + if len(datasetIDs) > 0: + for datasetName in datasetIDs: + dataset = self.getSetting(datasetName) + theList = eval(dataset[0]) + for gene in theList: + if limitGenes and gene not in geneList: + continue + + if gene not in excludeGeneList and gene not in orthologyList: + orthologyList.append(gene) + else: + for gene in self.genepool: + if limitGenes and gene not in geneList: + continue + + if gene not in excludeGeneList and gene not in orthologyList: + orthologyList.append(gene) + + for geneID in orthologyList: + geneLength[geneID] = len(self.genepool[geneID]) + if maxLength < geneLength[geneID]: + maxLength = geneLength[geneID] + + seqScaler = float(self.maxWidth) / float(maxLength) + for mot in expResults: + if limitMotifs and mot.tagID not in motifList: + continue + + if mot.tagID in excludeMotifList: + continue + + if len(self.motifToGene(mot.tagID)) >= maxOccurences: + continue + + if not skipSanity and not mot.isSane(): + continue + + motConsensus[mot.tagID] = mot.buildConsensus() + currentRed = random.randint(5, 240) + currentGreen = random.randint(5, 240) + currentBlue = random.randint(5, 240) + motColor[mot.tagID] = (currentRed, currentGreen, currentBlue) + + motKeys = motConsensus.keys() + motKeys.sort() + motKeysLen = len(motKeys) + for tagID in motKeys: + motNumber[tagID] = 0 + adjustedMotLength[tagID] = int(math.ceil(len(self.findMotif(tagID)) * seqScaler)) + + numLines = len(orthologyList) + if showHeader: + numLines += 1 + + footerLines = motKeysLen / 3 + if motKeysLen % 3: + footerLines += 1 + + imsize = (self.maxWidth + self.leftMargin + self.rightMargin, int(round((numLines + footerLines/2.) * self.lineHeight))) + image = Image.new("RGB", imsize, color="#ffffff") + draw = ImageDraw.Draw(image) + currentHeight = 0 + if showHeader: + line1 = "Experiment: %s in %s Type: %s Analysis: %s" % (self.experimentID, self.expFile, self.experimentType, self.analysisID) + draw.text([10, 10], line1, font=self.theFont, fill=0) + draw.line((10, 30, self.maxWidth + self.leftMargin + self.rightMargin - 10, 30), fill=0) + line2 = "Upstream: %s%s ORF: %s Downstream: %s%s" % (bound, up, hasORF, bound, down) + draw.text([10, 40], line2, font=self.theFont, fill=0) + draw.line((10, 60, self.maxWidth + self.leftMargin + self.rightMargin - 10, 60), fill=0) + currentHeight = self.lineHeight + + for geneID in orthologyList: + geneNames = "" + seqLength = geneLength[geneID] + try: + res = idb.geneIDSynonyms(geneID) + for entry in res[1:]: + geneNames += "%s " % str(entry) + except: + pass + + motList = self.geneToMotif(geneID) + draw.text([5, currentHeight+45], str(geneID[0]), font=self.theFont, fill=0) + draw.text([5, currentHeight+55], str(geneID[1]), font=self.theFont, fill=0) + draw.text([5, currentHeight+65], geneNames, font=self.theFont, fill=0) + adjustedSeqLength = seqLength * seqScaler + seqStart = self.leftMargin + int(self.maxWidth) - adjustedSeqLength + features = self.getFeatures(geneID) + for (ftype, fstart, fstop, forientation) in features: + if ftype != "CDS": + continue + + if float(fstop) < float(fstart): + fstart = fstop + fstop = fstart + + start = int(math.floor(float(fstart) * seqScaler)) + consLength = int(math.ceil((fstop - fstart) * seqScaler)) + if start + consLength > adjustedSeqLength: + consLength = adjustedSeqLength - start + + start += seqStart + draw.rectangle([start, currentHeight + 51, start + consLength, currentHeight + 64], fill="#aaaaaa") + + draw.rectangle([seqStart, currentHeight + 35, seqStart + adjustedSeqLength, currentHeight + 65], outline=0) + tagIndex = 0 + tagPosList = [] + for (tagID, (pos, sense)) in motList: + if tagID not in motKeys: + continue + + tagIndex += 1 + motNumber[tagID] += 1 + start = 0 + top = 0 + bottom = 0 + start = int(math.floor(float(pos) * seqScaler)) + seqStart + if sense == "F": + top = 12 + if tagIndex % 2: + textHeight = 0 + else: + textHeight = 70 + else: + top = 35 + bottom = 18 + if tagIndex % 2: + textHeight = 15 + else: + textHeight = 85 + + for (prevStart, prevHeight) in tagPosList: + if abs(prevStart - start) <= 7 and abs(prevHeight - textHeight) <= 7: + if textHeight < 50: + textHeight -= 7 + else: + textHeight += 7 + + tagPosList.append((start, textHeight)) + draw.rectangle([start, currentHeight + top, start + adjustedMotLength[tagID], currentHeight + 65 + bottom], fill=motColor[tagID]) + if tagID.count("-") == 2: + tagIDlist = tagID.split("-") + tempID = tagIDlist[0] + tagIDlist[1][0] + tagIDlist[2] + else: + tempID = tagID + + draw.text([start - 5, currentHeight + textHeight], tempID, font=self.theFont, fill=motColor[tagID]) + + conservedWindows = [] + try: + conservedWindows = self.getConservedSequenceWindows(geneID) + except: + pass + + for (location, cLength, criteria) in conservedWindows: + start = int(math.floor(float(location) * seqScaler)) + seqStart + consLength = int(math.ceil(float(cLength) * seqScaler)) + draw.rectangle([start, currentHeight + 38, start + consLength, currentHeight + 50], fill ='#ff0000') + + for location in range(0, seqLength, 1000): + start = int(math.floor(float(location) * seqScaler)) + seqStart + draw.rectangle([start, currentHeight + 60, start + 1, currentHeight + 65], fill = '#000000') + + draw.text([self.leftMargin + self.maxWidth + 5, currentHeight + 45], str(seqLength), font=self.theFont, fill=0) + currentHeight += self.lineHeight + + motNum = 0 + if showFooter: + for motID in motKeys: + x = self.leftMargin + (self.maxWidth / 3) * (motNum % 3) + x = self.leftMargin + (self.maxWidth / 3) * (motNum % 3) + y1 = currentHeight + ((self.lineHeight/2) * (motNum/3))+ 5 + y2 = currentHeight + ((self.lineHeight/2) * (motNum/3))+ 15 + y3 = currentHeight + ((self.lineHeight/2) * (motNum/3))+ 25 + draw.text([x, y1], motID, font=self.theFont, fill=motColor[motID]) + draw.text([x, y2], motConsensus[motID], font=self.theFont, fill=0) + draw.text([x, y3], str(motNumber[motID]) + " matches", font=self.theFont, fill=0) + motNum +=1 + + del draw + image.save(picName, "PNG") + + + def drawMotifs(self, picName, motifList, geneList=[], excludeGeneList=[], showHeader=True, + showFooter=True, maxOccurences=100, genesWithMotifOnly=True, skipSanity=False): + """ Draws an image of one or more motifs on the sequences. Can specifically list or exclude genes. + Options showHeader will add experiment information while showFooter will add a motif key. + This function will not show motifs that occur more than maxOccurences in the dataset, and will + only show sequences with the motif by default. + """ + restrictedGeneList = [] + if genesWithMotifOnly: + for motID in motifList: + matches = self.motifToGene(motID) + for (loc, pos) in matches: + if loc not in restrictedGeneList and loc not in excludeGeneList: + restrictedGeneList.append(loc) + else: + restrictedGeneList = geneList + + self.draw(picName, restrictedGeneList, motifList, excludeGeneList, [], showHeader, showFooter, maxOccurences, skipSanity) + + + def drawGenes(self, picName, geneList, motifList=[], excludeMotifList=[], showHeader=True, + showFooter=True, maxOccurences=100, includeHomologs=False, motifsOnGeneOnly=True, + skipSanity=False): + """ Draws an image of one or more motifs on the sequences. Can specifically list or exclude motif. + Options showHeader will add experiment information while showFooter will add a motif key. + This function will not show motifs that occur more than maxOccurences in the dataset, and will + only show motifs on the sequence by default in the footer. + """ + restrictedMotifList = [] + theGeneList = [] + if includeHomologs: + for geneID in geneList: + if geneID not in theGeneList: + theGeneList.append(geneID) + try: + hgenes = self.returnHomologs(geneID) + for gID in hgenes: + if gID in self.genepool and gID not in theGeneList: + theGeneList.append(gID) + except: + pass + else: + theGeneList = geneList + + if motifsOnGeneOnly: + for geneID in theGeneList: + matches = self.geneToMotif(geneID) + for (motID, pos) in matches: + if motID not in restrictedMotifList and motID not in excludeMotifList: + restrictedMotifList.append(motID) + else: + restrictedMotifList = motifList + + self.draw(picName, theGeneList, restrictedMotifList, [], excludeMotifList, showHeader, showFooter, maxOccurences, skipSanity) \ No newline at end of file diff --git a/cistematic/experiments/experiment.py b/cistematic/experiments/experiment.py new file mode 100644 index 0000000..fbe728a --- /dev/null +++ b/cistematic/experiments/experiment.py @@ -0,0 +1,931 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# python parent class for experiments +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import os, time, tempfile, string + +from cistematic.core import retrieveSeq, retrieveSeqFeatures, fasta +import cistematic.core +from cistematic.core.motif import matrixRow, Motif +from cistematic.genomes import Genome +from cistematic import version +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + + +class Experiment: + """ genepool = {(genomeA, geneA1): seq1, (genomeA, geneA2): seq2, (genomeB, geneB1): seq3,...} + geneFeatures = ((genomeA, geneA1):[feature0, feature1, .....], (genomeB, geneB1):[feature0, feature1,...],....} + programs = [(program_object1, setting1), (program_object2, setting2), ....] + """ + genepool = {} + geneFeatures = {} + programs = [] + genepoolID = 0 + workdir = "" + experimentID = "" + experimentType = "generic" + expFile = "" + geneDB = "" + debugSQL = False + + + def __init__(self, expID, expDBFile, geneDBFile=""): + self.experimentID = expID + self.expFile = expDBFile + self.geneDB = geneDBFile + self.maskLowerCase = False + self.boundToNextGene = False + print "cistematic version %s" % version + self.startDB() + + + def __del__(self): + self.savePrograms() + self.saveGeneDB() + self.uncacheGeneDB() + self.removeWorkdir() + + + def setExperimentID(self, expID): + self.mlog("changing experiment ID from %s to %s" % (self.experimentID, expID)) + self.experimentID = expID + + + def setGeneDB(self, geneDBFile=""): + self.mlog("using %s as the gene database" % geneDBFile) + self.geneDB = geneDBFile + + + def cacheGeneDB(self, genome): + try: + cistematic.core.cacheGeneDB(genome) + self.mlog("cached genome %s" % genome) + except: + self.mlog("could not cache genome %s" % genome) + + + def uncacheGeneDB(self): + try: + cistematic.core.uncacheGeneDB() + except: + self.mlog("could not uncache genomes") + + + def setMaskLowerCase(self, maskValue): + if maskValue == True or maskValue == 1 or maskValue == "1": + self.maskLowerCase = True + self.setSettings("maskLowerCase", ["1"]) + else: + self.maskLowerCase = False + self.setSettings("maskLowerCase", ["0"]) + + + def setBoundToNextGene(self, boundValue): + if boundValue == True or boundValue == 1 or boundValue == "1": + self.boundToNextGene = True + self.setSettings("boundToNextGene", ["1"]) + else: + self.boundToNextGene = False + self.setSettings("boundToNextGene", ["0"]) + + + def setSeqParameters(self, up=0, cds=1, down=0): + self.upstream = up + self.cds = cds + self.downstream = down + if cds == 0: + cdsStatus = "NO " + else: + cdsStatus = "" + + self.setSettings("seq_parameters", ["%d\t%d\t%d" % (up, cds, down)]) + self.mlog("setting sequence retrieval parameters to %d bp upstream, %d bp downstream, and %scds" % (up, down, cdsStatus)) + + + def getSeqParameters(self): + return (self.upstream, self.cds, self.downstream) + + + def dsetLength(self): + stmt="SELECT count(*) from dataset where expID = '%s' " % self.experimentID + res = self.sqlexp(stmt) + try: + answer = int(res[0][0]) + except IndexError: + answer = 0 + + return answer + + + def resultsLength(self): + stmt="SELECT count(*) from results where expID = '%s' " % self.experimentID + res = self.sqlexp(stmt) + try: + answer = int(res[0][0]) + except IndexError: + answer = 0 + + return answer + + + def checkMotID(self, motID): + answer = False + stmt = "SELECT ID from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + if len(res) > 0: + answer = True + + return answer + + + def findMotif(self,mTagID): + if self.checkMotID(mTagID): + return self.makeMotif(mTagID) + + self.mlog("could not find %s" % mTagID) + + return "" + + + def makeMotif(self, motID): + mPWM = [] + mseqs = [] + stmt = "SELECT motifSeq, threshold, info from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + entry = res[0] + (seq, threshold, info) = entry + stmt = "SELECT aFreq, cFreq, gFreq, tFreq from motifPWMs where expID = '%s' and mTagID = '%s' order by position" % (self.experimentID, motID) + res = self.sqlexp(stmt) + for entry in res: + col = [0.0, 0.0, 0.0, 0.0] + (aFreq, cFreq, gFreq, tFreq) = entry + col[matrixRow["A"]] = aFreq + col[matrixRow["C"]] = cFreq + col[matrixRow["G"]] = gFreq + col[matrixRow["T"]] = tFreq + mPWM.append(col) + + stmt = "SELECT sequence from motifSequences where expID = '%s' and mTagID = '%s' and type = 'instance' " % (self.experimentID, motID) + res = self.sqlexp(stmt) + for entry in res: + seq = entry[0] + mseqs.append(seq) + + return Motif(motID, seq, mPWM, mseqs, threshold, info) + + + def saveMotif(self, mot): + motID = mot.tagID + motifSeq = mot.motifSeq + motifInfo = mot.info + motifThreshold = mot.threshold + if self.checkMotID(motID): + stmt = "DELETE from motifs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + stmt = "DELETE from motifPWMs where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + stmt = "DELETE from motifSequences where expID = '%s' and mTagID = '%s' " % (self.experimentID, motID) + self.sqlexp(stmt, commit=True) + + stmtList = [] + values = "(NULL, '%s', '%s', '%s', %f, '%s')" % (self.experimentID, motID, motifSeq, motifThreshold, motifInfo) + stmt = "INSERT into motifs(ID, expID, mTagID, motifSeq, threshold, info) values %s " % values + self.sqlexp(stmt, commit=True) + pos = 0 + stmt = "INSERT into motifPWMs(ID, expID, mTagID, position, aFreq, cFreq, gFreq, tFreq) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for col in mot.motifPWM: + aFreq = round(col[matrixRow["A"]],4) + cFreq = round(col[matrixRow["C"]],4) + gFreq = round(col[matrixRow["G"]],4) + tFreq = abs(1.0 - aFreq - cFreq - gFreq) + stmtList.append((self.experimentID, motID, pos, aFreq, cFreq, gFreq, tFreq)) + pos += 1 + + self.batchsqlexp(stmt, stmtList) + if len(mot.sequences) > 0: + stmt = "INSERT into motifSequences(ID, expID, mTagID, sequence, type, location) values (NULL, ?, ?, ?, ?, ?) " + stmtList = [] + for seq in mot.sequences: + stmtList.append((self.experimentID, motID, seq, "instance", "-")) + + self.batchsqlexp(stmt, stmtList) + + + def exportMotifs(self, directory=".", prefix="", suffix="mot"): + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + if prefix == "": + prefix = self.experimentID + + for entry in res: + motID = entry[0] + mot = self.makeMotif(motID) + fileName = "%s/%s-%s.%s" % (directory, prefix, motID, suffix) + self.mlog("exporting %s as %s" % (motID, fileName)) + mot.saveMotif(fileName) + + + def exportLogos(self, directory=".", prefix=""): + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + if prefix == "": + prefix = self.experimentID + for entry in res: + motID = entry[0] + mot = self.makeMotif(motID) + fileName = "%s/%s-%s" % (directory, prefix, motID) + self.mlog("saving logo for %s as %s.png" % (motID, fileName)) + mot.saveLogo(fileName) + + + def appendResults(self, mot, resultsGroup="-"): + motID = mot.tagID + self.saveMotif(mot) + stmt = "INSERT into results(ID, expID, resultsGroup, mTagID) values (NULL, '%s', '%s', '%s') " % (self.experimentID, resultsGroup, motID) + self.sqlexp(stmt, "commit") + + + def getGeneDB(self, geneDBFile=""): + return self.geneDB + + + def getLog(self): + answer = [] + stmt = "SELECT timestamp, message from expLog where expID = '%s' order by timestamp" % self.experimentID + res = self.sqlexp(stmt) + for entry in res: + (timestamp, message) = entry + answer.append((eval(timestamp), message)) + + return answer + + + def getResults(self): + answer = [] + stmt = "SELECT distinct mTagID from results where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + motID = entry[0] + answer.append(self.makeMotif(motID)) + + return answer + + + def getSettings(self): + answer = {} + stmt = "SELECT settingName, data from settings where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (settingName, data) = entry + if settingName not in answer.keys(): + answer[settingName] = [] + + answer[settingName].append(data) + + return answer + + + def getSetting(self, settingName): + answer = [] + stmt = "SELECT data from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName) + res = self.sqlexp(stmt) + for entry in res: + answer.append(entry[0]) + + return answer + + + def settingsHasKey(self, thekey): + answer = False + stmt = "SELECT distinct ID from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, thekey) + res = self.sqlexp(stmt) + if len(res) > 0: + answer = True + + return answer + + + def setSettings(self, settingName, settingList): + """ insert or replace (i.e. delete previous entry) a setting with one or more setting data. + """ + # delete existing, insert new + try: + stmt = "DELETE from settings where expID = '%s' and settingName = '%s' " % (self.experimentID, settingName) + res = self.sqlexp(stmt, "commit") + except: + pass + + stmtList = [] + stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, ?, ?, ?)" + for entry in settingList: + stmtList.append((self.experimentID, settingName, entry)) + + res = self.batchsqlexp(stmt, stmtList) + + return settingName + + + def setSettingsID(self, settingName, data): + """ return the settingsID for the inserted settingName:data pair in the settings table. + """ + stmt = "INSERT into settings (ID, expID, settingName, data) values (NULL, '%s', '%s', \"%s\")" % (self.experimentID, settingName, data) + res = self.sqlexp(stmt, "commit") + + return res + + + def getSettingsID(self, settingID): + """ get a setting by settingsID in the settings table. + """ + answer = "" + stmt = "SELECT settingName, data from settings where expID = '%s' and ID = %d" % (self.experimentID, int(settingID)) + res = self.sqlexp(stmt) + try: + (name, data) = res[0] + answer = (name, data) + except: + pass + + return answer + + + def setRun(self, progName, datasetID, settingsID): + values = "(NULL, '%s', '%s', '%s', %d, '%s', '%s')" % (self.experimentID, progName, datasetID, settingsID, time.localtime(), "-") + stmt = "INSERT into runs (ID, expID, progName, datasetGroup, settingsID, timestamp, resultsGroup) values %s" % values + runID = self.sqlexp(stmt, "commit") + self.mlog("run %s: program %s with settings %d and dataset %s" % (runID, progName, settingsID, datasetID)) + + return int(runID) + + + def getRun(self, rID): + res = "" + stmt = "SELECT progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' and ID = %d" % (self.experimentID, rID) + res = self.sqlexp(stmt) + (progName, datasetID, settingsID, timestamp, resultsID) = res[0] + if resultsID == "-": + resultsID = [] + if datasetID != "chromolist": + datasetID = int(datasetID) + + return (progName, datasetID, int(settingsID), timestamp, resultsID) + + + def getRunsByProg(self, prog): + runs = self.getRuns() + matchingRuns = [] + for entry in runs: + if (runs[entry][0] == prog): + matchingRuns.append(runs[entry]) + + return matchingRuns + + + def getRuns(self): + runs = {} + res = '' + stmt = "SELECT ID, progName, datasetGroup, settingsID, timestamp, resultsGroup from runs where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (ID, progName, datasetID, settingsID, timestamp, resultsID) = entry + runs[int(ID)] = (progName, datasetID, settingsID, eval(timestamp), resultsID) + + return runs + + + def appendDataset(self, dset): + stmtList = [] + stmt = "INSERT into dataset(ID, genome, locus, sequence, expID) values (NULL, ?, ?, ?, ?) " + for (genome, entries) in dset: + for entry in entries: + if len(entry) == 2: + (locus, sequence) = entry + else: + locus = entry + sequence = "-" + + stmtList.append((genome, locus, sequence, self.experimentID)) + + res = self.batchsqlexp(stmt, stmtList) + + + def getDataset(self): + dset = [] + stmt = "SELECT genome, locus, sequence from dataset where expID = '%s' " % (self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (genome, locus, sequence) = entry + if sequence == "-": + dset.append((genome, [locus])) + else: + dset.append((genome, [locus, sequence])) + + return dset + + + def getDatasetNames(self): + dset = [] + stmt = "select settingName from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' " + res = self.sqlexp(stmt) + for entry in res: + dset.append(entry) + + return dset + + + def getDatasetIDs(self): + dset = [] + stmt = "select ID from settings where settingName like 'dataset" + "%' and expID = '" + self.experimentID + "' " + res = self.sqlexp(stmt) + for entry in res: + dset.append(entry) + + return dset + + + def getFeatures(self, geneID): + results = [] + try: + res = [] + stmt = "select featureType, start, stop, orientation from sequenceFeatures where seqGenome = '%s' and seqID = '%s' and expID = '%s' " % (geneID[0], geneID[1], self.experimentID) + res = self.sqlexp(stmt) + for entry in res: + (ftype, start, stop, orientation) = entry + results.append((ftype, start, stop, orientation)) + except: + pass + + results.sort() + + return results + + + def absoluteLocation(self, match, featureLength, seqparams="", gidCoordinates="", customDB=""): + """ Returns the absolute location of the start of a match = ((genome, gID), (pos, sense)), given a relative + location with respect to the gene and a feature length. + Can be passed cached sequence parameters (up, cds, down) and geneEntry tuple to avoid hitting database. + """ + result = ["", 0, "F"] + (geneID, loc) = match + (pos, sense) = loc + pos = int(pos) + if seqparams == "": + seqparams = self.getSeqParameters() + + (up, cds, down) = seqparams + if gidCoordinates == "": + gidCoordinates = cistematic.core.geneEntry(geneID) + + (gidChrom, gidStart, gidStop, gidLength, gidSense) = gidCoordinates + result[0] = gidChrom + if self.boundToNextGene: + up = cistematic.core.upstreamToNextGene(geneID, up, db=customDB) + down = cistematic.core.downstreamToNextGene(geneID, down, db=customDB) + + if gidSense == "F": + result[2] = sense + if pos < up or cds > 0: + result[1] = gidStart - up + pos + else: + result[1] = gidStop - up + pos + else: + if gidStart > gidStop: + temp = gidStart + gidStart = gidStop + gidStop = temp + + if sense == "F": + result[2] = "R" + + if pos < up or cds > 0: + result[1] = gidStop + up - pos - featureLength + else: + result[1] = gidStart + up - pos - featureLength + + return result + + + def setWorkdir(self, wdir=""): + self.workdir = wdir + self.mlog("changed workdir to %s" % (wdir)) + + + def resetDataset(self): + stmt = "DELETE from dataset where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("log reset") + + + def resetLog(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("log reset") + + + def resetResults(self): + stmt = "DELETE from results where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("results reset") + + + def resetRuns(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.mlog("runs reset") + + + def resetPrograms(self): + self.programs = [] + self.mlog("programs reset") + + + def resetSettings(self): + stmt = "DELETE from expLog where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + self.log("settings reset") + + + def appendProgram(self, program): + self.programs.append((program, self.setSettingsID(program.name(), program.getSettings()))) + self.mlog("adding program %s" % program.name()) + + + def removeProgram(self, progName): + for index in range(len(self.programs)): + if self.programs[index][0].name() == progName: + del self.programs[index] + self.mlog("removed program %s" % progName) + + + def createWorkdir(self): + if self.workdir == "": + self.workdir = tempfile.mktemp() + + try: + os.mkdir(self.workdir) + self.mlog("created workdir: %s" % (self.workdir)) + except: + pass + + + def removeWorkdir(self): + try: + filenames = os.listdir(self.workdir) + for entry in filenames: + os.remove("%s/%s" % (self.workdir, entry)) + os.rmdir(self.workdir) + except: + self.mlog("could not delete workdir: %s" % (self.workdir)) + + + def mlog(self,msg): + print msg + stmt = "INSERT into expLog(ID, expID, timestamp, message) values (NULL, '%s', '%s', '%s') " % (self.experimentID, time.localtime(),string.replace(msg, "'", '"')) + self.sqlexp(stmt, "commit") + + + def logToString(self): + response = "" + for line in self.getLog(): + response += time.asctime(line[0]) + ": " + line[1] + "\n" + + return response + + + def printLog(self): + for line in self.getLog(): + print "%s: %s" % (time.asctime(line[0]), line[1]) + + + def tailLog(self): + theLog = self.getLog() + for line in theLog[-10:]: + print "%s: %s" % (time.asctime(line[0]), line[1]) + + + def loadPrograms(self): + self.programs = [] + for progs in self.getSetting("loaded_programs"): + (progs0, progs1, progs2) = progs.split("\t") + execString = "from %s import %s" % (progs0, progs1) + exec execString + execString = 'self.programs.append((apply(%s), %s))' % (progs1, progs2) + exec execString + + + def savePrograms(self): + progs = [] + for (program, settingID) in self.programs: + progs.append("%s\t%s\t%s" % (program.__class__.__module__, program.__class__.__name__, settingID)) + + if len(progs) > 0: + self.setSettings("loaded_programs", progs) + + def saveGeneDB(self): + if self.geneDB != "": + self.setSettings("geneDB", [self.geneDB]) + + + def loadFasta(self, fastaFile, genomeName): + """ load fasta file into the dataset. + """ + genIDList = self.loadFastaFromFile(fastaFile, genomeName) + + return [(genomeName, genIDList)] + + + def loadFastaFromFile(self, fastaFile, genomeName): + seqArray = [] + seqLen = 0 + seqName = [] + geneDBPath = "%s.genedb" % genomeName + self.mlog("Loading fasta file %s into database %s" % (genomeName, geneDBPath)) + aGenome = Genome(genomeName, dbFile=geneDBPath) + aGenome.createGeneDB() + inFile = open(fastaFile, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + print "Added sequence %s to database" % chromID + aGenome.addSequence((genomeName, chromID), seq, "chromosome", str(seqLen)) + aGenome.addChromosomeEntry(chromID, chromID, "db") + aGenome.addGeneEntry((genomeName, chromID), chromID, 0, seqLen - 1, "F", "IMPORT", "1") + header = currentLine + seqName.append(chromID) + + inFile.close() + aGenome.createIndices() + self.setGeneDB(geneDBPath) + + return seqName + + + def loadGenepool(self): + self.genepool = {} + for geneTuple in self.getDataset(): + genome = geneTuple[0] + geneList = geneTuple[1] + for gene in geneList: + try: + if len(gene) == 2: + tag = gene[0] + geneID = (genome, tag) + seq = gene[1] + # need to deal with masking lowercase + self.genepool[geneID] = seq.upper() + else: + geneID = (genome, gene) + # note that we will only keep one copy of a geneid in the genepool, even if we + # retrieve it multiple times. + self.genepool[geneID] = retrieveSeq(geneID, self.upstream, self.cds, self.downstream, self.geneDB, self.maskLowerCase, self.boundToNextGene) + except: + self.mlog("could not load %s" % (str(geneID))) + + self.genepoolID = self.setSettingsID("genepool", self.genepool.keys()) + + + def loadGeneFeatures(self): + stmt = "DELETE from sequenceFeatures where expID = '%s'" % self.experimentID + res = self.sqlexp(stmt, "commit") + stmtList = [] + stmt = "INSERT into sequenceFeatures (ID, expID, seqGenome, seqID, featureType, start, stop, orientation) values (NULL, ?, ?, ?, ?, ?, ?, ?)" + for geneTuple in self.getDataset(): + genome = geneTuple[0] + geneList = geneTuple[1] + for gene in geneList: + try: + geneID = (genome, gene) + results = retrieveSeqFeatures(geneID, self.upstream, self.cds, self.downstream, self.boundToNextGene, self.geneDB) + for entry in results: + (ftype, start, stop, orientation) = entry + stmtList.append((self.experimentID, genome, gene, ftype, start, stop, orientation)) + except: + self.mlog("could not find features for %s" % (gene)) + + if len(stmtList) > 0: + self.batchsqlexp(stmt, stmtList) + + + def reload(self): + pass + + + def toFile(self, geneIDList, filename, geneDict=[]): + outFile = open(filename, "w") + for geneID in geneIDList: + if geneID in geneDict: + outFile.write(fasta(geneID, geneDict[geneID])) + elif geneID in self.genepool: + outFile.write(fasta(geneID, self.genepool[geneID])) + else: + self.mlog("could not write %s to file" % str(geneID)) + + outFile.close() + + + def createDataFile(self, datasetID=-1, geneIDList=[], geneDict=[]): + oldtempdir = tempfile.tempdir + tempfile.tempdir = self.workdir + dataFile = tempfile.mktemp() + tempfile.tempdir = oldtempdir + if datasetID < 0: + datasetID = self.genepoolID + + if len(geneIDList) < 1: + settingsList = self.getSettingsID(datasetID) + geneIDList = eval(settingsList[1]) + + try: + self.toFile(geneIDList, dataFile, geneDict) + except: + self.mlog("could not create dataFile %s" % (dataFile)) + + return dataFile + + + def initialize(self, dataset=[], workdir=""): + if workdir != "": + self.setWorkdir(workdir) + + self.createWorkdir() + if len(dataset) > 0: + self.appendDataset(dataset) + self.loadGenepool() + self.loadGeneFeatures() + + + def run(self): + if len(self.programs) == 0: + self.mlog("Must instantiate one or more programs first") + elif len(self.genepool) == 0: + self.mlog("Must have one or more valid sequences in the dataset") + + + def sqlexp(self, stmt, commit=""): + db = sqlite.connect(self.expFile, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "sqlexp: %s" % stmt + + sqlc.execute(stmt) + res = sqlc.fetchall() + if commit != "": + db.commit() + + if stmt[0:6] == "INSERT": + res = sqlc.lastrowid + + sqlc.close() + db.close() + + return res + + + def batchsqlexp(self, stmt, batch): + """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit. + """ + res = [] + db = sqlite.connect(self.expFile, timeout=60) + sqlc = db.cursor() + if self.debugSQL: + print "batchsql: %s" % stmt + print "batchsql: %s" % str(batch) + + sqlc.executemany(stmt, batch) + db.commit() + sqlc.close() + db.close() + + return res + + + def setExternalStatus(self, status): + self.mlog("Setting status to %s" % status) + statfile = open("%s.status" % self.expFile, "w") + statfile.write(status) + statfile.close() + + + def resetExternalStatus(self): + try: + os.remove("%s.status" % self.expFile) + except: + pass + + + def startDB(self): + if not os.path.exists(self.expFile): + try: + db = sqlite.connect(self.expFile, timeout=60) + sql = db.cursor() + sql.execute("CREATE table experiment(ID INTEGER PRIMARY KEY, expID varchar, expType varchar, expStatus varchar, timestamp varchar)") + sql.execute("CREATE table dataset(ID INTEGER PRIMARY KEY, expID varchar, datasetGroup varchar, genome varchar, locus varchar, sequence varchar)") + sql.execute("CREATE table results(ID INTEGER PRIMARY KEY, expID varchar, resultsGroup varchar, mTagID varchar)") + sql.execute("CREATE table motifs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, motifSeq varchar, threshold varchar, info varchar)") + sql.execute("CREATE table motifPWMs(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, position int, aFreq float, cFreq float, gFreq float, tFreq float)") + sql.execute("CREATE table motifSequences(ID INTEGER PRIMARY KEY, expID varchar, mTagID varchar, sequence varchar, type varchar, location varchar)") + sql.execute("CREATE table settings(ID INTEGER PRIMARY KEY, expID varchar, settingName varchar, data varchar)") + sql.execute("CREATE table runs(ID INTEGER PRIMARY KEY, expID varchar, progName varchar, datasetGroup varchar, settingsID int, timestamp varchar, resultsGroup varchar)") + sql.execute("CREATE table expLog(ID INTEGER PRIMARY KEY, expID varchar, timestamp varchar, message varchar)") + sql.execute("CREATE table sequenceFeatures(ID INTEGER PRIMARY KEY, expID varchar, seqGenome varchar, seqID varchar, featureType varchar, start int, stop int, orientation varchar)") + + sql.execute("CREATE INDEX settingIndex1 on settings(expID, settingName)") + sql.execute("CREATE INDEX datasetIndex1 on dataset(expID, datasetGroup)") + sql.execute("CREATE INDEX motifsIndex1 on motifs(expID, mTagID)") + sql.execute("CREATE INDEX motifPWMsIndex1 on motifPWMs(expID, mTagID)") + sql.execute("CREATE INDEX motifSequencesIndex1 on motifSequences(expID, mTagID)") + sql.execute("CREATE INDEX featuresIndex1 on sequenceFeatures(expID, seqGenome, seqID)") + + sql.execute("INSERT INTO settings(ID, expID, settingName, data) values (NULL, '%s', 'experimentType', '%s')" % (self.experimentID, self.experimentType)) + + db.commit() + sql.close() + db.close() + self.mlog("Created experiment database %s" % self.expFile) + except: + self.mlog("Could not create experiment database %s" % self.expFile) + else: + self.mlog("Using existing experiment database %s" % self.expFile) + + if self.settingsHasKey("loaded_programs"): + self.loadPrograms() + + if self.settingsHasKey("seq_parameters"): + res = self.getSetting("seq_parameters") + (up, cds, down) = res[0].split("\t") + self.setSeqParameters(int(up), int(cds), int(down)) + else: + self.setSeqParameters() + + if self.settingsHasKey("maskLowerCase"): + res = self.getSetting("maskLowerCase") + self.setMaskLowerCase(res[0]) + else: + self.setMaskLowerCase(False) + + if self.settingsHasKey("boundToNextGene"): + res = self.getSetting("boundToNextGene") + self.setBoundToNextGene(res[0]) + else: + self.setBoundToNextGene(False) + + if self.settingsHasKey("experimentType"): + res = self.getSetting("experimentType") + self.experimentType = res[0] + + if self.settingsHasKey("geneDB"): + res = self.getSetting("geneDB") + self.geneDB = res[0] + + if self.dsetLength() > 0: + self.loadGenepool() + + self.createWorkdir() \ No newline at end of file diff --git a/cistematic/experiments/fasta.py b/cistematic/experiments/fasta.py new file mode 100644 index 0000000..1caf031 --- /dev/null +++ b/cistematic/experiments/fasta.py @@ -0,0 +1,88 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from draw import Draw +from analyzeMotifs import AnalyzeMotifs +import sys + +class Fasta(Experiment, AnalyzeMotifs, Draw): + experimentType = "fasta" + + + def run(self, fastaFile): + self.loadFasta(fastaFile) + Experiment.run(self) + datasetID = self.genepoolID + for (prog, settingsID) in self.programs: + if 1: + prog.inputFile(fastaFile) + settings = self.getSettingsID(settingsID)[1] + prog.setSettings(eval(settings)) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + else: + self.mlog("Error running program %s with settings %s" % (prog, settingsID)) + + + def loadFasta(self, ffile): + """ load fasta file into genepool + """ + f=open(ffile, "r") + line = f.readline() + i = 1 + while line != "": + seq = "" + templine = f.readline() + while templine != "" and templine[0] != ">": + seq = seq + templine[0:-1] + templine = f.readline() + name = "seq%s" % line.strip()[1:] + # "progress bar" of dots... + if (i % 10 == 0): + sys.stderr.write(".") + + if (i % 1000 == 0): + sys.stderr.write("%s\n" % i) + else: + if (i % 100 == 0): + sys.stderr.write(" ") + + i = i + 1 + + self.genepool[(ffile, name)] = seq + line = templine + print + f.close() \ No newline at end of file diff --git a/cistematic/experiments/genexp.py b/cistematic/experiments/genexp.py new file mode 100644 index 0000000..b6dae71 --- /dev/null +++ b/cistematic/experiments/genexp.py @@ -0,0 +1,252 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +import os +import string +from cistematic.programs import supportedPrograms +from cistematic.experiments import experimentTypes, loadExperiment +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + + +class GenExp: + dataPath = "" + dbFilePath = "" + expFilePath = "" + expName = "" + expType = "" + dataset = [] + expProgs = [] + expSettings = {} + analysisName = "" + + + def __init__(self, path, dbPath, newexp, type="", dset="", progs="", analysis="", createExp=False): + self.dataPath = path + self.dbFilePath = dbPath + self.expName = newexp + self.expFilePath = string.replace(dbPath, ".db", "-db.py") + self.expType = type + self.dataset = dset + self.expProgs = progs + self.analysisName = newexp + analysis + + if createExp: + self.createExpDB() + + if len(self.expProgs) == 0: + exp = loadExperiment( self.expName , self.dbFilePath) + self.expType = exp.experimentType + self.expProgs = exp.getSetting("expProgs") + + if len(dset) == 0: + exp = loadExperiment( self.expName , self.dbFilePath) + self.expType = exp.experimentType + self.dataset = exp.getDataset() + + self.createExpRunFile() + + + def run(self): + exp = loadExperiment( self.expName , self.dbFilePath) + exp.setExternalStatus("QUEUED") + del exp + os.system("cd %s ; nohup /usr/bin/python %s & " % (self.dataPath, self.expFilePath)) + + + def createExpDB(): + strexec = 'from cistematic.experiments.' + modName + ' import ' + self.expType + '\n' + exec strexec + strexec = 'exp = ' + self.expType + '("' + self.expName + '", "' + self.dbFilePath + '")\n' + exec strexec + + + def createExpRunFile(self): + outFile = open(self.expFilePath, "w") + outFile.write(self.expHead()) + outFile.write(self.expInit()) + outFile.write(self.expPrograms()) + outFile.write(self.expRun()) + outFile.write(self.expAnalysis()) + outFile.write(self.expStatus()) + outFile.close() + + + def expHead(self): + strHead = ["# Experiment: %s autogenerated by Cistematic GenExp 0.9.9b\n" % self.expName] + strHead.append("from sys import path\n") + strHead.append("cisPath = '%s'\n" % cisRoot) + strHead.append("if cisPath not in path:\n\tpath.append(cisPath)\n\n") + for oneProg in supportedPrograms: + strHead.append("from cistematic.programs.%s import %s\n" % (oneProg[0], oneProg[1])) + + strHead.append("\n") + + return string.join(strHead, "") + + + def expInit(self): + useMussa = False + useSeqcomp = False + startingGenome = "" + targetGenomes = [] + consGene = [] + geneIDList = [] + + if self.expType in ["orthology", "phyloFoot", "phyloTest"]: + exp = loadExperiment( self.expName , self.dbFilePath) + consSettings = exp.getSetting("expConsConfig") + (window, threshold, numseq, homologs) = consSettings[0].split(":") + self.dataset.reverse() + if homologs != "1" and exp.settingsHasKey("gTargets"): + gTargets = exp.getSetting("gTargets") + if len(gTargets) > 0: + targetGenomes = gTargets[0].split(":") + + del exp + for geneID in self.dataset: + (genome, gid) = geneID + if (genome not in targetGenomes) and homologs == "1": + targetGenomes.append(genome) + + if startingGenome == "": + startingGenome = genome + + gid = gid[0] + # we are assuming that all of these genes will be from the same + # genome - this is only true (or necessary) if these are not + # explicitely maked as "homologs" by the user + if gid not in consGene: + consGene.append(gid) + + geneIDList.append((genome, gid)) + + preInitialize = "" + if homologs == "1": + preInitializeList = ["exp.createConservation()\n"] + preInitializeList.append("exp.loadConservation()\n") + preInitializeList.append("paralogs = %s\n" % str(geneIDList)) + preInitializeList.append('exp.insertHomologs(paralogs, "genExp")\n\n') + preInitialize = string.join(preInitializeList, "") + del consGene[1:] + + if int(numseq) > 0: + useSeqcomp = True + else: + useMussa = True + else: + initializeArguments = str(self.dataset) + + strInit = ["# Initialize Experiment\n"] + strInit.append("from cistematic.experiments import loadExperiment\n\n") + strInit.append('exp = loadExperiment("%s", "%s")\n' % (self.expName, self.dbFilePath)) + strInit.append('exp.setExternalStatus("INITIALIZING")\n') + strInit.append(preInitialize) + if useMussa or useSeqcomp: + if len(targetGenomes) > 0: + strInit.append('exp.initialize("%s", %s, %s)\n\n' % (startingGenome, str(consGene), str(targetGenomes))) + else: + strInit.append('exp.initialize("%s", %s)\n\n' % (startingGenome, str(consGene))) + strInit.append('exp.setExternalStatus("CONSERVATION")\n') + strInit.append("exp.computeAlignments()\n") + else: + strInit.append("exp.initialize(%s)\n\n" % initializeArguments) + + if useMussa: + strInit.append("exp.mapMussaConservation(window=%s, threshold=%s)\n\n" % (window, threshold)) + + if useSeqcomp: + strInit.append("exp.mapSeqcompConservation(window=%s, threshold=%s, minSequences=%s)\n\n" % (window, threshold, numseq)) + + return string.join(strInit, "") + + + def expPrograms(self): + index = 1 + genomes = [] + for (genome, geneID) in self.dataset: + if genome not in genomes: + genomes.append(genome) + + strProg = ["# Load individual programs and settings\n"] + for entry in self.expProgs: + progArray = entry[:-1].split(";") + (amod, aProg) = progArray[0].split(":") + strProg.append("prog%s = %s()\n" % (str(index), aProg)) + if len(genomes) == 1 and self.expType == "Simple": + strProg.apend('prog%s.setGenome("%s")\n' % (str(index), genomes[0])) + if len(progArray) > 1: + strProg.append("prog%s.setGenExpOptions(%s)\n" % (str(index), str(progArray[1:]))) + strProg.append("exp.appendProgram(prog%s)\n" % (str(index))) + index += 1 + + strProg.append("\n") + + return string.join(strProg, "") + + + def expRun(self): + strRun = ["# Run experiment\n", + 'exp.setExternalStatus("MOTIF FINDING")\n', + "exp.run()\n", + "\n" + ] + + return string.join(strRun, "") + + + def expAnalysis(self): + strAnalysis = ["# Analysis Section\n", + 'exp.setExternalStatus("ANALYZING")\n', + 'exp.loadAnalysis("consensus")\n', + "exp.annotateConsensus()\n", + "exp.mapConsensus()\n", + "exp.buildMotifSize()\n", + "\n", + 'exp.loadAnalysis("1 mismatch")\n', + "exp.annotateConsensus(numMismatches=1)\n", + "exp.mapConsensus(numMismatches=1)\n", + "exp.buildMotifSize()\n", + "\n", + 'exp.loadAnalysis("PWM - 90% threshold")\n', + "exp.annotateConsensus()\n", + "exp.mapMotifs(90.0)\n", + "exp.buildMotifSize()\n\n", + "\n" + ] + + return string.join(strAnalysis, "") + + + def expStatus(self): + return "exp.resetExternalStatus()\n\n" \ No newline at end of file diff --git a/cistematic/experiments/locate.py b/cistematic/experiments/locate.py new file mode 100644 index 0000000..02bb4fc --- /dev/null +++ b/cistematic/experiments/locate.py @@ -0,0 +1,88 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# The locate class allows mapping of known motifs to sequences or genomes +# should only be used with the Locator program, for now +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs +from cistematic.core import getChromosomeNames, readChromosome + +class Locate(Experiment, AnalyzeMotifs): + experimentType = "locate" + + + def getChromosomeList(self, genome, partition=1, slice=0, db=""): + """ return a list of the chromosomes available for a given genome. + """ + result = [] + try: + chromList = getChromosomeNames(genome, db, partition, slice) + except: + chromList = [] + + for entry in chromList: + result.append((genome, entry)) + + return result + + + def initialize(self, listType, compoundList): + """ must initialize with listType='chromosome' in order to scan chromosomes + """ + self.genepool = {} + genepoolKeys = [] + if listType == "chromosome": + for (genome, chromName) in compoundList: + self.genepool[(genome, chromName)] = readChromosome(genome, chromName) + genepoolKeys.append("%s\t%s" % (genome, chromName)) + + self.genepoolID = self.setSettings("chromolist", genepoolKeys) + else: + Experiment.initialize(self, compoundList) + + + def run(self): + Experiment.run(self) + datasetID = self.genepoolID + for (prog, settingsID) in self.programs: + try: + if prog.name() == "Locator": + (settingName, settingData) = self.getSettingsID(settingsID) + prog.setSettings(settingData) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + else: + self.mlog("Can only run Locate experiment with Locator program") + except: + self.mlog("Error running program %s with settings %s" % (prog, settingsID)) \ No newline at end of file diff --git a/cistematic/experiments/orthology.py b/cistematic/experiments/orthology.py new file mode 100644 index 0000000..d34e82b --- /dev/null +++ b/cistematic/experiments/orthology.py @@ -0,0 +1,108 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs +from conservation import Conservation +from draw import Draw +from cistematic.core.homology import homologyDB + +class Orthology(Experiment, AnalyzeMotifs, Conservation, Draw): + experimentType = "orthology" + useConservation = False + + + def initialize(self, sGenome="", rGenes=[], tGenomes=[], analysisDB="", conservationDB="", maskNonConserved=True): + wholeDataset = [] + try: + self.createAnalysis(dbName=analysisDB) + if self.consDBName == "": + self.createConservation(dbName=conservationDB) + except: + pass + + if sGenome == "": + sGenome = self.startingGenome + else: + self.startingGenome = sGenome + + if len(rGenes) == 0: + rGenes = self.refGenes + else: + self.refGenes = rGenes + + if len(tGenomes) == 0: + tGenomes = self.targetGenomes + else: + self.targetGenomes = tGenomes + + self.useConservation = maskNonConserved + try: + for gene in rGenes: + wholeDataset.append((sGenome, [gene])) + hGenes = self.returnHomologs((sGenome, gene)) + for oneGene in hGenes: + wholeDataset.append((oneGene[0], [oneGene[1]])) + except: + self.mlog("could not load homologs for (%s,%s,%s)" % (sGenome, str(rGenes), (tGenomes))) + + Experiment.initialize(self, wholeDataset) + + + def run(self): + for gene in self.refGenes: + dset = self.returnHomologs((self.startingGenome, gene)) + dset.append((self.startingGenome, gene)) + + datasetID = self.genepoolID + if self.checkForConservedSequence(): + fastaFile = self.createDataFile(geneDict = self.maskNonConservedSequence()) + for (prog, settingsID) in self.programs: + prog.inputFile(fastaFile) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + elif self.useConservation == False: + fastaFile = self.createDataFile() + for (prog, settingsID) in self.programs: + prog.inputFile(fastaFile) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + else: + self.mlog("no conserved sequences to run motif finder on") \ No newline at end of file diff --git a/cistematic/experiments/phyloFoot.py b/cistematic/experiments/phyloFoot.py new file mode 100644 index 0000000..c88e641 --- /dev/null +++ b/cistematic/experiments/phyloFoot.py @@ -0,0 +1,105 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs +from conservation import Conservation +from draw import Draw +from cistematic.core.homology import homologyDB + +class PhyloFoot(Experiment, AnalyzeMotifs, Conservation, Draw): + experimentType = "phyloFoot" + startingGenome = "scerevisiae" + targetGenomes = [] + refGenes = [] + phyloMap = {} + useConservation = True + + + def initialize(self, sGenome="", rGenes=[], tGenomes=[], analysisDB="", conservationDB="", maskNonConserved=True): + wholeDataset = [] + try: + self.createAnalysis(dbName=analysisDB) + if self.consDBName == "": + self.createConservation(dbName=conservationDB) + except: + pass + + if sGenome == "": + sGenome = self.startingGenome + else: + self.startingGenome = sGenome + + if len(rGenes) == 0: + rGenes = self.refGenes + else: + self.refGenes = rGenes + + if len(tGenomes) == 0: + tGenomes = self.targetGenomes + else: + self.targetGenomes = tGenomes + + self.useConservation = maskNonConserved + try: + for gene in rGenes: + wholeDataset.append((sGenome, [gene])) + hGenes = self.returnHomologs((sGenome, gene)) + self.phyloMap[(sGenome, gene)] = hGenes + for oneGene in hGenes: + wholeDataset.append((oneGene[0], [oneGene[1]])) + except: + self.mlog("could not load from homology for (%s,%s,%s)" % (sGenome, str(rGenes), str(tGenomes))) + + Experiment.initialize(self, wholeDataset) + + + def run(self): + for gene in self.refGenes: + dset = self.phyloMap[(self.startingGenome, gene)] + dset.append((self.startingGenome, gene)) + datasetID = self.setSettingsID("dataset-%s-%s" % (self.startingGenome, gene), dset) + if self.useConservation: + fastaFile = self.createDataFile(datasetID, geneDict= self.maskNonConservedSequence(datasetID)) + else: + fastaFile = self.createDataFile(datasetID) + + for (prog, settingsID) in self.programs: + prog.inputFile(fastaFile) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + if True: + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + else: + self.mlog("problem with run %s" % tag) \ No newline at end of file diff --git a/cistematic/experiments/phyloTest.py b/cistematic/experiments/phyloTest.py new file mode 100644 index 0000000..c73a0d9 --- /dev/null +++ b/cistematic/experiments/phyloTest.py @@ -0,0 +1,105 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs +from conservation import Conservation +from draw import Draw +from cistematic.core.homology import homologyDB + +class PhyloTest(Experiment, AnalyzeMotifs, Conservation, Draw): + experimentType = "phyloTest" + phyloTestMap = {} + useConservation = True + + + def initialize(self, sGenome="", rGenes=[], tGenomes=[], analysisDB="", conservationDB="", maskNonConserved=True): + wholeDataset = [] + try: + self.createAnalysis(dbName=analysisDB) + self.createConservation(dbName=conservationDB) + except: + pass + + if sGenome == "": + sGenome = self.startingGenome + else: + self.startingGenome = sGenome + + if len(rGenes) == 0: + rGenes = self.refGenes + else: + self.refGenes = rGenes + + if len(tGenomes) == 0: + tGenomes = self.targetGenomes + else: + self.targetGenomes = tGenomes + + self.useConservation = maskNonConserved + self.phyloTestMap[sGenome] = [] + for gene in rGenes: + self.phyloTestMap[sGenome].append((sGenome, gene)) + + for genome in self.targetGenomes: + self.phyloTestMap[genome] = [] + + self.targetGenomes.append(sGenome) + try: + for gene in rGenes: + wholeDataset.append((sGenome, [gene])) + hGenes = self.returnHomologs((sGenome, gene)) + for oneGene in hGenes: + wholeDataset.append((oneGene[0], [oneGene[1]])) + self.phyloTestMap[oneGene[0]].append(oneGene) + except: + self.mlog("could not load from homology for (%s,%s,%s)" % (sGenome, str(rGenes), str(tGenomes))) + + Experiment.initialize(self, wholeDataset) + + + def run(self): + for genome in self.targetGenomes: + dset = self.phyloTestMap[genome] + datasetID = self.setSettingsID("dataset-%s" % genome, dset) + if self.useConservation: + fastaFile = self.createDataFile(datasetID, geneDict = self.maskNonConservedSequence(datasetID)) + else: + fastaFile = self.createDataFile(datasetID) + + for (prog, settingsID) in self.programs: + prog.inputFile(fastaFile) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) \ No newline at end of file diff --git a/cistematic/experiments/randomset.py b/cistematic/experiments/randomset.py new file mode 100644 index 0000000..a71e551 --- /dev/null +++ b/cistematic/experiments/randomset.py @@ -0,0 +1,78 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +from cistematic.core import getGenomeEntries +import random + + +def randomEntry(genome): + """ randomEntry() - pick an entry + """ + (org, entries) = getGenomeEntries(genome) + if len(entries) > 0: + entry = random.choice(entries) + else: + entry = "" + + return (org, entry) + + +def randomSet(genomeList, number): + """ randomSet() - returns a number of random sequence identifiers for one or more genomes + """ + setResults = {} + for genome in genomeList: + (org, entries) = getGenomeEntries(genome) + if len(entries) > 0: + random.shuffle(entries) + setResults[genome] = entries[:number] + else: + setResults[genome] = [] + + return setResults + + +def randomSetPercentage(genomeList, percentage): + """ randomSetPercentage() - returns a percentage of random sequence identifiers + for one or more genomes + """ + setResults = {} + if percentage < 0 or percentage > 1: + return setResults + + for genome in genomeList: + (org, entries) = getGenomeEntries(genome) + if len(entries) > 0: + number = int(round(len(entries) * percentage)) + random.shuffle(entries) + setResults[genome] = entries[:number] + else: + setResults[genome] = [] + + return setResults \ No newline at end of file diff --git a/cistematic/experiments/simple.py b/cistematic/experiments/simple.py new file mode 100644 index 0000000..b758bf2 --- /dev/null +++ b/cistematic/experiments/simple.py @@ -0,0 +1,55 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs + +class Simple(Experiment, AnalyzeMotifs): + experimentType = "simple" + + + def run(self): + Experiment.run(self) + datasetID = self.genepoolID + fastaFile = self.createDataFile() + for (prog, settingsID) in self.programs: + if 1: + prog.inputFile(fastaFile) + settings = self.getSettingsID(settingsID)[1] + prog.setSettings(eval(settings)) + runID = self.setRun(prog.name(), datasetID, settingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.appendResults(mot) + else: + self.mlog("Error running program %s with settings %s" % (prog, settingsID)) \ No newline at end of file diff --git a/cistematic/experiments/varyLength.py b/cistematic/experiments/varyLength.py new file mode 100644 index 0000000..ea3f039 --- /dev/null +++ b/cistematic/experiments/varyLength.py @@ -0,0 +1,68 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# a simple use of the experiment class +from experiment import Experiment +from analyzeMotifs import AnalyzeMotifs + +class VaryLength(Experiment, AnalyzeMotifs): + experimentType = "varyLength" + + + def setRange(self, step, uMin=0, uMax=0, dMin=0, dMax=0): + self.stepSize = step + self.upstreamMin = uMin + self.upstreamMax = uMax + 1 + self.downstreamMin = dMin + self.downstreamMax = dMax + 1 + + + def run(self): + Experiment.run(self) + + for downP in range(self.downstreamMin, self.downstreamMax, self.stepSize): + for upP in range(self.upstreamMin, self.upstreamMax, self.stepSize): + self.setSeqParameters(upP, downP) + # force reloading of the genepool + self.loadGenepool() + datasetID = self.genepoolID + fastaFile = self.createDataFile() + for (prog, settingsID) in self.programs: + prog.inputFile(fastaFile) + # reset setttings to orig settings, set new length, save settings + prog.setSettings(self.settings[settingsID]) + prog.setSeqLength(str(upP + downP)) + newestSettingsID = self.setSettings(prog.getSettings()) + runID = self.setRun(prog.name(), datasetID, newestSettingsID) + tag = str(runID) + prog.setTagID(tag) + prog.run() + theMotifs = prog.getMotifs() + for mot in theMotifs: + self.results.append(mot) \ No newline at end of file diff --git a/cistematic/genomes/__init__.py b/cistematic/genomes/__init__.py new file mode 100644 index 0000000..4dee924 --- /dev/null +++ b/cistematic/genomes/__init__.py @@ -0,0 +1,1109 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import string +from mmap import * +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +__all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri", + "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus", + "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis", + "btaurus", "drerio", "ecaballus" +] + +supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens", + "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae", + "cbrenneri", "cremanei", "dmelanogaster", "mdomestica", + "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus" +] + +geneDB = {} +chromDict = {} +chromRoot = {} +chromGeneEntries = {} +checkGene = {} +geneInfo = {} +getAllGenes = {} +annotInfo = {} +goInfo = {} +allAnnotInfo = {} +allGOInfo = {} + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = {"A": "T", "T": "A", + "G": "C", "C": "G", + "S": "S", + "W": "W", + "R": "Y", "Y": "R", + "M": "K", "K": "M", + "H": "D", "D": "H", + "B": "V", "V": "B", + "N": "N", + "a": "t", "t": "a", + "g": "c", "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=0): + """ returns the complement of the sequence. + """ + newSeq = "" + seqLength = len(sequence) + if length == seqLength: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + for index in range(seqLength - 1, seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +class Genome: + genome = "" + chromosome = "" + version = "" + dbFile = "" + supported = False + oldStyle = False + customAnnotations = False + + + def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False): + self.genome = genome + if chrom != "": + self.chromosome = chrom + + if version != "": + self.version = version + + if dbFile != "": + self.dbFile = dbFile + + if genome in supportedGenomes and dbFile == "": + self.dbFile = geneDB[genome] + self.supported = True + + if inRAM: + self.memoryBacked = True + self.memoryDB = sqlite.connect(":memory:") + self.createGeneDB(inMemory=True) + sql = self.memoryDB.cursor() + try: + sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000") + sql.execute("ATTACH '%s' as diskdb" % self.dbFile) + for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]: + sql.execute("insert into %s select * from diskdb.%s" % (table, table)) + + sql.execute("DETACH diskdb") + except: + if self.dbFile != "": + print "could not import %s" % self.dbFile + + sql.close() + self.createIndices(inMemory=True) + else: + self.memoryBacked = False + self.memoryDB = "" + + + def setGeneDB(self, dbFile): + self.dbFile = dbFile + self.supported = False + + + def setChromosome(self, chrom): + self.chromosome = chrom + + + def checkGene(self, geneID): + """ returns True if the geneID matches an entry in the genome database. + """ + (genome, gID) = geneID + if genome != self.genome: + return False + + try: + stmt = "SELECT chromosome from gene_entries where name = '%s' " % gID + res = self.queryDB(stmt) + if len(res) > 0: + return True + except: + pass + + return False + + + def geneInfo(self, geneID, version="1"): + (genome, gID) = geneID + result = "" + if genome != self.genome: + return False + + try: + stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version) + res = self.queryDB(stmt) + if len(res) > 0: + chrom = res[0] + start = int(res[1]) + stop = int(res[2]) + if start > stop: + temp = stop + stop = start + start = temp + + length = int(res[3]) + orientation = res[4] + result = (chrom, start, stop, length, orientation) + except: + pass + + return result + + + def getallGeneInfo(self, name="", chrom="", version=""): + resultDict = {} + chromList = [] + stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, chromosome, start, stop, orientation) = entry + if name not in resultDict: + resultDict[name] = [] + + if chromosome not in chromList: + resultDict[chromosome] = [] + chromList.append(chromosome) + + resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation)) + + return resultDict + + + def leftGeneDistance(self, geneID, radius=50000, version="1"): + result = radius + res = self.geneInfo(geneID, version) + if res != "": + (chrom, start, stop, length, orientation) = res + if start > stop: + temp = stop + stop = start + start = temp + + stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + rstart = int(entry[1]) + rstop = int(entry[2]) + if rstart > rstop: + temp = rstop + rstop = rstart + rstart = temp + + thelength = start - rstart + if (start - rstop) > 0 and (start - rstop) < thelength: + thelength = start - rstop + + if thelength > 0 and thelength < result: + result = thelength + + return result + + def rightGeneDistance(self, geneID, radius=50000, version="1"): + result = radius + res = self.geneInfo(geneID, version) + if res != "": + (chrom, start, stop, length, orientation) = res + if start > stop: + temp = stop + stop = start + start = temp + + stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + rstart = int(entry[1]) + rstop = int(entry[2]) + if rstart > rstop: + temp = rstop + rstop = rstart + rstart = temp + + thelength = rstop - stop + if (rstart - stop) > 0 and (rstart - stop) < thelength: + thelength = rstart - stop + + if thelength > 0 and thelength < result: + result = thelength + + return result + + + def annotInfo(self, geneID): + (genome, gID) = geneID + result = [] + if genome != self.genome: + return False + + stmt = "SELECT description from gene_annotation where name = '%s'" % gID + res = self.queryDB(stmt, fetchall=True) + if len(res) > 0: + for entry in res: + result.append(entry[0]) + + return result + + + def goInfo(self, geneID): + (genome, gID) = geneID + result = [] + if genome != self.genome: + return False + + stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID + res = self.queryDB(stmt, fetchall=True) + if len(res) > 0: + for entry in res: + result.append(string.join(entry,"\t")) + + return result + + + def chromInfo(self, chrom=""): + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + result = "%s\t%s" % (res[0], res[1]) + + return result + + + def allGIDs(self): + """ returns [ list of all orf names] + """ + result = [] + stmt = "SELECT distinct name from gene_entries" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGIDsbyGOID(self, GOID): + """ returns [ list of all orf names] that match a particular GOID + """ + result = [] + stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGOIDs(self): + """ returns the list of GOID's in the genome + """ + result = [] + stmt = "SELECT distinct GOID from gene_ontology" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGOterms(self): + """ returns the list of GOID's and their associated GO term in the genome + """ + result = {} + stmt = "SELECT distinct GOID, GOterm from gene_ontology" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result[str(entry[0])] = str(entry[1]) + + return result + + + def getGOIDCount(self, GOID): + """ returns the match count for a particular GOID + """ + stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID + res = self.queryDB(stmt, fetchall=True) + + return len(res) + + + def allAnnotInfo(self): + result = {} + stmt = "SELECT name, description from gene_annotation" + res = self.queryDB(stmt, fetchall=True) + + for entry in res: + geneID = (self.genome, entry[0]) + if geneID not in result: + result[geneID] = [] + + result[(self.genome,entry[0])].append(entry[1]) + + return result + + + def allGoInfo(self): + result = {} + stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + geneID = (self.genome, entry[0]) + if geneID not in result: + result[geneID] = [] + + result[(self.genome, entry[0])].append(string.join(entry[1:], "\t")) + + return result + + + def allChromNames(self, partition=1, slice=0): + result = [] + stmt = "SELECT distinct name from chromosomes" + res = self.queryDB(stmt, fetchall=True) + reslen = len(res) + for index in range(reslen): + if (index + slice) % partition == 0: + entry = res[index] + result.append(entry[0]) + + return result + + + def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"): + (chrom, start, stop, length, orientation) = self.geneInfo(gID, version) + seq = self.sequence(chrom, start, length, maskCDS, maskLower) + if orientation == "R": + seq = complement(seq, length) + + return seq + + + def getGeneFeatures(self, gID, type="", version="1"): + results = [] + featureClause = "" + (genome, geneid) = gID + if len(type) > 0: + featureClause = ' and type = "%s" ' % type + + stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s" %s order by start ' % (geneid, str(version), featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (type, chromosome, start, stop, orientation) = entry + results.append((type, chromosome, start, stop, orientation)) + + return results + + + def getallGeneFeatures(self): + resultDict = {} + stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, type, chromosome, start, stop, orientation) = entry + if name not in resultDict: + resultDict[name] = [] + + resultDict[name].append((type, chromosome, start, stop, orientation)) + + return resultDict + + + def getFeatureTypes(self, ftype=''): + """ Returns the distinct feature types available in the sequence_features + tables. Can optionally limit by feature type; the wild-card % can be + used to search feature substrings. + """ + results = [] + whereClause = "" + useLike = False + if "%" in ftype: + useLike = True + + if len(ftype) > 0 and not useLike: + whereClause = 'where type = "%s" ' % ftype + elif len(ftype) > 0: + whereClause = 'where type LIKE "%s" ' % ftype + + stmt = "select distinct type from sequence_features %s" % whereClause + res = self.queryDB(stmt, fetchall=True) + for entry in res: + results.append(entry[0]) + + return results + + def getFeatures(self, ftype, name="", chrom="", version =""): + """ Get features stored in sequence_features that match a feature type, + optionally restricted by name/value, chromosome, or version. Will + search for substrings when ftype and/or name are given with a % to + indicate the location of the wildcard. Returns a dictionary of features + with chromosomes as the keys. + """ + results = {} + chromList = [] + nameClause = "" + chromClause = "" + versionClause = "" + useLike = "=" + if "%" in ftype: + useLike = "LIKE" + + if len(name) > 0: + if "%" in name: + nameLike = "LIKE" + else: + nameLike = "=" + + nameClause = ' and name %s "%s" ' % (nameLike, name) + + if len(chrom) > 0: + chromClause = ' and chromosome = "%s" ' % chrom + + if len(version) > 0: + versionClause = ' and version = "%s" ' % version + + stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, version, chromosome, start, stop, orientation, atype) = entry + if chromosome not in chromList: + results[chromosome] = [] + chromList.append(chromosome) + + results[chromosome].append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""): + """ Return features that are on a particular stretch of the genome. Can optionally + limit by feature type; the wild-card % can be used to search feature substrings. + """ + results = [] + featureClause = "" + qstop = qstart + qlength + useLike = False + if "%" in ftype: + useLike = True + + if len(ftype) > 0 and not useLike: + featureClause = ' and type = "%s" ' % ftype + elif len(ftype) > 0: + featureClause = ' and type LIKE "%s" ' % ftype + + #select all features that encompass our start/stop + stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features that have a "stop" between start and stop that aren't yet in the dataset + stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + if (name, version, chromosome, start, stop, orientation, atype) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features on chromosome that have a "start" between start and stop + stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + if (name, version, chromosome, start, stop, orientation, atype) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def getGenesIntersecting(self, chrom, qstart, qlength): + """ Return features that are on a ptarticular stretch of the genome. Can optionally + limit by feature type; the wild-card % can be used to search feature substrings. + """ + results = [] + qstop = qstart + qlength + atype = "model" + #select all features that encompass our start/stop + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features that have a "stop" between start and stop that aren't yet in the dataset + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + if (name, version, chromosome, start, stop, orientation) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features on chromosome that have a "start" between start and stop + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + if (name, version, chromosome, start, stop, orientation) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def sequence(self, chrom, start, length, maskCDS=False, maskLower=False): + seq = "" + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + seqName = res[0] + seqType = res[1] + if seqType == "db": + stmt = "select sequence from sequences where name = '%s' " % seqName + res = self.queryDB(stmt) + seq = res[0][start: start + length] + res = "" + else: + chromFile = open("%s%s" % (cisRoot, seqName), "r") + mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ) + mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ) + mymap.seek(start) + seq = mymap.read(length) + chromFile.close() + + if maskCDS: + stop = start + length - 1 + seqArray = list(seq) + features = self.getFeaturesIntersecting(chrom, start, length, "CDS") + for entry in features: + (name, geneID, chromosome, fstart, fstop, forientation, type) = entry + if fstart < start: + fstart = start + + if fstop > stop: + fstop = stop + + nstart = fstart - start + nstop = fstop - start + 1 + for pos in range(nstop - nstart): + seqArray[nstart + pos] = "N" + + seq = string.join(seqArray, "") + + if maskLower: + seqArray = list(seq) + for index in range(len(seqArray)): + if seqArray[index] in ["a", "c" , "g", "t"]: + seqArray[index] = "N" + + seq = string.join(seqArray, "") + + return seq + + + def getChromosomeSequence(self, chrom=""): + seq = "" + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + if res == None: + print "Could not find chromosome %s" % chrom + return '' + + seqName = res[0] + seqType = res[1] + if seqType == "db": + res = self.queryDB('select sequence from sequences where name = "%s"' % chrom) + seq = res[0] + res = "" + else: + chromFile = open("%s%s" % (cisRoot, seqName), "r") + seq = chromFile.readline() + chromFile.close() + + return seq + + + def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1): + result = [] + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom + res = self.queryDB(stmt, fetchall=True) + if lowerbound > 0 and upperbound > 0: + for entry in res: + start = int(entry[0]) + stop = int(entry[1]) + if stop < start: + start, stop = stop, start + + if stop < lowerbound or start > upperbound: + continue + + result.append((start, stop, entry[2], (self.genome, entry[3]))) + else: + for entry in res: + start = int(entry[0]) + stop = int(entry[1]) + if stop < start: + start, stop = stop, start + + result.append((start, stop, entry[2], (self.genome, entry[3]))) + + return result + + + def createGeneDB(self, dbFile="", inMemory=False): + if len(dbFile) > 0: + self.dbFile = dbFile + + tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar", + "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar", + "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar", + "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar", + "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar", + "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar" + } + + for table in tableDict.keys(): + stmt = "create table %s(%s)" % (table, tableDict[table]) + self.writeDB(stmt, useMemory=inMemory) + + + def createIndices(self, inMemory=False): + indexDict = {"nameIndex1": ("gene_entries", "name"), + "nameIndex2": ("gene_annotation", "name"), + "nameIndex3": ("gene_ontology", "name"), + "goidIndex": ("gene_ontology", "GOID"), + "nameIndex4": ("sequences", "name"), + "nameIndex5": ("chromosomes", "name"), + "geneIDIndex": ("sequence_features", "name, type"), + "posIndex": ("sequence_features", "chromosome, start, stop, type"), + "typeIndex": ("sequence_features", "type") + } + + for indexName in indexDict.keys(): + (tableName, fields) = indexDict[indexName] + stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields) + self.writeDB(stmt, useMemory=inMemory) + + + def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0): + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature) + self.writeDB(stmt) + + + def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""): + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type) + self.writeDB(stmt) + + + def addAnnotation(self, geneID, description): + (genome, gID) = geneID + stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description) + self.writeDB(stmt) + + + def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""): + (genome, gID) = geneID + stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other) + self.writeDB(stmt) + + + def addSequence(self, geneID, seq, seqType, seqLen="-1"): + (genome, gID) = geneID + if int(seqLen) < 0: + seqLen = str(len(seq)) + + stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq) + self.writeDB(stmt) + + + def addChromosomeEntry(self, chromo, seqName, storageType): + stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType) + self.writeDB(stmt) + + + def addSequenceBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)" + for entry in entryArray: + (geneID, seq, seqType, seqLen) = entry + (genome, gID) = geneID + if int(seqLen) < 0: + seqLen = str(len(seq)) + + stmtArray.append((gID, str(seqLen), seqType, seq)) + + self.writeBatchDB(stmt, stmtArray) + + + def addChromosomeEntryBatch(self, entryArray, inMemory=False): + stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, ?, ?, ?)" + self.writeBatchDB(stmt, entryArray, useMemory=inMemory) + + + def addGeneEntryBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, chrom, start, stop, orientation, feature, version) = entry + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addFeatureEntryBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, version, chrom, start, stop, orientation, type) = entry + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addAnnotationBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) " + for entry in entryArray: + (geneID, description) = entry + (genome, gID) = geneID + stmtArray.append((gID, description)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addGoInfoBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry + (genome, gID) = geneID + stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def extendFeatures(self, featureFile, fileType="cistematic", replace=False): + geneEntryList = [] + geneFeatureList = [] + currentGene = "" + gstart = -1 + gstop = -1 + gchrom = "" + gsense = "" + ftype = "" + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + featfile = open(featureFile) + if fileType == "cistematic": + for line in featfile: + if line[0] == "#": + continue + + fields = line.strip().split() + if currentGene == "": + currentGene = fields[0] + gchrom = fields[2] + gstart = int(fields[3]) + gsense = senseArray[fields[5]] + + if fields[0] != currentGene: + geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1")) + currentGene = fields[0] + gchrom = fields[2] + gstart = int(fields[3]) + gsense = senseArray[fields[5]] + + lstart = int(fields[3]) + gstop = int(fields[4]) + ftype = fields[6] + geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype)) + elif fileType == "UCSC": + for line in featfile: + if line[0] == "#": + continue + + geneFields = line.split("\t") + exonNum = int(geneFields[8]) + exonStarts = geneFields[9].split(",") + exonStops = geneFields[10].split(",") + gchrom = geneFields[2][3:] + gsense = senseArray[geneFields[3]] + gstop = int(geneFields[7]) - 1 + gstart = int(geneFields[6]) - 1 + geneID = ("generic", geneFields[1]) + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS")) + elif estop <= gstart: + if gsense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType)) + elif estart >= gstop: + if gsense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType)) + elif estart <= gstop and estart > gstart: + if gsense == 'F': + fType = '3UTR' + else: + fType = '5UTR' + + geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS")) + geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType)) + elif estart < gstart and estop <= gstop: + if gsense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType)) + geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS")) + else: + if gsense == "F": + fType1 = "5UTR" + fType2 = "3UTR" + else: + fType1 = "3UTR" + fType2 = "5UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1)) + geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS")) + geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2)) + else: + print "%s format not supported yet" + featfile.close() + return + + featfile.close() + if replace==True: + self.queryDB("delete from sequence_features", useMemory=True) + self.queryDB("delete from gene_entries", useMemory=True) + + self.addGeneEntryBatch(geneEntryList, inMemory=True) + self.addFeatureEntryBatch(geneFeatureList, inMemory=True) + + + def queryDB(self, stmt, fetchall=False, useMemory=False): + if useMemory or self.memoryBacked: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + sql.execute(stmt) + if fetchall: + results = sql.fetchall() + else: + results = sql.fetchone() + + sql.close() + if not (useMemory or self.memoryBacked): + db.close() + + return results + + + def writeDB(self, stmt, useMemory=False): + if useMemory: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + sql.execute(stmt) + db.commit() + sql.close() + if not useMemory: + db.close() + + + def writeBatchDB(self, stmt, stmtArray, useMemory=False): + if useMemory: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + try: + sql.executemany(stmt, stmtArray) + except: + print "writeBatchDB: problem with %s" % stmt + + db.commit() + sql.close() + if not useMemory: + db.close() + + +def processSql(sqlFile, keyFields={}): + """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the + name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column + name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the + same field name will be used. For example, one possible keyFields dict would be: + keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'} + """ + fields = {} + infile = open(sqlFile) + line = "" + while "CREATE TABLE" not in line: + line = infile.readline() + continue + + tabFields = line.split() + tabName = tabFields[2] + if keyFields == {}: + fields["chrom"] = 1 + fields["start"] = 2 + fields["stop"] = 3 + fields["name"] = 4 + else: + index = 0 + for line in infile: + lineFields = line.split() + if lineFields[0] in keyFields.keys(): + if keyFields[lineFields[0]] == "": + outfield = lineFields[0] + else: + outfield = keyFields[lineFields[0]] + + fields[outfield] = index + + index += 1 + + infile.close() + + return (tabName, fields) + + +def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"): + """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature + type in sequence_features, the dataFields in the format returned by processSql(), and a version. + If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records, + rather than use addFeatureEntryBatch() to record the added features. + + Note that numberic values are overloaded into the name field using @ as a delimiter. + """ + records = [] + if dataFields == {}: + dataFields["chrom"] = 1 + dataFields["start"] = 2 + dataFields["stop"] = 3 + dataFields["name"] = 4 + + infile = open(dataFile) + for line in infile: + fields = line.strip().split("\t") + if "name" in dataFields: + recName = fields[dataFields["name"]] + else: + recName = typeName + + if "value" in dataFields: + recName += "@" + fields[dataFields["value"]] + + start = int(fields[dataFields["start"]]) + stop = int(fields[dataFields["stop"]]) + chrom = fields[dataFields["chrom"]][3:] + chrom.replace("_random", "rand") + orientation = "F" + if "orientation" in dataFields: + if fields[dataFields["orientation"]] == "-": + orientation = "R" + + if "version" in dataFields: + version = fields[dataFields["version"]] + + records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper())) + + if genomeObject != "": + genomeObject.addFeatureEntryBatch(records) + else: + print records + + +# Voodoo to get recursive imports working +for genome in supportedGenomes: + importString = "import %s" % genome + exec importString + geneDB[genome] = eval("%s.geneDB" % genome) \ No newline at end of file diff --git a/cistematic/genomes/athaliana.py b/cistematic/genomes/athaliana.py new file mode 100644 index 0000000..628be4d --- /dev/null +++ b/cistematic/genomes/athaliana.py @@ -0,0 +1,269 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Arabidopsis thaliana +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot + +chromSize = {"1": 30432563, + "2": 19705359, + "3": 23470805, + "4": 18585042, + "5": 26992738 +} + +background = [0.3180185, 0.1819815, 0.1819815, 0.3180185] +genomeSize = 119186497 + + +def decodeGFF3(cols): + fType = cols[2] + chrom = cols[0][3:] + start = int(cols[3]) + stop = int(cols[4]) + sense = cols[6] + if sense == "+": + sense = "F" + else: + sense = "R" + + other = cols[-1] + otherList = other.split(";") + otherDict = {} + for otherItem in otherList: + try: + (name, value) = otherItem.split("=") + except: + continue + + otherDict[name] = value + if name == "Name": + gid = value.strip() + + if name == "Parent": + if "," in value: + value = value.split(",")[0] + + gid = value.strip() + + return (fType, gid, chrom, start, stop, sense, otherDict) + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + atGenome = Genome("athaliana", dbFile=db) + + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray,"") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % (chromOut) + outFile = open(cisRoot + chromOut, "w") + outFile.write(seq) + outFile.close() + seq = "" + + atGenome.addChromosomeEntry(chromID, chromOut, "file") + # Add alternative chromID - should be A-O and 01-09 + atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + geneEntries = [] + atGenome = Genome("athaliana", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + if line[0] == "#" or len(line) < 10: + continue + + fields = line.strip().split("\t") + if fields[2] != "gene": + continue + + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + geneID = ("athaliana", gid) + version = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", version)) + + print "inserting %d gene entries" % len(geneEntries) + atGenome.addGeneEntryBatch(geneEntries) + + +def loadFeatureEntries(db, gFile): + featureEntries = [] + trackedGenes = [] + atGenome = Genome("athaliana", dbFile=db) + featureTranslation = {"CDS": "CDS", + "three_prime_UTR": "3UTR", + "five_prime_UTR": "5UTR", + "miRNA": "5UTR", + "exon": "5UTR" + } + geneFile = open(gFile, "r") + for line in geneFile: + fields = line.split("\t") + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + if fType in ["ncRNA"]: + (locus, rev) = gid.split(".") + if gid not in trackedGenes: + trackedGenes.append(locus) + + geneFile = open(gFile, "r") + for line in geneFile: + if line[0] == "c" or len(line) < 10: + continue + + fields = line.split("\t") + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + locusField = gid.split('.') + try: + (locus, rev) = locusField + rev = rev.strip() + except: + locus = gid + rev = 1 + + if fType not in featureTranslation: + continue + + elif fType == "exon" and locus not in trackedGenes: + continue + + geneID = ("athaliana", locus) + featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType])) + + print "inserted %d feature entries" % len(featureEntries) + atGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + annotFile.readline() + lines = annotFile.readlines() + annotFile.close() + + atGenome = Genome("athaliana", dbFile=db) + for line in lines: + field = line.split("\t") + try: + orfName = field[0].strip() + if "." in orfName: + (locus, rev) = orfName.split(".") + orfName = locus + + description = field[2].strip() + geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + atGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath): + atGenome = Genome("athaliana", dbFile=db) + goFile = open(goPath, "r") + goEntries = goFile.readlines() + goArray = [] + for line in goEntries: + fields = line.split("\t") + gID = fields[0] + GOID = fields[4] + objType = string.replace(fields[3], "'", "p") + objName = string.replace(fields[2], "'", "p") + isNot = "" + GOterm = fields[7] + evidence = fields[8] + goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9])) + + atGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createGeneDB(db) + + +def createDBindices(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createIndices() + + +def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir + annotPath = "%s/TAIR9_functional_descriptions" % downloadDir + goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir + + chromos = {"1": "%s/chr1.fas" % downloadDir, + "2": "%s/chr2.fas" % downloadDir, + "3": "%s/chr3.fas" % downloadDir, + "4": "%s/chr4.fas" % downloadDir, + "5": "%s/chr5.fas" % downloadDir, + "C": "%s/chrC.fas" % downloadDir, + "M": "%s/chrM.fas" % downloadDir + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding feature entries" + loadFeatureEntries(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db, annotPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/btaurus.py b/cistematic/genomes/btaurus.py new file mode 100644 index 0000000..c8a01b6 --- /dev/null +++ b/cistematic/genomes/btaurus.py @@ -0,0 +1,271 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Bos Taurus +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + btGenome = Genome("btaurus", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 500000: + print "Added contig %s to database" % chromID + btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen)) + btGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + btGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + btGenome = Genome("btaurus", dbFile=db) + geneFile = open(gFile, "r") + + for line in geneFile: + cols = line.split("\t") + gid = cols[0] + start = int(cols[5]) + stop = int(cols[6]) + sense = cols[2] + chrom = cols[1] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("btaurus", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + btGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + btGenome = Genome("btaurus", dbFile=db) + for line in annotFile: + try: + cols = line.split("\t") + locID = cols[0] + geneDesc = cols[6] + if len(locID) > 0: + geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + btGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneFeatures(db, gfile): + geneFile = open(gfile, "r") + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + seenArray = [] + insertArray = [] + for geneLine in geneFile: + geneFields = geneLine.split("\t") + exonNum = int(geneFields[7]) + exonStarts = geneFields[8].split(",") + exonStops = geneFields[9].split(",") + chrom = geneFields[1] + sense = senseArray[geneFields[2]] + gstop = int(geneFields[6]) - 1 + gstart = int(geneFields[5]) - 1 + geneid = geneFields[0] + try: + geneID = ("btaurus", geneid) + except: + continue + + gidVersion = "1" + if geneID in seenArray: + gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet. + else: + seenArray.append(geneID) + + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS")) + elif estop <= gstart: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart >= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart <= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType)) + else: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType)) + + geneFile.close() + + btGenome = Genome("btaurus", dbFile=db) + print 'Adding %d features' % len(insertArray) + btGenome.addFeatureEntryBatch(insertArray) + + +def loadGeneOntology(db, goPath, goDefPath, annotPath): + btGenome = Genome("btaurus", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + annotFile = open(annotPath, "r") + annotEntries = annotFile.readlines() + annotFile.close() + goDefEntries = goDefFile.readlines() + goDefs = {} + locus = {} + goArray = [] + + for goDefEntry in goDefEntries: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + for annotEntry in annotEntries: + try: + cols = annotEntry.split("\t") + locID = cols[0] + geneName = cols[1] + geneDesc = cols[6] + mimID = "" + if len(locID) > 0: + locus[locID] = (geneName, geneDesc, mimID) + except: + pass + + for entry in goEntries: + try: + fields = entry.split("\t") + locID = fields[0].strip() + (gene_name, gene_desc, mimID) = locus[locID] + goArray.append((("btaurus", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID)) + except: + #print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + btGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + btGenome = Genome("btaurus", dbFile=db) + btGenome.createGeneDB(db) + + +def createDBindices(db): + btGenome = Genome("btaurus", dbFile=db) + btGenome.createIndices() + + +def buildBtaurusDB(db=geneDB): + genePath = "%s/download/bt2/genscan.txt" % cisRoot + chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot + chromoOutPath = "/B_taurus/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Loading sequences" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/cbrenneri.py b/cistematic/genomes/cbrenneri.py new file mode 100644 index 0000000..a227faa --- /dev/null +++ b/cistematic/genomes/cbrenneri.py @@ -0,0 +1,197 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Caenorhaditis remanei +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + cbGenome = Genome("cbrenneri", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromHeader = header.strip()[1:].split() + chromID = chromHeader[0] + currentLine = inFile.readline() + + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 100000: + print "Added contig %s to database" % chromID + cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen)) + cbGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open( "%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + cbGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gffFile): + cbGenome = Genome("cbrenneri", dbFile=db) + geneFile = open(gffFile, "r") + geneStart = {} + geneStop = {} + geneChrom = {} + geneSense = {} + geneEntries = [] + for line in geneFile: + if line[0] == "#": + continue + + if line[0] == "\n": + continue + + field = line[:-1].split("\t") + if field[2] != "stop_codon" and field[2] != "start_codon": + continue + + idfield = field[8].split('"') + gid = idfield[1] + geneID = ("cbrenneri", gid) + sense = field[6] + geneChrom[geneID] = field[0].strip() + if sense == "+": + geneSense[geneID] = "F" + else: + geneSense[geneID] = "R" + + if field[2] == "start_codon": + if sense == "+": + geneStart[geneID] = int(field[3]) + else: + geneStart[geneID] = int(field[4]) + else: + if sense == "+": + geneStop[geneID] = int(field[3]) + else: + geneStop[geneID] = int(field[4]) + + for geneID in geneStart: + if geneID not in geneStop: + print "geneID %s not in geneStop - skipping" % str(geneID) + continue + + geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1)) + + print "Adding %d gene entries" % len(geneEntries) + cbGenome.addGeneEntryBatch(geneEntries) + + +def loadFeatureEntries(db, gffFile): + cbGenome = Genome("cbrenneri", dbFile=db) + featureFile = open(gffFile, "r") + featureEntries = [] + for line in featureFile: + if line[0] == "#": + continue + + if line[0] == "\n": + continue + + field = line.split("\t") + if field[2].strip() != "CDS": + continue + + gidrev = field[8].split('"') + gid = gidrev[1] + geneID = ("cbrenneri", gid) + gidVersion = 1 + + start = int(field[3]) + stop = int(field[4]) + + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS")) + + print "Adding %d feature entries" % len(featureEntries) + cbGenome.addFeatureEntryBatch(featureEntries) + + +def createDBFile(db): + cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db) + cbGenome.createGeneDB(db) + + +def createDBindices(db): + cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db) + cbGenome.createIndices() + + +def buildCbrenneriDB(db=geneDB): + gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version + chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot + chromoOutPath = "/C_brenneri/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, gffPath) + + print "Adding feature entries" + loadFeatureEntries(db, gffPath) + + print "Loading genomic sequence" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/cbriggsae.py b/cistematic/genomes/cbriggsae.py new file mode 100644 index 0000000..b913728 --- /dev/null +++ b/cistematic/genomes/cbriggsae.py @@ -0,0 +1,194 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Caenorhaditis briggsae +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + cbGenome = Genome("cbriggsae", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 900000: + print "Added contig %s to database" % chromID + cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen)) + cbGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + cbGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gffFile): + cbGenome = Genome("cbriggsae", dbFile=db) + geneFile = open(gffFile, "r") + geneEntries = [] + geneAnnotations = [] + for line in geneFile: + field = line[:-1].split("\t") + if field[1] != "hybrid": + continue + + if field[2] != "CDS": + continue + + annot = field[8].split('"') + gid = annot[1] + geneID = ("cbriggsae", gid) + annotation = string.join(annot[3:], "") + gidVersion = 1 + try: + gidVersion = giddots[2] + except: + pass + + start = int(field[3]) - 1 + stop = int(field[4]) - 1 + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries: + geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion)) + if (geneID, annotation) not in geneAnnotations: + geneAnnotations.append((geneID, annotation)) + + print "Adding %d gene entries" % len(geneEntries) + cbGenome.addGeneEntryBatch(geneEntries) + + print "Adding %d annotations" % len(geneAnnotations) + cbGenome.addAnnotationBatch(geneAnnotations) + + +def loadFeatureEntries(db, gffFile): + cbGenome = Genome("cbriggsae", dbFile=db) + featureFile = open(gffFile, "r") + featureEntries = [] + seenFeatures = {} + featureTranslation = {"coding_exon": "CDS", + "three_prime_UTR": "3UTR", + "five_prime_UTR": "5UTR" + } + + for line in featureFile: + field = line.split("\t") + if field[1] != "hybrid": + continue + + if field[2].strip() not in featureTranslation: + continue + + featureType = featureTranslation[field[2].strip()] + gidrev = field[8].split('"') + gid = gidrev[1] + geneID = ("cbriggsae", gid) + gidVersion = 1 + start = int(field[3]) - 1 + stop = int(field[4]) - 1 + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + if geneID not in seenFeatures: + seenFeatures[geneID] = [] + if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]: + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType)) + seenFeatures[geneID].append((gidVersion, start, stop, featureType)) + + print "Adding %d feature entries" % len(featureEntries) + cbGenome.addFeatureEntryBatch(featureEntries) + + +def createDBFile(db): + cbGenome = Genome("cbriggsae", version="CB25", dbFile=db) + cbGenome.createGeneDB(db) + + +def createDBindices(db): + cbGenome = Genome("cbriggsae", version="CB25", dbFile=db) + cbGenome.createIndices() + + +def buildCbriggsaeDB(db=geneDB): + gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot + chromoPath = "%s/download/briggsae_25.fa" % cisRoot + chromoOutPath = "/C_briggsae/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, gffPath) + + print "Adding feature entries" + loadFeatureEntries(db, gffPath) + + print "Loading genomic sequence" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/celegans.py b/cistematic/genomes/celegans.py new file mode 100644 index 0000000..e3df297 --- /dev/null +++ b/cistematic/genomes/celegans.py @@ -0,0 +1,313 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Caenorhaditis elegans +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/C_elegans/celegans.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + ceGenome = Genome("celegans", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + ceGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gffFile): + ceGenome = Genome("celegans", dbFile=db) + geneFile = open(gffFile, "r") + geneEntries = [] + for line in geneFile: + if line[0] == "#": + continue + + field = line.split("\t") + if field[1] != "Coding_transcript" and field[1] != "miRNA": + continue + + if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript": + continue + + gidrev = field[8].split('"') + giddots = gidrev[1].split(".") + # we are ignoring gene models!!!! + if giddots[1][-1] in string.letters: + gidGene = giddots[1][:-1] + gidLetter = giddots[1][-1] + else: + gidGene = giddots[1] + gidLetter = "a" + gid = "%s.%s" % (giddots[0], gidGene) + geneID = ("celegans", gid) + gidVersion = 1 + if gidLetter != "a": + try: + gidVersion = ord(gidLetter.lower()) - 96 + except: + print "problem processing %s - skipping" % gidrev[1] + continue + + start = int(field[3]) - 1 + stop = int(field[4]) - 1 + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + ceGenome.addGeneEntryBatch(geneEntries) + + +def loadFeatureEntries(db, gffFile): + ceGenome = Genome("celegans", dbFile=db) + featureFile = open(gffFile, "r") + featureEntries = [] + seenFeatures = {} + featureTranslation = {"coding_exon": "CDS", + "three_prime_UTR": "3UTR", + "five_prime_UTR": "5UTR" + } + + for line in featureFile: + if line[0] == "#": + continue + + field = line.split("\t") + if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]: + continue + + if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation: + continue + + if field[1] in ["miRNA", "tRNAscan-SE-1.23"]: + featureType = "CDS" # identifying these as CDS will force their masking later on + else: + featureType = featureTranslation[field[2].strip()] + + gidrev = field[8].split('"') + giddots = gidrev[1].split(".") + # we are ignoring gene models!!!! + if giddots[1][-1] in string.letters: + gidGene = giddots[1][:-1] + gidLetter = giddots[1][-1] + else: + gidGene = giddots[1] + gidLetter = "a" + + gid = "%s.%s" % (giddots[0], gidGene) + geneID = ("celegans", gid) + gidVersion = 1 + if gidLetter != "a": + try: + gidVersion = ord(gidLetter.lower()) - 96 + except: + print "problem processing %s - skipping" % gidrev[1] + continue + + start = int(field[3]) - 1 + stop = int(field[4]) - 1 + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + if geneID not in seenFeatures: + seenFeatures[geneID] = [] + + if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]: + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType)) + seenFeatures[geneID].append((gidVersion, start, stop, featureType)) + + print "Adding %d feature entries" % len(featureEntries) + ceGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db, geneIDPath): + geneAnnotations = [] + geneIDFile = open(geneIDPath, "r") + lines = geneIDFile.readlines() + geneIDFile.close() + ceGenome = Genome("celegans", dbFile=db) + for line in lines: + field = line.split(",") + try: + gid = field[2].strip() + geneID = "%s\t%s" % (field[0], field[1]) + geneAnnotations.append((("celegans", gid), geneID)) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + ceGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath, geneIDPath): + ceGenome = Genome("celegans", version="WS200", dbFile=db) + geneIDFile = open(geneIDPath, "r") + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + lines = geneIDFile.readlines() + geneIDFile.close() + geneIDmap = {} + seenGO = {} + for line in lines: + field = line.split(",") + # ugly C elegans hack - map both fields to gid, since either might be + # used by GO ! + if len(field[2].strip()) > 1: + geneIDmap[field[1]] = field[2].strip() + + goDefEntries = goDefFile.readlines() + goDefs = {} + for goDefEntry in goDefEntries: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + goArray = [] + for line in goEntries: + if line[0] == "!": + continue + + fields = line.split("\t") + name = fields[2] + if name in geneIDmap: + name = geneIDmap[name] + + if name[-1] == "a": + name = name[:-1] + + GOIDarray = fields[4].split(" ") + GOID = GOIDarray[0] + objType = fields[8] + objName = fields[10].split("|") + gID = name + isNot = fields[3] + if len(objName) > 1: + name = "%s|%s" % (name.strip(), fields[10]) + + try: + GOterm = string.replace(goDefs[GOID][0], "'", "p") + except: + print "could no map %s - using GOID only" % GOID + GOterm = "" + + evidence = fields[9] + if gID not in seenGO: + seenGO[gID] = [] + + if GOID not in seenGO[gID]: + seenGO[gID].append(GOID) + goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1])) + + print "Adding %d GO entries" % len(goArray) + ceGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + ceGenome = Genome("celegans", version="WS200", dbFile=db) + ceGenome.createGeneDB(db) + + +def createDBindices(db): + ceGenome = Genome("celegans", version="WS200", dbFile=db) + ceGenome.createIndices() + + +def buildCelegansDB(db=geneDB, downloadRoot=""): + if downloadRoot == "": + downloadRoot = "%s/download/" % cisRoot + + geneIDPath = "%sgeneIDs.WS200" % downloadRoot + goDefPath = "%sGO.terms_and_ids" % downloadRoot + goPath = "%sgene_association.wb" % downloadRoot + + # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks + chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot, + "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot, + "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot, + "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot, + "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot, + "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot + } + + # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz + gffPath = "%selegansWS200.gff" % downloadRoot + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, gffPath) + + print "Adding feature entries" + loadFeatureEntries(db, gffPath) + + print "Adding gene annotations" + loadGeneAnnotations(db, geneIDPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath, geneIDPath) + + for chromID in chromos: + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/cfamiliaris.py b/cistematic/genomes/cfamiliaris.py new file mode 100644 index 0000000..75c31d1 --- /dev/null +++ b/cistematic/genomes/cfamiliaris.py @@ -0,0 +1,264 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Canis familiaris +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + cfGenome = Genome("cfamiliaris", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + cfGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + alreadySeen = [] + cfGenome = Genome("cfamiliaris", dbFile=db) + geneFile = open(gFile, "r") + geneFile.readline() + for line in geneFile: + cols = line.split("\t") + if cols[11].strip() != "GENE": + continue + + name = cols[10].split(":") + gid = name[1] + if gid == "" or gid in alreadySeen: + continue + + alreadySeen.append(gid) + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("cfamiliaris", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + cfGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + cfGenome = Genome("cfamiliaris", dbFile=db) + featureFile = open(gFile, "r") + featureFile.readline() + for line in featureFile: + cols = line.split("\t") + if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]: + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + if gid == "": + continue + + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("cfamiliaris", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + cfGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + cfGenome = Genome("cfamiliaris", dbFile=db) + for line in annotFile: + try: + cols = line.split("\t") + locID = cols[0] + geneDesc = cols[6] + if len(locID) > 0: + geneAnnotations.append((("cfamiliaris", locID), string.replace(geneDesc.strip(), "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + cfGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath): + cfGenome = Genome("cfamiliaris", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + idb = geneinfoDB() + goDefs = {} + goArray = [] + for goDefEntry in goDefFile: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + prevGID = '' + for entry in goEntries: + try: + fields = entry.split("\t") + if fields[0] != "9615": + continue + + locID = fields[1].strip() + gID = ("cfamiliaris", locID) + if prevGID != gID: + prevGID = gID + gene_name = "" + synonyms = idb.geneIDSynonyms(gID) + if len(synonyms) >0: + for entry in synonyms: + gene_name += "," + gene_name += entry + else: + gene_name = " " + + goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], "")) + except: + print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + cfGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + cfGenome = Genome("cfamiliaris", dbFile=db) + cfGenome.createGeneDB(db) + + +def createDBindices(db): + cfGenome = Genome("cfamiliaris", dbFile=db) + cfGenome.createIndices() + + +def buildDogDB(db=geneDB): + genePath = "%s/download/seq_gene.md" % cisRoot + chromos = {"1": "%s/download/chr1.fa" % cisRoot, + "2": "%s/download/chr2.fa" % cisRoot, + "3": "%s/download/chr3.fa" % cisRoot, + "4": "%s/download/chr4.fa" % cisRoot, + "5": "%s/download/chr5.fa" % cisRoot, + "6": "%s/download/chr6.fa" % cisRoot, + "7": "%s/download/chr7.fa" % cisRoot, + "8": "%s/download/chr8.fa" % cisRoot, + "9": "%s/download/chr9.fa" % cisRoot, + "10": "%s/download/chr10.fa" % cisRoot, + "11": "%s/download/chr11.fa" % cisRoot, + "12": "%s/download/chr12.fa" % cisRoot, + "13": "%s/download/chr13.fa" % cisRoot, + "14": "%s/download/chr14.fa" % cisRoot, + "15": "%s/download/chr15.fa" % cisRoot, + "16": "%s/download/chr16.fa" % cisRoot, + "17": "%s/download/chr17.fa" % cisRoot, + "18": "%s/download/chr18.fa" % cisRoot, + "19": "%s/download/chr19.fa" % cisRoot, + "20": "%s/download/chr20.fa" % cisRoot, + "21": "%s/download/chr21.fa" % cisRoot, + "22": "%s/download/chr22.fa" % cisRoot, + '23': "%s/download/chr23.fa" % cisRoot, + "24": "%s/download/chr24.fa" % cisRoot, + "25": "%s/download/chr25.fa" % cisRoot, + "26": "%s/download/chr26.fa" % cisRoot, + "27": "%s/download/chr27.fa" % cisRoot, + "28": "%s/download/chr28.fa" % cisRoot, + "29": "%s/download/chr29.fa" % cisRoot, + "30": "%s/download/chr30.fa" % cisRoot, + "31": "%s/download/chr31.fa" % cisRoot, + "32": "%s/download/chr32.fa" % cisRoot, + "33": "%s/download/chr33.fa" % cisRoot, + "34": "%s/download/chr34.fa" % cisRoot, + "35": "%s/download/chr35.fa" % cisRoot, + "36": "%s/download/chr36.fa" % cisRoot, + "37": "%s/download/chr37.fa" % cisRoot, + "38": "%s/download/chr38.fa" % cisRoot, + "X": "%s/download/chrX.fa" % cisRoot, + "Un": "%s/download/chrUn.fa" % cisRoot + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/C_familiaris/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/cremanei.py b/cistematic/genomes/cremanei.py new file mode 100644 index 0000000..86e5991 --- /dev/null +++ b/cistematic/genomes/cremanei.py @@ -0,0 +1,188 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Caenorhaditis remanei +import string, os +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot + + +def loadChromosomes(db, inPath, chromOutPath): + crGenome = Genome("cremanei", dbFile=db) + scontigList = os.listdir(inPath) + for scontig in scontigList: + seq = '' + seqArray = [] + seqLen = 0 + inFile = open("%s/%s" % (inPath, scontig), "r") + index = 0 + header = inFile.readline() + chromID = header.strip()[1:] + while header != "": + seqArray = [] + seqLen = 0 + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 100000: + print "Added contig %s to database" % chromID + crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen)) + crGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + crGenome.addChromosomeEntry(chromID, outFileName, "file") + + index += 1 + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gffFile): + crGenome = Genome("cremanei", dbFile=db) + geneFile = open(gffFile, "r") + geneStart = {} + geneStop = {} + geneChrom = {} + geneSense = {} + geneEntries = [] + for line in geneFile: + if line[0] == "#": + continue + + if line[0] == "\n": + continue + + field = line[:-1].split("\t") + if field[2] != "CDS": + continue + + idfield = field[8].split('"') + gid = idfield[1] + geneID = ("cremanei", gid) + geneStart[geneID] = int(field[3]) - 1 + geneStop[geneID] = int(field[4]) - 1 + sense = field[6] + geneChrom[geneID] = field[0].strip() + if sense == "+": + geneSense[geneID] = "F" + else: + geneSense[geneID] = "R" + + for geneID in geneStart: + if geneID not in geneStop: + print "geneID %s not in geneStop - skipping" % str(geneID) + continue + geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1)) + + print "Adding %d gene entries" % len(geneEntries) + crGenome.addGeneEntryBatch(geneEntries) + + +def loadFeatureEntries(db, gffFile): + crGenome = Genome("cremanei", dbFile=db) + featureFile = open(gffFile, "r") + featureEntries = [] + for line in featureFile: + if line[0] == "#": + continue + + if line[0] == "\n": + continue + + field = line.split("\t") + if field[2].strip() != "coding_exon": + continue + + gidrev = field[8].split('"') + gid = gidrev[1] + geneID = ("cremanei", gid) + gidVersion = 1 + start = int(field[3]) - 1 + stop = int(field[4]) - 1 + sense = field[6] + chrom = field[0].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS")) + + print "Adding %d feature entries" % len(featureEntries) + crGenome.addFeatureEntryBatch(featureEntries) + + +def createDBFile(db): + crGenome = Genome("cremanei", version="CR20050824", dbFile=db) + crGenome.createGeneDB(db) + + +def createDBindices(db): + crGenome = Genome("cremanei", version="CR20050824", dbFile=db) + crGenome.createIndices() + + +def buildCremaneiDB(db=geneDB): + gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version + chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot + chromoOutPath = "/C_remanei/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, gffPath) + + print "Adding feature entries" + loadFeatureEntries(db, gffPath) + + print "Loading genomic sequence" + loadChromosomes(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/dmelanogaster.py b/cistematic/genomes/dmelanogaster.py new file mode 100644 index 0000000..5c9f53a --- /dev/null +++ b/cistematic/genomes/dmelanogaster.py @@ -0,0 +1,330 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Drosophila melanogaster +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot + +version = {"A": "1", + "B": "2", + "C": "3", + "D": "4", + "E": "5", + "F": "6", + "G": "7", + "H": "8", + "I": "9", + "J": "10", + "K": "11", + "L": "12", + "M": "13", + "N": "14", + "O": "15", + "P": "16", + "Q": "17", + "R": "18", + "S": "19", + "T": "20", + "U": "21", + "V": "22", + "W": "23", + "X": "24", + "Y": "25", + "Z": "26" +} + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + dmGenome = Genome("dmelanogaster", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + dmGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + geneEntries = [] + dmGenome = Genome("dmelanogaster", dbFile=db) + geneFile = open(gFile, "r") + + for line in geneFile: + cols = line.split("\t") + name = cols[1].split("-R") + gid = name[0] + start = int(cols[4]) + stop = int(cols[5]) + sense = cols[3] + chrom = cols[2][3:] + if sense == "-": + sense = "R" + else: + sense = "F" + + geneID = ("dmelanogaster", gid) + try: + gidVersion = version[name[1]] + except: + continue + + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + dmGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gfile): + geneFile = open(gfile, "r") + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + insertArray = [] + for geneLine in geneFile: + geneFields = geneLine.split("\t") + exonNum = int(geneFields[8]) + exonStarts = geneFields[9].split(",") + exonStops = geneFields[10].split(",") + chrom = geneFields[2][3:] + sense = senseArray[geneFields[3]] + gstop = int(geneFields[7]) - 1 + gstart = int(geneFields[6]) - 1 + name = geneFields[1].split("-R") + geneID = ("dmelanogaster", name[0]) + try: + gidVersion = version[name[1]] + except: + continue + + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS")) + elif estop <= gstart: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType)) + elif estart >= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType)) + elif estart <= gstop and estart > gstart: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS")) + insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType)) + elif estart < gstart and estop <= gstop: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType)) + insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS")) + else: + if sense == "F": + fType1 = "5UTR" + fType2 = "3UTR" + else: + fType1 = "3UTR" + fType2 = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1)) + insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS")) + insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2)) + + geneFile.close() + dmGenome = Genome("dmelanogaster", dbFile=db) + print "Adding %d features" % len(insertArray) + dmGenome.addFeatureEntryBatch(insertArray) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + dmGenome = Genome("dmelanogaster", dbFile=db) + for line in annotFile: + try: + cols = line.split("\t") + if cols[0] != "7227": + continue + + locID = cols[3] + if "Dmel_" in locID: + locID = locID[5:] + + geneDesc = cols[4] + if len(locID) > 0: + geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + dmGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath, annotPath): + dmGenome = Genome("dmelanogaster", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + annotFile = open(annotPath, "r") + annotEntries = annotFile.readlines() + annotFile.close() + goDefEntries = goDefFile.readlines() + goDefs = {} + locus = {} + goArray = [] + for goDefEntry in goDefEntries: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + for annotEntry in annotEntries: + try: + cols = annotEntry.split("\t") + if cols[0] != "7227": + continue + + locID = cols[3].strip() + geneName = cols[1].strip() + if len(locID) > 0: + locus[geneName] = locID + except: + pass + + for entry in goEntries: + if entry[0] == "!": + continue + + if entry[:4] != "7227": + continue + + try: + fields = entry.split("\t") + geneName = fields[1].strip() + locID = locus[geneName] + if "Dmel_" in locID: + locID = locID[5:] + + GOID = fields[2] + goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], "")) + except: + pass + + print "adding %d go entries" % len(goArray) + dmGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + dmGenome = Genome("dmelanogaster", dbFile=db) + dmGenome.createGeneDB(db) + + +def createDBindices(db): + dmGenome = Genome("dmelanogaster", dbFile=db) + dmGenome.createIndices() + + +def buildDmelanogasterDB(db=geneDB): + """ genes and annotations are from UCSC. GO association file is from geneontology.org. + """ + genePath = "%s/download/flyBaseGene.txt" % cisRoot + annotPath = "%s/download/gene_info" % cisRoot + goDefPath = "%s/download/GO.terms_and_ids" % cisRoot + goPath = "%s/download/gene2go" % cisRoot + chromos = {"2L": "%s/download/chr2L.fa" % cisRoot, + "2R": "%s/download/chr2R.fa" % cisRoot, + "2LHet": "%s/download/chr2LHet.fa" % cisRoot, + "2RHet": "%s/download/chr2RHet.fa" % cisRoot, + "3L": "%s/download/chr3L.fa" % cisRoot, + "3LHet": "%s/download/chr3LHet.fa" % cisRoot, + "3R": "%s/download/chr3R.fa" % cisRoot, + "3RHet": "%s/download/chr3RHet.fa" % cisRoot, + "4": "%s/download/chr4.fa" % cisRoot, + "X": "%s/download/chrX.fa" % cisRoot, + "XHet": "%s/download/chrXHet.fa" % cisRoot, + "YHet": "%s/download/chrYHet.fa" % cisRoot, + "U": "%s/download/chrU.fa" % cisRoot, + "Uextra": "%s/download/chrUextra.fa" % cisRoot, + "M": "%s/download/chrM.fa" % cisRoot + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db, annotPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath, annotPath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/drerio.py b/cistematic/genomes/drerio.py new file mode 100644 index 0000000..7541db8 --- /dev/null +++ b/cistematic/genomes/drerio.py @@ -0,0 +1,250 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Danio Rerio +import string, os +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/D_rerio/drerio.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + drGenome = Genome("drerio", dbFile=db) + files = os.listdir(chromPath) + for filename in files: + inFile = open("%s/%s" % (chromPath, filename), "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 250000: + print "Added contig %s to database" % chromID + drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen)) + drGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + drGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gFile): + geneEntries = [] + seenGIDs = [] + drGenome = Genome("drerio", dbFile=db) + geneFile = open(gFile, "r") + idb = geneinfoDB() + for line in geneFile: + cols = line.split("\t") + gid = cols[0] + try: + tempID = idb.getGeneID("drerio", gid) + if not len(tempID): + print "could not find %s" % gid + continue + + geneInfo = idb.getGeneInfo(tempID) + gid = geneInfo[1] + except: + continue + + if gid == "": + continue + + start = int(cols[6]) + stop = int(cols[7]) + sense = cols[3] + chrom = cols[2] + if sense == "-": + sense = "R" + else: + sense = "F" + + geneID = ("drerio", gid) + if geneID in seenGIDs: + gidVersion = "2" + else: + gidVersion = "1" + seenGIDs.append(geneID) + + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + drGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gfile): + geneFile = open(gfile, "r") + idb = geneinfoDB() + seenGIDs = [] + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + insertArray = [] + for geneLine in geneFile: + geneFields = geneLine.split("\t") + exonNum = int(geneFields[8]) + exonStarts = geneFields[9].split(",") + exonStops = geneFields[10].split(",") + chrom = geneFields[2] + sense = senseArray[geneFields[3]] + gstart = int(geneFields[6]) - 1 + gstop = int(geneFields[7]) - 1 + gid = geneFields[0] + try: + tempID = idb.getGeneID("drerio", gid) + if not len(tempID): + print "could not find %s" % gid + continue + + geneInfo = idb.getGeneInfo(tempID) + gid = geneInfo[1] + except: + continue + + if gid == "": + continue + + geneID = ("drerio", gid) + if geneID in seenGIDs: + gidVersion = "2" + else: + gidVersion = "1" + + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS")) + elif estop <= gstart: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType)) + elif estart >= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType)) + elif estart <= gstop and estart > gstart: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS")) + insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType)) + elif estart < gstart and estop <= gstop: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType)) + insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS")) + else: + if sense == "F": + fType1 = "5UTR" + fType2 = "3UTR" + else: + fType1 = "3UTR" + fType2 = "5UTR" + + insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1)) + insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS")) + insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2)) + + geneFile.close() + drGenome = Genome("drerio", dbFile=db) + print "Adding %d features" % len(insertArray) + drGenome.addFeatureEntryBatch(insertArray) + + +def createDBFile(db): + drGenome = Genome("drerio", dbFile=db) + drGenome.createGeneDB(db) + + +def createDBindices(db): + drGenome = Genome("drerio", dbFile=db) + drGenome.createIndices() + + +def buildDrerioDB(db=geneDB): + """ genes and annotations are from UCSC (dr3). + """ + #genePath = "%s/download/xenoRefFlat.txt" % cisRoot + chromoPath = "%s/download/dr3" % cisRoot + chromoOutPath = "/D_rerio/" + print "Creating database %s" % db + createDBFile(db) + + #print "Adding gene entries" + #loadGeneEntries(db, genePath) + + #print "Adding gene features" + #loadGeneFeatures(db, genePath) + + print "Loading chromosomes" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/ecaballus.py b/cistematic/genomes/ecaballus.py new file mode 100644 index 0000000..e845c42 --- /dev/null +++ b/cistematic/genomes/ecaballus.py @@ -0,0 +1,198 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Equus Caballus +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + ecGenome = Genome("ecaballus", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + ecGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + alreadySeen = [] + ecGenome = Genome("ecaballus", dbFile=db) + geneFile = open(gFile, "r") + geneFile.readline() + for line in geneFile: + cols = line.split("\t") + if cols[11].strip() != "GENE": + continue + + name = cols[10].split(":") + gid = name[1] + if gid == "" or gid in alreadySeen: + continue + + alreadySeen.append(gid) + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("ecaballus", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + ecGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + ecGenome = Genome("ecaballus", dbFile=db) + featureFile = open(gFile, "r") + featureFile.readline() + for line in featureFile: + cols = line.split("\t") + if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]: + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + if gid == "": + continue + + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("ecaballus", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + ecGenome.addFeatureEntryBatch(featureEntries) + + +def createDBFile(db): + ecGenome = Genome("ecaballus", dbFile=db) + ecGenome.createGeneDB(db) + + +def createDBindices(db): + ecGenome = Genome("ecaballus", dbFile=db) + ecGenome.createIndices() + + +def buildHorseDB(db=geneDB): + genePath = "%s/download/seq_gene.md" % cisRoot + chromos = {"1": "%s/download/chr1.fa" % cisRoot, + "2": "%s/download/chr2.fa" % cisRoot, + "3": "%s/download/chr3.fa" % cisRoot, + "4": "%s/download/chr4.fa" % cisRoot, + "5": "%s/download/chr5.fa" % cisRoot, + "6": "%s/download/chr6.fa" % cisRoot, + "7": "%s/download/chr7.fa" % cisRoot, + "8": "%s/download/chr8.fa" % cisRoot, + "9": "%s/download/chr9.fa" % cisRoot, + "10": "%s/download/chr10.fa" % cisRoot, + "11": "%s/download/chr11.fa" % cisRoot, + "12": "%s/download/chr12.fa" % cisRoot, + "13": "%s/download/chr13.fa" % cisRoot, + "14": "%s/download/chr14.fa" % cisRoot, + "15": "%s/download/chr15.fa" % cisRoot, + "16": "%s/download/chr16.fa" % cisRoot, + "17": "%s/download/chr17.fa" % cisRoot, + "18": "%s/download/chr18.fa" % cisRoot, + "19": "%s/download/chr19.fa" % cisRoot, + "20": "%s/download/chr20.fa" % cisRoot, + "21": "%s/download/chr21.fa" % cisRoot, + "22": "%s/download/chr22.fa" % cisRoot, + "23": "%s/download/chr23.fa" % cisRoot, + "24": "%s/download/chr24.fa" % cisRoot, + "25": "%s/download/chr25.fa" % cisRoot, + "26": "%s/download/chr26.fa" % cisRoot, + "27": "%s/download/chr27.fa" % cisRoot, + "28": "%s/download/chr28.fa" % cisRoot, + "29": "%s/download/chr29.fa" % cisRoot, + "30": "%s/download/chr30.fa" % cisRoot, + "31": "%s/download/chr31.fa" % cisRoot, + "M": "%s/download/chrM.fa" % cisRoot, + "X": "%s/download/chrX.fa" % cisRoot, + "Un": "%s/download/chrUn.fa" % cisRoot + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/E_caballus/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/ggallus.py b/cistematic/genomes/ggallus.py new file mode 100644 index 0000000..551a116 --- /dev/null +++ b/cistematic/genomes/ggallus.py @@ -0,0 +1,274 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Gallus gallus +import string +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/G_gallus/ggallus.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + ggGenome = Genome("ggallus", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + ggGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + alreadySeen = [] + ggGenome = Genome("ggallus", dbFile=db) + geneFile = open(gFile, "r") + geneFile.readline() + for line in geneFile: + if "|" in line: + continue + + cols = line.split("\t") + if cols[11].strip() != "GENE": + continue + + name = cols[10].split(":") + gid = name[1] + if gid == "" or gid in alreadySeen: + continue + + alreadySeen.append(gid) + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("ggallus", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + ggGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + ggGenome = Genome("ggallus", dbFile=db) + featureFile = open(gFile, "r") + featureFile.readline() + for line in featureFile: + if "|" in line: + continue + + cols = line.split("\t") + if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]: + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + if gid == "": + continue + + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("ggallus", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + ggGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneOntology(db, goPath, goDefPath): + ggGenome = Genome("ggallus", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + idb = geneinfoDB() + goDefs = {} + goArray = [] + for goDefEntry in goDefFile: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + prevGID = "" + for entry in goEntries: + try: + fields = entry.split("\t") + if fields[0] != "9031": + continue + + locID = fields[1].strip() + gID = ("ggallus", locID) + if prevGID != gID: + prevGID = gID + gene_name = "" + synonyms = idb.geneIDSynonyms(gID) + if len(synonyms) >0: + gene_name = string.join(synonyms, ",") + + goArray.append((gID, fields[2], "", gene_name, "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], "")) + except: + print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + ggGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + ggGenome = Genome("ggallus", dbFile=db) + ggGenome.createGeneDB(db) + + +def createDBindices(db): + ggGenome = Genome("ggallus", dbFile=db) + ggGenome.createIndices() + + +def buildChickenDB(db=geneDB): + genePath = "%s/download/seq_gene.md" % cisRoot + goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids + goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz + chromos = {"1": "%s/download/chr1.fa" % cisRoot, + "2": "%s/download/chr2.fa" % cisRoot, + "3": "%s/download/chr3.fa" % cisRoot, + "4": "%s/download/chr4.fa" % cisRoot, + "5": "%s/download/chr5.fa" % cisRoot, + "6": "%s/download/chr6.fa" % cisRoot, + "7": "%s/download/chr7.fa" % cisRoot, + "8": "%s/download/chr8.fa" % cisRoot, + "9": "%s/download/chr9.fa" % cisRoot, + "10": "%s/download/chr10.fa" % cisRoot, + "11": "%s/download/chr11.fa" % cisRoot, + "12": "%s/download/chr12.fa" % cisRoot, + "13": "%s/download/chr13.fa" % cisRoot, + "14": "%s/download/chr14.fa" % cisRoot, + "15": "%s/download/chr15.fa" % cisRoot, + "16": "%s/download/chr16.fa" % cisRoot, + "17": "%s/download/chr17.fa" % cisRoot, + "18": "%s/download/chr18.fa" % cisRoot, + "19": "%s/download/chr19.fa" % cisRoot, + "20": "%s/download/chr20.fa" % cisRoot, + "21": "%s/download/chr21.fa" % cisRoot, + "22": "%s/download/chr22.fa" % cisRoot, + "23": "%s/download/chr23.fa" % cisRoot, + "24": "%s/download/chr24.fa" % cisRoot, + "25": "%s/download/chr25.fa" % cisRoot, + "26": "%s/download/chr26.fa" % cisRoot, + "27": "%s/download/chr27.fa" % cisRoot, + "28": "%s/download/chr28.fa" % cisRoot, + "32": "%s/download/chr32.fa" % cisRoot, + "W": "%s/download/chrW.fa" % cisRoot, + "Z": "%s/download/chrZ.fa" % cisRoot, + "M": "%s/download/chrM.fa" % cisRoot, + "E22C19W28_E50C23": "%s/download/chrE22C19W28_E50C23.fa" % cisRoot, + "E64": "%s/download/chrE64.fa" % cisRoot, + "1_random": "%s/download/chr1_random.fa" % cisRoot, + "2_random": "%s/download/chr2_random.fa" % cisRoot, + "4_random": "%s/download/chr4_random.fa" % cisRoot, + "5_random": "%s/download/chr5_random.fa" % cisRoot, + "6_random": "%s/download/chr6_random.fa" % cisRoot, + "7_random": "%s/download/chr7_random.fa" % cisRoot, + "8_random": "%s/download/chr8_random.fa" % cisRoot, + "10_random": "%s/download/chr10_random.fa" % cisRoot, + "11_random": "%s/download/chr11_random.fa" % cisRoot, + "12_random": "%s/download/chr12_random.fa" % cisRoot, + "13_random": "%s/download/chr13_random.fa" % cisRoot, + "16_random": "%s/download/chr16_random.fa" % cisRoot, + "17_random": "%s/download/chr17_random.fa" % cisRoot, + "18_random": "%s/download/chr18_random.fa" % cisRoot, + "20_random": "%s/download/chr20_random.fa" % cisRoot, + "22_random": "%s/download/chr22_random.fa" % cisRoot, + "25_random": "%s/download/chr25_random.fa" % cisRoot, + "28_random": "%s/download/chr28_random.fa" % cisRoot, + "Un_random": "%s/download/chrUn_random.fa" % cisRoot, + "W_random": "%s/download/chrW_random.fa" % cisRoot, + "E64_random": "%s/download/chrE64_random.fa" % cisRoot, + "Z_random": "%s/download/chrZ_random.fa" % cisRoot, + "E22C19W28_E50C23_random": "%s/download/chrE22C19W28_E50C23_random.fa" % cisRoot + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + #print "Adding gene annotations" + #loadGeneAnnotations(db, annotPath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/G_gallus/chromo%s.bin" % chromID) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/hsapiens.py b/cistematic/genomes/hsapiens.py new file mode 100644 index 0000000..e9c677c --- /dev/null +++ b/cistematic/genomes/hsapiens.py @@ -0,0 +1,264 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Homo sapiens +import string +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + hsGenome = Genome("hsapiens", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + hsGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile, cDict): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + hsGenome = Genome("hsapiens", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + cols = line.split("\t") + if cols[11] != "GENE": + continue + + if cols[12] == "Celera": + continue + + chrom = cols[1].strip() + if chrom not in cDict: + continue + + name = cols[10].split(":") + gid = name[1] + start = int(cols[2]) + stop = int(cols[3]) + sense = cols[4] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("hsapiens", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + hsGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile, cDict): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + hsGenome = Genome("hsapiens", dbFile=db) + featureFile = open(gFile, "r") + for line in featureFile: + cols = line.split("\t") + if cols[11] not in ["CDS", "UTR", "PSEUDO"]: + continue + if cols[12] == "Celera": + continue + chrom = cols[1].strip() + if chrom not in cDict: + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + start = int(cols[2]) + stop = int(cols[3]) + sense = cols[4] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("hsapiens", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + hsGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db): + geneAnnotations = [] + idb = geneinfoDB() + hsGenome = Genome("hsapiens", dbFile=db) + gidList = hsGenome.allGIDs() + for locID in gidList: + gID = ("hsapiens", locID) + geneDescArray = idb.getDescription(gID) + geneDesc = "" + for entry in geneDescArray: + geneDesc += "," + geneDesc += entry.strip() + + if len(geneDescArray) > 0: + geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p"))) + + print "Adding %d annotations" % len(geneAnnotations) + hsGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath): + hsGenome = Genome("hsapiens", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + idb = geneinfoDB() + goDefs = {} + goArray = [] + for goDefEntry in goDefFile: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + prevGID = "" + index = 0 + for entry in goFile: + try: + fields = entry.split("\t") + if fields[0] != "9606": + continue + + index += 1 + if index % 1000 == 0: + print "adding 1000 go entries" + hsGenome.addGoInfoBatch(goArray) + goArray = [] + + locID = fields[1].strip() + gID = ("hsapiens", locID) + if prevGID != gID: + prevGID = gID + gene_name = "" + synonyms = idb.geneIDSynonyms(gID) + if len(synonyms) >0: + for entry in synonyms: + gene_name += "," + gene_name += entry + else: + gene_name = " " + + goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], "")) + except: + print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + hsGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + hsGenome = Genome("hsapiens", dbFile=db) + hsGenome.createGeneDB(db) + + +def createDBindices(db): + hsGenome = Genome("hsapiens", dbFile=db) + hsGenome.createIndices() + + +def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz + goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids + goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz + # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes + chromDict = {"1": "%s/chr1.fa" % downloadDir, + "2": "%s/chr2.fa" % downloadDir, + "3": "%s/chr3.fa" % downloadDir, + "4": "%s/chr4.fa" % downloadDir, + "5": "%s/chr5.fa" % downloadDir, + "6": "%s/chr6.fa" % downloadDir, + "7": "%s/chr7.fa" % downloadDir, + "8": "%s/chr8.fa" % downloadDir, + "9": "%s/chr9.fa" % downloadDir, + "10": "%s/chr10.fa" % downloadDir, + "11": "%s/chr11.fa" % downloadDir, + "12": "%s/chr12.fa" % downloadDir, + "13": "%s/chr13.fa" % downloadDir, + "14": "%s/chr14.fa" % downloadDir, + "15": "%s/chr15.fa" % downloadDir, + "16": "%s/chr16.fa" % downloadDir, + "17": "%s/chr17.fa" % downloadDir, + "18": "%s/chr18.fa" % downloadDir, + "19": "%s/chr19.fa" % downloadDir, + "20": "%s/chr20.fa" % downloadDir, + "21": "%s/chr21.fa" % downloadDir, + "22": "%s/chr22.fa" % downloadDir, + "X": "%s/chrX.fa" % downloadDir, + "Y": "%s/chrY.fa" % downloadDir + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath, chromDict) + + print "Adding gene features" + loadGeneFeatures(db, genePath, chromDict) + + print "Adding gene annotations" + loadGeneAnnotations(db) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath) + + for chromID in chromDict.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db diff --git a/cistematic/genomes/mdomestica.py b/cistematic/genomes/mdomestica.py new file mode 100644 index 0000000..516e784 --- /dev/null +++ b/cistematic/genomes/mdomestica.py @@ -0,0 +1,224 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Monodelphis domestica +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + mdGenome = Genome("mdomestica", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 500000: + print "Added contig %s to database" % chromID + mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen)) + mdGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + mdGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + mdGenome = Genome("mdomestica", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + cols = line.split("\t") + gid = cols[0] + start = int(cols[5]) + stop = int(cols[6]) + sense = cols[2] + chrom = cols[1] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("mdomestica", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + mdGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + mdGenome = Genome("mdomestica", dbFile=db) + for line in annotFile: + try: + cols = line.split("\t") + locID = cols[0] + geneDesc = cols[6] + if len(locID) > 0: + geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + mdGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneFeatures(db, gfile): + geneFile = open(gfile, "r") + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + seenArray = [] + insertArray = [] + for geneLine in geneFile: + geneFields = geneLine.split("\t") + exonNum = int(geneFields[7]) + exonStarts = geneFields[8].split(",") + exonStops = geneFields[9].split(",") + chrom = geneFields[1] + sense = senseArray[geneFields[2]] + gstop = int(geneFields[6]) - 1 + gstart = int(geneFields[5]) - 1 + geneid = geneFields[0] + try: + geneID = ("mdomestica", geneid) + except: + continue + + gidVersion = "1" + if geneID in seenArray: + gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet. + else: + seenArray.append(geneID) + + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS")) + elif estop <= gstart: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart >= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart <= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType)) + else: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType)) + + geneFile.close() + mdGenome = Genome("mdomestica", dbFile=db) + print "Adding %d features" % len(insertArray) + mdGenome.addFeatureEntryBatch(insertArray) + + +def createDBFile(db): + mdGenome = Genome("mdomestica", dbFile=db) + mdGenome.createGeneDB(db) + + +def createDBindices(db): + mdGenome = Genome("mdomestica", dbFile=db) + mdGenome.createIndices() + + +def buildMdomesticaDB(db=geneDB): + genePath = "%s/download/mondom/genscan.txt" % cisRoot + chromoPath = "%s/download/mondom/softMask.fa" % cisRoot + chromoOutPath = "/M_domestica/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Loading chromosomes" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/mmusculus.py b/cistematic/genomes/mmusculus.py new file mode 100644 index 0000000..4e90cfe --- /dev/null +++ b/cistematic/genomes/mmusculus.py @@ -0,0 +1,263 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Mus musculus +import string +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/M_musculus/mmusculus.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + mmGenome = Genome("mmusculus", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + mmGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile, cDict): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + alreadySeen = [] + mmGenome = Genome("mmusculus", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + cols = line.split("\t") + if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J": + continue + + chrom = cols[1].strip() + if chrom not in cDict: + continue + + name = cols[10].split(":") + gid = name[1] + if gid == "" or gid in alreadySeen: + continue + + alreadySeen.append(gid) + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("mmusculus", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + mmGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile, cDict): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + mmGenome = Genome("mmusculus", dbFile=db) + featureFile = open(gFile, "r") + for line in featureFile: + cols = line.split("\t") + if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J": + continue + + chrom = cols[1].strip() + if chrom not in cDict: + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + if gid == "": + continue + + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("mmusculus", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + mmGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db): + geneAnnotations = [] + idb = geneinfoDB() + mmGenome = Genome("mmusculus", dbFile=db) + gidList = mmGenome.allGIDs() + for locID in gidList: + gID = ("mmusculus", locID) + geneDescArray = idb.getDescription(gID) + geneDesc = "" + for entry in geneDescArray: + geneDesc += "," + geneDesc += entry.strip() + + if len(geneDescArray) > 0: + geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p"))) + + print "Adding %d annotations" % len(geneAnnotations) + mmGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath): + mmGenome = Genome("mmusculus", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + idb = geneinfoDB() + goDefs = {} + goArray = [] + for goDefEntry in goDefFile: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + prevGID = "" + for entry in goEntries: + try: + fields = entry.split("\t") + if fields[0] != "10116": + continue + + locID = fields[1].strip() + gID = ("mmusculus", locID) + if prevGID != gID: + prevGID = gID + gene_name = "" + synonyms = idb.geneIDSynonyms(gID) + if len(synonyms) >0: + for entry in synonyms: + gene_name += "," + gene_name += entry + else: + gene_name = " " + + goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], "")) + except: + print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + mmGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + mmGenome = Genome("mmusculus", dbFile=db) + mmGenome.createGeneDB(db) + + +def createDBindices(db): + mmGenome = Genome("mmusculus", dbFile=db) + mmGenome.createIndices() + + +def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md + goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids + goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz + # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes + # but ignoring all random chromosomes + chromDict = {"1": "%s/chr1.fa" % downloadDir, + "2": "%s/chr2.fa" % downloadDir, + "3": "%s/chr3.fa" % downloadDir, + "4": "%s/chr4.fa" % downloadDir, + "5": "%s/chr5.fa" % downloadDir, + "6": "%s/chr6.fa" % downloadDir, + "7": "%s/chr7.fa" % downloadDir, + "8": "%s/chr8.fa" % downloadDir, + "9": "%s/chr9.fa" % downloadDir, + "10": "%s/chr10.fa" % downloadDir, + "11": "%s/chr11.fa" % downloadDir, + "12": "%s/chr12.fa" % downloadDir, + "13": "%s/chr13.fa" % downloadDir, + "14": "%s/chr14.fa" % downloadDir, + "15": "%s/chr15.fa" % downloadDir, + "16": "%s/chr16.fa" % downloadDir, + "17": "%s/chr17.fa" % downloadDir, + "18": "%s/chr18.fa" % downloadDir, + "19": "%s/chr19.fa" % downloadDir, + "X": "%s/chrX.fa" % downloadDir, + "Y": "%s/chrY.fa" % downloadDir, + "M": "%s/chrM.fa" % downloadDir + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath, chromDict) + + print "Adding gene features" + loadGeneFeatures(db, genePath, chromDict) + + print "Adding gene annotations" + loadGeneAnnotations(db) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath) + + for chromID in chromDict.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/rnorvegicus.py b/cistematic/genomes/rnorvegicus.py new file mode 100644 index 0000000..b0fae70 --- /dev/null +++ b/cistematic/genomes/rnorvegicus.py @@ -0,0 +1,255 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Ratus Norvegicus +import string +from cistematic.genomes import Genome +from cistematic.core.geneinfo import geneinfoDB +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + rnGenome = Genome("rnorvegicus", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + rnGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + rnGenome = Genome("rnorvegicus", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + cols = line.split("\t") + if cols[11] != "GENE": + continue + + if cols[12] == "Celera": + continue + + name = cols[10].split(":") + gid = name[1] + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("rnorvegicus", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + rnGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneFeatures(db, gFile): + """ Load gene features such as CDS, UTR, and PSEUDO from the gene file. + """ + featureEntries = [] + rnGenome = Genome("rnorvegicus", dbFile=db) + featureFile = open(gFile, "r") + for line in featureFile: + cols = line.split("\t") + if cols[11] not in ["CDS", "UTR", "PSEUDO"]: + continue + + if cols[12] == "Celera": + continue + + fType = cols[11] + name = cols[10].split(":") + gid = name[1] + start = int(cols[2]) - 1 + stop = int(cols[3]) - 1 + sense = cols[4] + chrom = cols[1].strip() + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("rnorvegicus", gid) + gidVersion = 1 + featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType)) + + print "Adding %d feature entries" % len(featureEntries) + rnGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db): + geneAnnotations = [] + idb = geneinfoDB() + rnGenome = Genome("rnorvegicus", dbFile=db) + gidList = rnGenome.allGIDs() + for locID in gidList: + gID = ("rnorvegicus", locID) + geneDescArray = idb.getDescription(gID) + geneDesc = "" + for entry in geneDescArray: + geneDesc += "," + geneDesc += entry.strip() + + if len(geneDescArray) > 0: + geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p"))) + + print "Adding %d annotations" % len(geneAnnotations) + rnGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath): + rnGenome = Genome("rnorvegicus", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + idb = geneinfoDB() + goDefs = {} + goArray = [] + for goDefEntry in goDefFile: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + prevGID = '' + for entry in goEntries: + try: + fields = entry.split("\t") + if fields[0] != "10090": + continue + + locID = fields[1].strip() + gID = ("rnorvegicus", locID) + if prevGID != gID: + prevGID = gID + gene_name = "" + synonyms = idb.geneIDSynonyms(gID) + if len(synonyms) >0: + for entry in synonyms: + gene_name += "," + gene_name += entry + else: + gene_name = " " + + goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], "")) + except: + print "locus ID %s could not be added" % locID + pass + + print "adding %d go entries" % len(goArray) + rnGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + rnGenome = Genome("rnorvegicus", dbFile=db) + rnGenome.createGeneDB(db) + + +def createDBindices(db): + rnGenome = Genome("rnorvegicus", dbFile=db) + rnGenome.createIndices() + + +def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/seq_gene.md" % downloadDir + goDefPath = "%s/GO.terms_and_ids" % downloadDir + goPath = "%s/gene2go" % downloadDir + # ignoring all random chromosomes + chromos = {"1": "%s/chr1.fa" % downloadDir, + "2": "%s/chr2.fa" % downloadDir, + "3": "%s/chr3.fa" % downloadDir, + "4": "%s/chr4.fa" % downloadDir, + "5": "%s/chr5.fa" % downloadDir, + "6": "%s/chr6.fa" % downloadDir, + "7": "%s/chr7.fa" % downloadDir, + "8": "%s/chr8.fa" % downloadDir, + "9": "%s/chr9.fa" % downloadDir, + "10": "%s/chr10.fa" % downloadDir, + "11": "%s/chr11.fa" % downloadDir, + "12": "%s/chr12.fa" % downloadDir, + "13": "%s/chr13.fa" % downloadDir, + "14": "%s/chr14.fa" % downloadDir, + "15": "%s/chr15.fa" % downloadDir, + "16": "%s/chr16.fa" % downloadDir, + "17": "%s/chr17.fa" % downloadDir, + "18": "%s/chr18.fa" % downloadDir, + "19": "%s/chr19.fa" % downloadDir, + "Un": "%s/chrUn.fa" % downloadDir, + "X": "%s/chrX.fa" % downloadDir, + "20": "%s/chr20.fa" % downloadDir, + "M": "%s/chrM.fa" % downloadDir + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/scerevisiae.py b/cistematic/genomes/scerevisiae.py new file mode 100644 index 0000000..5866f80 --- /dev/null +++ b/cistematic/genomes/scerevisiae.py @@ -0,0 +1,216 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Saccharomyces cerevisiae +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + scGenome = Genome("scerevisiae", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + seq = "" + print "calling scGenome()" + scGenome.addChromosomeEntry(chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + geneEntries = [] + geneFeatures = [] + scGenome = Genome("scerevisiae", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + field = line.split("\t") + if field[1] != "ORF": + continue + + orfName = field[3].strip() + sense = field[11] + chrom = field[8].strip() + if sense == "W": + sense = "F" + try: + start = int(field[9].strip()) - 1 + stop = int(field[10].strip()) - 1 + except: + start = 0 + stop = 0 + else: + sense = "R" + try: + start = int(field[10].strip()) - 1 + stop = int(field[9].strip()) - 1 + except: + start = 0 + stop = 0 + + geneID = ("scerevisiae", orfName) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "chromosomal_feature", gidVersion)) + geneFeatures.append((geneID, gidVersion, chrom, start, stop, sense, "CDS")) + + print "loading %d gene entries" % len(geneEntries) + scGenome.addGeneEntryBatch(geneEntries) + print "loading %d gene features" % len(geneFeatures) + scGenome.addFeatureEntryBatch(geneFeatures) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + lines = annotFile.readlines() + annotFile.close() + scGenome = Genome("scerevisiae", dbFile=db) + for line in lines: + field = line.split("\t") + if field[1] != "ORF": + continue + + try: + orfName = field[6].strip() + description = field[15].strip() + geneAnnotations.append((("scerevisiae", orfName), string.replace(description, "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + scGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath, goDefPath): + scGenome = Genome("scerevisiae", version="SGD1", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + goDefEntries = goDefFile.readlines() + goDefs = {} + for goDefEntry in goDefEntries: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + goArray = [] + for line in goEntries: + if line[0] == "!": + continue + + fields = line.split("\t") + genes = fields[10].split("|") + gID = genes[0] + GOID = fields[4] + objType = fields[11] + objNameArray = fields[10].split("|") + objName = objNameArray[0] + isNot = fields[3] + try: + GOterm = string.replace(goDefs[GOID][0], "'", "p") + except: + print "Could not translate %s" % (GOID) + GOterm = "" + + evidence = fields[6] + goArray.append((("scerevisiae", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[1])) + + scGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + scGenome = Genome("scerevisiae", version="SGD1", dbFile=db) + scGenome.createGeneDB(db) + + +def createDBindices(db): + scGenome = Genome("scerevisiae", version="SGD1", dbFile=db) + scGenome.createIndices() + + +def buildScerevisiaeDB(db=geneDB): + genePath = "%s/download/SGD_features.tab" % cisRoot + goDefPath = "%s/download/GO.terms_and_ids" % cisRoot + goPath = "%s/download/gene_association.sgd" % cisRoot + chromos = {"1": "%s/download/chr01.fsa" % cisRoot, + "2": "%s/download/chr02.fsa" % cisRoot, + "3": "%s/download/chr03.fsa" % cisRoot, + "4": "%s/download/chr04.fsa" % cisRoot, + "5": "%s/download/chr05.fsa" % cisRoot, + "6": "%s/download/chr06.fsa" % cisRoot, + "7": "%s/download/chr07.fsa" % cisRoot, + "8": "%s/download/chr08.fsa" % cisRoot, + "9": "%s/download/chr09.fsa" % cisRoot, + "10": "%s/download/chr10.fsa" % cisRoot, + "11": "%s/download/chr11.fsa" % cisRoot, + "12": "%s/download/chr12.fsa" % cisRoot, + "13": "%s/download/chr13.fsa" % cisRoot, + "14": "%s/download/chr14.fsa" % cisRoot, + "15": "%s/download/chr15.fsa" % cisRoot, + "16": "%s/download/chr16.fsa" % cisRoot + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db, genePath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath) + + for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]: + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/spurpuratus.py b/cistematic/genomes/spurpuratus.py new file mode 100644 index 0000000..6098562 --- /dev/null +++ b/cistematic/genomes/spurpuratus.py @@ -0,0 +1,108 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Strongylocentrotus purpuratus +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/S_purpuratus/spurpuratus.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + spGenome = Genome("spurpuratus", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + fields = header.split() + if "purpuratus" in header: + idpart = fields[3] + else: + idpart = fields[-1].strip() + + parts = idpart.split("_") + chromID = parts[-1] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 50000: + print "Added contig %s to database" % chromID + spGenome.addSequence(("spurpuratus", chromID), seq, "chromosome", str(seqLen)) + spGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + spGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def createDBFile(db): + spGenome = Genome("spurpuratus", version="2.1", dbFile=db) + spGenome.createGeneDB(db) + + +def createDBindices(db): + spGenome = Genome("spurpuratus", version="2.1", dbFile=db) + spGenome.createIndices() + + +def buildSpurpuratusDB(db=geneDB): + chromoPath = "%s/download/Spur2.1_Nmasked.txt" % cisRoot + chromoOutPath = "/S_purpuratus/" + + print "Creating database %s" % db + createDBFile(db) + + print "Loading genomic sequence" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/genomes/xtropicalis.py b/cistematic/genomes/xtropicalis.py new file mode 100644 index 0000000..eb37500 --- /dev/null +++ b/cistematic/genomes/xtropicalis.py @@ -0,0 +1,266 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Xenopus tropicalis +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot + + +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 + xtGenome = Genome("xtropicalis", dbFile=db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() + + seq = string.join(seqArray, "") + if seqLen < 500000: + print "Added contig %s to database" % chromID + xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen)) + xtGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + xtGenome.addChromosomeEntry(chromID, outFileName, "file") + + header = currentLine + + inFile.close() + + +def loadGeneEntries(db, gFile): + """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + """ + geneEntries = [] + xtGenome = Genome("xtropicalis", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + cols = line.split("\t") + gid = cols[0] + start = int(cols[5]) + stop = int(cols[6]) + sense = cols[2] + chrom = cols[1] + if sense == "+": + sense = "F" + else: + sense = "R" + + geneID = ("xtropicalis", gid) + gidVersion = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion)) + + print "Adding %d gene entries" % len(geneEntries) + xtGenome.addGeneEntryBatch(geneEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + xtGenome = Genome("xtropicalis", dbFile=db) + for line in annotFile: + try: + cols = line.split("\t") + locID = cols[0] + geneDesc = cols[6] + if len(locID) > 0: + geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + xtGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneFeatures(db, gfile): + geneFile = open(gfile, "r") + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + seenArray = [] + insertArray = [] + for geneLine in geneFile: + geneFields = geneLine.split("\t") + exonNum = int(geneFields[7]) + exonStarts = geneFields[8].split(",") + exonStops = geneFields[9].split(",") + chrom = geneFields[1] + sense = senseArray[geneFields[2]] + gstop = int(geneFields[6]) - 1 + gstart = int(geneFields[5]) - 1 + geneid = geneFields[0] + try: + geneID = ("xtropicalis", geneid) + except: + continue + + gidVersion = "1" + if geneID in seenArray: + gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet. + else: + seenArray.append(geneID) + + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS")) + elif estop <= gstart: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart >= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, estop, sense, fType)) + elif estart <= gstop: + if sense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType)) + else: + if sense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS")) + insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType)) + + geneFile.close() + xtGenome = Genome("xtropicalis", dbFile=db) + print "Adding %d features" % len(insertArray) + xtGenome.addFeatureEntryBatch(insertArray) + + +def loadGeneOntology(db, goPath, goDefPath, annotPath): + xtGenome = Genome("xtropicalis", dbFile=db) + goDefFile = open(goDefPath, "r") + goFile = open(goPath, "r") + annotFile = open(annotPath, "r") + annotEntries = annotFile.readlines() + annotFile.close() + goDefEntries = goDefFile.readlines() + goDefs = {} + locus = {} + goArray = [] + for goDefEntry in goDefEntries: + if goDefEntry[0] != "!": + cols = goDefEntry.split("\t") + goDefs[cols[0]] = (cols[1], cols[2].strip()) + + goEntries = goFile.readlines() + for annotEntry in annotEntries: + try: + cols = annotEntry.split("\t") + locID = cols[0] + geneName = cols[1] + geneDesc = cols[6] + mimID = "" + if len(locID) > 0: + locus[locID] = (geneName, geneDesc, mimID) + except: + pass + + for entry in goEntries: + try: + fields = entry.split("\t") + locID = fields[0].strip() + (gene_name, gene_desc, mimID) = locus[locID] + goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID)) + except: + pass + + print "adding %d go entries" % len(goArray) + xtGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + xtGenome = Genome("xtropicalis", dbFile=db) + xtGenome.createGeneDB(db) + + +def createDBindices(db): + xtGenome = Genome("xtropicalis", dbFile=db) + xtGenome.createIndices() + + +def buildXtropicalisDB(db=geneDB): + genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot + chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot + chromoOutPath = "/X_tropicalis/" + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Loading sequences" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file diff --git a/cistematic/programs/Consensus.py b/cistematic/programs/Consensus.py new file mode 100755 index 0000000..3bb39a8 --- /dev/null +++ b/cistematic/programs/Consensus.py @@ -0,0 +1,1110 @@ +##Sarah Aerni +##Created: July 12, 2005 +##Modified: June 22, 2006 +##Modified: Oct 22, 2008 by Ali (Minor modifications to outputs) +##This is a greedy motif finding program. +##Using background models it determines which motifs are overrepresented +############################################################################### +##include a way to go back and +##uses the new way of doing background and the new scoring rather than simple +##consensus it uses the log likelihood +############################################################################### + +import hotshot +import random +import math +import sys +import copy +import time + +from math import log +from math import e +from math import ceil +from random import shuffle +from random import randint + +ConsensusScore= {True:2, False:1} + +NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3} +IndexNTDs = {0:'A', 1:'C', 2:'G', 3:'T'} +INSERTION_N = "N" + +NTDA=0 +NTDC=1 +NTDG=2 +NTDT=3 + +INSERTION_MARKER="S" +INSERTION_FILLER="N" + +#markov size - 1 = markov model +#example: +#if you want a markov 1 you want to check 2 ntds total +#markov_size = 2 +#declare global variables +global MARKOV_WINDOW +global MW1 +global percID +global Founders +global UseRC +global Frequency +global NumOfMismatch +Frequency={} +global sizeOfMotif +global sequences +global founderPercID +#MotifFinder with 3 arguments +#input: sequences sequences in which we are to find a motif +# sizeOfMotif size of the motif to be identified in the sequences +# numberOfMotifs the number of motifs of the indicated size to be +# identified in the sequences +# Iterations The number of iterations that will be perfdormed in +# order to determine an optimal motif +# Frequency Markov Model for background +# UseRC a boolean indicating whether the reverse complement +# should be used i the finding of a motif +# (True = yes) +# Founders a boolean indicating whether the first 2 sequences +# should always be the founder sequences +# percID percent of the aligned sequences that must be +# identical in order to be considered for a motif to +# be considered (between the founder sequences then +# between founder sequence consensus and each other +# subsequence being considered) +# founderPercID The minimum percent identity two aligned sequences +# must be to qualify as founder sequences +# MARKOV_SIZE Size of the markov model (corresponds to length of +# the words in the dictionary Frequency) +#output: (List1, List2) List1 contains PSFMs of the indicated length and up +# to the indicated number (there may be fewer) in the +# standard cistematic format, while List2 holds the +# corresponding sequences. +# +#This function will call the overloaded MotifFinder function, passing in a +#default amount for the number of iterations (currently set at 100) +def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations, + Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE, + model,masking): + + #declare global variables + global MARKOV_WINDOW + global MW1 + global NumOfMismatch + global sizeOfMotif + numOfSequences=len(sequences) + + #the minimum window size permitted by this program is a motif of length 6 + if minSize < 6: + minSize = 6 + + #sanity check for reasonable motif sizes + if minSize > maxSize: + print "Please input a minimum size that is smaller than a maximum size" + print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize) + return None + + #sanity check for reasonable percentID + if percID> founderPercID: + print "Your input %i for percent identity for founder sequences" % founderPercID, + print "is less than your input for general percent identity at",percID + print "Please adjust your input" + return None + + #a user may input a number of iterations which exceeds the possible + #for a given set of input values, in which case the number of iterations + #should be decreased to another size + maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2) + if maxIterations < Iterations: + print "The number of Iterations you entered", Iterations, + print "is being reduced to",maxIterations, + Iterations = maxIterations + + #adjusting the window as remarked above + MARKOV_WINDOW = MARKOV_SIZE + 1 + MW1=MARKOV_SIZE + print len(sequences), "sequences search for",numberOfMotifs, + print " motifs of size",minSize,"to",maxSize + print "The percent Identity input is",percID,"%" + + #this list will contain motifs found (up to the NumberOfMotifs passed in + sequencesToCompare = [i.upper() for i in sequences] + #as a preprocessing steps, a string of Ns will be replaced by one N + for i in xrange(len(sequences)): + splitByN=sequencesToCompare[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequencesToCompare[i]=finalSequence + originalSeqs = [] + aveCalcSeqs = [] + + if UseRC: + for seqIndex in xrange(len(sequencesToCompare)): + sequence = revComp(sequencesToCompare[seqIndex]) + aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence])) + originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence])) + sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]]) + sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]]) + sequencesToCompare[seqIndex] = sequence + + AllPSFMS = [] + AllPSFMseqs = [] + for motif_i in xrange(numberOfMotifs): + empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))]) + if empty < maxSize: + print "A sequence has been depleted of all options" + return (AllPSFMS, AllPSFMseqs) + + OverallBest = "Infinity" + print "MOTIF NUMBER %i\n" % motif_i + #multiple iterations are attempted to avoid local maxima + for step in xrange(Iterations): + sys.stdout.flush() + numIncluded = 2 #used in order to create PSFMs from PWMs + #CompareList will be used to randomly generate the sequences to be + #compared + CompareList = range(len(sequencesToCompare)) + BestScore = "Infinity" + #find the best shared motif between the first two sequences + #the motif must be completely contained within the sequence + #or its reverse complement + NumberOfTries = 0 + while (BestScore == "Infinity"): + #PWM created from all sequences included in the motif + PWM = [] + #pick motif size randomly within the range provided by the user + sizeOfMotif = random.randint(minSize,maxSize) + NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif) + #if the motif average is not yet established create the average + #and store tis value in the dictionary + + #best motif startpoints will be stored in this list of start + #locations + BestMotif = [-1] * len(sequencesToCompare) + #these are start locations are used in order to test new best + #start indices + startLocs = BestMotif[:] + #if it appears there are no more motifs possible quit + if (NumberOfTries > Choose(len(sequencesToCompare), 2)): + print "No more motifs to be found after %i tries" % NumberOfTries + return (AllPSFMS, AllPSFMseqs) + NumberOfTries +=1 + #randomize sequence order + #if the sequence founders are input then you only want to + #randomize the remaining sequences + if Founders: + CompareList = [0,1] + remaining = range(2,len(sequencesToCompare)) + shuffle(remaining) + CompareList.extend(remaining) + #if no founders are specified, randomize all + else: + shuffle(CompareList) + + SeqIndex0 = CompareList[0] + SeqIndex1 = CompareList[1] + #founder sequences - first 2 sequences - are selected + #create random start sites to avoid favoring the 5' region + startLocs[SeqIndex0] = -1 + startLocs[SeqIndex1] = -1 + BestMotif[SeqIndex0] = -1 + BestMotif[SeqIndex1] = -1 + BestScore = "Infinity" + #define the best motif between the first two sequences the + #inital set of comparisons are performed at the same indices in + #both "founder" sequences + + #to ensure that the startpoint is random we will rearrange the + # sequences + start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1) + accessSeq=sequencesToCompare[SeqIndex0] + seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]]) + start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1) + accessSeq=sequencesToCompare[SeqIndex1] + seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]]) + + #create shifted score, then check to see if it meets all + #criteria to become the best. + + #Perform a time efficient alignment of the founder sequences + (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID) + + if BestScore=="Infinity": + continue + NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1 + NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1 + + #remap to the correct location + if startLocs[SeqIndex0] > NLoc0: + startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0]) + else: + startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1) + + if startLocs[SeqIndex1] > NLoc1: + startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1]) + else: + startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1) + + #if we found a good score get all the info necessary to + #continue, including a PWM and updating the startLocs + if BestScore != "Infinity": + index0 = startLocs[SeqIndex0] + index1 = startLocs[SeqIndex1] + seq0 = sequencesToCompare[SeqIndex0] + seq1 = sequencesToCompare[SeqIndex1] + motif0 = seq0[index0:index0+sizeOfMotif] + motif1 = seq1[index1:index1+sizeOfMotif] + BestMotifSeqs=[motif0,motif1] + PWM = convert2PWM([motif0, motif1],sizeOfMotif) + + #this PWM will be used in order to establish a criterion + #for future sequences, whether they meet a threshold + #similiarty to the founder sequences + + #if it exists, find the best scoring motifs in the remaining + #sequences. + + for seqCompIndex in xrange(2, len(sequencesToCompare)): + maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))] + BestScore = "Infinity" + seq=CompareList[seqCompIndex] + sequenceToAdd = sequencesToCompare[seq] + max_i=len(sequenceToAdd)-sizeOfMotif + bestMismatch = NumOfMismatch + #to ensure that the startpoint is random we will rearrange the + #sequences + start_i = randint(0,max_i-1) + seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]]) + i=0 + while (i<=max_i): + motif=seq_i[i:i+sizeOfMotif] + #skip this area if there are any masked regions found + nLoc=motif.rfind('N') + if nLoc >=0: + i+=nLoc+1 + continue + + mmNum = 0 + j = 0 + while (j < sizeOfMotif): + mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j]) + if mmNum > bestMismatch: + break + j+=1 + + #set a new best if we found it (always keep originals) + if mmNum < bestMismatch: + bestMismatch = mmNum + startLocs[seq]=i + + #we cannot improve upon a perfect match so we break end our + #search if this occurs + if mmNum == 0: + startLocs[seq]=i + break + i+=1 + + #if somethigng was found we add it to our PWM + if startLocs[seq]!= -1: + NLoc_i=len(sequenceToAdd)-start_i-1 + if startLocs[seq] > NLoc_i: + startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd) + else: + startLocs[seq]=(startLocs[seq]+start_i+1) + + starti=startLocs[seq] + sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif] + checkPWM=add2PWMReturn(sAdd,PWM) + add=True + checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))] + for seq in BestMotifSeqs: + mismatches=0 + for i in range(len(seq)): + mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i]) + if mismatches > NumOfMismatch: + add=False + break + if add: + BestMotifSeqs.append(sAdd) + PWM=copy.deepcopy(checkPWM) + numIncluded+=1 + elif model=="oops": + break + #if we require one occurence per sequence then if we do not + #have a motif in this sequence we stop our search + elif model=="oops": + break + + GroupScore =0 + #if we have an oops model and not all sequences are being included, + #we have to continue without recording what we have so far + if model=="oops" and numIncluded OverallBest): + nextToCompare = sequencesToCompare[:] + MotifSeqs = [] + OrderOfBest = CompareList[:] + BestIndices = startLocs[:] + bestMotifSeqYet = BestMotifSeqs + OverallBest = GroupScore + PSFM = convert2PSFM(PWM, numIncluded) + OverallPWM=PWM[:] + for i in xrange(len(sequencesToCompare)): + SeqIndexi = CompareList[i] + #some sequences may not contain the motifs, if so you do no + #want to include them + if BestIndices[SeqIndexi] != -1: + sAdd=sequencesToCompare[SeqIndexi] + sAddI=BestIndices[SeqIndexi] + Motifi = sAdd[sAddI:sAddI+sizeOfMotif] + MotifSeqs.append(Motifi) + + #mask out original location + nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]]) + sAdd=nextToCompare[SeqIndexi] + sAddLen=len(sAdd) + nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]]) + TopMotif = convert2motif(MotifSeqs, sizeOfMotif) + + #if not more motifs are found (best score is -1) then it is likely that + #we have found as many as we can + if OverallBest == "Infinity": + print "No more motifs were found!" + return (AllPSFMS, AllPSFMseqs) + + #if a motif was found report it here and add it to the total motifs + #Print out your results if any were reached! + for i in xrange(len(sequencesToCompare)): + SeqIndexi = OrderOfBest[i] + if BestIndices[SeqIndexi] != -1: + Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif] + MotifSeqs.append(Motifi) + + #add the current best motif to the total of all motifs for returning + #to the user + AllPSFMS.append(PSFM) + AllPSFMseqs.append(bestMotifSeqYet) + print OverallPWM + + NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'} + for Pntd in [0,1,2,3]: + print "" + print NTDSTUFF[Pntd], + for Pj in xrange(len(PSFM)): + print "\t %.003f"%(PSFM[Pj][Pntd]), + + print "\n**********\n" + print TopMotif + print "\n**********\n" + print "Score:",OverallBest + maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))] + for maski in xrange(numOfSequences): + maskThis=nextToCompare[maski] + i = 0 + max_i = len(maskThis)-sizeOfMotif + while (i<=max_i): + motif=maskThis[i:i+sizeOfMotif] + #skip this area if there are any masked regions found + nLoc=motif.rfind('N') + if nLoc >=0: + i+=nLoc+1 + continue + mmNum = 0 + j = 0 + while (j < sizeOfMotif): + if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]: + mmNum+=1 + if mmNum > NumOfMismatch: + break + j+=1 + #if the location is below threshold we mask it out also + if mmNum>NumOfMismatch: + i+=1 + continue + + i+=1 + + #replace old sequences with the new one + sequencesToCompare = nextToCompare[:] + #reduce masked regions to single N + for i in xrange(len(sequences)): + splitByN=sequencesToCompare[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequencesToCompare[i]=finalSequence + + return (AllPSFMS, AllPSFMseqs) + + +#MaskLocation +#input: sequence sequence to be masked +# location location at which to mask the sequence +# length length of area to mask +#output string masked sequence +def MaskLocation(sequence,start,length): + finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]]) + newLen=len(finalsequence) + finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]]) + + return finalsequence + + +#TimeEfficientAlignment +#input: seq0 string sequence in which to calculate a best location +# seq1 string second sequence +# start0 integer start location for alignment (gives a skew) +# start1 integer start for second sequence +# BestScore float which is the current bestscore +# percID percent of the aligned sequences that must be identical +# in order to be considered for a motif +#output float Best scoring alignment's score +# +#will be calculated as an ungapped consensus between two sequences as the +#algorithm steps through each window +def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID): + global sizeOfMotif + NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif) + bestMismatch = NumOfMismatch + MW1=MARKOV_WINDOW-1 + + #create shifted score, then check to see if it meets all + #criteria to become the best. + + i = 0 + maxi=len(seq0)-sizeOfMotif + maxj=len(seq1)-sizeOfMotif + + Score="Infinity" + + while (i<=maxi): + motif=seq0[i:sizeOfMotif+i] + Nloc=motif.rfind('N') + if Nloc>=0: + i+=Nloc+1 + continue + j=0 + while (j<=maxj): + Nloc=seq1[j:j+sizeOfMotif].rfind('N') + if Nloc>=0: + j+=Nloc+1 + continue + mismatchedNTDs=0 + for k in xrange(sizeOfMotif): + if motif[k]!=seq1[k+j]: + mismatchedNTDs+=1 + if mismatchedNTDs > bestMismatch: + break + if mismatchedNTDs == 0: + Score = 2*sizeOfMotif-bestMismatch + startLocs[SeqIndex0]=i + startLocs[SeqIndex1]=j + return (Score) + if mismatchedNTDs < bestMismatch: + bestMismatch = mismatchedNTDs + Score = 2*sizeOfMotif-bestMismatch + startLocs[SeqIndex0]=i + startLocs[SeqIndex1]=j + j+=1 + i+=1 + return (Score) + + +#AlignmentScore +#input: sequenceToCompare List of sequences whose substrings will be +# sizeOfMotif aligned integer size of the motif being found +# (length of the subseqeunce that will be aligned +# from each sequence in above list (window size) +# startLocs start locations of the motifs to be aligned to +# each other +# CompareList the indices of sequencesToCompare to be aligned +# numSeqs the number of sequences -1 from +# sequencesToCompare to be aligned. ie, the indices +# of sequenceToCompare stored in the first numSeqs +# indices of in CompareList. +# Frequency markov model being used to calculate background +# originalSeqs contain the unmasked sequences used for checking +# the markov score +# NumOfMismatch number of mismatches allowable +#output: integer Score indicating the consensus score of these +# sequences +# 2D-List contains the PSFM +# 2D-List contains the log markov scores +# +#will be calculated as an ungapped consensus between those elements in the +#CompareList Consensus score is calculated by choosing the largest number of the +#same elements in each column, and adding all these numbers up across all +#columns +def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch): + TotalScore = 0; + Scores = [] + MW1=MARKOV_WINDOW-1 + PWM = [] + Log = [] + ConsensusScore = 0 + len(sequencesToCompare) + #traverse each column individually + for i in xrange (sizeOfMotif): + divisor = 0.0 + PWMi = [0.0, 0.0, 0.0, 0.0] + Logi = [0.0, 0.0, 0.0, 0.0] + #traverse this index in each sequence + for j in xrange(numSeqs+1): + SequenceIndex = CompareList[j] + CurrSeq = sequencesToCompare[SequenceIndex] + CurrOr = originalSeqs[SequenceIndex] + #some sequences may not contain the motifs, if so you do not want + #to include them in the consensus. These have uninitialized start + #locations (ie startLocs would be -1 + if startLocs[SequenceIndex] != -1: + divisor += 1.0 + if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N': + print sequencesToCompare + print "\nBAD HERE!" + print CurrSeq + print startLocs + print startLocs[SequenceIndex] + print CompareList + print j + print SequenceIndex + print numSeqs + print sizeOfMotif + print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif] + + PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0 + Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]] + + Scores.append(0) + Top = -1 + for NTD in ['A', 'C', 'G', 'T']: + #avoid ln(0) errors + if PWMi[NTDIndices[NTD]] > 0: + Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e) + if PWMi[NTDIndices[NTD]] > Top: + Top = PWMi[NTDIndices[NTD]] + TotalScore += Scores[i] + ConsensusScore += Top + PWM.append(PWMi) + Log.append(Logi) + + return (TotalScore, Scores, ConsensusScore,PWM, Log) + + +#MarkovFreq +#input: prefix string of length MARKV_SIZE - 1 prefix used for model +# actualNTD character NTD at the en dof the prefix being calculated +# Frequency Markov model for calculations +#output: float that gives the markov score for this specific sequence +# +#The helper function will run through and find all possible words with the +#prefix and determine the markov score based on this +def MarkovFreq (prefix, actualNTD, Frequency): + + denominator = 0.0 + numerator = 0.0 + for NTD in ['A', 'C', 'G', 'T']: + value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW) + if NTD == actualNTD : + numerator = value + denominator += value + retVal = numerator/denominator + return retVal + + +#revComp +#input: sequence DNA sequence to be converted to reverse complement +#output: string reverse complement of input sequence +# +#obtains the reverse complement of an input sequence +def revComp (sequence): + #base pairs + RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'} + reverse = "" + #reverse the sequene + for i in xrange(len(sequence)): + reverse = RevDict[sequence[i].upper()]+reverse + return reverse + + +#Markov3 +#input: sequences list that are being used to create the background +#output: dictionary of all 6mers (reverse complement also) and their -log2 +# proportion seen +# +#background will build a markov model of the background in order to be able +#to differentiate the motifs from the pure background of a certain size +#they will be stored as -log(fraction) +def Markov(sequences, IncludeRC,MARKOV): + MARKOV_WINDOW = MARKOV + 1 + WordDict = {} + totalWindows = 0 + + #take each sequence and use it in order to separately determine background + for seq in sequences: + #all sequences for the background must be full-length + for index in xrange(len(seq)-MARKOV): + subseq = seq[index:index+MARKOV_WINDOW].upper() + if "N" in subseq: + continue + + totalWindows += 1 + if subseq not in WordDict: + WordDict[subseq] = 0.0 + WordDict[subseq] += 1.0 + if IncludeRC: + totalWindows += 1 + RC = revComp(subseq) + if RC not in WordDict: + WordDict[RC] = 0.0 + WordDict[RC] += 1.0 + + #convert to logs + for key in WordDict: + WordDict[key] = 1.0*WordDict[key]/totalWindows + return WordDict + + +#Average_M +#input: sequences List of sequences on which to find the average +# markov score +# Model Dictionary containing pvalues for seeing 3mers +# l integer designating the word sizes from which to +# determine average pvalue +#output: average probability of all input lmers in the sequences in the Model +# +#finds the probability of seeing all subsequence in the total strings +#using the markov model created using the background. Markov3 is used +#(window size of 3) and from this determine the average. This functio will +#also screen the background model +def Average_M (sequence, Model, l,MARKOV_WINDOW): + totalSum = 0.0; + totalWords = 0.0; + MW1=MARKOV_WINDOW-1 + for seq in sequence: + for i in xrange(MW1,len(seq)-l+1): + sequenceCheck=seq[i-MW1:i+1] + Nindex=sequenceCheck.rfind('N') + if Nindex >= 0: + continue + totalWords += 1.0 #increase number of words + PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW) + #add current word to the running total of scores + totalSum += PVal + retVal = totalSum/totalWords + print totalWords + + return retVal + + +#M_Score +#input: sequence string for which the Pvalue is to be determined +# Model Dictionary containing log2 pvalues for seeing 6mers +# check Boolean which determines whether to also check for +# completeness of markov model +#output: log2 probability of seeing the input seqeunce in the Model +# +#gives the probability of seeing the given subsequence in the total strings +#using the markov model created using the background. Markov6 is used +#(window size of 3) +def M_Score (sequence, Model, check,MARKOV_WINDOW): + PVal = 0.0 + MW1=MARKOV_WINDOW-1 + for j in xrange(len(sequence)-MARKOV_WINDOW+1): + #if the subsequences is not in the background value it + #the program returns an exit code and asks the user to + #revise background model input + if sequence[j:j+MARKOV_WINDOW] not in Model: + if check: + print "The Markov Model is inadequate for your input", + print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW], + print "not contained in model provided\n", + print "Please revise your provided model or consider", + print "using Background Modelling provided" + sys.exit(0) + continue + #calculates score + PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e) + + return PVal + +#LogOdds +#input: sequences sequences for which to find the log odds score +# startLocs start locations at which to start computing score +# sizeOfMotif size of the motif being created +# Freqeuncy Markov3 Model +# Index Current Indices +# CompareList Order of Comparison +#output returns the log odds score for the consensus +#the equation used is as follow: +#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) +def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW): + MW1=MARKOV_WINDOW-1 + for i in range(sizeOfMotif): + for j in range(Index+1): + seqIndex = CompareList[j] + lnValue = [0.0, 0.0, 0.0, 0.0] + totalNum = [0.0, 0.0, 0.0, 0.0] + #add the frequency of seeing the given nucleotides in each position + #a runnning total of each one. Then add them to the equation + if startLocs[seqIndex] != -1: + index = startLocs[seqIndex]+i + previous = sequences[seqIndex][index-(MW1):index+1] + denominator = 0.0 + numerator = 0.0 + #determine probabilities for each path + for NTD in ['A', 'C', 'G', 'T']: + full = previous+NTD + if full in Frequency: + value = Frequency[full] + if NTD == sequences[seqIndex][index+1]: + numerator = value + denominator += value + #increase the summation of each location + lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator + #increase number of given nucleotides at the index + totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0 + + +#LogOdds +#input: sequence relevant part of the sequence being added to the PWM +# PWM information on sequences already in the motif +# LogsM frequency information on sequences already in motif +# Frequency markov model for background +# sizeOfMotif size f the motif being found +#output returns the log odds score for the consensus +#the equation used is as follow: +#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) +def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW): + Score = 0 + PWMout = copy.deepcopy(PWM) + LogsMout = copy.deepcopy(LogsM) + MW1=MARKOV_WINDOW-1 + #since each column of the PWM must add up to the total umber of sequences + #in that PWM, in addition one must be added for the current sequence + totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1 + for j in range(len(PWMout)): + for i in ['A', 'C', 'G', 'T']: + if i == sequence[j+MW1]: + PWMout[j][NTDIndices[i]] += 1.0 + #prefix to check for prob of specific ntd given prefix + word = sequence[j:j+MARKOV_WINDOW] + #determine the frequency of the words individually + LogsMout[j][NTDIndices[i]] += Frequency[word] + if PWMout[j][NTDIndices[i]]> 0: + Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e) + + return Score, PWMout, LogsMout + + +#convert2motif +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +# maybe add threshold?!?! +#output: string motif converted into descriptive symbols +# +#takes in a list of motifs that were found at each point and converts them to +#an actual motif +def convert2motif(sequences, size): + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', + 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', + 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + #determine the composition of each column + numSeqs = len(sequences) + for i in xrange(size): + A = 0. + C = 0. + G = 0. + T = 0. + for seq in sequences: + if seq[i].upper() == "A": + A += 1 + elif seq[i].upper() == "C": + C += 1 + elif seq[i].upper() == "G": + G += 1 + else: + T += 1 + + characterCode = "" + if (A/numSeqs > 0.1): + characterCode += "A" + if (C/numSeqs > 0.1): + characterCode += "C" + if (G/numSeqs > 0.1): + characterCode += "G" + if (T/numSeqs > 0.1): + characterCode += "T" + Motif += SymbolDict[characterCode] + + return Motif + + +#PMW2motif +#input: array PWM +#output: string motif converted into descriptive symbols +# +#takes in a PWM that was created and converts it to +#an actual motif +def PWM2Motif(PWM): + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', + 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', + 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + #determine the composition of each column + for i in xrange(len(PWM)): + characterCode = "" + if (PWM[i][NTDA] > 0.1): + characterCode += "A" + if (PWM[i][NTDC] > 0.1): + characterCode += "C" + if (PWM[i][NTDG] > 0.1): + characterCode += "G" + if (PWM[i][NTDT] > 0.1): + characterCode += "T" + Motif += SymbolDict[characterCode] + + return Motif + + + +#convert2PSFM +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +#output: 2Darray will contain the PSFM where indices 0-3 of each list +# will be A,C,G,T respectively +# +#takes in a list of motifs that were found at each point and converts them to +#a PSFM +def convert2PSFM (PWM, NumOfSeqs): + #Position specific frequency matrix to be returned + PSFM = [] + #determine the composition of each column + for i in xrange(len(PWM)): + index = [] + #get frequencies for each NTD + for j in [0,1,2,3]: + index.append(PWM[i][j]/NumOfSeqs) + #add all nucleotide frequencies to the columns + PSFM.append(index) + + return PSFM + + +#convert2PWM +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +#output: 2Darray will contain the PSFM where indices 0-3 of each list +# will be A,C,G,T respectively +# +#takes in a list of motifs that were found at each point and converts them to +#a PWM +def convert2PWM (sequences, size): + #Position specific frequency matrix to be returned + PWM = [] + #determine the composition of each column + for i in xrange(size): + indices = [0.0, 0.0, 0.0, 0.0] + for seq in sequences: + indices[NTDIndices[seq[i].upper()]] += 1.0 + #add all nucleotide frequencies to the columns + PWM.append(indices) + + return PWM + + +#add2PWM +#input: sequence sequence to be added to PWM +# PWM PWM being modiifed +# +#mutator method takes in a sequence and adds it to the PWM +def add2PWM(sequence, PWM): + #determine the composition to add + for i in xrange(len(PWM)): + PWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + +#add2PWMReturn +#input: sequence sequence to be added to PWM +# PWM PWM being modiifed +# +#mutator method takes in a sequence and adds it to the PWM +def add2PWMReturn(sequence, PWM): + retPWM=copy.deepcopy(PWM) + #determine the composition to add + for i in xrange(len(retPWM)): + retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + return retPWM + + +#Align2PWM +#input: Motifi Sequence being aligned to PWM +# PWM PWM to which the sequence is being aligned +#output: float alignment score to the matrix +# +#takes in a PWM and aligns the sequence to this PWM and returns +#the consensus scoreo +def Align2PSFM(Motifi,PWM): + Best = 0.0 + for i in xrange(len(PWM)): + CurrentIndex = PWM[i][:] + CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0 + Top = CurrentIndex[0] + for j in [1,2,3]: + if Top < CurrentIndex[j]: + Top = CurrentIndex[j] + Best += Top + + return Best + + +#Factorial +#input: n Number for which we will calculate the factorial +#output: float factorial of input number +# +#calculates the factorial of input number n +def Factorial(n): + #by definition factorial should be 1 + Fact = 1 + for i in xrange(2,n+1): + Fact *= i + + return Fact + + +#Choose +#input: n integer for total number of elements in a set +# k integer for total number of elements to be chose from set +#output: float the binomial coefficient of the above number, ie nCk +# +#calculates the number of ways to choose k elements from a set of n +def Choose(n, k): + return Factorial(n)/Factorial(n-k)/Factorial(k) + + +#GetMotif +#input: sequencesToCompare sequences where the motif is being found +# sequencesLast original version of the sequences +# CompareList Order in which sequences are examined +# Best Motif The location where the motifs score highest +# startLocs The current locations for the motifs being examined +# sizeOfMotif Size of the motif to be found in the sequences +# Index Index up to which the motif has been optimized so +# far +# BestScore The best score of aligned motifs +# Frequency Markov3 Model +# PWM PWM which includes alll the sequences which have +# been included so far +# PWMFounder This is the "founder" PWM, includes only the +# founder sequences' information +# ConScoreTop This value stores the Consensus score of the +# founder sequences only +# normalization normalization value for the current words +# LogsM frequency of markov words +# percID percent of the aligned sequences that must be +# identical in order to be considered for a motif to +# be considered (between the founder sequences then +# between founder sequence consensus and each other +# subsequence being considered) +# originalSeqs contain unmasked sequence for markov scores +#output:integer Score of the top scoring motif +# +#this function will align the Indexth item in the sequencesToCompare to all the +#sequences previously aligned from their best indices in a way to optimize the +#alignment score (consensus score) + +def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif, + startLocs, sizeOfMotif, Index, BestScore, Frequency, + PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID, + originalSeqs,origLast,MARKOV_WINDOW): + + MW1=MARKOV_WINDOW-1 + #variable to indicate whether sequencesLast needs to be updated + NewBestDetermined = False + SeqIndexi = CompareList[Index] + + #search through all positions for this index + starti=MW1 + endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif + while (starti<=endi): + Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif] + MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif] + + #if this area has already been earmarked as a motif it cannot be + #used again + Nlocation=Motifi.rfind('N') + if Nlocation >= 0: + starti+=1+Nlocation+MARKOV_WINDOW + + #check to see if the current sequence has a high enough consensus + #with the founder sequences in order to be considered further + ScoreTop = Align2PSFM(Motifi, PWMFounder) + Poss = float(ScoreTop - ConScoreTop) + GOF = Poss/sizeOfMotif + if GOF < percID/100.0: + starti+=1 + continue + + #if the word is not above noise + if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0: + starti+=1 + continue + + startLocs[SeqIndexi] = starti + #new best scores are assigned by aligning the current sequence to + #the PWM + (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW) + Score = ScoreAll + + #new best scores are evaluted + if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore : + #record best position + BestMotif[SeqIndexi] = startLocs[SeqIndexi] + BestScore = Score + LogsMhold = LogsM2[:] + PWMhold = PWM2[:] + NewBestDetermined = True + + starti+=1 + + startLocs[SeqIndexi] = BestMotif[SeqIndexi] + if NewBestDetermined: + PWM = PWMhold[:] + LogsM = LogsMhold[:] + + return (BestScore, NewBestDetermined, PWM, LogsM) \ No newline at end of file diff --git a/cistematic/programs/Gibbs.py b/cistematic/programs/Gibbs.py new file mode 100644 index 0000000..c4aaff8 --- /dev/null +++ b/cistematic/programs/Gibbs.py @@ -0,0 +1,650 @@ +##Sarah Aerni +##Created: June 25, 2005 +##Modified: July 11, 2005 +##Motif Finder using Gibbs sampling +##convergence is measured when the difference in the sequences is negligible + +import random +import math +import sys +import copy +from time import time + +from math import ceil + +Frequency = {} +ConsensusScore= {True:2, False:1} + +NTDIndices = {"A": 0, "C": 1, "G": 2, "T": 3} +IndexNTDs = {0: "A", 1: "C", 2: "G", 3: "T"} +INSERTION_N = "N" +global minSize +global maxSize +global thresholdPercent +global sequences +global sizeOfMotif +global numOfMismatches +global maxIterations +global maxLoopsWithoutImprovement + +""" +markov size - 1 = markov model +example: +if you want a markov 1 you want to check 2 ntds total +markov_size = 2 +MARKOV_SIZE = 2 + +AlignmentScore +input: sequenceToCompare List of sequences whose substrings will be + sizeOfMotif aligned integer size of the motif being found + (length of the subseqeunce that will be aligned + from each sequence in above list (window size) + startLocs start locations of the motifs to be aligned to + each other + CompareList the indices of sequencesToCompare to be aligned + numSeqs the number of sequences -1 from + sequencesToCompare to be aligned, the indices + of sequenceToCompare stored in the first + numSeqs indices of in CompareList. + Frequency markov model being used to calculate background + originalSeqs contain unmasked sequences used for checking + the markov score +output: integer Score indicating the consensus score of these + sequences + 2D-List contains the PSFM + 2D-List contains the log markov scores + +will be calculated as an ungapped consensus between those elements in the +CompareList Consensus score is calculated by choosing the largest number of +the same elements in each column, and adding all these numbers up across all +columns +""" + + +def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs): + TotalScore = 0; + PWM = [] + maxScores=[] + len(sequencesToCompare) + for i in range (sizeOfMotif): + PWMi = [0.0, 0.0, 0.0, 0.0] + for j in range(numSeqs+1): + SequenceIndex = CompareList[j] + CurrSeq = sequencesToCompare[SequenceIndex] + + #some sequences may not contain the motifs, if so you do not want + #to include them in the consensus. These have uninitialized start + #locations (ie startLocs would be -1 + if startLocs[SequenceIndex] != -1: + if sequencesToCompare[SequenceIndex]\ + [startLocs[SequenceIndex]+i] == "N": + print sequencesToCompare + print "\nBAD HERE!" + print CurrSeq + print startLocs + print startLocs[SequenceIndex] + print CompareList + print j + print SequenceIndex + print numSeqs + print sizeOfMotif + print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif] + PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0 + maxHere=max(PWMi) + TotalScore += maxHere + maxScores.append(maxHere) + PWM.append(PWMi) + + return (TotalScore, PWM,maxScores) + + +def MarkovFreq (prefix, actualNTD, Frequency,MARKOV_WINDOW): + """ MarkovFreq + input: prefix string of length MARKV_SIZE - 1 prefix used for model + actualNTD character NTD at the en dof the prefix being calculated + Frequency Markov model for calculations + output: float that gives the markov score for this specific sequence + + The helper function will run through and find all possible words with the + prefix and determine the markov score based on this + """ + denominator = 0.0 + numerator = 0.0 + for NTD in ["A", "C", "G", "T"]: + value = M_Score(prefix+NTD, Frequency, False, MARKOV_WINDOW) + if NTD == actualNTD : + numerator = value + denominator += value + retVal = numerator/denominator + + return retVal + + +def revComp (sequence): + """ revComp + input: sequence DNA sequence to be converted to reverse complement + output: string reverse complement of input sequence + + obtains the reverse complement of an input sequence + """ + RevDict={"A": "T", + "T": "A", + "C": "G", + "G": "C", + "N": "N" + } + reverse = "" + for i in range(len(sequence)): + reverse = RevDict[sequence[i].upper()]+reverse + + return reverse + + +def Markov(sequences, IncludeRC, MARKOV): + """ Markov3 + input: sequences list that are being used to create the background + output: dictionary of all 6mers (reverse complement also) and their -log2 + proportion seen + + background will build a markov model of the background in order to be able + to differentiate the motifs from the pure background of a certain size + they will be stored as -log(fraction) + """ + MARKOV_WINDOW = MARKOV + 1 + WordDict = {} + totalWindows = 0 + for i in sequences: + totalWindows += (len(i)-MARKOV_WINDOW+1)*2 + + for seq in sequences: + for index in range(len(seq)-MARKOV_WINDOW+1): + subseq = seq[index:index+MARKOV_WINDOW].upper() + if subseq not in WordDict: + WordDict[subseq] = 0.0 + + WordDict[subseq] += 1.0 + if IncludeRC: + RC = revComp(subseq) + if RC not in WordDict: + WordDict[RC] = 0.0 + + WordDict[RC] += 1.0 + + for key in WordDict: + WordDict[key] = 1.0*WordDict[key]/totalWindows + + return WordDict + + +def Average_M (sequence, Model, l, MARKOV_WINDOW): + """ Average_M + input: sequences List of sequences on which to find the average + markov score + Model Dictionary containing pvalues for seeing 3mers + l integer designating the word sizes from which to + determine average pvalue + output: average probability of all input lmers in sequences in the Model + + finds the probability of seeing all subsequence in the total strings + using the markov model created using the background. Markov3 is used + (window size of 3) and from this determine the average. This function will + also screen the background model + """ + totalSum = 0.0; + totalWords = 0.0; + for seq in sequence: + for i in range(MARKOV_WINDOW-1,len(seq)-l+1): + totalWords += 1.0 + PVal = M_Score(seq[i-MARKOV_WINDOW+1:i+l], Model, True, MARKOV_WINDOW) + totalSum += PVal + + retVal = totalSum/totalWords + print totalWords + return retVal + + +def M_Score (sequence, Model, check, MARKOV_WINDOW): + """ M_Score + input: sequence string for which the Pvalue is to be determined + Model Dictionary containing log2 pvalues for seeing 6mers + check Boolean which determines whether to also check for + completeness of markov model + output: log2 probability of seeing the input seqeunce in the Model + + gives the probability of seeing the given subsequence in the total strings + using the markov model created using the background. Markov6 is used + (window size of 3) + """ + PVal = 0.0 + for j in range(len(sequence)-MARKOV_WINDOW+1): + if sequence[j:j+MARKOV_WINDOW] not in Model: + if check: + print "The Markov Model is inadequate for your input", + print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW], + print "not contained in model provided\n", + print "Please revise your provided model or consider", + print "using Background Modelling provided" + sys.exit(0) + + continue + + PVal += -math.log(Model[sequence[j:j+MARKOV_WINDOW]],math.e) + + return PVal + + +def LogOdds(PWM, LogsM, sequence, Frequency, MARKOV_WINDOW): + """ LogOdds + input: sequence relevant part of the sequence being added to the PWM + PWM information on sequences already in the motif + LogsM frequency information on sequences already in motif + Frequency markov model for background + sizeOfMotif size f the motif being found + output returns the log odds score for the consensus + the equation used is as follow: + S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) + """ + Score = 0 + PWMout = copy.deepcopy(PWM) + LogsMout = copy.deepcopy(LogsM) + #since each column of the PWM must add up to the total umber of sequences + #in that PWM, in addition one must be added for the current sequence + totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1 + for j in range(len(PWMout)): + for i in ['A', 'C', 'G', 'T']: + if i == sequence[j+MARKOV_WINDOW-1]: + PWMout[j][NTDIndices[i]] += 1.0 + word = sequence[j:j+MARKOV_WINDOW] + LogsMout[j][NTDIndices[i]] += Frequency[word] + + if PWMout[j][NTDIndices[i]]> 0: + Score += PWMout[j][NTDIndices[i]]/totalSeqs*math.log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),math.e) + + return Score, PWMout, LogsMout + + +def convert2motif(sequences, size): + """ convert2motif + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + maybe add threshold?!?! + output: string motif converted into descriptive symbols + + takes in a list of motifs that were found at each point and converts them to + an actual motif + """ + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + for i in range(size): + A = 0 + C = 0 + G = 0 + T = 0 + for seq in sequences: + if seq[i].upper() == "A": + A += 1 + elif seq[i].upper() == "C": + C += 1 + elif seq[i].upper() == "G": + G += 1 + else: + T += 1 + + characterCode = "" + + #translate column composition into symbols + ###########should we use percentages?! ie. A >certain percent################## + if (A > 0): + characterCode += "A" + + if (C > 0): + characterCode += "C" + + if (G > 0): + characterCode += "G" + + if (T > 0): + characterCode += "T" + + Motif += SymbolDict[characterCode] + + return Motif + + +def convert2PSFM (sequences, NumOfSeqs): + """ convert2PSFM + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + output: 2Darray will contain the PSFM where indices 0-3 of each list + will be A,C,G,T respectively + + takes in a list of motifs that were found at each point and converts them to + a PSFM + """ + PSFM = [] + PWM = convert2PWM(sequences, len(sequences[0])) + for i in xrange(len(PWM)): + index = [] + for j in [0,1,2,3]: + index.append(PWM[i][j]/NumOfSeqs) + + PSFM.append(index) + + return PSFM + + +def convert2PWM (sequences, size): + """ convert2PWM + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + output: 2Darray will contain the PSFM where indices 0-3 of each list + will be A,C,G,T respectively + + takes in a list of motifs that were found at each point and converts them to + a PWM + """ + PWM = [] + for i in range(size): + indices = [0.0, 0.0, 0.0, 0.0] + for seq in sequences: + indices[NTDIndices[seq[i].upper()]] += 1.0 + + PWM.append(indices) + + return PWM + + +def add2PWM(sequence, PWM): + """ add2PWM + input: sequence sequence to be added to PWM + PWM PWM being modiifed + + takes in a sequence and adds it to the PWM + """ + #determine the composition to add + for i in range(len(PWM)): + PWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + +def Align2PWM(Motifi,PWM): + """ Align2PWM + input: Motifi Sequence being aligned to PWM + PWM PWM to which the sequence is being aligned + output: float alignment score to the matrix + + takes in a PWM and aligns the sequence to this PWM and returns + the consensus scoreo + """ + Score = 0.0 + for i in range(len(PWM)): + Score += PWM[i][NTDIndices[Motifi[i]]] + return Score + + +def Factorial(n): + """ Factorial + input: n Number for which we will calculate the factorial + output: float factorial of input number + + calculates the factorial of input number n + """ + #by definition factorial should be 1 + Fact = 1 + for i in range(2,n+1): + Fact *= i + return Fact + + +def Choose(n, k): + """ Choose + input: n integer for total number of elements in a set + k integer for total number of elements to be chose from set + output: float the binomial coefficient of the above number, ie nCk + + calculates the number of ways to choose k elements from a set of n + """ + return Factorial(n)/Factorial(n-k)/Factorial(k) + + +getNTDVals = {0:'A',1:'C',2:'G', 3:'T'} + +getNTDIndex = {'A':0,'C':1,'G':2, 'T':3} + + +def MotifFinder(InputSequences, minS, maxS, numberOfMotifs, Markov_Size, + UseRC, Frequency,excludeBelowAve, percID, + maxIter, maxLoopsW_outImprovement): + global minSize + minSize=minS + global maxSize + maxSize=maxS + global thresholdPercent + thresholdPercent=percID + global sequences + global sizeOfMotif + global numOfMismatches + global maxIterations + maxIterations=maxIter + + global maxLoopsWithoutImprovement + maxLoopsWithoutImprovement=maxLoopsW_outImprovement + + print "Running Sampler with %i sequences"%(len(InputSequences)) + print "Finding %i motifs of size %i to %i using markov size %i" % (numberOfMotifs,minSize,maxSize,Markov_Size) + + sequences = [InputSequences[i].upper() for i in range(len(InputSequences))] + PSFMs = [] + if UseRC: + for seqIndex in xrange(len(sequences)): + RCSeq = revComp(sequences[seqIndex]) + sequences[seqIndex] += INSERTION_N+ RCSeq + + #this will track the movement of each sequence + #if a movement exceeds a certain threshold we are not finished + for motifNum in range(numberOfMotifs): + + #to improve speed shrink sequences by replacing strings of Ns by + #a single N + for i in xrange(len(sequences)): + splitByN=sequences[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequences[i]=finalSequence + + print "MOTIF NUMBER %i" %motifNum + empty=min([len(sequences[i]) for i in xrange(len(sequences))]) + #pick motif size randomly within the range provided by the user + sizeOfMotif = random.randint(minSize,maxSize) + if empty < maxSize: + return ALLPSFMS + + numOfMismatches=sizeOfMotif-ceil(thresholdPercent/100.0*sizeOfMotif) + (PWM,PWMScores,startLocs)=GibbsRunner(100) + MaxVals=[0 for i in xrange(len(PWM))] + for ConsI in xrange(len(PWM)): + MaxVals[ConsI] = max(PWM[ConsI]) + PWMScores = [0 for i in range(len(sequences))] + for SIndex in range(len(sequences)): + subseq = sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif] + PWMScores[SIndex] = 0 + #######################start here########## + for subIndex in range(len(subseq)): + PWMScores[SIndex] += PWM[subIndex][NTDIndices[subseq[subIndex]]] + + maxScore = max(PWMScores) + #get rid of all the sequences that do not achieve a certain consensus + #score defined by the top one + thresh = thresholdPercent/100.0 * maxScore + FinalPWMSeqs = [] + for SIndex in range(len(PWMScores)): + if PWMScores[SIndex] > thresh: + FinalPWMSeqs.append(sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif]) + else: + startLocs[SIndex] = -1 + + FinalPSFM= convert2PSFM (FinalPWMSeqs, len(FinalPWMSeqs)) + PSFMs.append(FinalPSFM) + for i in xrange(len(sequences)): + if startLocs[i] != -1: + sequences[i] = sequences[i][:startLocs[i]]+INSERTION_N*sizeOfMotif+sequences[i][startLocs[i]+sizeOfMotif:] + sequences[i] = sequences[i][:len(sequences[i])-startLocs[i]-sizeOfMotif]+INSERTION_N*sizeOfMotif+sequences[i][len(sequences[i])-startLocs[i]:] + return PSFMs + + +def GibbsRunner(iterIn): + iterAll = 0 + BestPWM=[] + BestScore=0 + BestLocs=[] + global minSize + global maxSize + global thresholdPercent + global sequences + global sizeOfMotif + global numOfMismatches + global maxIterations + global maxLoopsWithoutImprovement + maxScore=sizeOfMotif*len(sequences) + st=time() + + while iterAll < iterIn: + en=time() + print "%.03f\t"%(en-st), + st=time() + iterAll+=1 + startLocs = [-1 ] * len(sequences) + + for i in range(len(sequences)): + startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif) + while "N" in sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]: + startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif) + + (TotalScore, PWM,dummy)= AlignmentScore(sequences, sizeOfMotif, startLocs, [i for i in range(len(sequences))], len(sequences)-1) + PWMScore=[Align2PWM(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif],PWM) for i in range(len(sequences))] + print "PWM is right now" + print PWM + print "scores for each" + print PWMScore + SOi = -1 + ConsensusScore = 0 + PreviousBestScore = 0 + PreviousBestTime = -1 + iterations = 0 + while iterations < maxIterations and (ConsensusScore > PreviousBestScore or PreviousBestTime <= maxLoopsWithoutImprovement): + iterations += 1 + SOi = random.randint(0,len(sequences)-1) + SeqMotifs = [] + locs=startLocs[:] + for i in range(len(sequences)): + if(SOi == i): + locs[i]=-1 + continue + SeqMotifs.append(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]) + + (TotalScore, PWM,maxScores)= AlignmentScore(sequences, sizeOfMotif, locs, [i for i in range(len(sequences))], len(sequences)-1) + startLocsProb = [] + startLocsI= [] + SOSeq = sequences[SOi] + total = 0 + start = 0 + endloc=len(SOSeq)-sizeOfMotif + while(start<=endloc): + Motif = SOSeq[start:start+sizeOfMotif] + locOfN=Motif.rfind("N") + if locOfN>=0: + start+=locOfN+1 + continue + probAtPosn=0 + j=0 + mmNum=0 + while (jnumOfMismatches: + probAtPosn=0 + break + j+=1 + + if probAtPosn == 0: + start+=1 + continue + startLocsI.append(start) + startLocsProb.append(probAtPosn) + total += probAtPosn + start+=1 + + if len(startLocsProb) == 0: + continue + + choice = random.random() + choiceLoc = choice*total + totalToHere = 0 + for PrefI in range(len(startLocsProb)): + if totalToHere+startLocsProb[PrefI] == 0: + continue + if choiceLoc < totalToHere+startLocsProb[PrefI]: + break + totalToHere += startLocsProb[PrefI] + + startLocs[SOi] = startLocsI[PrefI] + PWMScore[SOi] = startLocsProb[PrefI] + newMotif=SOSeq[startLocs[SOi]:startLocs[SOi]+sizeOfMotif] + add2PWM (newMotif, PWM) + + NewScores=[] + PercentChange = [] + for i in range(len(sequences)): + NewScore = 0 + Motif_i=sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif] + for j in xrange(sizeOfMotif): + NewScore+=PWM[j][NTDIndices[Motif_i[j]]] + + NewScores.append(NewScore) + PercentChange.append(math.fabs(NewScore - PWMScore[i])/PWMScore[i]) + + TotConsensusScore = sum(NewScores) + AveConsensusScore = TotConsensusScore/(len(sequences)) + if AveConsensusScore > PreviousBestScore: + PreviousBestScore = AveConsensusScore + PreviousBestTime = 0 + else: + PreviousBestTime += 1 + + PWMScore=NewScores[:] + + Consensus=sum([max(PWM[i]) for i in xrange(len(PWM))]) + if Consensus> BestScore: + BestScore=Consensus + BestPWM=PWM[:] + BestLocs=startLocs[:] + if BestScore==maxScore: + break + + print "iterated %i times to find"%iterAll + print BestPWM + + return(BestPWM,BestScore,BestLocs) + + +def probFromPSFM(sequence, PSFM): + probability = 1 + for i in range(len (sequence)): + probability *= PSFM[i][getNTDIndex[sequence[i]]] + + return probability \ No newline at end of file diff --git a/cistematic/programs/__init__.py b/cistematic/programs/__init__.py new file mode 100644 index 0000000..efb49d2 --- /dev/null +++ b/cistematic/programs/__init__.py @@ -0,0 +1,128 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +__all__ = ["meme", "locator","mafft", "paircomp", "cisGreedy", "gibbs"] + +from os import environ + +supportedPrograms = [("locator" ,"Locator", {"consensus": "", + "identifier": ""}), + ("meme" ,"Meme", {"model": "zoops", + "nmotifs": 10, + "maxwidth": 16}), + ("cisGreedy", "CisGreedy", {"model": "zoops", + "nmotifs": 10, + "maxwidth": 16, + "minwidth": 6, + "reverse": True, + "background": "None", + "iterations": 100, + "founder": False, + "percentID": 75, + "Markov size": 3}), + ("gibbs", "Gibbs", {"model": "zoops", + "nmotifs": 10, + "maxwidth":16, + "minwidth": 6, + "reverse": True, + "background": "None", + "percentID": 50, + "Markov size": 3, + "iterations": 100}) +] + + +class Program: + """ Program is the Super class for all of the programs supported by cistematic and + that are typically called by other classes in cistematic, such as the Experiments. + Children will overide the methods that they need to instantiate. + """ + if environ.get("CISTEMATIC_ROOT"): + programRoot = "%s/programs" % environ.get("CISTEMATIC_ROOT") + else: + programRoot = "/proj/genome/programs" + + contents = "" + tagID = "" + + def __init__(self, tagID="", inputFilePath="", outputFilePath=""): + self.tagID = "" + self.inputFilePath = inputFilePath + self.outputFilePath = outputFilePath + + + def setTagID(self, tid): + self.tagID = tid + + + def name(self): + return self.__class__.__name__ + + + def inputFile(self, inputFilePath): + self.inputFilePath = inputFilePath + + + def outputFile(self, outputFilePath): + self.outputFilePath = outputFilePath + + + def setSeqLength(self, length): + pass + + + def setGenome(self, genome): + pass + + + def setGenExpOptions(self, optionArray): + pass + + + def run(self): + pass + + + def display(self): + for line in self.contents: + print line + + + def load(self, inFilePath): + inFile = open(inFilePath, "r") + self.contents = inFile.readlines() + inFile.close() + + + def save(self): + outFile = open(self.outputFilePath, "w") + for line in self.contents: + outFile.write(line) + + outFile.close() \ No newline at end of file diff --git a/cistematic/programs/cisGreedy.py b/cistematic/programs/cisGreedy.py new file mode 100644 index 0000000..271b6bb --- /dev/null +++ b/cistematic/programs/cisGreedy.py @@ -0,0 +1,201 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# covariance.py +from cistematic.programs import Program +import time, Consensus +from cistematic.core.motif import Motif + + +class CisGreedy(Program): + + + def getSettings(self): + return (self.model, self.nmotifs, self.minwidth, self.maxwidth, self.bON, + self.iterations, self.Founder, self.bfile, self.percID, + self.founderPercID, self.MarkovSize) + + + def setSettings(self, settings): + self.motifs = [] + (self.model, self.nmotifs, self.minwidth, self.maxwidth, self.bON, + self.iterations, self.Founder, self.bfile, self.percID, + self.founderPercID, self.MarkovSize) = settings + + + def setGenExpOptions(self, optionArray): + self.setModel("zoops") + self.setNumMotifs("10") + self.setMaxWidth("15") + self.setMinWidth("6") + self.setBON(True) + self.setIterations("100") + self.bfile = "None" + self.TCM="All" + self.Founder = False + self.setPercID("75") + self.setMarkovSize("3") + self.setFounderPID("75") + self.setBackground("None") + for option in optionArray: + (optionTag, optionValue) = option.split(":") + + if optionTag == "Markov size": + self.setMarkovSize(optionValue) + + if optionTag == "percentID": + self.setPercID(optionValue) + + if optionTag == "founderPercentID": + self.setFounderPID(optionValue) + + if optionTag == "model": + self.setModel(optionValue) + + if optionTag == "nmotifs": + self.setNumMotifs(optionValue) + + if optionTag == "width": + self.setMinWidth(optionValue) + self.setMaxWidth(optionValue) + + if optionTag == "minWidth": + self.setMinWidth(optionValue) + + if optionTag == "maxWidth": + self.setMaxWidth(optionValue) + + if optionTag == "reverse": + self.setBON(optionValue) + + if optionTag == "iterations": + self.setIterations(optionValue) + + if optionTag == "background": + self.setBackground(optionValue) + + if optionTag == "founder": + self.setFounder(True) + + #if no background file is provided the default setting + #is to use the input sequences as a background + print self.bfile + #set markovSize + + + def setMarkovSize(self, val): + self.MarkovSize = int(str(val)) + + + def setFounderPID(self, val): + self.founderPercID = int(str(val)) + + + def setFounder(self, boolVal): + if boolVal: + self.Founder = True + else: + self.Founder = False + + + def setPercID(self, percVal): + self.percID = int(str(percVal)) + + + def setModel(self, modelType): + if modelType in ["oops", "zoops", "tcm"]: + self.model = str(modelType) + else: + self.model = "zoops"; + + + def setIterations(self, number): + self.iterations = int(str(number)) + + + def setNumMotifs(self, motifNum): + self.nmotifs = int(str(motifNum)) + + + def setBON(self, B_val): + if str(B_val) == "False": + self.bON = False + else: + self.bON = True + + + def getSequences(self, infile): + seqFiles = open(infile, 'r') + Lines = seqFiles.readlines() + seqFiles.close() + self.sequences = [] + for line in Lines: + if line[0] != ">": + self.sequences.append(line[:-1]) + + + def setMinWidth(self, width): + self.minwidth = int(str(width)) + + + def setMaxWidth(self, width): + self.maxwidth = int(str(width)) + + + def setBackground(self, backFile): + self.bfile = str(backFile) + + + def run(self): + print self.nmotifs + startTime = time.time() + if self.bfile == "None": + self.bfile = Consensus.Markov(self.sequences, self.bON,self.MarkovSize) + + print "geting consensus score" + self.contents = Consensus.MotifFinder(self.sequences, self.minwidth, self.maxwidth, + self.nmotifs, self.iterations, self.bfile, + self.bON, self.Founder,self.percID, + self.founderPercID, self.MarkovSize, self.model, + self.TCM) + stopTime = time.time() + print "\nThis run took %.3f - %.3f = %.3f seconds and produced %d motifs" % (startTime, stopTime, stopTime - startTime, len(self.contents[0])) + + + def getMotifs(self): + self.motifs = [] + try: + (PWMs, Seqs) = self.contents + for index in range(len(Seqs)): + self.motifs.append(Motif("%s-cisGreedy-%s" % (self.tagID, str(index + 1)), seqs=Seqs[index])) + except: + print "error returning motifs" + pass + + return self.motifs \ No newline at end of file diff --git a/cistematic/programs/cisSampler.py b/cistematic/programs/cisSampler.py new file mode 100644 index 0000000..059032f --- /dev/null +++ b/cistematic/programs/cisSampler.py @@ -0,0 +1,198 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# covariance.py +from cistematic.programs import Program +import time +import cistematic.programs.gibbs as Gibbs +from cistematic.core.motif import Motif + + +class CisSampler(Program): + + + def getSettings(self): + return (self.model, self.nmotifs, self.minwidth, self.maxwidth, self.bON, self.iterations,self.bfile, self.percID,self.MarkovSize,self.findBest) + + + def setSettings(self, settings): + self.motifs = [] + (self.model, self.nmotifs, self.minwidth, self.maxwidth, self.bON, self.iterations, self.bfile,self.percID,self.MarkovSize,self.findBest) = settings + + + def setGenExpOptions(self, optionArray): + self.setModel("zoops") + self.setNumMotifs("10") + self.setMaxWidth("15") + self.setIterations("1000") + self.setMinWidth("6") + self.setBON(True) + self.bfile = "None" + self.setPercID("85") + self.setMarkovSize("2") + self.setFindBest("100") + for option in optionArray: + (optionTag, optionValue) = option.split(":") + if optionTag == "Markov size": + self.setMarkovSize(optionValue) + + if optionTag == "percentID": + self.setPercID(optionValue) + + if optionTag == "founderPercentID": + self.setFounderPID(optionValue) + + if optionTag == "model": + self.setModel(optionValue) + + if optionTag == "nmotifs": + self.setNumMotifs(optionValue) + + if optionTag == "width": + self.setMinWidth(optionValue) + self.setMaxWidth(optionValue) + + if optionTag == "minWidth": + self.setMinWidth(optionValue) + + if optionTag == "maxWidth": + self.setMaxWidth(optionValue) + + if optionTag == "reverse": + self.setBON(optionValue) + + if optionTag == "iterations": + self.setIterations(optionValue) + + if optionTag == "background": + self.setBackground(optionValue) + + if optionTag == "founder": + self.setFounder(True) + + #if no background file is provided the default setting + #is to use the input sequences as a background + print self.bfile + + + def setMarkovSize(self,val): + self.MarkovSize = int(str(val)) + + + def setFindBest(self,val): + self.findBest = int(str(val)) + + + def setFounderPID(self,val): + self.founderPercID = int(str(val)) + + + def setFounder(self, boolVal): + if boolVal: + self.Founder = True + else: + self.Founder = False + + + def setPercID(self, percVal): + self.percID = int(str(percVal)) + + + def setModel(self, modelType): + if modelType in ["oops", "zoops", "tcm"]: + self.model = str(modelType) + else: + self.model = "zoops"; + + + def setIterations(self, number): + self.iterations = int(str(number)) + + + def setNumMotifs(self, motifNum): + self.nmotifs = int(str(motifNum)) + + + def setBON(self, B_val): + if str(B_val) == "False": + self.bON = False + else: + self.bON = True + + + def getSequences(self, infile): + seqFiles = open(infile, "r") + Lines = seqFiles.readlines() + seqFiles.close() + self.sequences = [] + for line in Lines: + if line[0] != ">": + self.sequences.append(line[:-1]) + + + def setMinWidth(self, width): + self.minwidth = int(str(width)) + + + def setMaxWidth(self, width): + self.maxwidth = int(str(width)) + + + ##willlikley satr to use this but not yet + # set background frequency file + def setBackground(self, backFile): + self.bfile = str(backFile) + + + def run(self): + print self.nmotifs + startTime = time.time() + if self.bfile == "None": + self.bfile = Gibbs.Markov(self.sequences, self.bON,self.MarkovSize) + print "geting consensus score" + self.contents = Gibbs.MotifFinder(self.sequences, self.minwidth, self.maxwidth, self.nmotifs, + self.MarkovSize, self.bON, self.bfile, False, self.percID, + self.iterations, self.findBest) + stopTime = time.time() + print self.contents + print "right here!\n \n\n" + print "\nThis run took %.3f - %.3f = %.3f seconds and produced %d motifs" % (startTime, stopTime, stopTime - startTime, len(self.contents)) + + + def getMotifs(self): + self.motifs = [] + try: + for motif in range(len(self.contents)): + info = "" + self.motifs.append(Motif("%s-cisSampler-%s" % (self.tagID, str(motif + 1)), "", self.contents[motif], [], 0.0, info)) + except: + print "error returning motifs" + pass + + return self.motifs \ No newline at end of file diff --git a/cistematic/programs/fastcomp.py b/cistematic/programs/fastcomp.py new file mode 100644 index 0000000..2307578 --- /dev/null +++ b/cistematic/programs/fastcomp.py @@ -0,0 +1,224 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# fastcomp.py +from cistematic.programs import Program +import os, time, tempfile +from cistematic.core.motif import Motif +from cistematic.core import complement, sanitize +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + + +class Fastcomp(Program): + fastcompPath = Program.programRoot + "/fastcomp/" + motifs = [] + argDict = {} + seq1name = "" + seq2name = "" + pairdir = "" + argLetter = {"windowsize": "-w", "threshold": "-t"} + + + def getSettings(self): + return self.argDict + + + def setSettings(self, settings): + self.motifs = [] + self.argDict = settings + + + def buildInputFiles(self): + """ given a Fasta file (via the constructor or the inputFile() method), grab the + first two sequences and save them as two temporary sequences. + """ + line = " " + seq1 = "" + seq2 = "" + inFile = open(self.inputFilePath, "r") + try: + while line[0] != ">": + line = inFile.readline() + + self.seq1name = line[:-1] + line = inFile.readline() + while line[0] != ">": + seq1 += line.strip() + line = inFile.readline() + + self.seq2name = line[:-1] + line = inFile.readline() + while line[0] != ">": + seq2 += line.strip() + line = inFile.readline() + except: + if len(seq2) > 0: + pass + else: + print "Error processing input file" + return ("", "") + + inFile.close() + self.fastdir = tempfile.mktemp() + try: + os.mkdir(self.fastdir) + except: + self.fastdir = cisTemp + + self.inputFile1 = self.fastdir + "/seq1.fsa" + self.inputFile2 = self.fastdir + "/seq2.fsa" + seq1 = sanitize(seq1) + seq2 = sanitize(seq2) + seq1File = open(self.inputFile1, "w") + seq1File.write(seq1 + "\n") + seq1File.close() + seq2File = open(self.inputFile2, "w") + seq2File.write(seq2 + "\n") + seq2File.close() + + return (seq1, seq2) + + + def buildCommand(self): + self.buildInputFiles() + cmd = self.fastcompPath + "fastcomp -1 " + self.inputFile1 + " -2 " + self.inputFile2 + for arg in ["windowsize", "threshold"]: + cmd = cmd + " " + self.argLetter[arg] + " " + str(self.argDict[arg]) + + cmd += " -o %s/fastcomp.out" % self.fastdir + print "cmd is %s" % (cmd) + + return cmd + + + def setWindowSize(self, wsize): + self.argDict["windowsize"] = str(wsize) + + + def setThreshold(self, thresh): + self.argDict["threshold"] = thresh + + + def checkDefaults(self): + if "threshold" not in self.argDict: + self.argDict["threshold"] = 0.9 + + if "windowsize" not in self.argDict: + self.argDict["windowsize"] = 10 + + + # run the program - preferrably after some of the other options have been set + def run(self): + startTime = time.time() + self.checkDefaults() + self.contents = os.popen(Fastcomp.buildCommand(self)).readlines() + outFile = open( self.fastdir + "/fastcomp.out") + self.contents = outFile.readlines() + outFile.close() + try: + filenames = os.listdir(self.fastdir) + for entry in filenames: + os.remove(self.fastdir + "/" + entry) + os.rmdir(self.fastdir) + except: + print "error cleaning up directory %s" % self.fastdir + + stopTime = time.time() + + print "\nThis run took %.3f - %.3f = %.3f seconds" % (startTime, stopTime, stopTime - startTime) + + + def getWindows(self, seqNum="1"): + results = [] + for line in self.contents: + (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t") + if seqNum == "1": + results.append((seq1pos, seq2pos, matches, sense)) + else: + results.append((seq2pos, seq1pos, matches, sense)) + + return results + + + def getMotifs(self): + self.motifs = [] + thePWM = [] + motSize = self.argDict["windowsize"] + matrixRow = {"A": 0, "C": 1, "G": 2, "T": 3} + (seq1, seq2) = self.buildInputFiles() + currentPos = -1 + for line in self.contents: + (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t") + if int(seq1pos) > currentPos: + currentPos = int(seq1pos) + if len(thePWM) > 0: + for pos in range(len(thePWM)): + thePWM[pos][matrixRow["A"]] /= seqNum + thePWM[pos][matrixRow["C"]] /= seqNum + thePWM[pos][matrixRow["G"]] /= seqNum + thePWM[pos][matrixRow["T"]] /= seqNum + + self.motifs.append(Motif(self.tagID + "-FASTCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"]))) + + thePWM = [] + motseq1 = seq1[currentPos:currentPos+motSize] + seqNum = 1.0 + for pos in range(motSize): + thePWM.append([0.0, 0.0, 0.0, 0.0]) + NT = motseq1[pos] + thePWM[pos][matrixRow[NT]] += 1 + + if sense == "1": + motseq2 = seq2[int(seq2pos):int(seq2pos) + motSize] + else: + motseq2 = complement(seq2[int(seq2pos) + 1 - motSize:int(seq2pos) + 1], motSize) + + for pos in range(motSize): + NT = motseq2[pos] + thePWM[pos][matrixRow[NT]] += 1 + + seqNum += 1.0 + + if len(thePWM) > 0: + for pos in range(len(thePWM)): + thePWM[pos][matrixRow["A"]] /= seqNum + thePWM[pos][matrixRow["C"]] /= seqNum + thePWM[pos][matrixRow["G"]] /= seqNum + thePWM[pos][matrixRow["T"]] /= seqNum + + self.motifs.append(Motif(self.tagID + "-FASTCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"]))) + + return self.motifs \ No newline at end of file diff --git a/cistematic/programs/locator.py b/cistematic/programs/locator.py new file mode 100644 index 0000000..6634602 --- /dev/null +++ b/cistematic/programs/locator.py @@ -0,0 +1,94 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# locator.py +from cistematic.programs import Program +from cistematic.core.motif import Motif + +class Locator(Program): + motifs = [] + consensus = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + identifier = "DEFAULT" + PWM = [] + seqs = [] + motFile = "" + + + def getSettings(self): + return (self.consensus, self.identifier, self.PWM, self.seqs, self.motFile) + + + def setSettings(self, settings): + self.motifs = [] + progsettings = "" + commSettings = "progsettings = %s" % str(settings) + exec commSettings + (self.consensus, self.identifier, self.PWM, self.seqs, self.motFile) = progsettings + + + def setGenExpOptions(self, optionArray): + for option in optionArray: + (optionTag, optionValue) = option.split(":") + if optionTag == "consensus": + self.setConsensus(optionValue) + + if optionTag == "identifier": + self.setID(optionValue) + + + def setConsensus(self, cons): + self.consensus = cons + + + def setID(self, ident): + self.identifier = ident + + + def setPWM(self, aPWM): + self.PWM[self.identifier] = aPWM + + + def useMotifFile(self, mFile): + self.motFile = mFile + + + def run(self): + pass + + + def getMotifs(self): + self.motifs = [] + if self.motFile != "": + motif = Motif("%s-LOC-%s" % (self.tagID, self.identifier), motifFile=self.motFile) + self.motifs.append(motif) + else: + motif = Motif("%s-LOC-%s" % (self.tagID, self.identifier), self.consensus, self.PWM, self.seqs) + self.motifs.append(motif) + + return self.motifs \ No newline at end of file diff --git a/cistematic/programs/mafft.py b/cistematic/programs/mafft.py new file mode 100644 index 0000000..e2d725f --- /dev/null +++ b/cistematic/programs/mafft.py @@ -0,0 +1,121 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# mafft.py +from cistematic.programs import Program +import os, time +from cistematic.core.motif import Motif + + +class Mafft(Program): + """ Multiple Alignment using Fast Fourier Transform. Uses fftnsi as described in: + K. Katoh, K. Misawa, K. Kuma and T. Miyata (2002) + Nucleic Acids Research 30: 3059-3066. + """ + mafftPath = "%s/mafft/" % Program.programRoot + motifs = [] + argDict = {} + + + def getSettings(self): + return self.argDict + + + def setSettings(self, settings): + self.motifs = [] + self.argDict = settings + + + def buildCommand(self): + cmd = self.mafftPath + "fftnsi " + for arg in self.argDict.keys(): + cmd = cmd + " --" + arg + " %s" % str(self.argDict[arg]) + cmd += " --quiet " + self.inputFilePath + print "cmd is %s" % (cmd) + + return cmd + + + def setGapOpening(self, op): + """ Gap opening penalty. Default is 1.58 + """ + self.argDict["op"] = op + + + def setOffset(self, ep): + """ Offset - like a gap expansion penalty. Default is 0.120 + """ + self.argDict["ep"] = ep + + + def setScoringMatrix(self, bl): + """ set Blossum scoring matrix. Choices are 30, 45, 62, and 80. + """ + self.argDict["bl"] = bl + + + def setMaxiterate(self, maxi): + """ maximum number of iterations in progressive method. + """ + self.argDict["maxiterate"] = maxi + + + def setRetree(self, tnum): + """ number of tree building in progressive method. + """ + self.argDict["retree"] = tnum + + + def run(self): + startTime = time.time() + self.contents = os.popen(Mafft.buildCommand(self)).readlines() + stopTime = time.time() + + print "\nThis run took %.3f - %.3f = %.3f seconds" % (startTime, stopTime, stopTime - startTime) + + + def getAlignment(self): + """ take the results stored in self.contents and return a dictionary for inclusion into the genepool. + """ + alignedDict= {} + dictKey = "" + sequence = "" + for line in self.contents: + if line[0] == ">": + if len(dictKey) > 0 and len(sequence) > 0: + alignedDict[dictKey] = sequence + dictKey = line[2:-1] + sequence = "" + else: + sequence += line[:-1] + + if len(dictKey) > 0 and len(sequence) > 0: + alignedDict[dictKey] = sequence + + return alignedDict \ No newline at end of file diff --git a/cistematic/programs/meme.py b/cistematic/programs/meme.py new file mode 100644 index 0000000..3e4bba6 --- /dev/null +++ b/cistematic/programs/meme.py @@ -0,0 +1,182 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# meme.py +import os +import time +import string +from cistematic.programs import Program +from cistematic.core.motif import Motif +from erange.commoncode import getConfigParser, getConfigOption + +SUPPORTED_MODELS = ["oops", "zoops", "tcm"] + +class Meme(Program): + + + def __init__(self): + parser = getConfigParser() + memeProgramName = getConfigOption(parser, "programs", "meme", default="meme.3.0.8") + self.memePath = string.join([Program.programRoot, memeProgramName], "/") + self.model = "zoops" + self.background = "" + self.numMotifs = 10 + self.minWidth = 6 + self.maxWidth = 15 + self.bfile = "" + self.motifs = [] + self.contents = [] + + + def getSettings(self): + return (self.model, self.background, self.numMotifs, self.bfile) + + + def setSettings(self, settings): + self.clearMotifList() + try: + (self.model, self.background, self.numMotifs, self.bfile) = settings + except ValueError: + print "Error unpacking settings for Meme. No parameters changed." + + + def setGenExpOptions(self, optionArray): + for option in optionArray: + try: + (optionName, optionValue) = option.split(":") + except ValueError: + continue + + if optionName == "model": + self.setModel(optionValue) + + if optionName == "nmotifs": + self.setNumMotifs(optionValue) + + if optionName == "maxwidth": + self.setMaxWidth(optionValue) + + + def buildCommand(self): + argList = ["%s/bin/meme" % self.memePath, + self.inputFilePath, + "-dna -maxsize 1000000", + "-maxw %d" % self.maxWidth, + "-minw %d" % self.minWidth, + "-mod %s" % self.model, + "-revcomp -nmotifs %d" % self.numMotifs + ] + + if self.bfile != "": + argList.append("-bfile %s" % self.bfile) + + cmd = string.join(argList, " ") + + return cmd + + + def setModel(self, modelType): + if modelType in SUPPORTED_MODELS: + self.model = modelType + + + def setNumMotifs(self, motifNum): + self.numMotifs = motifNum + + + def setMinWidth(self, width): + self.minWidth = width + + + def setMaxWidth(self, width): + self.maxWidth = width + + + def setBackground(self, backgroundFileName): + self.bfile = backgroundFileName + + + def clearMotifList(self): + self.motifs = [] + + + def setContents(self, motifFile): + self.contents = motifFile.readlines() + + + def run(self): + startTime = time.time() + memeResultFile = os.popen(Meme.buildCommand(self)) + self.setContents(memeResultFile) + stopTime = time.time() + + print "\nThis run took %.3f seconds and produced %d lines" % (stopTime - startTime, len(self.contents)) + + + def getMotifs(self): + index = 0 + self.clearMotifList() + try: + for motif in range(0, self.numMotifs): + PWM = [] + seqList = [] + index = self.locateMotifSeqs(index) + info = self.contents[index] + index += 1 + while string.find(self.contents[index], "//") < 0: + fields = self.contents[index].split() + seqList.append(fields[-2]) + index += 1 + + motifName = "%s-meme-%d" % (self.tagID, motif + 1) + self.motifs.append(Motif(motifName, "", PWM, seqList, 0.0, info)) + except: + pass + + return self.motifs + + + def locateMotif(self, startingLineNum): + return self.getDataLineNumberAfterHeader("pspm_doc", startingLineNum) + + + def locateMotifSeqs(self, startingLineNum): + return self.getDataLineNumberAfterHeader("BLOCKS_doc", startingLineNum) + + + def getDataLineNumberAfterHeader(self, headerText, startingLineNum): + currentLineNum = startingLineNum + numLines = len(self.contents) + while currentLineNum < numLines and string.find(self.contents[currentLineNum], headerText) < 0: + currentLineNum += 1 + + if currentLineNum < numLines: + return currentLineNum + 3 + + return -1 \ No newline at end of file diff --git a/cistematic/programs/paircomp.py b/cistematic/programs/paircomp.py new file mode 100644 index 0000000..3793311 --- /dev/null +++ b/cistematic/programs/paircomp.py @@ -0,0 +1,225 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# paircomp.py +from cistematic.programs import Program +import os, time, tempfile +from cistematic.core.motif import Motif +from cistematic.core import complement +from os import environ + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" + +tempfile.tempdir = cisTemp + + +class Paircomp(Program): + paircompPath = Program.programRoot + "/paircomp/" + motifs = [] + argDict = {} + seq1name = "" + seq2name = "" + pairdir = "" + + + def getSettings(self): + return self.argDict + + + def setSettings(self, settings): + self.motifs = [] + self.argDict = settings + + + def buildInputFiles(self): + """ given a Fasta file (via the constructor or the inputFile() method), grab the + first two sequences and save them as two temporary sequences. + """ + line = " " + seq1 = "" + seq2 = "" + inFile = open(self.inputFilePath, "r") + try: + while line[0] != ">": + line = inFile.readline() + + self.seq1name = line[:-1] + line = inFile.readline() + while line[0] != ">": + seq1 += line.strip() + line = inFile.readline() + + self.seq2name = line[:-1] + line = inFile.readline() + while line[0] != ">": + seq2 += line.strip() + line = inFile.readline() + except: + if len(seq2) > 0: + pass + else: + print "Error processing input file" + return ("", "") + + inFile.close() + self.pairdir = tempfile.mktemp() + try: + os.mkdir(self.pairdir) + except: + self.pairdir = cisTemp + + self.inputFile1 = self.pairdir + "/seq1.fsa" + self.inputFile2 = self.pairdir + "/seq2.fsa" + + seq1File = open(self.inputFile1, "w") + seq1File.write(self.seq1name + "\n") + seq1File.write(seq1 + "\n") + seq1File.close() + + seq2File = open(self.inputFile2, "w") + seq2File.write(self.seq2name + "\n") + seq2File.write(seq2 + "\n") + seq2File.close() + + return (seq1, seq2) + + + def buildCommand(self): + self.buildInputFiles() + cmd = self.paircompPath + "paircomp " + self.inputFile1 + " " + self.inputFile2 + for arg in ["windowsize", "threshold"]: + cmd = cmd + " " + str(self.argDict[arg]) + + cmd += " %s/paircomp.out" % self.pairdir + print "cmd is %s" % (cmd) + + return cmd + + + #set window size + def setWindowSize(self, wsize): + self.argDict["windowsize"] = str(wsize) + + + # set threshold + def setThreshold(self, thresh): + self.argDict["threshold"] = thresh + + + def checkDefaults(self): + if "threshold" not in self.argDict: + self.argDict["threshold"] = 0.9 + + if "windowsize" not in self.argDict: + self.argDict["windowsize"] = 10 + + + def run(self): + startTime = time.time() + self.checkDefaults() + self.contents = os.popen(Paircomp.buildCommand(self)).readlines() + outFile = open( self.pairdir + "/paircomp.out") + self.contents = outFile.readlines() + outFile.close() + try: + filenames = os.listdir(self.pairdir) + for entry in filenames: + os.remove(self.pairdir + "/" + entry) + os.rmdir(self.pairdir) + except: + print "error cleaning up directory %s" % self.pairdir + + stopTime = time.time() + print "\nThis run took %.3f - %.3f = %.3f seconds" % (startTime, stopTime, stopTime - startTime) + + + def getWindows(self, seqNum="1"): + results = [] + for line in self.contents: + (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t") + if seqNum == "1": + results.append((seq1pos, seq2pos, matches, sense)) + else: + results.append((seq2pos, seq1pos, matches, sense)) + + return results + + + def getMotifs(self): + self.motifs = [] + thePWM = [] + motSize = self.argDict["windowsize"] + matrixRow = {"A": 0, "C": 1, "G": 2, "T": 3} + (seq1, seq2) = self.buildInputFiles() + currentPos = -1 + for line in self.contents: + (seq1pos, seq2pos, matches, sense) = line[:-1].split("\t") + if int(seq1pos) > currentPos: + currentPos = int(seq1pos) + if len(thePWM) > 0: + for pos in range(len(thePWM)): + thePWM[pos][matrixRow["A"]] /= seqNum + thePWM[pos][matrixRow["C"]] /= seqNum + thePWM[pos][matrixRow["G"]] /= seqNum + thePWM[pos][matrixRow["T"]] /= seqNum + + self.motifs.append(Motif(self.tagID + "-PAIRCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"]))) + + thePWM = [] + motseq1 = seq1[currentPos:currentPos+motSize] + seqNum = 1.0 + for pos in range(motSize): + thePWM.append([0.0, 0.0, 0.0, 0.0]) + NT = motseq1[pos] + thePWM[pos][matrixRow[NT]] += 1 + + if sense == "1": + motseq2 = seq2[int(seq2pos):int(seq2pos) + motSize] + else: + motseq2 = complement(seq2[int(seq2pos) + 1 - motSize:int(seq2pos) + 1], motSize) + + for pos in range(motSize): + NT = motseq2[pos] + thePWM[pos][matrixRow[NT]] += 1 + + seqNum += 1.0 + + if len(thePWM) > 0: + for pos in range(len(thePWM)): + thePWM[pos][matrixRow["A"]] /= seqNum + thePWM[pos][matrixRow["C"]] /= seqNum + thePWM[pos][matrixRow["G"]] /= seqNum + thePWM[pos][matrixRow["T"]] /= seqNum + + self.motifs.append(Motif(self.tagID + "-PAIRCOMP-" + str(len(self.motifs) + 1), "", thePWM, [], 1.0, str(self.argDict["threshold"]))) + + return self.motifs \ No newline at end of file diff --git a/cistematic/programs/parent.py b/cistematic/programs/parent.py new file mode 100644 index 0000000..582352e --- /dev/null +++ b/cistematic/programs/parent.py @@ -0,0 +1,83 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# parent.py - defines Parent class used by all children classes in cistematic.programs +# +class Parent: + contents = "" + tagID = "" + + + def __init__(self, tagID="", inputFilePath="", outputFilePath=""): + self.tagID = "" + self.inputFilePath = inputFilePath + self.outputFilePath = outputFilePath + + + def setTagID(self, tid): + self.tagID = tid + + + def name(self): + return self.__class__.__name__ + + + def inputFile(self, inputFilePath): + self.inputFilePath = inputFilePath + + + def outputFile(self, outputFilePath): + self.outputFilePath = outputFilePath + + + def setSeqLength(self, length): + pass + + + def run(self): + pass + + + def display(self): + for line in self.contents: + print line + + + def load(self, inFilePath): + inFile = open(inFilePath, "r") + self.contents = inFile.readlines() + inFile.close() + + + def save(self): + outFile = open(self.outputFilePath, "w") + for line in self.contents: + outFile.write(line) + + outFile.close() \ No newline at end of file diff --git a/rnapath/RNAPATH.py b/rnapath/RNAPATH.py index 86f61cd..030e70a 100644 --- a/rnapath/RNAPATH.py +++ b/rnapath/RNAPATH.py @@ -2,8 +2,9 @@ import sys import optparse import string from numpy import zeros, int16 +from erange.commoncode import getConfigParser, getConfigOption, getConfigIntOption -versionString = "%s: version 0.95" % sys.argv[0] +versionString = "RNAPATH: version 0.96" print versionString @@ -67,10 +68,7 @@ def main(argv=None): usage = "python %prog incontigfile distalPairs outpathfile outcontigfile [--prefix string] [--overlap bp]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--prefix", dest="pathPrefix") - parser.add_option("--overlap", type="int", dest="overlap") - parser.set_defaults(pathPrefix="RNAPATH", overlap=30) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 4: @@ -86,6 +84,21 @@ def main(argv=None): outcontigfilename, options.pathPrefix, options.overlap) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="pathPrefix") + parser.add_option("--overlap", type="int", dest="overlap") + + configParser = getConfigParser() + section = "RNAPATH" + pathPrefix = getConfigOption(configParser, section, "pathPrefix", "RNAPATH") + overlap = getConfigIntOption(configParser, section, "overlap", 30) + + parser.set_defaults(pathPrefix=pathPrefix, overlap=overlap) + + return parser + + def rnaPath(incontigfilename, distalPairsfile, outpathfilename, outcontigfilename, pathPrefix="RNAPATH", overlap=30): diff --git a/rnapath/processvelvet.py b/rnapath/processvelvet.py index 0af43d1..80ed1dd 100644 --- a/rnapath/processvelvet.py +++ b/rnapath/processvelvet.py @@ -1,7 +1,8 @@ import sys import optparse +from erange.commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption -print "%prog: version 1.1" +print "processvelvet: version 1.2" def main(argv=None): if not argv: @@ -9,12 +10,7 @@ def main(argv=None): usage = "usage: python %prog infile outfile [--prefix contigpref] [--filter pslfile] [--min bp] [--keepcov]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--prefix", dest="contigPrefix") - parser.add_option("--filter", dest="filterFileName") - parser.add_option("--min", type="int", dest="minSize") - parser.add_option("--keepcov", action="store_true", dest="keepCoverage") - parser.set_defaults(contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 2: @@ -27,6 +23,25 @@ def main(argv=None): processvelvet(infile, outfile, options.contigPrefix, options.filterFileName, options.minSize, options.keepCoverage) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--prefix", dest="contigPrefix") + parser.add_option("--filter", dest="filterFileName") + parser.add_option("--min", type="int", dest="minSize") + parser.add_option("--keepcov", action="store_true", dest="keepCoverage") + + configParser = getConfigParser() + section = "processvelvet" + contigPrefix = getConfigOption(configParser, section, "contigPrefix", "chr") + filterFileName = getConfigOption(configParser, section, "filterFileName", "") + minSize = getConfigIntOption(configParser, section, "minSize", 0) + keepCoverage = getConfigBoolOption(configParser, section, "keepCoverage", False) + + parser.set_defaults(contigPrefix=contigPrefix, filterFileName=filterFileName, minSize=minSize, keepCoverage=keepCoverage) + + return parser + + def processvelvet(inFileName, outFileName, contigPrefix="chr", filterFileName="", minSize=0, keepCoverage=False): infile = open(inFileName) outfile = open(outFileName, "w")