X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2Fathaliana.py;fp=cistematic%2Fgenomes%2Fathaliana.py;h=628be4dc4789ec61f8775429fd1d261020209063;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/genomes/athaliana.py b/cistematic/genomes/athaliana.py new file mode 100644 index 0000000..628be4d --- /dev/null +++ b/cistematic/genomes/athaliana.py @@ -0,0 +1,269 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +# data for Arabidopsis thaliana +import string +from cistematic.genomes import Genome +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot + +chromSize = {"1": 30432563, + "2": 19705359, + "3": 23470805, + "4": 18585042, + "5": 26992738 +} + +background = [0.3180185, 0.1819815, 0.1819815, 0.3180185] +genomeSize = 119186497 + + +def decodeGFF3(cols): + fType = cols[2] + chrom = cols[0][3:] + start = int(cols[3]) + stop = int(cols[4]) + sense = cols[6] + if sense == "+": + sense = "F" + else: + sense = "R" + + other = cols[-1] + otherList = other.split(";") + otherDict = {} + for otherItem in otherList: + try: + (name, value) = otherItem.split("=") + except: + continue + + otherDict[name] = value + if name == "Name": + gid = value.strip() + + if name == "Parent": + if "," in value: + value = value.split(",")[0] + + gid = value.strip() + + return (fType, gid, chrom, start, stop, sense, otherDict) + + +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + atGenome = Genome("athaliana", dbFile=db) + + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray,"") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % (chromOut) + outFile = open(cisRoot + chromOut, "w") + outFile.write(seq) + outFile.close() + seq = "" + + atGenome.addChromosomeEntry(chromID, chromOut, "file") + # Add alternative chromID - should be A-O and 01-09 + atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file") + + +def loadGeneEntries(db, gFile): + geneEntries = [] + atGenome = Genome("athaliana", dbFile=db) + geneFile = open(gFile, "r") + for line in geneFile: + if line[0] == "#" or len(line) < 10: + continue + + fields = line.strip().split("\t") + if fields[2] != "gene": + continue + + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + geneID = ("athaliana", gid) + version = 1 + geneEntries.append((geneID, chrom, start, stop, sense, "gene", version)) + + print "inserting %d gene entries" % len(geneEntries) + atGenome.addGeneEntryBatch(geneEntries) + + +def loadFeatureEntries(db, gFile): + featureEntries = [] + trackedGenes = [] + atGenome = Genome("athaliana", dbFile=db) + featureTranslation = {"CDS": "CDS", + "three_prime_UTR": "3UTR", + "five_prime_UTR": "5UTR", + "miRNA": "5UTR", + "exon": "5UTR" + } + geneFile = open(gFile, "r") + for line in geneFile: + fields = line.split("\t") + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + if fType in ["ncRNA"]: + (locus, rev) = gid.split(".") + if gid not in trackedGenes: + trackedGenes.append(locus) + + geneFile = open(gFile, "r") + for line in geneFile: + if line[0] == "c" or len(line) < 10: + continue + + fields = line.split("\t") + (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields) + locusField = gid.split('.') + try: + (locus, rev) = locusField + rev = rev.strip() + except: + locus = gid + rev = 1 + + if fType not in featureTranslation: + continue + + elif fType == "exon" and locus not in trackedGenes: + continue + + geneID = ("athaliana", locus) + featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType])) + + print "inserted %d feature entries" % len(featureEntries) + atGenome.addFeatureEntryBatch(featureEntries) + + +def loadGeneAnnotations(db, annotPath): + geneAnnotations = [] + annotFile = open(annotPath, "r") + annotFile.readline() + lines = annotFile.readlines() + annotFile.close() + + atGenome = Genome("athaliana", dbFile=db) + for line in lines: + field = line.split("\t") + try: + orfName = field[0].strip() + if "." in orfName: + (locus, rev) = orfName.split(".") + orfName = locus + + description = field[2].strip() + geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p"))) + except: + pass + + print "Adding %d annotations" % len(geneAnnotations) + atGenome.addAnnotationBatch(geneAnnotations) + + +def loadGeneOntology(db, goPath): + atGenome = Genome("athaliana", dbFile=db) + goFile = open(goPath, "r") + goEntries = goFile.readlines() + goArray = [] + for line in goEntries: + fields = line.split("\t") + gID = fields[0] + GOID = fields[4] + objType = string.replace(fields[3], "'", "p") + objName = string.replace(fields[2], "'", "p") + isNot = "" + GOterm = fields[7] + evidence = fields[8] + goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9])) + + atGenome.addGoInfoBatch(goArray) + + +def createDBFile(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createGeneDB(db) + + +def createDBindices(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createIndices() + + +def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir + annotPath = "%s/TAIR9_functional_descriptions" % downloadDir + goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir + + chromos = {"1": "%s/chr1.fas" % downloadDir, + "2": "%s/chr2.fas" % downloadDir, + "3": "%s/chr3.fas" % downloadDir, + "4": "%s/chr4.fas" % downloadDir, + "5": "%s/chr5.fas" % downloadDir, + "C": "%s/chrC.fas" % downloadDir, + "M": "%s/chrM.fas" % downloadDir + } + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding feature entries" + loadFeatureEntries(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db, annotPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath) + + for chromID in chromos.keys(): + print "Loading chromosome %s" % chromID + loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db \ No newline at end of file