--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# data for Drosophila melanogaster
+import string
+from cistematic.genomes import Genome
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+ cisRoot = environ.get("CISTEMATIC_ROOT")
+else:
+ cisRoot = "/proj/genome"
+
+geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot
+
+version = {"A": "1",
+ "B": "2",
+ "C": "3",
+ "D": "4",
+ "E": "5",
+ "F": "6",
+ "G": "7",
+ "H": "8",
+ "I": "9",
+ "J": "10",
+ "K": "11",
+ "L": "12",
+ "M": "13",
+ "N": "14",
+ "O": "15",
+ "P": "16",
+ "Q": "17",
+ "R": "18",
+ "S": "19",
+ "T": "20",
+ "U": "21",
+ "V": "22",
+ "W": "23",
+ "X": "24",
+ "Y": "25",
+ "Z": "26"
+}
+
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ dmGenome.addChromosomeEntry(chromID, chromOut, "file")
+
+
+def loadGeneEntries(db, gFile):
+ geneEntries = []
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ geneFile = open(gFile, "r")
+
+ for line in geneFile:
+ cols = line.split("\t")
+ name = cols[1].split("-R")
+ gid = name[0]
+ start = int(cols[4])
+ stop = int(cols[5])
+ sense = cols[3]
+ chrom = cols[2][3:]
+ if sense == "-":
+ sense = "R"
+ else:
+ sense = "F"
+
+ geneID = ("dmelanogaster", gid)
+ try:
+ gidVersion = version[name[1]]
+ except:
+ continue
+
+ geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
+
+ print "Adding %d gene entries" % len(geneEntries)
+ dmGenome.addGeneEntryBatch(geneEntries)
+
+
+def loadGeneFeatures(db, gfile):
+ geneFile = open(gfile, "r")
+ senseArray = {"+": "F",
+ "-": "R",
+ ".": "F"
+ }
+
+ insertArray = []
+ for geneLine in geneFile:
+ geneFields = geneLine.split("\t")
+ exonNum = int(geneFields[8])
+ exonStarts = geneFields[9].split(",")
+ exonStops = geneFields[10].split(",")
+ chrom = geneFields[2][3:]
+ sense = senseArray[geneFields[3]]
+ gstop = int(geneFields[7]) - 1
+ gstart = int(geneFields[6]) - 1
+ name = geneFields[1].split("-R")
+ geneID = ("dmelanogaster", name[0])
+ try:
+ gidVersion = version[name[1]]
+ except:
+ continue
+
+ for index in range(exonNum):
+ estart = int(exonStarts[index]) - 1
+ estop = int(exonStops[index]) - 1
+ if estart >= gstart and estop <= gstop:
+ insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
+ elif estop <= gstart:
+ if sense == "F":
+ fType = "5UTR"
+ else:
+ fType = "3UTR"
+
+ insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
+ elif estart >= gstop:
+ if sense == "F":
+ fType = "3UTR"
+ else:
+ fType = "5UTR"
+
+ insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
+ elif estart <= gstop and estart > gstart:
+ if sense == "F":
+ fType = "3UTR"
+ else:
+ fType = "5UTR"
+
+ insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
+ insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
+ elif estart < gstart and estop <= gstop:
+ if sense == "F":
+ fType = "5UTR"
+ else:
+ fType = "3UTR"
+
+ insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
+ insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
+ else:
+ if sense == "F":
+ fType1 = "5UTR"
+ fType2 = "3UTR"
+ else:
+ fType1 = "3UTR"
+ fType2 = "5UTR"
+
+ insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
+ insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
+ insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
+
+ geneFile.close()
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ print "Adding %d features" % len(insertArray)
+ dmGenome.addFeatureEntryBatch(insertArray)
+
+
+def loadGeneAnnotations(db, annotPath):
+ geneAnnotations = []
+ annotFile = open(annotPath, "r")
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ for line in annotFile:
+ try:
+ cols = line.split("\t")
+ if cols[0] != "7227":
+ continue
+
+ locID = cols[3]
+ if "Dmel_" in locID:
+ locID = locID[5:]
+
+ geneDesc = cols[4]
+ if len(locID) > 0:
+ geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p")))
+ except:
+ pass
+
+ print "Adding %d annotations" % len(geneAnnotations)
+ dmGenome.addAnnotationBatch(geneAnnotations)
+
+
+def loadGeneOntology(db, goPath, goDefPath, annotPath):
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ goDefFile = open(goDefPath, "r")
+ goFile = open(goPath, "r")
+ annotFile = open(annotPath, "r")
+ annotEntries = annotFile.readlines()
+ annotFile.close()
+ goDefEntries = goDefFile.readlines()
+ goDefs = {}
+ locus = {}
+ goArray = []
+ for goDefEntry in goDefEntries:
+ if goDefEntry[0] != "!":
+ cols = goDefEntry.split("\t")
+ goDefs[cols[0]] = (cols[1], cols[2].strip())
+
+ goEntries = goFile.readlines()
+ for annotEntry in annotEntries:
+ try:
+ cols = annotEntry.split("\t")
+ if cols[0] != "7227":
+ continue
+
+ locID = cols[3].strip()
+ geneName = cols[1].strip()
+ if len(locID) > 0:
+ locus[geneName] = locID
+ except:
+ pass
+
+ for entry in goEntries:
+ if entry[0] == "!":
+ continue
+
+ if entry[:4] != "7227":
+ continue
+
+ try:
+ fields = entry.split("\t")
+ geneName = fields[1].strip()
+ locID = locus[geneName]
+ if "Dmel_" in locID:
+ locID = locID[5:]
+
+ GOID = fields[2]
+ goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], ""))
+ except:
+ pass
+
+ print "adding %d go entries" % len(goArray)
+ dmGenome.addGoInfoBatch(goArray)
+
+
+def createDBFile(db):
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ dmGenome.createGeneDB(db)
+
+
+def createDBindices(db):
+ dmGenome = Genome("dmelanogaster", dbFile=db)
+ dmGenome.createIndices()
+
+
+def buildDmelanogasterDB(db=geneDB):
+ """ genes and annotations are from UCSC. GO association file is from geneontology.org.
+ """
+ genePath = "%s/download/flyBaseGene.txt" % cisRoot
+ annotPath = "%s/download/gene_info" % cisRoot
+ goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+ goPath = "%s/download/gene2go" % cisRoot
+ chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
+ "2R": "%s/download/chr2R.fa" % cisRoot,
+ "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
+ "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
+ "3L": "%s/download/chr3L.fa" % cisRoot,
+ "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
+ "3R": "%s/download/chr3R.fa" % cisRoot,
+ "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
+ "4": "%s/download/chr4.fa" % cisRoot,
+ "X": "%s/download/chrX.fa" % cisRoot,
+ "XHet": "%s/download/chrXHet.fa" % cisRoot,
+ "YHet": "%s/download/chrYHet.fa" % cisRoot,
+ "U": "%s/download/chrU.fa" % cisRoot,
+ "Uextra": "%s/download/chrUextra.fa" % cisRoot,
+ "M": "%s/download/chrM.fa" % cisRoot
+ }
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene features"
+ loadGeneFeatures(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, annotPath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath, annotPath)
+
+ for chromID in chromos.keys():
+ print "Loading chromosome %s" % chromID
+ loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
\ No newline at end of file