1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Xenopus tropicalis
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
43 def loadChromosome(db, chromPath, chromOutPath):
46 xtGenome = Genome("xtropicalis", dbFile=db)
47 inFile = open(chromPath, "r")
48 header = inFile.readline()
52 chromID = header.strip()[1:]
53 currentLine = inFile.readline()
54 while currentLine != "" and currentLine[0] != ">":
55 lineSeq = currentLine.strip()
56 seqLen += len(lineSeq)
57 seqArray.append(lineSeq)
58 currentLine = inFile.readline()
60 seq = string.join(seqArray, "")
62 print "Added contig %s to database" % chromID
63 xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
64 xtGenome.addChromosomeEntry(chromID, chromID, "db")
66 outFileName = "%s%s.bin" % (chromOutPath, chromID)
67 outFile = open("%s%s" % (cisRoot, outFileName), "w")
70 print "Added contig file %s to database" % outFileName
71 xtGenome.addChromosomeEntry(chromID, outFileName, "file")
78 def loadGeneEntries(db, gFile):
79 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
82 xtGenome = Genome("xtropicalis", dbFile=db)
83 geneFile = open(gFile, "r")
85 cols = line.split("\t")
96 geneID = ("xtropicalis", gid)
98 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
100 print "Adding %d gene entries" % len(geneEntries)
101 xtGenome.addGeneEntryBatch(geneEntries)
104 def loadGeneAnnotations(db, annotPath):
106 annotFile = open(annotPath, "r")
107 xtGenome = Genome("xtropicalis", dbFile=db)
108 for line in annotFile:
110 cols = line.split("\t")
114 geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p")))
118 print "Adding %d annotations" % len(geneAnnotations)
119 xtGenome.addAnnotationBatch(geneAnnotations)
122 def loadGeneFeatures(db, gfile):
123 geneFile = open(gfile, "r")
124 senseArray = {"+": "F",
131 for geneLine in geneFile:
132 geneFields = geneLine.split("\t")
133 exonNum = int(geneFields[7])
134 exonStarts = geneFields[8].split(",")
135 exonStops = geneFields[9].split(",")
136 chrom = geneFields[1]
137 sense = senseArray[geneFields[2]]
138 gstop = int(geneFields[6]) - 1
139 gstart = int(geneFields[5]) - 1
140 geneid = geneFields[0]
142 geneID = ("xtropicalis", geneid)
147 if geneID in seenArray:
148 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
150 seenArray.append(geneID)
152 for index in range(exonNum):
153 estart = int(exonStarts[index]) - 1
154 estop = int(exonStops[index]) - 1
155 if estart >= gstart and estop <= gstop:
156 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
157 elif estop <= gstart:
163 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164 elif estart >= gstop:
170 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
171 elif estart <= gstop:
177 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
178 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
185 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
186 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
189 xtGenome = Genome("xtropicalis", dbFile=db)
190 print "Adding %d features" % len(insertArray)
191 xtGenome.addFeatureEntryBatch(insertArray)
194 def loadGeneOntology(db, goPath, goDefPath, annotPath):
195 xtGenome = Genome("xtropicalis", dbFile=db)
196 goDefFile = open(goDefPath, "r")
197 goFile = open(goPath, "r")
198 annotFile = open(annotPath, "r")
199 annotEntries = annotFile.readlines()
201 goDefEntries = goDefFile.readlines()
205 for goDefEntry in goDefEntries:
206 if goDefEntry[0] != "!":
207 cols = goDefEntry.split("\t")
208 goDefs[cols[0]] = (cols[1], cols[2].strip())
210 goEntries = goFile.readlines()
211 for annotEntry in annotEntries:
213 cols = annotEntry.split("\t")
219 locus[locID] = (geneName, geneDesc, mimID)
223 for entry in goEntries:
225 fields = entry.split("\t")
226 locID = fields[0].strip()
227 (gene_name, gene_desc, mimID) = locus[locID]
228 goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
232 print "adding %d go entries" % len(goArray)
233 xtGenome.addGoInfoBatch(goArray)
236 def createDBFile(db):
237 xtGenome = Genome("xtropicalis", dbFile=db)
238 xtGenome.createGeneDB(db)
241 def createDBindices(db):
242 xtGenome = Genome("xtropicalis", dbFile=db)
243 xtGenome.createIndices()
246 def buildXtropicalisDB(db=geneDB):
247 genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
248 chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
249 chromoOutPath = "/X_tropicalis/"
251 print "Creating database %s" % db
254 print "Adding gene entries"
255 loadGeneEntries(db, genePath)
257 print "Adding gene features"
258 loadGeneFeatures(db, genePath)
260 print "Loading sequences"
261 loadChromosome(db, chromoPath, chromoOutPath)
263 print "Creating Indices"
266 print "Finished creating database %s" % db