1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Xenopus tropicalis
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
43 def buildXtropicalisDB(db=geneDB):
44 genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
45 chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
46 chromoOutPath = "/X_tropicalis/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, genePath)
54 print "Adding gene features"
55 loadGeneFeatures(db, genePath)
57 print "Loading sequences"
58 loadChromosome(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 xtGenome = Genome("xtropicalis", dbFile=db)
68 xtGenome.createGeneDB(db)
71 def loadGeneEntries(db, gFile):
72 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
75 xtGenome = Genome("xtropicalis", dbFile=db)
76 geneFile = open(gFile, "r")
78 cols = line.split("\t")
89 geneID = ("xtropicalis", gid)
91 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
93 print "Adding %d gene entries" % len(geneEntries)
94 xtGenome.addGeneEntryBatch(geneEntries)
97 def loadGeneAnnotations(db, annotPath):
99 annotFile = open(annotPath, "r")
100 xtGenome = Genome("xtropicalis", dbFile=db)
101 for line in annotFile:
103 cols = line.split("\t")
107 geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p")))
111 print "Adding %d annotations" % len(geneAnnotations)
112 xtGenome.addAnnotationBatch(geneAnnotations)
115 def loadGeneFeatures(db, gfile):
116 geneFile = open(gfile, "r")
117 senseArray = {"+": "F",
124 for geneLine in geneFile:
125 geneFields = geneLine.split("\t")
126 exonNum = int(geneFields[7])
127 exonStarts = geneFields[8].split(",")
128 exonStops = geneFields[9].split(",")
129 chrom = geneFields[1]
130 sense = senseArray[geneFields[2]]
131 gstop = int(geneFields[6]) - 1
132 gstart = int(geneFields[5]) - 1
133 geneid = geneFields[0]
135 geneID = ("xtropicalis", geneid)
140 if geneID in seenArray:
141 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
143 seenArray.append(geneID)
145 for index in range(exonNum):
146 estart = int(exonStarts[index]) - 1
147 estop = int(exonStops[index]) - 1
148 if estart >= gstart and estop <= gstop:
149 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
150 elif estop <= gstart:
156 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
157 elif estart >= gstop:
163 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164 elif estart <= gstop:
170 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
171 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
178 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
179 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
182 xtGenome = Genome("xtropicalis", dbFile=db)
183 print "Adding %d features" % len(insertArray)
184 xtGenome.addFeatureEntryBatch(insertArray)
187 def loadGeneOntology(db, goPath, goDefPath, annotPath):
188 xtGenome = Genome("xtropicalis", dbFile=db)
189 goDefFile = open(goDefPath, "r")
190 goFile = open(goPath, "r")
191 annotFile = open(annotPath, "r")
192 annotEntries = annotFile.readlines()
194 goDefEntries = goDefFile.readlines()
198 for goDefEntry in goDefEntries:
199 if goDefEntry[0] != "!":
200 cols = goDefEntry.split("\t")
201 goDefs[cols[0]] = (cols[1], cols[2].strip())
203 goEntries = goFile.readlines()
204 for annotEntry in annotEntries:
206 cols = annotEntry.split("\t")
212 locus[locID] = (geneName, geneDesc, mimID)
216 for entry in goEntries:
218 fields = entry.split("\t")
219 locID = fields[0].strip()
220 (gene_name, gene_desc, mimID) = locus[locID]
221 goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
225 print "adding %d go entries" % len(goArray)
226 xtGenome.addGoInfoBatch(goArray)
229 def loadChromosome(db, chromPath, chromOutPath):
232 xtGenome = Genome("xtropicalis", dbFile=db)
233 inFile = open(chromPath, "r")
234 header = inFile.readline()
238 chromID = header.strip()[1:]
239 currentLine = inFile.readline()
240 while currentLine != "" and currentLine[0] != ">":
241 lineSeq = currentLine.strip()
242 seqLen += len(lineSeq)
243 seqArray.append(lineSeq)
244 currentLine = inFile.readline()
246 seq = string.join(seqArray, "")
248 print "Added contig %s to database" % chromID
249 xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
250 xtGenome.addChromosomeEntry(chromID, chromID, "db")
252 outFileName = "%s%s.bin" % (chromOutPath, chromID)
253 outFile = open("%s%s" % (cisRoot, outFileName), "w")
256 print "Added contig file %s to database" % outFileName
257 xtGenome.addChromosomeEntry(chromID, outFileName, "file")
264 def createDBindices(db):
265 xtGenome = Genome("xtropicalis", dbFile=db)
266 xtGenome.createIndices()