1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis remanei
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
43 def buildCbrenneriDB(db=geneDB):
44 gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
45 chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
46 chromoOutPath = "/C_brenneri/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, gffPath)
54 print "Adding feature entries"
55 loadFeatureEntries(db, gffPath)
57 print "Loading genomic sequence"
58 loadChromosome(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
68 cbGenome.createGeneDB(db)
71 def loadGeneEntries(db, gffFile):
72 cbGenome = Genome("cbrenneri", dbFile=db)
73 geneFile = open(gffFile, "r")
86 field = line[:-1].split("\t")
87 if field[2] != "stop_codon" and field[2] != "start_codon":
90 idfield = field[8].split('"')
92 geneID = ("cbrenneri", gid)
94 geneChrom[geneID] = field[0].strip()
96 geneSense[geneID] = "F"
98 geneSense[geneID] = "R"
100 if field[2] == "start_codon":
102 geneStart[geneID] = int(field[3])
104 geneStart[geneID] = int(field[4])
107 geneStop[geneID] = int(field[3])
109 geneStop[geneID] = int(field[4])
111 for geneID in geneStart:
112 if geneID not in geneStop:
113 print "geneID %s not in geneStop - skipping" % str(geneID)
116 geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
118 print "Adding %d gene entries" % len(geneEntries)
119 cbGenome.addGeneEntryBatch(geneEntries)
122 def loadFeatureEntries(db, gffFile):
123 cbGenome = Genome("cbrenneri", dbFile=db)
124 featureFile = open(gffFile, "r")
126 for line in featureFile:
133 field = line.split("\t")
134 if field[2].strip() != "CDS":
137 gidrev = field[8].split('"')
139 geneID = ("cbrenneri", gid)
142 start = int(field[3])
146 chrom = field[0].strip()
152 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
154 print "Adding %d feature entries" % len(featureEntries)
155 cbGenome.addFeatureEntryBatch(featureEntries)
158 def loadChromosome(db, chromPath, chromOutPath):
161 cbGenome = Genome("cbrenneri", dbFile=db)
162 inFile = open(chromPath, "r")
163 header = inFile.readline()
167 chromHeader = header.strip()[1:].split()
168 chromID = chromHeader[0]
169 currentLine = inFile.readline()
171 while currentLine != "" and currentLine[0] != ">":
172 lineSeq = currentLine.strip()
173 seqLen += len(lineSeq)
174 seqArray.append(lineSeq)
175 currentLine = inFile.readline()
177 seq = string.join(seqArray, "")
179 print "Added contig %s to database" % chromID
180 cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
181 cbGenome.addChromosomeEntry(chromID, chromID, "db")
183 outFileName = "%s%s.bin" % (chromOutPath, chromID)
184 outFile = open( "%s%s" % (cisRoot, outFileName), "w")
187 print "Added contig file %s to database" % outFileName
188 cbGenome.addChromosomeEntry(chromID, outFileName, "file")
195 def createDBindices(db):
196 cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
197 cbGenome.createIndices()