1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis briggsae
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
43 def loadChromosome(db, chromPath, chromOutPath):
46 cbGenome = Genome("cbriggsae", dbFile=db)
47 inFile = open(chromPath, "r")
48 header = inFile.readline()
52 chromID = header.strip()[1:]
53 currentLine = inFile.readline()
54 while currentLine != "" and currentLine[0] != ">":
55 lineSeq = currentLine.strip()
56 seqLen += len(lineSeq)
57 seqArray.append(lineSeq)
58 currentLine = inFile.readline()
60 seq = string.join(seqArray, "")
62 print "Added contig %s to database" % chromID
63 cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
64 cbGenome.addChromosomeEntry(chromID, chromID, "db")
66 outFileName = "%s%s.bin" % (chromOutPath, chromID)
67 outFile = open("%s%s" % (cisRoot, outFileName), "w")
70 print "Added contig file %s to database" % outFileName
71 cbGenome.addChromosomeEntry(chromID, outFileName, "file")
78 def loadGeneEntries(db, gffFile):
79 cbGenome = Genome("cbriggsae", dbFile=db)
80 geneFile = open(gffFile, "r")
84 field = line[:-1].split("\t")
85 if field[1] != "hybrid":
91 annot = field[8].split('"')
93 geneID = ("cbriggsae", gid)
94 annotation = string.join(annot[3:], "")
97 gidVersion = giddots[2]
101 start = int(field[3]) - 1
102 stop = int(field[4]) - 1
104 chrom = field[0].strip()
110 if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
111 geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
112 if (geneID, annotation) not in geneAnnotations:
113 geneAnnotations.append((geneID, annotation))
115 print "Adding %d gene entries" % len(geneEntries)
116 cbGenome.addGeneEntryBatch(geneEntries)
118 print "Adding %d annotations" % len(geneAnnotations)
119 cbGenome.addAnnotationBatch(geneAnnotations)
122 def loadFeatureEntries(db, gffFile):
123 cbGenome = Genome("cbriggsae", dbFile=db)
124 featureFile = open(gffFile, "r")
127 featureTranslation = {"coding_exon": "CDS",
128 "three_prime_UTR": "3UTR",
129 "five_prime_UTR": "5UTR"
132 for line in featureFile:
133 field = line.split("\t")
134 if field[1] != "hybrid":
137 if field[2].strip() not in featureTranslation:
140 featureType = featureTranslation[field[2].strip()]
141 gidrev = field[8].split('"')
143 geneID = ("cbriggsae", gid)
145 start = int(field[3]) - 1
146 stop = int(field[4]) - 1
148 chrom = field[0].strip()
154 if geneID not in seenFeatures:
155 seenFeatures[geneID] = []
156 if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
157 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
158 seenFeatures[geneID].append((gidVersion, start, stop, featureType))
160 print "Adding %d feature entries" % len(featureEntries)
161 cbGenome.addFeatureEntryBatch(featureEntries)
164 def createDBFile(db):
165 cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
166 cbGenome.createGeneDB(db)
169 def createDBindices(db):
170 cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
171 cbGenome.createIndices()
174 def buildCbriggsaeDB(db=geneDB):
175 gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
176 chromoPath = "%s/download/briggsae_25.fa" % cisRoot
177 chromoOutPath = "/C_briggsae/"
179 print "Creating database %s" % db
182 print "Adding gene entries"
183 loadGeneEntries(db, gffPath)
185 print "Adding feature entries"
186 loadFeatureEntries(db, gffPath)
188 print "Loading genomic sequence"
189 loadChromosome(db, chromoPath, chromoOutPath)
191 print "Creating Indices"
194 print "Finished creating database %s" % db