1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis briggsae
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
43 def buildCbriggsaeDB(db=geneDB):
44 gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
45 chromoPath = "%s/download/briggsae_25.fa" % cisRoot
46 chromoOutPath = "/C_briggsae/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, gffPath)
54 print "Adding feature entries"
55 loadFeatureEntries(db, gffPath)
57 print "Loading genomic sequence"
58 loadChromosome(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
68 cbGenome.createGeneDB(db)
71 def loadGeneEntries(db, gffFile):
72 cbGenome = Genome("cbriggsae", dbFile=db)
73 geneFile = open(gffFile, "r")
77 field = line[:-1].split("\t")
78 if field[1] != "hybrid":
84 annot = field[8].split('"')
86 geneID = ("cbriggsae", gid)
87 annotation = string.join(annot[3:], "")
90 gidVersion = giddots[2]
94 start = int(field[3]) - 1
95 stop = int(field[4]) - 1
97 chrom = field[0].strip()
103 if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
104 geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
105 if (geneID, annotation) not in geneAnnotations:
106 geneAnnotations.append((geneID, annotation))
108 print "Adding %d gene entries" % len(geneEntries)
109 cbGenome.addGeneEntryBatch(geneEntries)
111 print "Adding %d annotations" % len(geneAnnotations)
112 cbGenome.addAnnotationBatch(geneAnnotations)
115 def loadFeatureEntries(db, gffFile):
116 cbGenome = Genome("cbriggsae", dbFile=db)
117 featureFile = open(gffFile, "r")
120 featureTranslation = {"coding_exon": "CDS",
121 "three_prime_UTR": "3UTR",
122 "five_prime_UTR": "5UTR"
125 for line in featureFile:
126 field = line.split("\t")
127 if field[1] != "hybrid":
130 if field[2].strip() not in featureTranslation:
133 featureType = featureTranslation[field[2].strip()]
134 gidrev = field[8].split('"')
136 geneID = ("cbriggsae", gid)
138 start = int(field[3]) - 1
139 stop = int(field[4]) - 1
141 chrom = field[0].strip()
147 if geneID not in seenFeatures:
148 seenFeatures[geneID] = []
149 if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
150 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
151 seenFeatures[geneID].append((gidVersion, start, stop, featureType))
153 print "Adding %d feature entries" % len(featureEntries)
154 cbGenome.addFeatureEntryBatch(featureEntries)
157 def loadChromosome(db, chromPath, chromOutPath):
160 cbGenome = Genome("cbriggsae", dbFile=db)
161 inFile = open(chromPath, "r")
162 header = inFile.readline()
166 chromID = header.strip()[1:]
167 currentLine = inFile.readline()
168 while currentLine != "" and currentLine[0] != ">":
169 lineSeq = currentLine.strip()
170 seqLen += len(lineSeq)
171 seqArray.append(lineSeq)
172 currentLine = inFile.readline()
174 seq = string.join(seqArray, "")
176 print "Added contig %s to database" % chromID
177 cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
178 cbGenome.addChromosomeEntry(chromID, chromID, "db")
180 outFileName = "%s%s.bin" % (chromOutPath, chromID)
181 outFile = open("%s%s" % (cisRoot, outFileName), "w")
184 print "Added contig file %s to database" % outFileName
185 cbGenome.addChromosomeEntry(chromID, outFileName, "file")
192 def createDBindices(db):
193 cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
194 cbGenome.createIndices()