1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis remanei
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
43 def loadChromosomes(db, inPath, chromOutPath):
44 crGenome = Genome("cremanei", dbFile=db)
45 scontigList = os.listdir(inPath)
46 for scontig in scontigList:
50 inFile = open("%s/%s" % (inPath, scontig), "r")
52 header = inFile.readline()
53 chromID = header.strip()[1:]
57 currentLine = inFile.readline()
58 while currentLine != "" and currentLine[0] != ">":
59 lineSeq = currentLine.strip()
60 seqLen += len(lineSeq)
61 seqArray.append(lineSeq)
62 currentLine = inFile.readline()
64 seq = string.join(seqArray, "")
66 print "Added contig %s to database" % chromID
67 crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
68 crGenome.addChromosomeEntry(chromID, chromID, "db")
70 outFileName = "%s%s.bin" % (chromOutPath, chromID)
71 outFile = open("%s%s" % (cisRoot, outFileName), "w")
74 print "Added contig file %s to database" % outFileName
75 crGenome.addChromosomeEntry(chromID, outFileName, "file")
83 def loadGeneEntries(db, gffFile):
84 crGenome = Genome("cremanei", dbFile=db)
85 geneFile = open(gffFile, "r")
98 field = line[:-1].split("\t")
102 idfield = field[8].split('"')
104 geneID = ("cremanei", gid)
105 geneStart[geneID] = int(field[3]) - 1
106 geneStop[geneID] = int(field[4]) - 1
108 geneChrom[geneID] = field[0].strip()
110 geneSense[geneID] = "F"
112 geneSense[geneID] = "R"
114 for geneID in geneStart:
115 if geneID not in geneStop:
116 print "geneID %s not in geneStop - skipping" % str(geneID)
118 geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
120 print "Adding %d gene entries" % len(geneEntries)
121 crGenome.addGeneEntryBatch(geneEntries)
124 def loadFeatureEntries(db, gffFile):
125 crGenome = Genome("cremanei", dbFile=db)
126 featureFile = open(gffFile, "r")
128 for line in featureFile:
135 field = line.split("\t")
136 if field[2].strip() != "coding_exon":
139 gidrev = field[8].split('"')
141 geneID = ("cremanei", gid)
143 start = int(field[3]) - 1
144 stop = int(field[4]) - 1
146 chrom = field[0].strip()
152 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
154 print "Adding %d feature entries" % len(featureEntries)
155 crGenome.addFeatureEntryBatch(featureEntries)
158 def createDBFile(db):
159 crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
160 crGenome.createGeneDB(db)
163 def createDBindices(db):
164 crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
165 crGenome.createIndices()
168 def buildCremaneiDB(db=geneDB):
169 gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
170 chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
171 chromoOutPath = "/C_remanei/"
173 print "Creating database %s" % db
176 print "Adding gene entries"
177 loadGeneEntries(db, gffPath)
179 print "Adding feature entries"
180 loadFeatureEntries(db, gffPath)
182 print "Loading genomic sequence"
183 loadChromosomes(db, chromoPath, chromoOutPath)
185 print "Creating Indices"
188 print "Finished creating database %s" % db