1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis remanei
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
43 def buildCremaneiDB(db=geneDB):
44 gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
45 chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
46 chromoOutPath = "/C_remanei/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, gffPath)
54 print "Adding feature entries"
55 loadFeatureEntries(db, gffPath)
57 print "Loading genomic sequence"
58 loadChromosomes(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
68 crGenome.createGeneDB(db)
71 def loadGeneEntries(db, gffFile):
72 crGenome = Genome("cremanei", dbFile=db)
73 geneFile = open(gffFile, "r")
86 field = line[:-1].split("\t")
90 idfield = field[8].split('"')
92 geneID = ("cremanei", gid)
93 geneStart[geneID] = int(field[3]) - 1
94 geneStop[geneID] = int(field[4]) - 1
96 geneChrom[geneID] = field[0].strip()
98 geneSense[geneID] = "F"
100 geneSense[geneID] = "R"
102 for geneID in geneStart:
103 if geneID not in geneStop:
104 print "geneID %s not in geneStop - skipping" % str(geneID)
106 geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
108 print "Adding %d gene entries" % len(geneEntries)
109 crGenome.addGeneEntryBatch(geneEntries)
112 def loadFeatureEntries(db, gffFile):
113 crGenome = Genome("cremanei", dbFile=db)
114 featureFile = open(gffFile, "r")
116 for line in featureFile:
123 field = line.split("\t")
124 if field[2].strip() != "coding_exon":
127 gidrev = field[8].split('"')
129 geneID = ("cremanei", gid)
131 start = int(field[3]) - 1
132 stop = int(field[4]) - 1
134 chrom = field[0].strip()
140 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
142 print "Adding %d feature entries" % len(featureEntries)
143 crGenome.addFeatureEntryBatch(featureEntries)
146 def loadChromosomes(db, inPath, chromOutPath):
147 crGenome = Genome("cremanei", dbFile=db)
148 scontigList = os.listdir(inPath)
149 for scontig in scontigList:
153 inFile = open("%s/%s" % (inPath, scontig), "r")
155 header = inFile.readline()
156 chromID = header.strip()[1:]
160 currentLine = inFile.readline()
161 while currentLine != "" and currentLine[0] != ">":
162 lineSeq = currentLine.strip()
163 seqLen += len(lineSeq)
164 seqArray.append(lineSeq)
165 currentLine = inFile.readline()
167 seq = string.join(seqArray, "")
169 print "Added contig %s to database" % chromID
170 crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
171 crGenome.addChromosomeEntry(chromID, chromID, "db")
173 outFileName = "%s%s.bin" % (chromOutPath, chromID)
174 outFile = open("%s%s" % (cisRoot, outFileName), "w")
177 print "Added contig file %s to database" % outFileName
178 crGenome.addChromosomeEntry(chromID, outFileName, "file")
186 def createDBindices(db):
187 crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
188 crGenome.createIndices()