1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
43 def buildBtaurusDB(db=geneDB):
44 genePath = "%s/download/bt2/genscan.txt" % cisRoot
45 chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
46 chromoOutPath = "/B_taurus/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, genePath)
54 print "Adding gene features"
55 loadGeneFeatures(db, genePath)
57 print "Loading sequences"
58 loadChromosome(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 btGenome = Genome("btaurus", dbFile=db)
68 btGenome.createGeneDB(db)
71 def loadGeneEntries(db, gFile):
72 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
75 btGenome = Genome("btaurus", dbFile=db)
76 geneFile = open(gFile, "r")
79 cols = line.split("\t")
90 geneID = ("btaurus", gid)
92 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
94 print "Adding %d gene entries" % len(geneEntries)
95 btGenome.addGeneEntryBatch(geneEntries)
98 def loadGeneFeatures(db, gfile):
99 geneFile = open(gfile, "r")
100 senseArray = {"+": "F",
107 for geneLine in geneFile:
108 geneFields = geneLine.split("\t")
109 exonNum = int(geneFields[7])
110 exonStarts = geneFields[8].split(",")
111 exonStops = geneFields[9].split(",")
112 chrom = geneFields[1]
113 sense = senseArray[geneFields[2]]
114 gstop = int(geneFields[6]) - 1
115 gstart = int(geneFields[5]) - 1
116 geneid = geneFields[0]
118 geneID = ("btaurus", geneid)
123 if geneID in seenArray:
124 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
126 seenArray.append(geneID)
128 for index in range(exonNum):
129 estart = int(exonStarts[index]) - 1
130 estop = int(exonStops[index]) - 1
131 if estart >= gstart and estop <= gstop:
132 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
133 elif estop <= gstart:
139 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
140 elif estart >= gstop:
146 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
147 elif estart <= gstop:
153 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
154 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
161 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
162 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
166 btGenome = Genome("btaurus", dbFile=db)
167 print 'Adding %d features' % len(insertArray)
168 btGenome.addFeatureEntryBatch(insertArray)
171 def loadGeneAnnotations(db, annotPath):
173 annotFile = open(annotPath, "r")
174 btGenome = Genome("btaurus", dbFile=db)
175 for line in annotFile:
177 cols = line.split("\t")
181 geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
185 print "Adding %d annotations" % len(geneAnnotations)
186 btGenome.addAnnotationBatch(geneAnnotations)
189 def loadGeneOntology(db, goPath, goDefPath, annotPath):
190 btGenome = Genome("btaurus", dbFile=db)
191 goDefFile = open(goDefPath, "r")
192 goFile = open(goPath, "r")
193 annotFile = open(annotPath, "r")
194 annotEntries = annotFile.readlines()
196 goDefEntries = goDefFile.readlines()
201 for goDefEntry in goDefEntries:
202 if goDefEntry[0] != "!":
203 cols = goDefEntry.split("\t")
204 goDefs[cols[0]] = (cols[1], cols[2].strip())
206 goEntries = goFile.readlines()
207 for annotEntry in annotEntries:
209 cols = annotEntry.split("\t")
215 locus[locID] = (geneName, geneDesc, mimID)
219 for entry in goEntries:
221 fields = entry.split("\t")
222 locID = fields[0].strip()
223 (gene_name, gene_desc, mimID) = locus[locID]
224 goArray.append((("btaurus", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
226 #print "locus ID %s could not be added" % locID
229 print "adding %d go entries" % len(goArray)
230 btGenome.addGoInfoBatch(goArray)
233 def loadChromosome(db, chromPath, chromOutPath):
236 btGenome = Genome("btaurus", dbFile=db)
237 inFile = open(chromPath, "r")
238 header = inFile.readline()
242 chromID = header.strip()[1:]
243 currentLine = inFile.readline()
245 while currentLine != "" and currentLine[0] != ">":
246 lineSeq = currentLine.strip()
247 seqLen += len(lineSeq)
248 seqArray.append(lineSeq)
249 currentLine = inFile.readline()
251 seq = string.join(seqArray, "")
253 print "Added contig %s to database" % chromID
254 btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
255 btGenome.addChromosomeEntry(chromID, chromID, "db")
257 outFileName = "%s%s.bin" % (chromOutPath, chromID)
258 outFile = open("%s%s" % (cisRoot, outFileName), "w")
261 print "Added contig file %s to database" % outFileName
262 btGenome.addChromosomeEntry(chromID, outFileName, "file")
269 def createDBindices(db):
270 btGenome = Genome("btaurus", dbFile=db)
271 btGenome.createIndices()