1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
43 def loadChromosome(db, chromPath, chromOutPath):
46 btGenome = Genome("btaurus", dbFile=db)
47 inFile = open(chromPath, "r")
48 header = inFile.readline()
52 chromID = header.strip()[1:]
53 currentLine = inFile.readline()
55 while currentLine != "" and currentLine[0] != ">":
56 lineSeq = currentLine.strip()
57 seqLen += len(lineSeq)
58 seqArray.append(lineSeq)
59 currentLine = inFile.readline()
61 seq = string.join(seqArray, "")
63 print "Added contig %s to database" % chromID
64 btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
65 btGenome.addChromosomeEntry(chromID, chromID, "db")
67 outFileName = "%s%s.bin" % (chromOutPath, chromID)
68 outFile = open("%s%s" % (cisRoot, outFileName), "w")
71 print "Added contig file %s to database" % outFileName
72 btGenome.addChromosomeEntry(chromID, outFileName, "file")
79 def loadGeneEntries(db, gFile):
80 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
83 btGenome = Genome("btaurus", dbFile=db)
84 geneFile = open(gFile, "r")
87 cols = line.split("\t")
98 geneID = ("btaurus", gid)
100 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
102 print "Adding %d gene entries" % len(geneEntries)
103 btGenome.addGeneEntryBatch(geneEntries)
106 def loadGeneAnnotations(db, annotPath):
108 annotFile = open(annotPath, "r")
109 btGenome = Genome("btaurus", dbFile=db)
110 for line in annotFile:
112 cols = line.split("\t")
116 geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
120 print "Adding %d annotations" % len(geneAnnotations)
121 btGenome.addAnnotationBatch(geneAnnotations)
124 def loadGeneFeatures(db, gfile):
125 geneFile = open(gfile, "r")
126 senseArray = {"+": "F",
133 for geneLine in geneFile:
134 geneFields = geneLine.split("\t")
135 exonNum = int(geneFields[7])
136 exonStarts = geneFields[8].split(",")
137 exonStops = geneFields[9].split(",")
138 chrom = geneFields[1]
139 sense = senseArray[geneFields[2]]
140 gstop = int(geneFields[6]) - 1
141 gstart = int(geneFields[5]) - 1
142 geneid = geneFields[0]
144 geneID = ("btaurus", geneid)
149 if geneID in seenArray:
150 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
152 seenArray.append(geneID)
154 for index in range(exonNum):
155 estart = int(exonStarts[index]) - 1
156 estop = int(exonStops[index]) - 1
157 if estart >= gstart and estop <= gstop:
158 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
159 elif estop <= gstart:
165 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
166 elif estart >= gstop:
172 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
173 elif estart <= gstop:
179 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
180 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
187 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
188 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
192 btGenome = Genome("btaurus", dbFile=db)
193 print 'Adding %d features' % len(insertArray)
194 btGenome.addFeatureEntryBatch(insertArray)
197 def loadGeneOntology(db, goPath, goDefPath, annotPath):
198 btGenome = Genome("btaurus", dbFile=db)
199 goDefFile = open(goDefPath, "r")
200 goFile = open(goPath, "r")
201 annotFile = open(annotPath, "r")
202 annotEntries = annotFile.readlines()
204 goDefEntries = goDefFile.readlines()
209 for goDefEntry in goDefEntries:
210 if goDefEntry[0] != "!":
211 cols = goDefEntry.split("\t")
212 goDefs[cols[0]] = (cols[1], cols[2].strip())
214 goEntries = goFile.readlines()
215 for annotEntry in annotEntries:
217 cols = annotEntry.split("\t")
223 locus[locID] = (geneName, geneDesc, mimID)
227 for entry in goEntries:
229 fields = entry.split("\t")
230 locID = fields[0].strip()
231 (gene_name, gene_desc, mimID) = locus[locID]
232 goArray.append((("btaurus", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
234 #print "locus ID %s could not be added" % locID
237 print "adding %d go entries" % len(goArray)
238 btGenome.addGoInfoBatch(goArray)
241 def createDBFile(db):
242 btGenome = Genome("btaurus", dbFile=db)
243 btGenome.createGeneDB(db)
246 def createDBindices(db):
247 btGenome = Genome("btaurus", dbFile=db)
248 btGenome.createIndices()
251 def buildBtaurusDB(db=geneDB):
252 genePath = "%s/download/bt2/genscan.txt" % cisRoot
253 chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
254 chromoOutPath = "/B_taurus/"
256 print "Creating database %s" % db
259 print "Adding gene entries"
260 loadGeneEntries(db, genePath)
262 print "Adding gene features"
263 loadGeneFeatures(db, genePath)
265 print "Loading sequences"
266 loadChromosome(db, chromoPath, chromoOutPath)
268 print "Creating Indices"
271 print "Finished creating database %s" % db