1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Monodelphis domestica
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
43 def loadChromosome(db, chromPath, chromOutPath):
46 mdGenome = Genome("mdomestica", dbFile=db)
47 inFile = open(chromPath, "r")
48 header = inFile.readline()
52 chromID = header.strip()[1:]
53 currentLine = inFile.readline()
54 while currentLine != "" and currentLine[0] != ">":
55 lineSeq = currentLine.strip()
56 seqLen += len(lineSeq)
57 seqArray.append(lineSeq)
58 currentLine = inFile.readline()
60 seq = string.join(seqArray, "")
62 print "Added contig %s to database" % chromID
63 mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
64 mdGenome.addChromosomeEntry(chromID, chromID, "db")
66 outFileName = "%s%s.bin" % (chromOutPath, chromID)
67 outFile = open("%s%s" % (cisRoot, outFileName), "w")
70 print "Added contig file %s to database" % outFileName
71 mdGenome.addChromosomeEntry(chromID, outFileName, "file")
78 def loadGeneEntries(db, gFile):
79 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
82 mdGenome = Genome("mdomestica", dbFile=db)
83 geneFile = open(gFile, "r")
85 cols = line.split("\t")
96 geneID = ("mdomestica", gid)
98 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
100 print "Adding %d gene entries" % len(geneEntries)
101 mdGenome.addGeneEntryBatch(geneEntries)
104 def loadGeneAnnotations(db, annotPath):
106 annotFile = open(annotPath, "r")
107 mdGenome = Genome("mdomestica", dbFile=db)
108 for line in annotFile:
110 cols = line.split("\t")
114 geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p")))
118 print "Adding %d annotations" % len(geneAnnotations)
119 mdGenome.addAnnotationBatch(geneAnnotations)
122 def loadGeneFeatures(db, gfile):
123 geneFile = open(gfile, "r")
124 senseArray = {"+": "F",
131 for geneLine in geneFile:
132 geneFields = geneLine.split("\t")
133 exonNum = int(geneFields[7])
134 exonStarts = geneFields[8].split(",")
135 exonStops = geneFields[9].split(",")
136 chrom = geneFields[1]
137 sense = senseArray[geneFields[2]]
138 gstop = int(geneFields[6]) - 1
139 gstart = int(geneFields[5]) - 1
140 geneid = geneFields[0]
142 geneID = ("mdomestica", geneid)
147 if geneID in seenArray:
148 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
150 seenArray.append(geneID)
152 for index in range(exonNum):
153 estart = int(exonStarts[index]) - 1
154 estop = int(exonStops[index]) - 1
155 if estart >= gstart and estop <= gstop:
156 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
157 elif estop <= gstart:
163 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164 elif estart >= gstop:
170 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
171 elif estart <= gstop:
177 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
178 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
185 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
186 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
189 mdGenome = Genome("mdomestica", dbFile=db)
190 print "Adding %d features" % len(insertArray)
191 mdGenome.addFeatureEntryBatch(insertArray)
194 def createDBFile(db):
195 mdGenome = Genome("mdomestica", dbFile=db)
196 mdGenome.createGeneDB(db)
199 def createDBindices(db):
200 mdGenome = Genome("mdomestica", dbFile=db)
201 mdGenome.createIndices()
204 def buildMdomesticaDB(db=geneDB):
205 genePath = "%s/download/mondom/genscan.txt" % cisRoot
206 chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
207 chromoOutPath = "/M_domestica/"
209 print "Creating database %s" % db
212 print "Adding gene entries"
213 loadGeneEntries(db, genePath)
215 print "Adding gene features"
216 loadGeneFeatures(db, genePath)
218 print "Loading chromosomes"
219 loadChromosome(db, chromoPath, chromoOutPath)
221 print "Creating Indices"
224 print "Finished creating database %s" % db