1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Monodelphis domestica
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
43 def buildMdomesticaDB(db=geneDB):
44 genePath = "%s/download/mondom/genscan.txt" % cisRoot
45 chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
46 chromoOutPath = "/M_domestica/"
48 print "Creating database %s" % db
51 print "Adding gene entries"
52 loadGeneEntries(db, genePath)
54 print "Adding gene features"
55 loadGeneFeatures(db, genePath)
57 print "Loading chromosomes"
58 loadChromosome(db, chromoPath, chromoOutPath)
60 print "Creating Indices"
63 print "Finished creating database %s" % db
67 mdGenome = Genome("mdomestica", dbFile=db)
68 mdGenome.createGeneDB(db)
71 def loadGeneEntries(db, gFile):
72 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
75 mdGenome = Genome("mdomestica", dbFile=db)
76 geneFile = open(gFile, "r")
78 cols = line.split("\t")
89 geneID = ("mdomestica", gid)
91 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
93 print "Adding %d gene entries" % len(geneEntries)
94 mdGenome.addGeneEntryBatch(geneEntries)
97 def loadGeneAnnotations(db, annotPath):
99 annotFile = open(annotPath, "r")
100 mdGenome = Genome("mdomestica", dbFile=db)
101 for line in annotFile:
103 cols = line.split("\t")
107 geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p")))
111 print "Adding %d annotations" % len(geneAnnotations)
112 mdGenome.addAnnotationBatch(geneAnnotations)
115 def loadGeneFeatures(db, gfile):
116 geneFile = open(gfile, "r")
117 senseArray = {"+": "F",
124 for geneLine in geneFile:
125 geneFields = geneLine.split("\t")
126 exonNum = int(geneFields[7])
127 exonStarts = geneFields[8].split(",")
128 exonStops = geneFields[9].split(",")
129 chrom = geneFields[1]
130 sense = senseArray[geneFields[2]]
131 gstop = int(geneFields[6]) - 1
132 gstart = int(geneFields[5]) - 1
133 geneid = geneFields[0]
135 geneID = ("mdomestica", geneid)
140 if geneID in seenArray:
141 gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
143 seenArray.append(geneID)
145 for index in range(exonNum):
146 estart = int(exonStarts[index]) - 1
147 estop = int(exonStops[index]) - 1
148 if estart >= gstart and estop <= gstop:
149 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
150 elif estop <= gstart:
156 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
157 elif estart >= gstop:
163 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164 elif estart <= gstop:
170 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
171 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
178 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
179 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
182 mdGenome = Genome("mdomestica", dbFile=db)
183 print "Adding %d features" % len(insertArray)
184 mdGenome.addFeatureEntryBatch(insertArray)
187 def loadChromosome(db, chromPath, chromOutPath):
190 mdGenome = Genome("mdomestica", dbFile=db)
191 inFile = open(chromPath, "r")
192 header = inFile.readline()
196 chromID = header.strip()[1:]
197 currentLine = inFile.readline()
198 while currentLine != "" and currentLine[0] != ">":
199 lineSeq = currentLine.strip()
200 seqLen += len(lineSeq)
201 seqArray.append(lineSeq)
202 currentLine = inFile.readline()
204 seq = string.join(seqArray, "")
206 print "Added contig %s to database" % chromID
207 mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
208 mdGenome.addChromosomeEntry(chromID, chromID, "db")
210 outFileName = "%s%s.bin" % (chromOutPath, chromID)
211 outFile = open("%s%s" % (cisRoot, outFileName), "w")
214 print "Added contig file %s to database" % outFileName
215 mdGenome.addChromosomeEntry(chromID, outFileName, "file")
222 def createDBindices(db):
223 mdGenome = Genome("mdomestica", dbFile=db)
224 mdGenome.createIndices()