1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Canis familiaris
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
43 def buildDogDB(db=geneDB):
44 genePath = "%s/download/seq_gene.md" % cisRoot
45 print "Creating database %s" % db
48 print "Adding gene entries"
49 loadGeneEntries(db, genePath)
51 print "Adding gene features"
52 loadGeneFeatures(db, genePath)
54 chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
55 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
56 "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
57 "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
59 for chromID in chromList:
60 print "Loading chromosome %s" % chromID
61 chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
62 loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
64 print "Creating Indices"
67 print "Finished creating database %s" % db
71 cfGenome = Genome("cfamiliaris", dbFile=db)
72 cfGenome.createGeneDB(db)
75 def loadGeneEntries(db, gFile):
76 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
80 cfGenome = Genome("cfamiliaris", dbFile=db)
81 geneFile = open(gFile, "r")
84 cols = line.split("\t")
85 if cols[11].strip() != "GENE":
88 name = cols[10].split(":")
90 if gid == "" or gid in alreadySeen:
93 alreadySeen.append(gid)
94 start = int(cols[2]) - 1
95 stop = int(cols[3]) - 1
97 chrom = cols[1].strip()
103 geneID = ("cfamiliaris", gid)
105 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
107 print "Adding %d gene entries" % len(geneEntries)
108 cfGenome.addGeneEntryBatch(geneEntries)
111 def loadGeneFeatures(db, gFile):
112 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
115 cfGenome = Genome("cfamiliaris", dbFile=db)
116 featureFile = open(gFile, "r")
117 featureFile.readline()
118 for line in featureFile:
119 cols = line.split("\t")
120 if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
124 name = cols[10].split(":")
129 start = int(cols[2]) - 1
130 stop = int(cols[3]) - 1
132 chrom = cols[1].strip()
138 geneID = ("cfamiliaris", gid)
140 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
142 print "Adding %d feature entries" % len(featureEntries)
143 cfGenome.addFeatureEntryBatch(featureEntries)
146 def loadGeneAnnotations(db, annotPath):
148 annotFile = open(annotPath, "r")
149 cfGenome = Genome("cfamiliaris", dbFile=db)
150 for line in annotFile:
152 cols = line.split("\t")
156 geneAnnotations.append((("cfamiliaris", locID), string.replace(geneDesc.strip(), "'", "p")))
160 print "Adding %d annotations" % len(geneAnnotations)
161 cfGenome.addAnnotationBatch(geneAnnotations)
164 def loadGeneOntology(db, goPath, goDefPath):
165 cfGenome = Genome("cfamiliaris", dbFile=db)
166 goDefFile = open(goDefPath, "r")
167 goFile = open(goPath, "r")
171 for goDefEntry in goDefFile:
172 if goDefEntry[0] != "!":
173 cols = goDefEntry.split("\t")
174 goDefs[cols[0]] = (cols[1], cols[2].strip())
176 goEntries = goFile.readlines()
178 for entry in goEntries:
180 fields = entry.split("\t")
181 if fields[0] != "9615":
184 locID = fields[1].strip()
185 gID = ("cfamiliaris", locID)
189 synonyms = idb.geneIDSynonyms(gID)
191 for entry in synonyms:
197 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
199 print "locus ID %s could not be added" % locID
202 print "adding %d go entries" % len(goArray)
203 cfGenome.addGoInfoBatch(goArray)
206 def loadChromosome(db, chromID, chromPath, chromOut):
208 cfGenome = Genome("cfamiliaris", dbFile=db)
209 inFile = open(chromPath, "r")
210 line = inFile.readline()
212 seqArray.append(line.strip())
214 seq = string.join(seqArray, "")
217 print "Problems reading sequence from file"
219 print "writing to file %s" % chromOut
220 outFile = open("%s%s" % (cisRoot, chromOut), "w")
223 cfGenome.addChromosomeEntry(chromID, chromOut, "file")
226 def createDBindices(db):
227 cfGenome = Genome("cfamiliaris", dbFile=db)
228 cfGenome.createIndices()