1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Canis familiaris
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
43 def loadChromosome(db, chromID, chromPath, chromOut):
45 cfGenome = Genome("cfamiliaris", dbFile=db)
46 inFile = open(chromPath, "r")
47 line = inFile.readline()
49 seqArray.append(line.strip())
51 seq = string.join(seqArray, "")
54 print "Problems reading sequence from file"
56 print "writing to file %s" % chromOut
57 outFile = open("%s%s" % (cisRoot, chromOut), "w")
60 cfGenome.addChromosomeEntry(chromID, chromOut, "file")
63 def loadGeneEntries(db, gFile):
64 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
68 cfGenome = Genome("cfamiliaris", dbFile=db)
69 geneFile = open(gFile, "r")
72 cols = line.split("\t")
73 if cols[11].strip() != "GENE":
76 name = cols[10].split(":")
78 if gid == "" or gid in alreadySeen:
81 alreadySeen.append(gid)
82 start = int(cols[2]) - 1
83 stop = int(cols[3]) - 1
85 chrom = cols[1].strip()
91 geneID = ("cfamiliaris", gid)
93 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
95 print "Adding %d gene entries" % len(geneEntries)
96 cfGenome.addGeneEntryBatch(geneEntries)
99 def loadGeneFeatures(db, gFile):
100 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
103 cfGenome = Genome("cfamiliaris", dbFile=db)
104 featureFile = open(gFile, "r")
105 featureFile.readline()
106 for line in featureFile:
107 cols = line.split("\t")
108 if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
112 name = cols[10].split(":")
117 start = int(cols[2]) - 1
118 stop = int(cols[3]) - 1
120 chrom = cols[1].strip()
126 geneID = ("cfamiliaris", gid)
128 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
130 print "Adding %d feature entries" % len(featureEntries)
131 cfGenome.addFeatureEntryBatch(featureEntries)
134 def loadGeneAnnotations(db, annotPath):
136 annotFile = open(annotPath, "r")
137 cfGenome = Genome("cfamiliaris", dbFile=db)
138 for line in annotFile:
140 cols = line.split("\t")
144 geneAnnotations.append((("cfamiliaris", locID), string.replace(geneDesc.strip(), "'", "p")))
148 print "Adding %d annotations" % len(geneAnnotations)
149 cfGenome.addAnnotationBatch(geneAnnotations)
152 def loadGeneOntology(db, goPath, goDefPath):
153 cfGenome = Genome("cfamiliaris", dbFile=db)
154 goDefFile = open(goDefPath, "r")
155 goFile = open(goPath, "r")
159 for goDefEntry in goDefFile:
160 if goDefEntry[0] != "!":
161 cols = goDefEntry.split("\t")
162 goDefs[cols[0]] = (cols[1], cols[2].strip())
164 goEntries = goFile.readlines()
166 for entry in goEntries:
168 fields = entry.split("\t")
169 if fields[0] != "9615":
172 locID = fields[1].strip()
173 gID = ("cfamiliaris", locID)
177 synonyms = idb.geneIDSynonyms(gID)
179 for entry in synonyms:
185 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
187 print "locus ID %s could not be added" % locID
190 print "adding %d go entries" % len(goArray)
191 cfGenome.addGoInfoBatch(goArray)
194 def createDBFile(db):
195 cfGenome = Genome("cfamiliaris", dbFile=db)
196 cfGenome.createGeneDB(db)
199 def createDBindices(db):
200 cfGenome = Genome("cfamiliaris", dbFile=db)
201 cfGenome.createIndices()
204 def buildDogDB(db=geneDB):
205 genePath = "%s/download/seq_gene.md" % cisRoot
206 chromos = {"1": "%s/download/chr1.fa" % cisRoot,
207 "2": "%s/download/chr2.fa" % cisRoot,
208 "3": "%s/download/chr3.fa" % cisRoot,
209 "4": "%s/download/chr4.fa" % cisRoot,
210 "5": "%s/download/chr5.fa" % cisRoot,
211 "6": "%s/download/chr6.fa" % cisRoot,
212 "7": "%s/download/chr7.fa" % cisRoot,
213 "8": "%s/download/chr8.fa" % cisRoot,
214 "9": "%s/download/chr9.fa" % cisRoot,
215 "10": "%s/download/chr10.fa" % cisRoot,
216 "11": "%s/download/chr11.fa" % cisRoot,
217 "12": "%s/download/chr12.fa" % cisRoot,
218 "13": "%s/download/chr13.fa" % cisRoot,
219 "14": "%s/download/chr14.fa" % cisRoot,
220 "15": "%s/download/chr15.fa" % cisRoot,
221 "16": "%s/download/chr16.fa" % cisRoot,
222 "17": "%s/download/chr17.fa" % cisRoot,
223 "18": "%s/download/chr18.fa" % cisRoot,
224 "19": "%s/download/chr19.fa" % cisRoot,
225 "20": "%s/download/chr20.fa" % cisRoot,
226 "21": "%s/download/chr21.fa" % cisRoot,
227 "22": "%s/download/chr22.fa" % cisRoot,
228 '23': "%s/download/chr23.fa" % cisRoot,
229 "24": "%s/download/chr24.fa" % cisRoot,
230 "25": "%s/download/chr25.fa" % cisRoot,
231 "26": "%s/download/chr26.fa" % cisRoot,
232 "27": "%s/download/chr27.fa" % cisRoot,
233 "28": "%s/download/chr28.fa" % cisRoot,
234 "29": "%s/download/chr29.fa" % cisRoot,
235 "30": "%s/download/chr30.fa" % cisRoot,
236 "31": "%s/download/chr31.fa" % cisRoot,
237 "32": "%s/download/chr32.fa" % cisRoot,
238 "33": "%s/download/chr33.fa" % cisRoot,
239 "34": "%s/download/chr34.fa" % cisRoot,
240 "35": "%s/download/chr35.fa" % cisRoot,
241 "36": "%s/download/chr36.fa" % cisRoot,
242 "37": "%s/download/chr37.fa" % cisRoot,
243 "38": "%s/download/chr38.fa" % cisRoot,
244 "X": "%s/download/chrX.fa" % cisRoot,
245 "Un": "%s/download/chrUn.fa" % cisRoot
248 print "Creating database %s" % db
251 print "Adding gene entries"
252 loadGeneEntries(db, genePath)
254 print "Adding gene features"
255 loadGeneFeatures(db, genePath)
257 for chromID in chromos.keys():
258 print "Loading chromosome %s" % chromID
259 loadChromosome(db, chromID, chromos[chromID], "/C_familiaris/chromo%s.bin" % chromID)
261 print "Creating Indices"
264 print "Finished creating database %s" % db