1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Mus musculus
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/M_musculus/mmusculus.genedb" % cisRoot
44 def loadChromosome(db, chromID, chromPath, chromOut):
46 mmGenome = Genome("mmusculus", dbFile=db)
47 inFile = open(chromPath, "r")
48 line = inFile.readline()
50 seqArray.append(line.strip())
52 seq = string.join(seqArray, "")
55 print "Problems reading sequence from file"
57 print "writing to file %s" % chromOut
58 outFile = open("%s%s" % (cisRoot, chromOut), "w")
61 mmGenome.addChromosomeEntry(chromID, chromOut, "file")
64 def loadGeneEntries(db, gFile, cDict):
65 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
69 mmGenome = Genome("mmusculus", dbFile=db)
70 geneFile = open(gFile, "r")
72 cols = line.split("\t")
73 if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
76 chrom = cols[1].strip()
77 if chrom not in cDict:
80 name = cols[10].split(":")
82 if gid == "" or gid in alreadySeen:
85 alreadySeen.append(gid)
86 start = int(cols[2]) - 1
87 stop = int(cols[3]) - 1
94 geneID = ("mmusculus", gid)
96 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
98 print "Adding %d gene entries" % len(geneEntries)
99 mmGenome.addGeneEntryBatch(geneEntries)
102 def loadGeneFeatures(db, gFile, cDict):
103 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
106 mmGenome = Genome("mmusculus", dbFile=db)
107 featureFile = open(gFile, "r")
108 for line in featureFile:
109 cols = line.split("\t")
110 if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
113 chrom = cols[1].strip()
114 if chrom not in cDict:
118 name = cols[10].split(":")
123 start = int(cols[2]) - 1
124 stop = int(cols[3]) - 1
131 geneID = ("mmusculus", gid)
133 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
135 print "Adding %d feature entries" % len(featureEntries)
136 mmGenome.addFeatureEntryBatch(featureEntries)
139 def loadGeneAnnotations(db):
142 mmGenome = Genome("mmusculus", dbFile=db)
143 gidList = mmGenome.allGIDs()
144 for locID in gidList:
145 gID = ("mmusculus", locID)
146 geneDescArray = idb.getDescription(gID)
148 for entry in geneDescArray:
150 geneDesc += entry.strip()
152 if len(geneDescArray) > 0:
153 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
155 print "Adding %d annotations" % len(geneAnnotations)
156 mmGenome.addAnnotationBatch(geneAnnotations)
159 def loadGeneOntology(db, goPath, goDefPath):
160 mmGenome = Genome("mmusculus", dbFile=db)
161 goDefFile = open(goDefPath, "r")
162 goFile = open(goPath, "r")
166 for goDefEntry in goDefFile:
167 if goDefEntry[0] != "!":
168 cols = goDefEntry.split("\t")
169 goDefs[cols[0]] = (cols[1], cols[2].strip())
171 goEntries = goFile.readlines()
173 for entry in goEntries:
175 fields = entry.split("\t")
176 if fields[0] != "10116":
179 locID = fields[1].strip()
180 gID = ("mmusculus", locID)
184 synonyms = idb.geneIDSynonyms(gID)
186 for entry in synonyms:
192 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
194 print "locus ID %s could not be added" % locID
197 print "adding %d go entries" % len(goArray)
198 mmGenome.addGoInfoBatch(goArray)
201 def createDBFile(db):
202 mmGenome = Genome("mmusculus", dbFile=db)
203 mmGenome.createGeneDB(db)
206 def createDBindices(db):
207 mmGenome = Genome("mmusculus", dbFile=db)
208 mmGenome.createIndices()
211 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
212 genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
213 goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
214 goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
215 # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
216 # but ignoring all random chromosomes
217 chromDict = {"1": "%s/chr1.fa" % downloadDir,
218 "2": "%s/chr2.fa" % downloadDir,
219 "3": "%s/chr3.fa" % downloadDir,
220 "4": "%s/chr4.fa" % downloadDir,
221 "5": "%s/chr5.fa" % downloadDir,
222 "6": "%s/chr6.fa" % downloadDir,
223 "7": "%s/chr7.fa" % downloadDir,
224 "8": "%s/chr8.fa" % downloadDir,
225 "9": "%s/chr9.fa" % downloadDir,
226 "10": "%s/chr10.fa" % downloadDir,
227 "11": "%s/chr11.fa" % downloadDir,
228 "12": "%s/chr12.fa" % downloadDir,
229 "13": "%s/chr13.fa" % downloadDir,
230 "14": "%s/chr14.fa" % downloadDir,
231 "15": "%s/chr15.fa" % downloadDir,
232 "16": "%s/chr16.fa" % downloadDir,
233 "17": "%s/chr17.fa" % downloadDir,
234 "18": "%s/chr18.fa" % downloadDir,
235 "19": "%s/chr19.fa" % downloadDir,
236 "X": "%s/chrX.fa" % downloadDir,
237 "Y": "%s/chrY.fa" % downloadDir,
238 "M": "%s/chrM.fa" % downloadDir
241 print "Creating database %s" % db
244 print "Adding gene entries"
245 loadGeneEntries(db, genePath, chromDict)
247 print "Adding gene features"
248 loadGeneFeatures(db, genePath, chromDict)
250 print "Adding gene annotations"
251 loadGeneAnnotations(db)
253 print "Adding gene ontology"
254 loadGeneOntology(db, goPath, goDefPath)
256 for chromID in chromDict.keys():
257 print "Loading chromosome %s" % chromID
258 loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
260 print "Creating Indices"
263 print "Finished creating database %s" % db