1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Mus musculus
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/M_musculus/mmusculus.genedb" % cisRoot
44 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45 genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
46 goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
47 goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
48 # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
49 # but ignoring all random chromosomes
50 chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
51 "11", "12", "13", "14", "15", "16", "17", "18", "19",
55 print "Creating database %s" % db
58 print "Adding gene entries"
59 loadGeneEntries(db, genePath, chromList)
61 print "Adding gene features"
62 loadGeneFeatures(db, genePath, chromList)
64 print "Adding gene annotations"
65 loadGeneAnnotations(db)
67 print "Adding gene ontology"
68 loadGeneOntology(db, goPath, goDefPath)
70 for chromID in chromList:
71 print "Loading chromosome %s" % chromID
72 chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
73 loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
75 print "Creating Indices"
78 print "Finished creating database %s" % db
82 mmGenome = Genome("mmusculus", dbFile=db)
83 mmGenome.createGeneDB(db)
86 def loadGeneEntries(db, gFile, cDict):
87 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
91 mmGenome = Genome("mmusculus", dbFile=db)
92 geneFile = open(gFile, "r")
94 cols = line.split("\t")
95 if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
98 chrom = cols[1].strip()
99 if chrom not in cDict:
102 name = cols[10].split(":")
104 if gid == "" or gid in alreadySeen:
107 alreadySeen.append(gid)
108 start = int(cols[2]) - 1
109 stop = int(cols[3]) - 1
116 geneID = ("mmusculus", gid)
118 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
120 print "Adding %d gene entries" % len(geneEntries)
121 mmGenome.addGeneEntryBatch(geneEntries)
124 def loadGeneFeatures(db, gFile, cDict):
125 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
128 mmGenome = Genome("mmusculus", dbFile=db)
129 featureFile = open(gFile, "r")
130 for line in featureFile:
131 cols = line.split("\t")
132 if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
135 chrom = cols[1].strip()
136 if chrom not in cDict:
140 name = cols[10].split(":")
145 start = int(cols[2]) - 1
146 stop = int(cols[3]) - 1
153 geneID = ("mmusculus", gid)
155 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
157 print "Adding %d feature entries" % len(featureEntries)
158 mmGenome.addFeatureEntryBatch(featureEntries)
161 def loadGeneAnnotations(db):
164 mmGenome = Genome("mmusculus", dbFile=db)
165 gidList = mmGenome.allGIDs()
166 for locID in gidList:
167 gID = ("mmusculus", locID)
168 geneDescArray = idb.getDescription(gID)
170 for entry in geneDescArray:
172 geneDesc += entry.strip()
174 if len(geneDescArray) > 0:
175 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
177 print "Adding %d annotations" % len(geneAnnotations)
178 mmGenome.addAnnotationBatch(geneAnnotations)
181 def loadGeneOntology(db, goPath, goDefPath):
182 mmGenome = Genome("mmusculus", dbFile=db)
183 goDefFile = open(goDefPath, "r")
184 goFile = open(goPath, "r")
188 for goDefEntry in goDefFile:
189 if goDefEntry[0] != "!":
190 cols = goDefEntry.split("\t")
191 goDefs[cols[0]] = (cols[1], cols[2].strip())
193 goEntries = goFile.readlines()
195 for entry in goEntries:
197 fields = entry.split("\t")
198 if fields[0] != "10116":
201 locID = fields[1].strip()
202 gID = ("mmusculus", locID)
206 synonyms = idb.geneIDSynonyms(gID)
208 for entry in synonyms:
214 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
216 print "locus ID %s could not be added" % locID
219 print "adding %d go entries" % len(goArray)
220 mmGenome.addGoInfoBatch(goArray)
223 def loadChromosome(db, chromID, chromPath, chromOut):
225 mmGenome = Genome("mmusculus", dbFile=db)
226 inFile = open(chromPath, "r")
227 line = inFile.readline()
229 seqArray.append(line.strip())
231 seq = string.join(seqArray, "")
234 print "Problems reading sequence from file"
236 print "writing to file %s" % chromOut
237 outFile = open("%s%s" % (cisRoot, chromOut), "w")
240 mmGenome.addChromosomeEntry(chromID, chromOut, "file")
243 def createDBindices(db):
244 mmGenome = Genome("mmusculus", dbFile=db)
245 mmGenome.createIndices()