1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Homo sapiens
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
43 def loadChromosome(db, chromID, chromPath, chromOut):
45 hsGenome = Genome("hsapiens", dbFile=db)
46 inFile = open(chromPath, "r")
47 line = inFile.readline()
49 seqArray.append(line.strip())
51 seq = string.join(seqArray, "")
54 print "Problems reading sequence from file"
55 print "writing to file %s" % chromOut
56 outFile = open("%s%s" % (cisRoot, chromOut), "w")
59 hsGenome.addChromosomeEntry(chromID, chromOut, "file")
62 def loadGeneEntries(db, gFile, cDict):
63 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
66 hsGenome = Genome("hsapiens", dbFile=db)
67 geneFile = open(gFile, "r")
69 cols = line.split("\t")
70 if cols[11] != "GENE":
73 if cols[12] == "Celera":
76 chrom = cols[1].strip()
77 if chrom not in cDict:
80 name = cols[10].split(":")
90 geneID = ("hsapiens", gid)
92 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
94 print "Adding %d gene entries" % len(geneEntries)
95 hsGenome.addGeneEntryBatch(geneEntries)
98 def loadGeneFeatures(db, gFile, cDict):
99 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
102 hsGenome = Genome("hsapiens", dbFile=db)
103 featureFile = open(gFile, "r")
104 for line in featureFile:
105 cols = line.split("\t")
106 if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
108 if cols[12] == "Celera":
110 chrom = cols[1].strip()
111 if chrom not in cDict:
115 name = cols[10].split(":")
125 geneID = ("hsapiens", gid)
127 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
129 print "Adding %d feature entries" % len(featureEntries)
130 hsGenome.addFeatureEntryBatch(featureEntries)
133 def loadGeneAnnotations(db):
136 hsGenome = Genome("hsapiens", dbFile=db)
137 gidList = hsGenome.allGIDs()
138 for locID in gidList:
139 gID = ("hsapiens", locID)
140 geneDescArray = idb.getDescription(gID)
142 for entry in geneDescArray:
144 geneDesc += entry.strip()
146 if len(geneDescArray) > 0:
147 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
149 print "Adding %d annotations" % len(geneAnnotations)
150 hsGenome.addAnnotationBatch(geneAnnotations)
153 def loadGeneOntology(db, goPath, goDefPath):
154 hsGenome = Genome("hsapiens", dbFile=db)
155 goDefFile = open(goDefPath, "r")
156 goFile = open(goPath, "r")
160 for goDefEntry in goDefFile:
161 if goDefEntry[0] != "!":
162 cols = goDefEntry.split("\t")
163 goDefs[cols[0]] = (cols[1], cols[2].strip())
169 fields = entry.split("\t")
170 if fields[0] != "9606":
174 if index % 1000 == 0:
175 print "adding 1000 go entries"
176 hsGenome.addGoInfoBatch(goArray)
179 locID = fields[1].strip()
180 gID = ("hsapiens", locID)
184 synonyms = idb.geneIDSynonyms(gID)
186 for entry in synonyms:
192 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
194 print "locus ID %s could not be added" % locID
197 print "adding %d go entries" % len(goArray)
198 hsGenome.addGoInfoBatch(goArray)
201 def createDBFile(db):
202 hsGenome = Genome("hsapiens", dbFile=db)
203 hsGenome.createGeneDB(db)
206 def createDBindices(db):
207 hsGenome = Genome("hsapiens", dbFile=db)
208 hsGenome.createIndices()
211 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
212 genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
213 goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
214 goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
215 # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
216 chromDict = {"1": "%s/chr1.fa" % downloadDir,
217 "2": "%s/chr2.fa" % downloadDir,
218 "3": "%s/chr3.fa" % downloadDir,
219 "4": "%s/chr4.fa" % downloadDir,
220 "5": "%s/chr5.fa" % downloadDir,
221 "6": "%s/chr6.fa" % downloadDir,
222 "7": "%s/chr7.fa" % downloadDir,
223 "8": "%s/chr8.fa" % downloadDir,
224 "9": "%s/chr9.fa" % downloadDir,
225 "10": "%s/chr10.fa" % downloadDir,
226 "11": "%s/chr11.fa" % downloadDir,
227 "12": "%s/chr12.fa" % downloadDir,
228 "13": "%s/chr13.fa" % downloadDir,
229 "14": "%s/chr14.fa" % downloadDir,
230 "15": "%s/chr15.fa" % downloadDir,
231 "16": "%s/chr16.fa" % downloadDir,
232 "17": "%s/chr17.fa" % downloadDir,
233 "18": "%s/chr18.fa" % downloadDir,
234 "19": "%s/chr19.fa" % downloadDir,
235 "20": "%s/chr20.fa" % downloadDir,
236 "21": "%s/chr21.fa" % downloadDir,
237 "22": "%s/chr22.fa" % downloadDir,
238 "X": "%s/chrX.fa" % downloadDir,
239 "Y": "%s/chrY.fa" % downloadDir
242 print "Creating database %s" % db
245 print "Adding gene entries"
246 loadGeneEntries(db, genePath, chromDict)
248 print "Adding gene features"
249 loadGeneFeatures(db, genePath, chromDict)
251 print "Adding gene annotations"
252 loadGeneAnnotations(db)
254 print "Adding gene ontology"
255 loadGeneOntology(db, goPath, goDefPath)
257 for chromID in chromDict.keys():
258 print "Loading chromosome %s" % chromID
259 loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
261 print "Creating Indices"
264 print "Finished creating database %s" % db