1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Homo sapiens
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
44 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45 genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
46 goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
47 goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
48 # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
49 chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
50 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
54 print "Creating database %s" % db
57 print "Adding gene entries"
58 loadGeneEntries(db, genePath, chromList)
60 print "Adding gene features"
61 loadGeneFeatures(db, genePath, chromList)
63 print "Adding gene annotations"
64 loadGeneAnnotations(db)
66 print "Adding gene ontology"
67 loadGeneOntology(db, goPath, goDefPath)
69 for chromID in chromList:
70 print "Loading chromosome %s" % chromID
71 chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
72 loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
74 print "Creating Indices"
77 print "Finished creating database %s" % db
81 hsGenome = Genome("hsapiens", dbFile=db)
82 hsGenome.createGeneDB(db)
85 def loadGeneEntries(db, gFile, cDict):
86 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
88 hsGenome = Genome("hsapiens", dbFile=db)
89 geneFile = open(gFile, "r")
91 cols = line.split("\t")
92 if cols[11] != "GENE":
95 if cols[12] == "Celera":
98 chrom = cols[1].strip()
99 if chrom not in cDict:
102 name = cols[10].split(":")
112 geneID = ("hsapiens", gid)
114 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
116 print "Adding %d gene entries" % len(geneEntries)
117 hsGenome.addGeneEntryBatch(geneEntries)
120 def loadGeneFeatures(db, gFile, cDict):
121 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
124 hsGenome = Genome("hsapiens", dbFile=db)
125 featureFile = open(gFile, "r")
126 for line in featureFile:
127 cols = line.split("\t")
128 if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
131 if cols[12] == "Celera":
134 chrom = cols[1].strip()
135 if chrom not in cDict:
139 name = cols[10].split(":")
149 geneID = ("hsapiens", gid)
151 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
153 print "Adding %d feature entries" % len(featureEntries)
154 hsGenome.addFeatureEntryBatch(featureEntries)
157 def loadGeneAnnotations(db):
160 hsGenome = Genome("hsapiens", dbFile=db)
161 gidList = hsGenome.allGIDs()
162 for locID in gidList:
163 gID = ("hsapiens", locID)
164 geneDescArray = idb.getDescription(gID)
166 for entry in geneDescArray:
168 geneDesc += entry.strip()
170 if len(geneDescArray) > 0:
171 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
173 print "Adding %d annotations" % len(geneAnnotations)
174 hsGenome.addAnnotationBatch(geneAnnotations)
177 def loadGeneOntology(db, goPath, goDefPath):
178 hsGenome = Genome("hsapiens", dbFile=db)
179 goDefFile = open(goDefPath, "r")
180 goFile = open(goPath, "r")
184 for goDefEntry in goDefFile:
185 if goDefEntry[0] != "!":
186 cols = goDefEntry.split("\t")
187 goDefs[cols[0]] = (cols[1], cols[2].strip())
193 fields = entry.split("\t")
194 if fields[0] != "9606":
198 if index % 1000 == 0:
199 print "adding 1000 go entries"
200 hsGenome.addGoInfoBatch(goArray)
203 locID = fields[1].strip()
204 gID = ("hsapiens", locID)
208 synonyms = idb.geneIDSynonyms(gID)
210 for entry in synonyms:
216 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
218 print "locus ID %s could not be added" % locID
221 print "adding %d go entries" % len(goArray)
222 hsGenome.addGoInfoBatch(goArray)
225 def loadChromosome(db, chromID, chromPath, chromOut):
227 hsGenome = Genome("hsapiens", dbFile=db)
228 inFile = open(chromPath, "r")
229 line = inFile.readline()
231 seqArray.append(line.strip())
233 seq = string.join(seqArray, "")
236 print "Problems reading sequence from file"
237 print "writing to file %s" % chromOut
238 outFile = open("%s%s" % (cisRoot, chromOut), "w")
241 hsGenome.addChromosomeEntry(chromID, chromOut, "file")
244 def createDBindices(db):
245 hsGenome = Genome("hsapiens", dbFile=db)
246 hsGenome.createIndices()