erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / hsapiens.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Homo sapiens
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT") 
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
42
43 def loadChromosome(db, chromID, chromPath, chromOut):
44     seqArray = []
45     hsGenome = Genome("hsapiens", dbFile=db)
46     inFile = open(chromPath, "r")
47     line = inFile.readline()
48     for line in inFile:
49         seqArray.append(line.strip())
50
51     seq = string.join(seqArray, "")
52     seqLen = len(seq)
53     if seqLen < 1:
54         print "Problems reading sequence from file"
55     print "writing to file %s" % chromOut
56     outFile = open("%s%s" % (cisRoot, chromOut), "w")
57     outFile.write(seq)
58     outFile.close()
59     hsGenome.addChromosomeEntry(chromID, chromOut, "file")
60
61
62 def loadGeneEntries(db, gFile, cDict):
63     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
64     """
65     geneEntries = []
66     hsGenome = Genome("hsapiens", dbFile=db)
67     geneFile = open(gFile, "r")
68     for line in geneFile:
69         cols = line.split("\t")
70         if cols[11] != "GENE":
71             continue
72
73         if cols[12] == "Celera":
74             continue
75
76         chrom = cols[1].strip()
77         if chrom not in cDict:
78             continue
79   
80         name = cols[10].split(":")
81         gid = name[1]
82         start = int(cols[2])
83         stop = int(cols[3])
84         sense = cols[4]
85         if sense == "+":
86             sense = "F"
87         else:
88             sense = "R"
89
90         geneID = ("hsapiens", gid)
91         gidVersion = 1
92         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
93
94     print "Adding %d gene entries" % len(geneEntries)
95     hsGenome.addGeneEntryBatch(geneEntries)
96
97
98 def loadGeneFeatures(db, gFile, cDict):
99     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
100     """
101     featureEntries = []
102     hsGenome = Genome("hsapiens", dbFile=db)
103     featureFile = open(gFile, "r")
104     for line in featureFile:
105         cols = line.split("\t")
106         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
107             continue
108         if cols[12] == "Celera":
109             continue
110         chrom = cols[1].strip()
111         if chrom not in cDict:
112             continue
113
114         fType = cols[11]
115         name = cols[10].split(":")
116         gid = name[1]
117         start = int(cols[2])
118         stop = int(cols[3])
119         sense = cols[4]
120         if sense == "+":
121             sense = "F"
122         else:
123             sense = "R"
124
125         geneID = ("hsapiens", gid)
126         gidVersion = 1
127         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
128
129     print "Adding %d feature entries" % len(featureEntries)
130     hsGenome.addFeatureEntryBatch(featureEntries)
131
132
133 def loadGeneAnnotations(db):
134     geneAnnotations = []
135     idb = geneinfoDB()
136     hsGenome = Genome("hsapiens", dbFile=db)
137     gidList = hsGenome.allGIDs()
138     for locID in gidList:
139         gID = ("hsapiens", locID)
140         geneDescArray = idb.getDescription(gID)
141         geneDesc = ""
142         for entry in geneDescArray:
143             geneDesc += ","
144             geneDesc += entry.strip()
145
146         if len(geneDescArray) > 0:
147             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
148
149     print "Adding %d annotations" % len(geneAnnotations)
150     hsGenome.addAnnotationBatch(geneAnnotations)
151
152
153 def loadGeneOntology(db, goPath, goDefPath):
154     hsGenome = Genome("hsapiens", dbFile=db)
155     goDefFile = open(goDefPath, "r")
156     goFile = open(goPath, "r")
157     idb = geneinfoDB()
158     goDefs = {}
159     goArray = []
160     for goDefEntry in goDefFile:
161         if goDefEntry[0] != "!":
162             cols = goDefEntry.split("\t")
163             goDefs[cols[0]] = (cols[1], cols[2].strip())
164
165     prevGID = ""
166     index = 0
167     for entry in goFile:
168         try:
169             fields = entry.split("\t")
170             if fields[0] != "9606":
171                 continue
172
173             index += 1
174             if index % 1000 == 0:
175                 print "adding 1000 go entries"
176                 hsGenome.addGoInfoBatch(goArray)
177                 goArray = []
178
179             locID = fields[1].strip()
180             gID = ("hsapiens", locID)
181             if prevGID != gID:
182                 prevGID = gID
183                 gene_name = ""
184                 synonyms = idb.geneIDSynonyms(gID)
185                 if len(synonyms) >0:
186                     for entry in synonyms:
187                         gene_name += ","    
188                         gene_name += entry
189                 else:
190                     gene_name = " "
191
192             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
193         except:
194             print "locus ID %s could not be added" % locID
195             pass
196
197     print "adding %d go entries" % len(goArray)
198     hsGenome.addGoInfoBatch(goArray)
199
200
201 def createDBFile(db):
202     hsGenome = Genome("hsapiens",  dbFile=db)
203     hsGenome.createGeneDB(db)
204
205
206 def createDBindices(db):
207     hsGenome = Genome("hsapiens", dbFile=db)
208     hsGenome.createIndices()
209
210
211 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
212     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
213     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
214     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
215     # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
216     chromDict = {"1": "%s/chr1.fa" % downloadDir,
217                  "2": "%s/chr2.fa" % downloadDir,
218                  "3": "%s/chr3.fa" % downloadDir,
219                  "4": "%s/chr4.fa" % downloadDir,
220                  "5": "%s/chr5.fa" % downloadDir,
221                  "6": "%s/chr6.fa" % downloadDir,
222                  "7": "%s/chr7.fa" % downloadDir,
223                  "8": "%s/chr8.fa" % downloadDir,
224                  "9": "%s/chr9.fa" % downloadDir,
225                  "10": "%s/chr10.fa" % downloadDir,
226                  "11": "%s/chr11.fa" % downloadDir,
227                  "12": "%s/chr12.fa" % downloadDir,
228                  "13": "%s/chr13.fa" % downloadDir,
229                  "14": "%s/chr14.fa" % downloadDir,
230                  "15": "%s/chr15.fa" % downloadDir,
231                  "16": "%s/chr16.fa" % downloadDir,
232                  "17": "%s/chr17.fa" % downloadDir,
233                  "18": "%s/chr18.fa" % downloadDir,
234                  "19": "%s/chr19.fa" % downloadDir,
235                  "20": "%s/chr20.fa" % downloadDir,
236                  "21": "%s/chr21.fa" % downloadDir,
237                  "22": "%s/chr22.fa" % downloadDir,
238                  "X": "%s/chrX.fa" % downloadDir,
239                  "Y": "%s/chrY.fa" % downloadDir
240     }
241     
242     print "Creating database %s" % db
243     createDBFile(db)
244     
245     print "Adding gene entries"
246     loadGeneEntries(db, genePath, chromDict)
247     
248     print "Adding gene features"
249     loadGeneFeatures(db, genePath, chromDict)
250     
251     print "Adding gene annotations"
252     loadGeneAnnotations(db)
253     
254     print "Adding gene ontology"
255     loadGeneOntology(db, goPath, goDefPath)
256     
257     for chromID in chromDict.keys():
258         print "Loading chromosome %s" % chromID
259         loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
260     
261     print "Creating Indices"
262     createDBindices(db)
263     
264     print "Finished creating database %s" % db