first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / hsapiens.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Homo sapiens
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT")
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
42
43
44 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
46     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
47     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
48     # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
49     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
50                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
51                  "21", "22", "X", "Y"
52     ]
53
54     print "Creating database %s" % db
55     createDBFile(db)
56
57     print "Adding gene entries"
58     loadGeneEntries(db, genePath, chromList)
59
60     print "Adding gene features"
61     loadGeneFeatures(db, genePath, chromList)
62
63     print "Adding gene annotations"
64     loadGeneAnnotations(db)
65
66     print "Adding gene ontology"
67     loadGeneOntology(db, goPath, goDefPath)
68
69     for chromID in chromList:
70         print "Loading chromosome %s" % chromID
71         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
72         loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
73
74     print "Creating Indices"
75     createDBindices(db)
76
77     print "Finished creating database %s" % db
78
79
80 def createDBFile(db):
81     hsGenome = Genome("hsapiens",  dbFile=db)
82     hsGenome.createGeneDB(db)
83
84
85 def loadGeneEntries(db, gFile, cDict):
86     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
87     geneEntries = []
88     hsGenome = Genome("hsapiens", dbFile=db)
89     geneFile = open(gFile, "r")
90     for line in geneFile:
91         cols = line.split("\t")
92         if cols[11] != "GENE":
93             continue
94
95         if cols[12] == "Celera":
96             continue
97
98         chrom = cols[1].strip()
99         if chrom not in cDict:
100             continue
101
102         name = cols[10].split(":")
103         gid = name[1]
104         start = int(cols[2])
105         stop = int(cols[3])
106         sense = cols[4]
107         if sense == "+":
108             sense = "F"
109         else:
110             sense = "R"
111
112         geneID = ("hsapiens", gid)
113         gidVersion = 1
114         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
115
116     print "Adding %d gene entries" % len(geneEntries)
117     hsGenome.addGeneEntryBatch(geneEntries)
118
119
120 def loadGeneFeatures(db, gFile, cDict):
121     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
122     """
123     featureEntries = []
124     hsGenome = Genome("hsapiens", dbFile=db)
125     featureFile = open(gFile, "r")
126     for line in featureFile:
127         cols = line.split("\t")
128         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
129             continue
130
131         if cols[12] == "Celera":
132             continue
133
134         chrom = cols[1].strip()
135         if chrom not in cDict:
136             continue
137
138         fType = cols[11]
139         name = cols[10].split(":")
140         gid = name[1]
141         start = int(cols[2])
142         stop = int(cols[3])
143         sense = cols[4]
144         if sense == "+":
145             sense = "F"
146         else:
147             sense = "R"
148
149         geneID = ("hsapiens", gid)
150         gidVersion = 1
151         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
152
153     print "Adding %d feature entries" % len(featureEntries)
154     hsGenome.addFeatureEntryBatch(featureEntries)
155
156
157 def loadGeneAnnotations(db):
158     geneAnnotations = []
159     idb = geneinfoDB()
160     hsGenome = Genome("hsapiens", dbFile=db)
161     gidList = hsGenome.allGIDs()
162     for locID in gidList:
163         gID = ("hsapiens", locID)
164         geneDescArray = idb.getDescription(gID)
165         geneDesc = ""
166         for entry in geneDescArray:
167             geneDesc += ","
168             geneDesc += entry.strip()
169
170         if len(geneDescArray) > 0:
171             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
172
173     print "Adding %d annotations" % len(geneAnnotations)
174     hsGenome.addAnnotationBatch(geneAnnotations)
175
176
177 def loadGeneOntology(db, goPath, goDefPath):
178     hsGenome = Genome("hsapiens", dbFile=db)
179     goDefFile = open(goDefPath, "r")
180     goFile = open(goPath, "r")
181     idb = geneinfoDB()
182     goDefs = {}
183     goArray = []
184     for goDefEntry in goDefFile:
185         if goDefEntry[0] != "!":
186             cols = goDefEntry.split("\t")
187             goDefs[cols[0]] = (cols[1], cols[2].strip())
188
189     prevGID = ""
190     index = 0
191     for entry in goFile:
192         try:
193             fields = entry.split("\t")
194             if fields[0] != "9606":
195                 continue
196
197             index += 1
198             if index % 1000 == 0:
199                 print "adding 1000 go entries"
200                 hsGenome.addGoInfoBatch(goArray)
201                 goArray = []
202
203             locID = fields[1].strip()
204             gID = ("hsapiens", locID)
205             if prevGID != gID:
206                 prevGID = gID
207                 gene_name = ""
208                 synonyms = idb.geneIDSynonyms(gID)
209                 if len(synonyms) >0:
210                     for entry in synonyms:
211                         gene_name += ","
212                         gene_name += entry
213                 else:
214                     gene_name = " "
215
216             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
217         except:
218             print "locus ID %s could not be added" % locID
219             pass
220
221     print "adding %d go entries" % len(goArray)
222     hsGenome.addGoInfoBatch(goArray)
223
224
225 def loadChromosome(db, chromID, chromPath, chromOut):
226     seqArray = []
227     hsGenome = Genome("hsapiens", dbFile=db)
228     inFile = open(chromPath, "r")
229     line = inFile.readline()
230     for line in inFile:
231         seqArray.append(line.strip())
232
233     seq = string.join(seqArray, "")
234     seqLen = len(seq)
235     if seqLen < 1:
236         print "Problems reading sequence from file"
237     print "writing to file %s" % chromOut
238     outFile = open("%s%s" % (cisRoot, chromOut), "w")
239     outFile.write(seq)
240     outFile.close()
241     hsGenome.addChromosomeEntry(chromID, chromOut, "file")
242
243
244 def createDBindices(db):
245     hsGenome = Genome("hsapiens", dbFile=db)
246     hsGenome.createIndices()