erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / mmusculus.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Mus musculus
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT") 
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
42
43
44 def loadChromosome(db, chromID, chromPath, chromOut):
45     seqArray = []
46     mmGenome = Genome("mmusculus", dbFile=db)
47     inFile = open(chromPath, "r")
48     line = inFile.readline()
49     for line in inFile:
50         seqArray.append(line.strip())
51
52     seq = string.join(seqArray, "")
53     seqLen = len(seq)
54     if seqLen < 1:
55         print "Problems reading sequence from file"
56
57     print "writing to file %s" % chromOut
58     outFile = open("%s%s" % (cisRoot, chromOut), "w")
59     outFile.write(seq)
60     outFile.close()
61     mmGenome.addChromosomeEntry(chromID, chromOut, "file")
62
63
64 def loadGeneEntries(db, gFile, cDict):
65     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
66     """
67     geneEntries = []
68     alreadySeen = []
69     mmGenome = Genome("mmusculus", dbFile=db)
70     geneFile = open(gFile, "r")
71     for line in geneFile:
72         cols = line.split("\t")
73         if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
74             continue
75
76         chrom = cols[1].strip()
77         if chrom not in cDict:
78             continue
79
80         name = cols[10].split(":")
81         gid = name[1]
82         if gid == "" or gid in alreadySeen:
83             continue
84
85         alreadySeen.append(gid)
86         start = int(cols[2]) - 1
87         stop = int(cols[3]) - 1
88         sense = cols[4]
89         if sense == "+":
90             sense = "F"
91         else:
92             sense = "R"
93
94         geneID = ("mmusculus", gid)
95         gidVersion = 1
96         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
97
98     print "Adding %d gene entries" % len(geneEntries)
99     mmGenome.addGeneEntryBatch(geneEntries)
100
101
102 def loadGeneFeatures(db, gFile, cDict):
103     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
104     """
105     featureEntries = []
106     mmGenome = Genome("mmusculus", dbFile=db)
107     featureFile = open(gFile, "r")
108     for line in featureFile:
109         cols = line.split("\t")
110         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
111             continue
112
113         chrom = cols[1].strip()
114         if chrom not in cDict:
115             continue
116
117         fType = cols[11]
118         name = cols[10].split(":")
119         gid = name[1]
120         if gid == "":
121             continue
122
123         start = int(cols[2]) - 1
124         stop = int(cols[3]) - 1
125         sense = cols[4]
126         if sense == "+":
127             sense = "F"
128         else:
129             sense = "R"
130
131         geneID = ("mmusculus", gid)
132         gidVersion = 1
133         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
134
135     print "Adding %d feature entries" % len(featureEntries)
136     mmGenome.addFeatureEntryBatch(featureEntries)
137
138
139 def loadGeneAnnotations(db):
140     geneAnnotations = []
141     idb = geneinfoDB()
142     mmGenome = Genome("mmusculus", dbFile=db)
143     gidList = mmGenome.allGIDs()
144     for locID in gidList:
145         gID = ("mmusculus", locID)
146         geneDescArray = idb.getDescription(gID)
147         geneDesc = ""
148         for entry in geneDescArray:
149             geneDesc += ","
150             geneDesc += entry.strip()
151
152         if len(geneDescArray) > 0:
153             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
154
155     print "Adding %d annotations" % len(geneAnnotations)
156     mmGenome.addAnnotationBatch(geneAnnotations)
157
158
159 def loadGeneOntology(db, goPath, goDefPath):
160     mmGenome = Genome("mmusculus", dbFile=db)
161     goDefFile = open(goDefPath, "r")
162     goFile = open(goPath, "r")
163     idb = geneinfoDB()
164     goDefs = {}
165     goArray = []
166     for goDefEntry in goDefFile:
167         if goDefEntry[0] != "!":
168             cols = goDefEntry.split("\t")
169             goDefs[cols[0]] = (cols[1], cols[2].strip())
170
171     goEntries = goFile.readlines()
172     prevGID = ""
173     for entry in goEntries:
174         try:
175             fields = entry.split("\t")
176             if fields[0] != "10116":
177                 continue
178
179             locID = fields[1].strip()
180             gID = ("mmusculus", locID)
181             if prevGID != gID:
182                 prevGID = gID
183                 gene_name = ""
184                 synonyms = idb.geneIDSynonyms(gID)
185                 if len(synonyms) >0:
186                     for entry in synonyms:
187                         gene_name += ","    
188                         gene_name += entry
189                 else:
190                     gene_name = " "
191
192             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
193         except:
194             print "locus ID %s could not be added" % locID
195             pass
196
197     print "adding %d go entries" % len(goArray)
198     mmGenome.addGoInfoBatch(goArray)
199
200
201 def createDBFile(db):
202     mmGenome = Genome("mmusculus",  dbFile=db)
203     mmGenome.createGeneDB(db)
204
205
206 def createDBindices(db):
207     mmGenome = Genome("mmusculus", dbFile=db)
208     mmGenome.createIndices()
209
210
211 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
212     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
213     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
214     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
215     # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
216     # but ignoring all random chromosomes
217     chromDict = {"1": "%s/chr1.fa" % downloadDir,
218                  "2": "%s/chr2.fa" % downloadDir,
219                  "3": "%s/chr3.fa" % downloadDir,
220                  "4": "%s/chr4.fa" % downloadDir,
221                  "5": "%s/chr5.fa" % downloadDir,
222                  "6": "%s/chr6.fa" % downloadDir,
223                  "7": "%s/chr7.fa" % downloadDir,
224                  "8": "%s/chr8.fa" % downloadDir,
225                  "9": "%s/chr9.fa" % downloadDir,
226                  "10": "%s/chr10.fa" % downloadDir,
227                  "11": "%s/chr11.fa" % downloadDir,
228                  "12": "%s/chr12.fa" % downloadDir,
229                  "13": "%s/chr13.fa" % downloadDir,
230                  "14": "%s/chr14.fa" % downloadDir,
231                  "15": "%s/chr15.fa" % downloadDir,
232                  "16": "%s/chr16.fa" % downloadDir,
233                  "17": "%s/chr17.fa" % downloadDir,
234                  "18": "%s/chr18.fa" % downloadDir,
235                  "19": "%s/chr19.fa" % downloadDir,
236                  "X": "%s/chrX.fa" % downloadDir,
237                  "Y": "%s/chrY.fa" % downloadDir,
238                  "M": "%s/chrM.fa" % downloadDir
239     }
240
241     print "Creating database %s" % db
242     createDBFile(db)
243
244     print "Adding gene entries"
245     loadGeneEntries(db, genePath, chromDict)
246
247     print "Adding gene features"
248     loadGeneFeatures(db, genePath, chromDict)
249
250     print "Adding gene annotations"
251     loadGeneAnnotations(db)
252
253     print "Adding gene ontology"
254     loadGeneOntology(db, goPath, goDefPath)
255
256     for chromID in chromDict.keys():
257         print "Loading chromosome %s" % chromID
258         loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
259
260     print "Creating Indices"
261     createDBindices(db)
262
263     print "Finished creating database %s" % db