first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / mmusculus.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Mus musculus
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT")
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
42
43
44 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
46     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
47     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
48     # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
49     # but ignoring all random chromosomes
50     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
51                  "11", "12", "13", "14", "15", "16", "17", "18", "19",
52                  "X", "Y", "M"
53     ]
54
55     print "Creating database %s" % db
56     createDBFile(db)
57
58     print "Adding gene entries"
59     loadGeneEntries(db, genePath, chromList)
60
61     print "Adding gene features"
62     loadGeneFeatures(db, genePath, chromList)
63
64     print "Adding gene annotations"
65     loadGeneAnnotations(db)
66
67     print "Adding gene ontology"
68     loadGeneOntology(db, goPath, goDefPath)
69
70     for chromID in chromList:
71         print "Loading chromosome %s" % chromID
72         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
73         loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
74
75     print "Creating Indices"
76     createDBindices(db)
77
78     print "Finished creating database %s" % db
79
80
81 def createDBFile(db):
82     mmGenome = Genome("mmusculus",  dbFile=db)
83     mmGenome.createGeneDB(db)
84
85
86 def loadGeneEntries(db, gFile, cDict):
87     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
88
89     geneEntries = []
90     alreadySeen = []
91     mmGenome = Genome("mmusculus", dbFile=db)
92     geneFile = open(gFile, "r")
93     for line in geneFile:
94         cols = line.split("\t")
95         if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
96             continue
97
98         chrom = cols[1].strip()
99         if chrom not in cDict:
100             continue
101
102         name = cols[10].split(":")
103         gid = name[1]
104         if gid == "" or gid in alreadySeen:
105             continue
106
107         alreadySeen.append(gid)
108         start = int(cols[2]) - 1
109         stop = int(cols[3]) - 1
110         sense = cols[4]
111         if sense == "+":
112             sense = "F"
113         else:
114             sense = "R"
115
116         geneID = ("mmusculus", gid)
117         gidVersion = 1
118         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
119
120     print "Adding %d gene entries" % len(geneEntries)
121     mmGenome.addGeneEntryBatch(geneEntries)
122
123
124 def loadGeneFeatures(db, gFile, cDict):
125     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
126     """
127     featureEntries = []
128     mmGenome = Genome("mmusculus", dbFile=db)
129     featureFile = open(gFile, "r")
130     for line in featureFile:
131         cols = line.split("\t")
132         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
133             continue
134
135         chrom = cols[1].strip()
136         if chrom not in cDict:
137             continue
138
139         fType = cols[11]
140         name = cols[10].split(":")
141         gid = name[1]
142         if gid == "":
143             continue
144
145         start = int(cols[2]) - 1
146         stop = int(cols[3]) - 1
147         sense = cols[4]
148         if sense == "+":
149             sense = "F"
150         else:
151             sense = "R"
152
153         geneID = ("mmusculus", gid)
154         gidVersion = 1
155         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
156
157     print "Adding %d feature entries" % len(featureEntries)
158     mmGenome.addFeatureEntryBatch(featureEntries)
159
160
161 def loadGeneAnnotations(db):
162     geneAnnotations = []
163     idb = geneinfoDB()
164     mmGenome = Genome("mmusculus", dbFile=db)
165     gidList = mmGenome.allGIDs()
166     for locID in gidList:
167         gID = ("mmusculus", locID)
168         geneDescArray = idb.getDescription(gID)
169         geneDesc = ""
170         for entry in geneDescArray:
171             geneDesc += ","
172             geneDesc += entry.strip()
173
174         if len(geneDescArray) > 0:
175             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
176
177     print "Adding %d annotations" % len(geneAnnotations)
178     mmGenome.addAnnotationBatch(geneAnnotations)
179
180
181 def loadGeneOntology(db, goPath, goDefPath):
182     mmGenome = Genome("mmusculus", dbFile=db)
183     goDefFile = open(goDefPath, "r")
184     goFile = open(goPath, "r")
185     idb = geneinfoDB()
186     goDefs = {}
187     goArray = []
188     for goDefEntry in goDefFile:
189         if goDefEntry[0] != "!":
190             cols = goDefEntry.split("\t")
191             goDefs[cols[0]] = (cols[1], cols[2].strip())
192
193     goEntries = goFile.readlines()
194     prevGID = ""
195     for entry in goEntries:
196         try:
197             fields = entry.split("\t")
198             if fields[0] != "10116":
199                 continue
200
201             locID = fields[1].strip()
202             gID = ("mmusculus", locID)
203             if prevGID != gID:
204                 prevGID = gID
205                 gene_name = ""
206                 synonyms = idb.geneIDSynonyms(gID)
207                 if len(synonyms) >0:
208                     for entry in synonyms:
209                         gene_name += ","
210                         gene_name += entry
211                 else:
212                     gene_name = " "
213
214             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
215         except:
216             print "locus ID %s could not be added" % locID
217             pass
218
219     print "adding %d go entries" % len(goArray)
220     mmGenome.addGoInfoBatch(goArray)
221
222
223 def loadChromosome(db, chromID, chromPath, chromOut):
224     seqArray = []
225     mmGenome = Genome("mmusculus", dbFile=db)
226     inFile = open(chromPath, "r")
227     line = inFile.readline()
228     for line in inFile:
229         seqArray.append(line.strip())
230
231     seq = string.join(seqArray, "")
232     seqLen = len(seq)
233     if seqLen < 1:
234         print "Problems reading sequence from file"
235
236     print "writing to file %s" % chromOut
237     outFile = open("%s%s" % (cisRoot, chromOut), "w")
238     outFile.write(seq)
239     outFile.close()
240     mmGenome.addChromosomeEntry(chromID, chromOut, "file")
241
242
243 def createDBindices(db):
244     mmGenome = Genome("mmusculus", dbFile=db)
245     mmGenome.createIndices()