first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / rnorvegicus.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Ratus Norvegicus
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT")
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
42
43
44 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45     genePath = "%s/seq_gene.md" % downloadDir
46     goDefPath = "%s/GO.terms_and_ids" % downloadDir
47     goPath = "%s/gene2go" % downloadDir
48
49     print "Creating database %s" % db
50     createDBFile(db)
51
52     print "Adding gene entries"
53     loadGeneEntries(db, genePath)
54
55     print "Adding gene features"
56     loadGeneFeatures(db, genePath)
57
58     print "Adding gene annotations"
59     loadGeneAnnotations(db)
60
61     print "Adding gene ontology"
62     loadGeneOntology(db, goPath, goDefPath)
63
64     # ignoring all random chromosomes
65     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
66                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
67                  "Un", "X", "M"
68     ]
69     for chromID in chromList:
70         print "Loading chromosome %s" % chromID
71         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
72         loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
73
74     print "Creating Indices"
75     createDBindices(db)
76
77     print "Finished creating database %s" % db
78
79
80 def createDBFile(db):
81     rnGenome = Genome("rnorvegicus",  dbFile=db)
82     rnGenome.createGeneDB(db)
83
84
85 def loadGeneEntries(db, gFile):
86     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
87
88     geneEntries = []
89     rnGenome = Genome("rnorvegicus", dbFile=db)
90     geneFile = open(gFile, "r")
91     for line in geneFile:
92         cols = line.split("\t")
93         if cols[11] != "GENE":
94             continue
95
96         if cols[12] == "Celera":
97             continue
98
99         name = cols[10].split(":")
100         gid = name[1]
101         start = int(cols[2]) - 1
102         stop = int(cols[3]) - 1
103         sense = cols[4]
104         chrom = cols[1].strip()
105         if sense == "+":
106             sense = "F"
107         else:
108             sense = "R"
109
110         geneID = ("rnorvegicus", gid)
111         gidVersion = 1
112         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
113
114     print "Adding %d gene entries" % len(geneEntries)
115     rnGenome.addGeneEntryBatch(geneEntries)
116
117
118 def loadGeneFeatures(db, gFile):
119     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
120     """
121     featureEntries = []
122     rnGenome = Genome("rnorvegicus", dbFile=db)
123     featureFile = open(gFile, "r")
124     for line in featureFile:
125         cols = line.split("\t")
126         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
127             continue
128
129         if cols[12] == "Celera":
130             continue
131
132         fType = cols[11]
133         name = cols[10].split(":")
134         gid = name[1]
135         start = int(cols[2]) - 1
136         stop = int(cols[3]) - 1
137         sense = cols[4]
138         chrom = cols[1].strip()
139         if sense == "+":
140             sense = "F"
141         else:
142             sense = "R"
143
144         geneID = ("rnorvegicus", gid)
145         gidVersion = 1
146         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
147
148     print "Adding %d feature entries" % len(featureEntries)
149     rnGenome.addFeatureEntryBatch(featureEntries)
150
151
152 def loadGeneAnnotations(db):
153     geneAnnotations = []
154     idb = geneinfoDB()
155     rnGenome = Genome("rnorvegicus", dbFile=db)
156     gidList = rnGenome.allGIDs()
157     for locID in gidList:
158         gID = ("rnorvegicus", locID)
159         geneDescArray = idb.getDescription(gID)
160         geneDesc = ""
161         for entry in geneDescArray:
162             geneDesc += ","
163             geneDesc += entry.strip()
164
165         if len(geneDescArray) > 0:
166             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
167
168     print "Adding %d annotations" % len(geneAnnotations)
169     rnGenome.addAnnotationBatch(geneAnnotations)
170
171
172 def loadGeneOntology(db, goPath, goDefPath):
173     rnGenome = Genome("rnorvegicus", dbFile=db)
174     goDefFile = open(goDefPath, "r")
175     goFile = open(goPath, "r")
176     idb = geneinfoDB()
177     goDefs = {}
178     goArray = []
179     for goDefEntry in goDefFile:
180         if goDefEntry[0] != "!":
181             cols = goDefEntry.split("\t")
182             goDefs[cols[0]] = (cols[1], cols[2].strip())
183
184     goEntries = goFile.readlines()
185     prevGID = ''
186     for entry in goEntries:
187         try:
188             fields = entry.split("\t")
189             if fields[0] != "10090":
190                 continue
191
192             locID = fields[1].strip()
193             gID = ("rnorvegicus", locID)
194             if prevGID != gID:
195                 prevGID = gID
196                 gene_name = ""
197                 synonyms = idb.geneIDSynonyms(gID)
198                 if len(synonyms) >0:
199                     for entry in synonyms:
200                         gene_name += ","
201                         gene_name += entry
202                 else:
203                     gene_name = " "
204
205             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
206         except:
207             print "locus ID %s could not be added" % locID
208             pass
209
210     print "adding %d go entries" % len(goArray)
211     rnGenome.addGoInfoBatch(goArray)
212
213
214 def loadChromosome(db, chromID, chromPath, chromOut):
215     seqArray = []
216     rnGenome = Genome("rnorvegicus", dbFile=db)
217     inFile = open(chromPath, "r")
218     line = inFile.readline()
219     for line in inFile:
220         seqArray.append(line.strip())
221
222     seq = string.join(seqArray, "")
223     seqLen = len(seq)
224     if seqLen < 1:
225         print "Problems reading sequence from file"
226
227     print "writing to file %s" % chromOut
228     outFile = open("%s%s" % (cisRoot, chromOut), "w")
229     outFile.write(seq)
230     outFile.close()
231     rnGenome.addChromosomeEntry(chromID, chromOut, "file")
232
233
234 def createDBindices(db):
235     rnGenome = Genome("rnorvegicus", dbFile=db)
236     rnGenome.createIndices()