first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / dmelanogaster.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Drosophila melanogaster
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot
41
42 version = {"A": "1",
43            "B": "2",
44            "C": "3",
45            "D": "4",
46            "E": "5",
47            "F": "6",
48            "G": "7",
49            "H": "8",
50            "I": "9",
51            "J": "10",
52            "K": "11",
53            "L": "12",
54            "M": "13",
55            "N": "14",
56            "O": "15",
57            "P": "16",
58            "Q": "17",
59            "R": "18",
60            "S": "19",
61            "T": "20",
62            "U": "21",
63            "V": "22",
64            "W": "23",
65            "X": "24",
66            "Y": "25",
67            "Z": "26"
68 }
69
70
71 def buildDmelanogasterDB(db=geneDB):
72     """ genes and annotations are from UCSC. GO association file is from geneontology.org.
73     """
74     genePath = "%s/download/flyBaseGene.txt" % cisRoot
75     annotPath = "%s/download/gene_info" % cisRoot
76     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
77     goPath = "%s/download/gene2go" % cisRoot
78
79     print "Creating database %s" % db
80     createDBFile(db)
81
82     print "Adding gene entries"
83     loadGeneEntries(db, genePath)
84
85     print "Adding gene features"
86     loadGeneFeatures(db, genePath)
87
88     print "Adding gene annotations"
89     loadGeneAnnotations(db, annotPath)
90
91     print "Adding gene ontology"
92     loadGeneOntology(db, goPath, goDefPath, annotPath)
93
94     chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet",
95                  "4", "X", "XHet", "YHet", "U", "UExtra", "M"
96     ]
97     for chromID in chromList:
98         print "Loading chromosome %s" % chromID
99         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
100         loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID)
101
102     print "Creating Indices"
103     createDBindices(db)
104
105     print "Finished creating database %s" % db
106
107
108 def createDBFile(db):
109     dmGenome = Genome("dmelanogaster", dbFile=db)
110     dmGenome.createGeneDB(db)
111
112
113 def loadGeneEntries(db, gFile):
114     geneEntries = []
115     dmGenome = Genome("dmelanogaster", dbFile=db)
116     geneFile = open(gFile, "r")
117
118     for line in geneFile:
119         cols = line.split("\t")
120         name = cols[1].split("-R")
121         gid = name[0]
122         start = int(cols[4])
123         stop = int(cols[5])
124         sense = cols[3]
125         chrom = cols[2][3:]
126         if sense == "-":
127             sense = "R"
128         else:
129             sense = "F"
130
131         geneID = ("dmelanogaster", gid)
132         try:
133             gidVersion = version[name[1]]
134         except:
135             continue
136
137         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
138
139     print "Adding %d gene entries" % len(geneEntries)
140     dmGenome.addGeneEntryBatch(geneEntries)
141
142
143 def loadGeneFeatures(db, gfile):
144     geneFile = open(gfile, "r")
145     senseArray = {"+": "F",
146                   "-": "R",
147                   ".": "F"
148     }
149
150     insertArray = []
151     for geneLine in geneFile:
152         geneFields = geneLine.split("\t")
153         exonNum = int(geneFields[8])
154         exonStarts = geneFields[9].split(",")
155         exonStops = geneFields[10].split(",")
156         chrom = geneFields[2][3:]
157         sense = senseArray[geneFields[3]]
158         gstop = int(geneFields[7]) - 1
159         gstart = int(geneFields[6]) - 1
160         name = geneFields[1].split("-R")
161         geneID = ("dmelanogaster", name[0])
162         try:
163             gidVersion = version[name[1]]
164         except:
165             continue
166
167         for index in range(exonNum):
168             estart = int(exonStarts[index]) - 1
169             estop = int(exonStops[index]) - 1
170             if estart >= gstart and estop <= gstop:
171                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
172             elif estop <= gstart:
173                 if sense == "F":
174                     fType = "5UTR"
175                 else:
176                     fType = "3UTR"
177
178                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
179             elif estart >= gstop:
180                 if sense == "F":
181                     fType = "3UTR"
182                 else:
183                     fType = "5UTR"
184
185                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
186             elif estart <= gstop and estart > gstart:
187                 if sense == "F":
188                     fType = "3UTR"
189                 else:
190                     fType = "5UTR"
191
192                 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
193                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
194             elif estart < gstart and estop <= gstop:
195                 if sense == "F":
196                     fType = "5UTR"
197                 else:
198                     fType = "3UTR"
199
200                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
201                 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
202             else:
203                 if sense == "F":
204                     fType1 = "5UTR"
205                     fType2 = "3UTR"
206                 else:
207                     fType1 = "3UTR"
208                     fType2 = "5UTR"
209
210                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
211                 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
212                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
213
214     geneFile.close()
215     dmGenome = Genome("dmelanogaster", dbFile=db)
216     print "Adding %d features" % len(insertArray)
217     dmGenome.addFeatureEntryBatch(insertArray)
218
219
220 def loadGeneAnnotations(db, annotPath):
221     geneAnnotations = []
222     annotFile = open(annotPath, "r")
223     dmGenome = Genome("dmelanogaster", dbFile=db)
224     for line in annotFile:
225         try:
226             cols = line.split("\t")
227             if cols[0] != "7227":
228                 continue
229
230             locID = cols[3]
231             if "Dmel_" in locID:
232                 locID = locID[5:]
233
234             geneDesc = cols[4]
235             if len(locID) > 0:
236                 geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p")))
237         except:
238             pass
239
240     print "Adding %d annotations" % len(geneAnnotations)
241     dmGenome.addAnnotationBatch(geneAnnotations)
242
243
244 def loadGeneOntology(db, goPath, goDefPath, annotPath):
245     dmGenome = Genome("dmelanogaster", dbFile=db)
246     goDefFile = open(goDefPath, "r")
247     goFile = open(goPath, "r")
248     annotFile = open(annotPath, "r")
249     annotEntries = annotFile.readlines()
250     annotFile.close()
251     goDefEntries = goDefFile.readlines()
252     goDefs = {}
253     locus = {}
254     goArray = []
255     for goDefEntry in goDefEntries:
256         if goDefEntry[0] != "!":
257             cols = goDefEntry.split("\t")
258             goDefs[cols[0]] = (cols[1], cols[2].strip())
259
260     goEntries = goFile.readlines()
261     for annotEntry in annotEntries:
262         try:
263             cols = annotEntry.split("\t")
264             if cols[0] != "7227":
265                 continue
266
267             locID = cols[3].strip()
268             geneName = cols[1].strip()
269             if len(locID) > 0:
270                 locus[geneName] = locID
271         except:
272             pass
273
274     for entry in goEntries:
275         if entry[0] == "!":
276             continue
277
278         if entry[:4] != "7227":
279             continue
280
281         try:
282             fields = entry.split("\t")
283             geneName = fields[1].strip()
284             locID = locus[geneName]
285             if "Dmel_" in locID:
286                 locID = locID[5:]
287
288             GOID = fields[2]
289             goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], ""))
290         except:
291             pass
292
293     print "adding %d go entries" % len(goArray)
294     dmGenome.addGoInfoBatch(goArray)
295
296
297 def loadChromosome(db, chromID, chromPath, chromOut):
298     seqArray = []
299     dmGenome = Genome("dmelanogaster", dbFile=db)
300     inFile = open(chromPath, "r")
301     line = inFile.readline()
302     for line in inFile:
303         seqArray.append(line.strip())
304
305     seq = string.join(seqArray, "")
306     seqLen = len(seq)
307     if seqLen < 1:
308         print "Problems reading sequence from file"
309
310     print "writing to file %s" % chromOut
311     outFile = open("%s%s" % (cisRoot, chromOut), "w")
312     outFile.write(seq)
313     outFile.close()
314     dmGenome.addChromosomeEntry(chromID, chromOut, "file")
315
316
317 def createDBindices(db):
318     dmGenome = Genome("dmelanogaster", dbFile=db)
319     dmGenome.createIndices()