first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / xtropicalis.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Xenopus tropicalis
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
41
42
43 def buildXtropicalisDB(db=geneDB):
44     genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
45     chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
46     chromoOutPath = "/X_tropicalis/"
47
48     print "Creating database %s" % db
49     createDBFile(db)
50
51     print "Adding gene entries"
52     loadGeneEntries(db, genePath)
53
54     print "Adding gene features"
55     loadGeneFeatures(db, genePath)
56
57     print "Loading sequences"
58     loadChromosome(db, chromoPath, chromoOutPath)
59
60     print "Creating Indices"
61     createDBindices(db)
62
63     print "Finished creating database %s" % db
64
65
66 def createDBFile(db):
67     xtGenome = Genome("xtropicalis",  dbFile=db)
68     xtGenome.createGeneDB(db)
69
70
71 def loadGeneEntries(db, gFile):
72     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
73
74     geneEntries = []
75     xtGenome = Genome("xtropicalis", dbFile=db)
76     geneFile = open(gFile, "r")
77     for line in geneFile:
78         cols = line.split("\t")
79         gid = cols[0]
80         start = int(cols[5])
81         stop = int(cols[6])
82         sense = cols[2]
83         chrom = cols[1]
84         if sense == "+":
85             sense = "F"
86         else:
87             sense = "R"
88
89         geneID = ("xtropicalis", gid)
90         gidVersion = 1
91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
92
93     print "Adding %d gene entries" % len(geneEntries)
94     xtGenome.addGeneEntryBatch(geneEntries)
95
96
97 def loadGeneAnnotations(db, annotPath):
98     geneAnnotations = []
99     annotFile = open(annotPath, "r")
100     xtGenome = Genome("xtropicalis", dbFile=db)
101     for line in annotFile:
102         try:
103             cols = line.split("\t")
104             locID = cols[0]
105             geneDesc = cols[6]
106             if len(locID) > 0:
107                 geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p")))
108         except:
109             pass
110
111     print "Adding %d annotations" % len(geneAnnotations)
112     xtGenome.addAnnotationBatch(geneAnnotations)
113
114
115 def loadGeneFeatures(db, gfile):
116     geneFile = open(gfile, "r")
117     senseArray = {"+": "F",
118                   "-": "R",
119                   ".": "F"
120     }
121
122     seenArray = []
123     insertArray = []
124     for geneLine in geneFile:
125         geneFields = geneLine.split("\t")
126         exonNum = int(geneFields[7])
127         exonStarts = geneFields[8].split(",")
128         exonStops = geneFields[9].split(",")
129         chrom = geneFields[1]
130         sense = senseArray[geneFields[2]]
131         gstop = int(geneFields[6]) - 1
132         gstart = int(geneFields[5]) - 1
133         geneid = geneFields[0]
134         try:
135             geneID = ("xtropicalis", geneid)
136         except:
137             continue
138
139         gidVersion = "1"
140         if geneID in seenArray:
141             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
142         else:
143             seenArray.append(geneID)
144
145         for index in range(exonNum):
146             estart = int(exonStarts[index]) - 1
147             estop = int(exonStops[index]) - 1
148             if estart >= gstart and estop <= gstop:
149                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
150             elif estop <= gstart:
151                 if sense == "F":
152                     fType = "5UTR"
153                 else:
154                     fType = "3UTR"
155
156                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
157             elif estart >= gstop:
158                 if sense == "F":
159                     fType = "3UTR"
160                 else:
161                     fType = "5UTR"
162
163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164             elif estart <= gstop:
165                 if sense == "F":
166                     fType = "3UTR"
167                 else:
168                     fType = "5UTR"
169
170                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
171                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
172             else:
173                 if sense == "F":
174                     fType = "5UTR"
175                 else:
176                     fType = "3UTR"
177
178                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
179                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
180
181     geneFile.close()
182     xtGenome = Genome("xtropicalis", dbFile=db)
183     print "Adding %d features" % len(insertArray)
184     xtGenome.addFeatureEntryBatch(insertArray)
185
186
187 def loadGeneOntology(db, goPath, goDefPath, annotPath):
188     xtGenome = Genome("xtropicalis", dbFile=db)
189     goDefFile = open(goDefPath, "r")
190     goFile = open(goPath, "r")
191     annotFile = open(annotPath, "r")  
192     annotEntries = annotFile.readlines()
193     annotFile.close()
194     goDefEntries = goDefFile.readlines()
195     goDefs = {}
196     locus = {}
197     goArray = []
198     for goDefEntry in goDefEntries:
199         if goDefEntry[0] != "!":
200             cols = goDefEntry.split("\t")
201             goDefs[cols[0]] = (cols[1], cols[2].strip())
202
203     goEntries = goFile.readlines()
204     for annotEntry in annotEntries:
205         try:
206             cols = annotEntry.split("\t")
207             locID = cols[0]
208             geneName = cols[1]
209             geneDesc = cols[6]
210             mimID = ""
211             if len(locID) > 0:
212                 locus[locID] = (geneName, geneDesc, mimID)
213         except:
214             pass
215
216     for entry in goEntries:
217         try:
218             fields = entry.split("\t")
219             locID = fields[0].strip()
220             (gene_name, gene_desc, mimID) = locus[locID]
221             goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
222         except:
223             pass
224
225     print "adding %d go entries" % len(goArray)
226     xtGenome.addGoInfoBatch(goArray)
227
228
229 def loadChromosome(db, chromPath, chromOutPath):
230     seqArray = []
231     seqLen = 0
232     xtGenome = Genome("xtropicalis", dbFile=db)
233     inFile = open(chromPath, "r")
234     header = inFile.readline()
235     while header != "":
236         seqArray = []
237         seqLen = 0
238         chromID = header.strip()[1:]
239         currentLine = inFile.readline()
240         while currentLine != "" and currentLine[0] != ">":
241             lineSeq = currentLine.strip()
242             seqLen += len(lineSeq)
243             seqArray.append(lineSeq)
244             currentLine = inFile.readline()
245
246         seq = string.join(seqArray, "")
247         if seqLen < 500000:
248             print "Added contig %s to database" % chromID
249             xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
250             xtGenome.addChromosomeEntry(chromID, chromID, "db")
251         else:
252             outFileName = "%s%s.bin" % (chromOutPath, chromID)
253             outFile = open("%s%s" % (cisRoot, outFileName), "w")
254             outFile.write(seq)
255             outFile.close()
256             print "Added contig file %s to database" % outFileName
257             xtGenome.addChromosomeEntry(chromID, outFileName, "file")
258
259         header = currentLine
260
261     inFile.close()
262
263
264 def createDBindices(db):
265     xtGenome = Genome("xtropicalis", dbFile=db)
266     xtGenome.createIndices()