erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / xtropicalis.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Xenopus tropicalis
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromPath, chromOutPath):
44     seqArray = []
45     seqLen = 0
46     xtGenome = Genome("xtropicalis", dbFile=db)
47     inFile = open(chromPath, "r")
48     header = inFile.readline()
49     while header != "":
50         seqArray = []
51         seqLen = 0
52         chromID = header.strip()[1:]
53         currentLine = inFile.readline()
54         while currentLine != "" and currentLine[0] != ">":
55             lineSeq = currentLine.strip()
56             seqLen += len(lineSeq)
57             seqArray.append(lineSeq)
58             currentLine = inFile.readline()
59
60         seq = string.join(seqArray, "")
61         if seqLen < 500000:
62             print "Added contig %s to database" % chromID
63             xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
64             xtGenome.addChromosomeEntry(chromID, chromID, "db")
65         else:
66             outFileName = "%s%s.bin" % (chromOutPath, chromID)
67             outFile = open("%s%s" % (cisRoot, outFileName), "w")
68             outFile.write(seq)
69             outFile.close()
70             print "Added contig file %s to database" % outFileName
71             xtGenome.addChromosomeEntry(chromID, outFileName, "file")
72
73         header = currentLine
74
75     inFile.close()
76
77
78 def loadGeneEntries(db, gFile):
79     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
80     """
81     geneEntries = []
82     xtGenome = Genome("xtropicalis", dbFile=db)
83     geneFile = open(gFile, "r")
84     for line in geneFile:
85         cols = line.split("\t")
86         gid = cols[0]
87         start = int(cols[5])
88         stop = int(cols[6])
89         sense = cols[2]
90         chrom = cols[1]
91         if sense == "+":
92             sense = "F"
93         else:
94             sense = "R"
95
96         geneID = ("xtropicalis", gid)
97         gidVersion = 1
98         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
99
100     print "Adding %d gene entries" % len(geneEntries)
101     xtGenome.addGeneEntryBatch(geneEntries)
102
103
104 def loadGeneAnnotations(db, annotPath):
105     geneAnnotations = []
106     annotFile = open(annotPath, "r")
107     xtGenome = Genome("xtropicalis", dbFile=db)
108     for line in annotFile:
109         try:
110             cols = line.split("\t")
111             locID = cols[0]
112             geneDesc = cols[6]
113             if len(locID) > 0:
114                 geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p")))
115         except:
116             pass
117
118     print "Adding %d annotations" % len(geneAnnotations)
119     xtGenome.addAnnotationBatch(geneAnnotations)
120
121
122 def loadGeneFeatures(db, gfile):
123     geneFile = open(gfile, "r")
124     senseArray = {"+": "F",
125                   "-": "R",
126                   ".": "F"
127     }
128
129     seenArray = []
130     insertArray = []
131     for geneLine in geneFile:
132         geneFields = geneLine.split("\t")
133         exonNum = int(geneFields[7])
134         exonStarts = geneFields[8].split(",")
135         exonStops = geneFields[9].split(",")
136         chrom = geneFields[1]
137         sense = senseArray[geneFields[2]]
138         gstop = int(geneFields[6]) - 1
139         gstart = int(geneFields[5]) - 1
140         geneid = geneFields[0]
141         try:
142             geneID = ("xtropicalis", geneid)
143         except:
144             continue
145
146         gidVersion = "1"
147         if geneID in seenArray:
148             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
149         else:
150             seenArray.append(geneID)
151
152         for index in range(exonNum):
153             estart = int(exonStarts[index]) - 1
154             estop = int(exonStops[index]) - 1
155             if estart >= gstart and estop <= gstop:
156                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
157             elif estop <= gstart:
158                 if sense == "F":
159                     fType = "5UTR"
160                 else:
161                     fType = "3UTR"
162
163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164             elif estart >= gstop:
165                 if sense == "F":
166                     fType = "3UTR"
167                 else:
168                     fType = "5UTR"
169
170                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
171             elif estart <= gstop:
172                 if sense == "F":
173                     fType = "3UTR"
174                 else:
175                     fType = "5UTR"
176
177                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
178                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
179             else:
180                 if sense == "F":
181                     fType = "5UTR"
182                 else:
183                     fType = "3UTR"
184
185                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
186                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
187
188     geneFile.close()
189     xtGenome = Genome("xtropicalis", dbFile=db)
190     print "Adding %d features" % len(insertArray)
191     xtGenome.addFeatureEntryBatch(insertArray)
192
193
194 def loadGeneOntology(db, goPath, goDefPath, annotPath):
195     xtGenome = Genome("xtropicalis", dbFile=db)
196     goDefFile = open(goDefPath, "r")
197     goFile = open(goPath, "r")
198     annotFile = open(annotPath, "r")  
199     annotEntries = annotFile.readlines()
200     annotFile.close()
201     goDefEntries = goDefFile.readlines()
202     goDefs = {}
203     locus = {}
204     goArray = []
205     for goDefEntry in goDefEntries:
206         if goDefEntry[0] != "!":
207             cols = goDefEntry.split("\t")
208             goDefs[cols[0]] = (cols[1], cols[2].strip())
209
210     goEntries = goFile.readlines()
211     for annotEntry in annotEntries:
212         try:
213             cols = annotEntry.split("\t")
214             locID = cols[0]
215             geneName = cols[1]
216             geneDesc = cols[6]
217             mimID = ""
218             if len(locID) > 0:
219                 locus[locID] = (geneName, geneDesc, mimID)
220         except:
221             pass
222
223     for entry in goEntries:
224         try:
225             fields = entry.split("\t")
226             locID = fields[0].strip()
227             (gene_name, gene_desc, mimID) = locus[locID]
228             goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
229         except:
230             pass
231
232     print "adding %d go entries" % len(goArray)
233     xtGenome.addGoInfoBatch(goArray)
234
235
236 def createDBFile(db):
237     xtGenome = Genome("xtropicalis",  dbFile=db)
238     xtGenome.createGeneDB(db)
239
240
241 def createDBindices(db):
242     xtGenome = Genome("xtropicalis", dbFile=db)
243     xtGenome.createIndices()
244
245
246 def buildXtropicalisDB(db=geneDB):
247     genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
248     chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
249     chromoOutPath = "/X_tropicalis/"
250
251     print "Creating database %s" % db
252     createDBFile(db)
253
254     print "Adding gene entries"
255     loadGeneEntries(db, genePath)
256
257     print "Adding gene features"
258     loadGeneFeatures(db, genePath)
259
260     print "Loading sequences" 
261     loadChromosome(db, chromoPath, chromoOutPath)
262
263     print "Creating Indices"
264     createDBindices(db)
265
266     print "Finished creating database %s" % db