erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / mdomestica.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Monodelphis domestica
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromPath, chromOutPath):
44     seqArray = []
45     seqLen = 0
46     mdGenome = Genome("mdomestica", dbFile=db)
47     inFile = open(chromPath, "r")
48     header = inFile.readline()
49     while header != "":
50         seqArray = []
51         seqLen = 0
52         chromID = header.strip()[1:]
53         currentLine = inFile.readline()
54         while currentLine != "" and currentLine[0] != ">":
55             lineSeq = currentLine.strip()
56             seqLen += len(lineSeq)
57             seqArray.append(lineSeq)
58             currentLine = inFile.readline()
59
60         seq = string.join(seqArray, "")
61         if seqLen < 500000:
62             print "Added contig %s to database" % chromID
63             mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
64             mdGenome.addChromosomeEntry(chromID, chromID, "db")
65         else:
66             outFileName = "%s%s.bin" % (chromOutPath, chromID)
67             outFile = open("%s%s" % (cisRoot, outFileName), "w")
68             outFile.write(seq)
69             outFile.close()
70             print "Added contig file %s to database" % outFileName
71             mdGenome.addChromosomeEntry(chromID, outFileName, "file")
72
73         header = currentLine
74
75     inFile.close()
76
77
78 def loadGeneEntries(db, gFile):
79     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
80     """
81     geneEntries = []
82     mdGenome = Genome("mdomestica", dbFile=db)
83     geneFile = open(gFile, "r")
84     for line in geneFile:
85         cols = line.split("\t")
86         gid = cols[0]
87         start = int(cols[5])
88         stop = int(cols[6])
89         sense = cols[2]
90         chrom = cols[1]
91         if sense == "+":
92             sense = "F"
93         else:
94             sense = "R"
95
96         geneID = ("mdomestica", gid)
97         gidVersion = 1
98         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
99
100     print "Adding %d gene entries" % len(geneEntries)
101     mdGenome.addGeneEntryBatch(geneEntries)
102
103
104 def loadGeneAnnotations(db, annotPath):
105     geneAnnotations = []
106     annotFile = open(annotPath, "r")  
107     mdGenome = Genome("mdomestica", dbFile=db)
108     for line in annotFile:
109         try:
110             cols = line.split("\t")
111             locID = cols[0]
112             geneDesc = cols[6]
113             if len(locID) > 0:
114                 geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p")))
115         except:
116             pass
117
118     print "Adding %d annotations" % len(geneAnnotations)
119     mdGenome.addAnnotationBatch(geneAnnotations)
120
121
122 def loadGeneFeatures(db, gfile):
123     geneFile = open(gfile, "r")
124     senseArray = {"+": "F",
125                   "-": "R",
126                   ".": "F"
127     }
128
129     seenArray = []
130     insertArray = []
131     for geneLine in geneFile:
132         geneFields = geneLine.split("\t")
133         exonNum = int(geneFields[7])
134         exonStarts = geneFields[8].split(",")
135         exonStops = geneFields[9].split(",")
136         chrom = geneFields[1]
137         sense = senseArray[geneFields[2]]
138         gstop = int(geneFields[6]) - 1
139         gstart = int(geneFields[5]) - 1
140         geneid = geneFields[0]
141         try:
142             geneID = ("mdomestica", geneid)
143         except:
144             continue
145
146         gidVersion = "1"
147         if geneID in seenArray:
148             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
149         else:
150             seenArray.append(geneID)
151
152         for index in range(exonNum):
153             estart = int(exonStarts[index]) - 1
154             estop = int(exonStops[index]) - 1
155             if estart >= gstart and estop <= gstop:
156                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
157             elif estop <= gstart:
158                 if sense == "F":
159                     fType = "5UTR"
160                 else:
161                     fType = "3UTR"
162
163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164             elif estart >= gstop:
165                 if sense == "F":
166                     fType = "3UTR"
167                 else:
168                     fType = "5UTR"
169
170                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
171             elif estart <= gstop:
172                 if sense == "F":
173                     fType = "3UTR"
174                 else:
175                     fType = "5UTR"
176
177                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
178                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
179             else:
180                 if sense == "F":
181                     fType = "5UTR"
182                 else:
183                     fType = "3UTR"
184
185                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
186                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
187
188     geneFile.close()
189     mdGenome = Genome("mdomestica", dbFile=db)
190     print "Adding %d features" % len(insertArray)
191     mdGenome.addFeatureEntryBatch(insertArray)
192
193
194 def createDBFile(db):
195     mdGenome = Genome("mdomestica",  dbFile=db)
196     mdGenome.createGeneDB(db)
197
198
199 def createDBindices(db):
200     mdGenome = Genome("mdomestica", dbFile=db)
201     mdGenome.createIndices()
202
203
204 def buildMdomesticaDB(db=geneDB):
205     genePath = "%s/download/mondom/genscan.txt" % cisRoot
206     chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
207     chromoOutPath = "/M_domestica/"
208
209     print "Creating database %s" % db
210     createDBFile(db)
211
212     print "Adding gene entries"
213     loadGeneEntries(db, genePath)
214
215     print "Adding gene features"
216     loadGeneFeatures(db, genePath)
217
218     print "Loading chromosomes"
219     loadChromosome(db, chromoPath, chromoOutPath)
220
221     print "Creating Indices"
222     createDBindices(db)
223
224     print "Finished creating database %s" % db