first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / mdomestica.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Monodelphis domestica
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
41
42
43 def buildMdomesticaDB(db=geneDB):
44     genePath = "%s/download/mondom/genscan.txt" % cisRoot
45     chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
46     chromoOutPath = "/M_domestica/"
47
48     print "Creating database %s" % db
49     createDBFile(db)
50
51     print "Adding gene entries"
52     loadGeneEntries(db, genePath)
53
54     print "Adding gene features"
55     loadGeneFeatures(db, genePath)
56
57     print "Loading chromosomes"
58     loadChromosome(db, chromoPath, chromoOutPath)
59
60     print "Creating Indices"
61     createDBindices(db)
62
63     print "Finished creating database %s" % db
64
65
66 def createDBFile(db):
67     mdGenome = Genome("mdomestica",  dbFile=db)
68     mdGenome.createGeneDB(db)
69
70
71 def loadGeneEntries(db, gFile):
72     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
73
74     geneEntries = []
75     mdGenome = Genome("mdomestica", dbFile=db)
76     geneFile = open(gFile, "r")
77     for line in geneFile:
78         cols = line.split("\t")
79         gid = cols[0]
80         start = int(cols[5])
81         stop = int(cols[6])
82         sense = cols[2]
83         chrom = cols[1]
84         if sense == "+":
85             sense = "F"
86         else:
87             sense = "R"
88
89         geneID = ("mdomestica", gid)
90         gidVersion = 1
91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
92
93     print "Adding %d gene entries" % len(geneEntries)
94     mdGenome.addGeneEntryBatch(geneEntries)
95
96
97 def loadGeneAnnotations(db, annotPath):
98     geneAnnotations = []
99     annotFile = open(annotPath, "r")
100     mdGenome = Genome("mdomestica", dbFile=db)
101     for line in annotFile:
102         try:
103             cols = line.split("\t")
104             locID = cols[0]
105             geneDesc = cols[6]
106             if len(locID) > 0:
107                 geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p")))
108         except:
109             pass
110
111     print "Adding %d annotations" % len(geneAnnotations)
112     mdGenome.addAnnotationBatch(geneAnnotations)
113
114
115 def loadGeneFeatures(db, gfile):
116     geneFile = open(gfile, "r")
117     senseArray = {"+": "F",
118                   "-": "R",
119                   ".": "F"
120     }
121
122     seenArray = []
123     insertArray = []
124     for geneLine in geneFile:
125         geneFields = geneLine.split("\t")
126         exonNum = int(geneFields[7])
127         exonStarts = geneFields[8].split(",")
128         exonStops = geneFields[9].split(",")
129         chrom = geneFields[1]
130         sense = senseArray[geneFields[2]]
131         gstop = int(geneFields[6]) - 1
132         gstart = int(geneFields[5]) - 1
133         geneid = geneFields[0]
134         try:
135             geneID = ("mdomestica", geneid)
136         except:
137             continue
138
139         gidVersion = "1"
140         if geneID in seenArray:
141             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
142         else:
143             seenArray.append(geneID)
144
145         for index in range(exonNum):
146             estart = int(exonStarts[index]) - 1
147             estop = int(exonStops[index]) - 1
148             if estart >= gstart and estop <= gstop:
149                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
150             elif estop <= gstart:
151                 if sense == "F":
152                     fType = "5UTR"
153                 else:
154                     fType = "3UTR"
155
156                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
157             elif estart >= gstop:
158                 if sense == "F":
159                     fType = "3UTR"
160                 else:
161                     fType = "5UTR"
162
163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
164             elif estart <= gstop:
165                 if sense == "F":
166                     fType = "3UTR"
167                 else:
168                     fType = "5UTR"
169
170                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
171                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
172             else:
173                 if sense == "F":
174                     fType = "5UTR"
175                 else:
176                     fType = "3UTR"
177
178                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
179                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
180
181     geneFile.close()
182     mdGenome = Genome("mdomestica", dbFile=db)
183     print "Adding %d features" % len(insertArray)
184     mdGenome.addFeatureEntryBatch(insertArray)
185
186
187 def loadChromosome(db, chromPath, chromOutPath):
188     seqArray = []
189     seqLen = 0
190     mdGenome = Genome("mdomestica", dbFile=db)
191     inFile = open(chromPath, "r")
192     header = inFile.readline()
193     while header != "":
194         seqArray = []
195         seqLen = 0
196         chromID = header.strip()[1:]
197         currentLine = inFile.readline()
198         while currentLine != "" and currentLine[0] != ">":
199             lineSeq = currentLine.strip()
200             seqLen += len(lineSeq)
201             seqArray.append(lineSeq)
202             currentLine = inFile.readline()
203
204         seq = string.join(seqArray, "")
205         if seqLen < 500000:
206             print "Added contig %s to database" % chromID
207             mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
208             mdGenome.addChromosomeEntry(chromID, chromID, "db")
209         else:
210             outFileName = "%s%s.bin" % (chromOutPath, chromID)
211             outFile = open("%s%s" % (cisRoot, outFileName), "w")
212             outFile.write(seq)
213             outFile.close()
214             print "Added contig file %s to database" % outFileName
215             mdGenome.addChromosomeEntry(chromID, outFileName, "file")
216
217         header = currentLine
218
219     inFile.close()
220
221
222 def createDBindices(db):
223     mdGenome = Genome("mdomestica", dbFile=db)
224     mdGenome.createIndices()