first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / drerio.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Danio Rerio
31 import string, os
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT")
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
42
43
44 def buildDrerioDB(db=geneDB):
45     """ genes and annotations are from UCSC (dr3).
46     """
47     #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
48     chromoPath = "%s/download/dr3" % cisRoot
49     chromoOutPath = "/D_rerio/"
50     print "Creating database %s" % db
51     createDBFile(db)
52
53     #print "Adding gene entries"
54     #loadGeneEntries(db, genePath)
55
56     #print "Adding gene features"
57     #loadGeneFeatures(db, genePath)
58
59     print "Loading chromosomes"
60     loadChromosome(db, chromoPath, chromoOutPath)
61
62     print "Creating Indices"
63     createDBindices(db)
64
65     print "Finished creating database %s" % db
66
67
68 def createDBFile(db):
69     drGenome = Genome("drerio",  dbFile=db)
70     drGenome.createGeneDB(db)
71
72
73 def loadGeneEntries(db, gFile):
74     geneEntries = []
75     seenGIDs = []
76     drGenome = Genome("drerio", dbFile=db)
77     geneFile = open(gFile, "r")
78     idb = geneinfoDB()
79     for line in geneFile:
80         cols = line.split("\t")
81         gid = cols[0]
82         try:
83             tempID = idb.getGeneID("drerio", gid)
84             if not len(tempID):
85                 print "could not find %s" % gid
86                 continue
87
88             geneInfo = idb.getGeneInfo(tempID)
89             gid = geneInfo[1]
90         except:
91             continue
92
93         if gid == "":
94             continue
95         
96         start = int(cols[6])
97         stop = int(cols[7])
98         sense = cols[3]
99         chrom = cols[2]
100         if sense == "-":
101             sense = "R"
102         else:
103             sense = "F"
104
105         geneID = ("drerio", gid)
106         if geneID in seenGIDs:
107             gidVersion = "2"
108         else:
109             gidVersion = "1"
110             seenGIDs.append(geneID)
111
112         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
113
114     print "Adding %d gene entries" % len(geneEntries)
115     drGenome.addGeneEntryBatch(geneEntries)
116
117
118 def loadGeneFeatures(db, gfile):
119     geneFile = open(gfile, "r")
120     idb = geneinfoDB()
121     seenGIDs = []
122     senseArray = {"+": "F",
123                   "-": "R",
124                   ".": "F"
125     }
126
127     insertArray = []
128     for geneLine in geneFile:
129         geneFields = geneLine.split("\t")
130         exonNum = int(geneFields[8])
131         exonStarts = geneFields[9].split(",")
132         exonStops = geneFields[10].split(",")
133         chrom = geneFields[2]
134         sense = senseArray[geneFields[3]]
135         gstart = int(geneFields[6]) - 1
136         gstop = int(geneFields[7]) - 1
137         gid = geneFields[0]
138         try:
139             tempID = idb.getGeneID("drerio", gid)
140             if not len(tempID):
141                 print "could not find %s" % gid
142                 continue
143
144             geneInfo = idb.getGeneInfo(tempID)
145             gid = geneInfo[1]
146         except:
147             continue
148
149         if gid == "":
150             continue
151
152         geneID = ("drerio", gid)
153         if geneID in seenGIDs:
154             gidVersion = "2"
155         else:
156             gidVersion = "1"
157
158         for index in range(exonNum):
159             estart = int(exonStarts[index]) - 1
160             estop = int(exonStops[index]) - 1
161             if estart >= gstart and estop <= gstop:
162                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
163             elif estop <= gstart:
164                 if sense == "F":
165                     fType = "5UTR"
166                 else:
167                     fType = "3UTR"
168
169                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
170             elif estart >= gstop:
171                 if sense == "F":
172                     fType = "3UTR"
173                 else:
174                     fType = "5UTR"
175
176                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
177             elif estart <= gstop and estart > gstart:
178                 if sense == "F":
179                     fType = "3UTR"
180                 else:
181                     fType = "5UTR"
182
183                 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
184                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
185             elif estart < gstart and estop <= gstop:
186                 if sense == "F":
187                     fType = "5UTR"
188                 else:
189                     fType = "3UTR"
190
191                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
192                 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
193             else:
194                 if sense == "F":
195                     fType1 = "5UTR"
196                     fType2 = "3UTR"
197                 else:
198                     fType1 = "3UTR"
199                     fType2 = "5UTR"
200
201                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
202                 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
203                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
204
205     geneFile.close()
206     drGenome = Genome("drerio", dbFile=db)
207     print "Adding %d features" % len(insertArray)
208     drGenome.addFeatureEntryBatch(insertArray)
209
210
211 def loadChromosome(db, chromPath, chromOutPath):
212     seqArray = []
213     seqLen = 0
214     drGenome = Genome("drerio", dbFile=db)
215     files = os.listdir(chromPath)
216     for filename in files:
217         inFile = open("%s/%s" % (chromPath, filename), "r")
218         header = inFile.readline()
219         while header != "":
220             seqArray = []
221             seqLen = 0
222             chromID = header.strip()[1:]
223             currentLine = inFile.readline()
224             while currentLine != "" and currentLine[0] != ">":
225                 lineSeq = currentLine.strip()
226                 seqLen += len(lineSeq)
227                 seqArray.append(lineSeq)
228                 currentLine = inFile.readline()
229
230             seq = string.join(seqArray, "")
231             if seqLen < 250000:
232                 print "Added contig %s to database" % chromID
233                 drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
234                 drGenome.addChromosomeEntry(chromID, chromID, "db")
235             else:
236                 outFileName = "%s%s.bin" % (chromOutPath, chromID)
237                 outFile = open("%s%s" % (cisRoot, outFileName), "w")
238                 outFile.write(seq)
239                 outFile.close()
240                 print "Added contig file %s to database" % outFileName
241                 drGenome.addChromosomeEntry(chromID, outFileName, "file")
242
243             header = currentLine
244
245         inFile.close()
246
247
248 def createDBindices(db):
249     drGenome = Genome("drerio", dbFile=db)
250     drGenome.createIndices()