erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / drerio.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Danio Rerio
31 import string, os
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT") 
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB = "%s/D_rerio/drerio.genedb" % cisRoot
42
43
44 def loadChromosome(db, chromPath, chromOutPath):
45     seqArray = []
46     seqLen = 0
47     drGenome = Genome("drerio", dbFile=db)
48     files = os.listdir(chromPath)
49     for filename in files:
50         inFile = open("%s/%s" % (chromPath, filename), "r")
51         header = inFile.readline()
52         while header != "":
53             seqArray = []
54             seqLen = 0
55             chromID = header.strip()[1:]
56             currentLine = inFile.readline()
57             while currentLine != "" and currentLine[0] != ">":
58                 lineSeq = currentLine.strip()
59                 seqLen += len(lineSeq)
60                 seqArray.append(lineSeq)
61                 currentLine = inFile.readline()
62
63             seq = string.join(seqArray, "")
64             if seqLen < 250000:
65                 print "Added contig %s to database" % chromID
66                 drGenome.addSequence(("drerio", chromID), seq, "chromosome", str(seqLen))
67                 drGenome.addChromosomeEntry(chromID, chromID, "db")
68             else:
69                 outFileName = "%s%s.bin" % (chromOutPath, chromID)
70                 outFile = open("%s%s" % (cisRoot, outFileName), "w")
71                 outFile.write(seq)
72                 outFile.close()
73                 print "Added contig file %s to database" % outFileName
74                 drGenome.addChromosomeEntry(chromID, outFileName, "file")
75
76             header = currentLine
77
78         inFile.close()
79
80
81 def loadGeneEntries(db, gFile):
82     geneEntries = []
83     seenGIDs = []
84     drGenome = Genome("drerio", dbFile=db)
85     geneFile = open(gFile, "r")
86     idb = geneinfoDB()
87     for line in geneFile:
88         cols = line.split("\t")
89         gid = cols[0]
90         try:
91             tempID = idb.getGeneID("drerio", gid)
92             if not len(tempID):
93                 print "could not find %s" % gid
94                 continue
95
96             geneInfo = idb.getGeneInfo(tempID)
97             gid = geneInfo[1]
98         except:
99             continue
100
101         if gid == "":
102             continue
103         
104         start = int(cols[6])
105         stop = int(cols[7])
106         sense = cols[3]
107         chrom = cols[2]
108         if sense == "-":
109             sense = "R"
110         else:
111             sense = "F"
112
113         geneID = ("drerio", gid)
114         if geneID in seenGIDs:
115             gidVersion = "2"
116         else:
117             gidVersion = "1"
118             seenGIDs.append(geneID)
119
120         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
121
122     print "Adding %d gene entries" % len(geneEntries)
123     drGenome.addGeneEntryBatch(geneEntries)
124
125
126 def loadGeneFeatures(db, gfile):
127     geneFile = open(gfile, "r")
128     idb = geneinfoDB()
129     seenGIDs = []
130     senseArray = {"+": "F",
131                   "-": "R",
132                   ".": "F"
133     }
134
135     insertArray = []
136     for geneLine in geneFile:
137         geneFields = geneLine.split("\t")
138         exonNum = int(geneFields[8])
139         exonStarts = geneFields[9].split(",")
140         exonStops = geneFields[10].split(",")
141         chrom = geneFields[2]
142         sense = senseArray[geneFields[3]]
143         gstart = int(geneFields[6]) - 1
144         gstop = int(geneFields[7]) - 1
145         gid = geneFields[0]
146         try:
147             tempID = idb.getGeneID("drerio", gid)
148             if not len(tempID):
149                 print "could not find %s" % gid
150                 continue
151
152             geneInfo = idb.getGeneInfo(tempID)
153             gid = geneInfo[1]
154         except:
155             continue
156
157         if gid == "":
158             continue
159
160         geneID = ("drerio", gid)
161         if geneID in seenGIDs:
162             gidVersion = "2"
163         else:
164             gidVersion = "1"
165
166         for index in range(exonNum):
167             estart = int(exonStarts[index]) - 1
168             estop = int(exonStops[index]) - 1
169             if estart >= gstart and estop <= gstop:
170                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
171             elif estop <= gstart:
172                 if sense == "F":
173                     fType = "5UTR"
174                 else:
175                     fType = "3UTR"
176
177                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
178             elif estart >= gstop:
179                 if sense == "F":
180                     fType = "3UTR"
181                 else:
182                     fType = "5UTR"
183
184                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
185             elif estart <= gstop and estart > gstart:
186                 if sense == "F":
187                     fType = "3UTR"
188                 else:
189                     fType = "5UTR"
190
191                 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
192                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
193             elif estart < gstart and estop <= gstop:
194                 if sense == "F":
195                     fType = "5UTR"
196                 else:
197                     fType = "3UTR"
198
199                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
200                 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
201             else:
202                 if sense == "F":
203                     fType1 = "5UTR"
204                     fType2 = "3UTR"
205                 else:
206                     fType1 = "3UTR"
207                     fType2 = "5UTR"
208
209                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
210                 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
211                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
212
213     geneFile.close()
214     drGenome = Genome("drerio", dbFile=db)
215     print "Adding %d features" % len(insertArray)
216     drGenome.addFeatureEntryBatch(insertArray)
217
218
219 def createDBFile(db):
220     drGenome = Genome("drerio",  dbFile=db)
221     drGenome.createGeneDB(db)
222
223
224 def createDBindices(db):
225     drGenome = Genome("drerio", dbFile=db)
226     drGenome.createIndices()
227
228
229 def buildDrerioDB(db=geneDB):
230     """ genes and annotations are from UCSC (dr3). 
231     """
232     #genePath = "%s/download/xenoRefFlat.txt" % cisRoot
233     chromoPath = "%s/download/dr3" % cisRoot
234     chromoOutPath = "/D_rerio/"
235     print "Creating database %s" % db
236     createDBFile(db)
237
238     #print "Adding gene entries"
239     #loadGeneEntries(db, genePath)
240
241     #print "Adding gene features"
242     #loadGeneFeatures(db, genePath)
243
244     print "Loading chromosomes"
245     loadChromosome(db, chromoPath, chromoOutPath)
246
247     print "Creating Indices"
248     createDBindices(db)
249
250     print "Finished creating database %s" % db