erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / cbrenneri.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis remanei
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromPath, chromOutPath):
44     seqArray = []
45     seqLen = 0
46     cbGenome = Genome("cbrenneri", dbFile=db)
47     inFile = open(chromPath, "r")
48     header = inFile.readline()
49     while header != "":
50         seqArray = []
51         seqLen = 0
52         chromHeader = header.strip()[1:].split()
53         chromID = chromHeader[0]
54         currentLine = inFile.readline()
55
56         while currentLine != "" and currentLine[0] != ">":
57             lineSeq = currentLine.strip()
58             seqLen += len(lineSeq)
59             seqArray.append(lineSeq)
60             currentLine = inFile.readline()
61
62         seq = string.join(seqArray, "")
63         if seqLen < 100000:
64             print "Added contig %s to database" % chromID
65             cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
66             cbGenome.addChromosomeEntry(chromID, chromID, "db")
67         else:
68             outFileName = "%s%s.bin" % (chromOutPath, chromID)
69             outFile = open( "%s%s" % (cisRoot, outFileName), "w")
70             outFile.write(seq)
71             outFile.close()
72             print "Added contig file %s to database" % outFileName
73             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
74
75         header = currentLine
76
77     inFile.close()
78
79
80 def loadGeneEntries(db, gffFile):
81     cbGenome = Genome("cbrenneri", dbFile=db)
82     geneFile = open(gffFile, "r")
83     geneStart = {}
84     geneStop = {}
85     geneChrom = {}
86     geneSense = {}
87     geneEntries = []
88     for line in geneFile:
89         if line[0] == "#":
90             continue
91
92         if line[0] == "\n":
93             continue
94
95         field = line[:-1].split("\t")
96         if field[2] != "stop_codon" and field[2] != "start_codon":
97             continue
98
99         idfield = field[8].split('"')
100         gid = idfield[1]
101         geneID = ("cbrenneri", gid)
102         sense = field[6]
103         geneChrom[geneID] = field[0].strip()
104         if sense == "+":
105             geneSense[geneID] = "F"
106         else:
107             geneSense[geneID] = "R"
108
109         if field[2] == "start_codon":
110             if sense == "+":
111                 geneStart[geneID] = int(field[3])
112             else:
113                 geneStart[geneID] = int(field[4])
114         else:
115             if sense == "+":
116                 geneStop[geneID] = int(field[3])
117             else:
118                 geneStop[geneID] = int(field[4])
119
120     for geneID in geneStart:
121         if geneID not in geneStop:
122             print "geneID %s not in geneStop - skipping" % str(geneID)
123             continue
124
125         geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
126
127     print "Adding %d gene entries" % len(geneEntries)
128     cbGenome.addGeneEntryBatch(geneEntries)
129
130
131 def loadFeatureEntries(db, gffFile):
132     cbGenome = Genome("cbrenneri", dbFile=db)
133     featureFile = open(gffFile, "r")
134     featureEntries = []
135     for line in featureFile:
136         if line[0] == "#":
137             continue
138
139         if line[0] == "\n":
140             continue
141
142         field = line.split("\t")
143         if field[2].strip() != "CDS":
144             continue
145
146         gidrev = field[8].split('"')
147         gid = gidrev[1]
148         geneID = ("cbrenneri", gid)
149         gidVersion = 1
150
151         start = int(field[3])
152         stop = int(field[4])
153
154         sense = field[6]
155         chrom = field[0].strip()
156         if sense == "+":
157             sense = "F"
158         else:
159             sense = "R"
160
161         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
162
163     print "Adding %d feature entries" % len(featureEntries)
164     cbGenome.addFeatureEntryBatch(featureEntries)
165
166
167 def createDBFile(db):
168     cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
169     cbGenome.createGeneDB(db)
170
171
172 def createDBindices(db):
173     cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
174     cbGenome.createIndices()
175
176
177 def buildCbrenneriDB(db=geneDB):
178     gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
179     chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
180     chromoOutPath = "/C_brenneri/"
181
182     print "Creating database %s" % db
183     createDBFile(db)
184
185     print "Adding gene entries"
186     loadGeneEntries(db, gffPath)
187
188     print "Adding feature entries"
189     loadFeatureEntries(db, gffPath)
190
191     print "Loading genomic sequence" 
192     loadChromosome(db, chromoPath, chromoOutPath)
193
194     print "Creating Indices"
195     createDBindices(db)
196
197     print "Finished creating database %s" % db