erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / cbriggsae.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis briggsae
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromPath, chromOutPath):
44     seqArray = []
45     seqLen = 0
46     cbGenome = Genome("cbriggsae", dbFile=db)
47     inFile = open(chromPath, "r")
48     header = inFile.readline()
49     while header != "":
50         seqArray = []
51         seqLen = 0
52         chromID = header.strip()[1:]
53         currentLine = inFile.readline()
54         while currentLine != "" and currentLine[0] != ">":
55             lineSeq = currentLine.strip()
56             seqLen += len(lineSeq)
57             seqArray.append(lineSeq)
58             currentLine = inFile.readline()
59
60         seq = string.join(seqArray, "")
61         if seqLen < 900000:
62             print "Added contig %s to database" % chromID
63             cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
64             cbGenome.addChromosomeEntry(chromID, chromID, "db")
65         else:
66             outFileName = "%s%s.bin" % (chromOutPath, chromID)
67             outFile = open("%s%s" % (cisRoot, outFileName), "w")
68             outFile.write(seq)
69             outFile.close()
70             print "Added contig file %s to database" % outFileName
71             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
72
73         header = currentLine
74
75     inFile.close()
76
77
78 def loadGeneEntries(db, gffFile):
79     cbGenome = Genome("cbriggsae", dbFile=db)
80     geneFile = open(gffFile, "r")
81     geneEntries = []
82     geneAnnotations = []
83     for line in geneFile:
84         field = line[:-1].split("\t")
85         if field[1] != "hybrid":
86             continue
87
88         if field[2] != "CDS":
89             continue
90
91         annot = field[8].split('"')
92         gid = annot[1]
93         geneID = ("cbriggsae", gid)
94         annotation = string.join(annot[3:], "")
95         gidVersion = 1
96         try:
97             gidVersion = giddots[2]
98         except:
99             pass
100
101         start = int(field[3]) - 1
102         stop = int(field[4]) - 1
103         sense = field[6]
104         chrom = field[0].strip()
105         if sense == "+":
106             sense = "F"
107         else:
108             sense = "R"
109
110         if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
111             geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
112         if (geneID, annotation) not in geneAnnotations:
113             geneAnnotations.append((geneID, annotation))
114
115     print "Adding %d gene entries" % len(geneEntries)
116     cbGenome.addGeneEntryBatch(geneEntries)
117
118     print "Adding %d annotations" % len(geneAnnotations)
119     cbGenome.addAnnotationBatch(geneAnnotations)
120
121
122 def loadFeatureEntries(db, gffFile):
123     cbGenome = Genome("cbriggsae", dbFile=db)
124     featureFile = open(gffFile, "r")
125     featureEntries = []
126     seenFeatures = {}
127     featureTranslation = {"coding_exon": "CDS",
128                           "three_prime_UTR": "3UTR", 
129                           "five_prime_UTR": "5UTR"
130     }
131
132     for line in featureFile:
133         field = line.split("\t")
134         if field[1] != "hybrid":
135             continue
136
137         if field[2].strip() not in featureTranslation:
138             continue
139
140         featureType = featureTranslation[field[2].strip()]
141         gidrev = field[8].split('"')
142         gid = gidrev[1]
143         geneID = ("cbriggsae", gid)
144         gidVersion = 1
145         start = int(field[3]) - 1
146         stop = int(field[4]) - 1
147         sense = field[6]
148         chrom = field[0].strip()
149         if sense == "+":
150             sense = "F"
151         else:
152             sense = "R"
153
154         if geneID not in seenFeatures:
155             seenFeatures[geneID] = []
156         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
157             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
158             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
159
160     print "Adding %d feature entries" % len(featureEntries)
161     cbGenome.addFeatureEntryBatch(featureEntries)
162
163
164 def createDBFile(db):
165     cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
166     cbGenome.createGeneDB(db)
167
168
169 def createDBindices(db):
170     cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
171     cbGenome.createIndices()
172
173
174 def buildCbriggsaeDB(db=geneDB):
175     gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
176     chromoPath = "%s/download/briggsae_25.fa" % cisRoot
177     chromoOutPath = "/C_briggsae/"
178
179     print "Creating database %s" % db
180     createDBFile(db)
181
182     print "Adding gene entries"
183     loadGeneEntries(db, gffPath)
184
185     print "Adding feature entries"
186     loadFeatureEntries(db, gffPath)
187
188     print "Loading genomic sequence" 
189     loadChromosome(db, chromoPath, chromoOutPath)
190
191     print "Creating Indices"
192     createDBindices(db)
193
194     print "Finished creating database %s" % db