erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / cremanei.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis remanei
31 import string, os
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
41
42
43 def loadChromosomes(db, inPath, chromOutPath):
44     crGenome = Genome("cremanei", dbFile=db)
45     scontigList = os.listdir(inPath)
46     for scontig in scontigList:
47         seq = ''
48         seqArray = []
49         seqLen = 0
50         inFile = open("%s/%s" % (inPath, scontig), "r")
51         index = 0
52         header = inFile.readline()
53         chromID = header.strip()[1:]
54         while header != "":
55             seqArray = []
56             seqLen = 0
57             currentLine = inFile.readline()
58             while currentLine != "" and currentLine[0] != ">":
59                 lineSeq = currentLine.strip()
60                 seqLen += len(lineSeq)
61                 seqArray.append(lineSeq)
62                 currentLine = inFile.readline()
63
64             seq = string.join(seqArray, "")
65             if seqLen < 100000:
66                 print "Added contig %s to database" % chromID
67                 crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
68                 crGenome.addChromosomeEntry(chromID, chromID, "db")
69             else:
70                 outFileName = "%s%s.bin" % (chromOutPath, chromID)
71                 outFile = open("%s%s" % (cisRoot, outFileName), "w")
72                 outFile.write(seq)
73                 outFile.close()
74                 print "Added contig file %s to database" % outFileName
75                 crGenome.addChromosomeEntry(chromID, outFileName, "file")
76
77             index += 1
78             header = currentLine
79
80         inFile.close()
81
82
83 def loadGeneEntries(db, gffFile):
84     crGenome = Genome("cremanei", dbFile=db)
85     geneFile = open(gffFile, "r")
86     geneStart = {}
87     geneStop = {}
88     geneChrom = {}
89     geneSense = {}
90     geneEntries = []
91     for line in geneFile:
92         if line[0] == "#":
93             continue
94
95         if line[0] == "\n":
96             continue
97
98         field = line[:-1].split("\t")
99         if field[2] != "CDS":
100             continue
101
102         idfield = field[8].split('"')
103         gid = idfield[1]
104         geneID = ("cremanei", gid)
105         geneStart[geneID] = int(field[3]) - 1
106         geneStop[geneID] = int(field[4]) - 1
107         sense = field[6]
108         geneChrom[geneID] = field[0].strip()
109         if sense == "+":
110             geneSense[geneID] = "F"
111         else:
112             geneSense[geneID] = "R"
113
114     for geneID in geneStart:
115         if geneID not in geneStop:
116             print "geneID %s not in geneStop - skipping" % str(geneID)
117             continue
118         geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
119
120     print "Adding %d gene entries" % len(geneEntries)
121     crGenome.addGeneEntryBatch(geneEntries)
122
123
124 def loadFeatureEntries(db, gffFile):
125     crGenome = Genome("cremanei", dbFile=db)
126     featureFile = open(gffFile, "r")
127     featureEntries = []
128     for line in featureFile:
129         if line[0] == "#":
130             continue
131
132         if line[0] == "\n":
133             continue
134
135         field = line.split("\t")
136         if field[2].strip() != "coding_exon":
137             continue
138
139         gidrev = field[8].split('"')
140         gid = gidrev[1]
141         geneID = ("cremanei", gid)
142         gidVersion = 1
143         start = int(field[3]) - 1
144         stop = int(field[4]) - 1
145         sense = field[6]
146         chrom = field[0].strip()
147         if sense == "+":
148             sense = "F"
149         else:
150             sense = "R"
151
152         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
153
154     print "Adding %d feature entries" % len(featureEntries)
155     crGenome.addFeatureEntryBatch(featureEntries)
156
157
158 def createDBFile(db):
159     crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
160     crGenome.createGeneDB(db)
161
162
163 def createDBindices(db):
164     crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
165     crGenome.createIndices()
166
167
168 def buildCremaneiDB(db=geneDB):
169     gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
170     chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
171     chromoOutPath = "/C_remanei/"
172
173     print "Creating database %s" % db
174     createDBFile(db)
175
176     print "Adding gene entries"
177     loadGeneEntries(db, gffPath)
178
179     print "Adding feature entries"
180     loadFeatureEntries(db, gffPath)
181
182     print "Loading genomic sequence" 
183     loadChromosomes(db, chromoPath, chromoOutPath)
184
185     print "Creating Indices"
186     createDBindices(db)
187
188     print "Finished creating database %s" % db