first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / cbriggsae.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis briggsae
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
41
42
43 def buildCbriggsaeDB(db=geneDB):
44     gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
45     chromoPath = "%s/download/briggsae_25.fa" % cisRoot
46     chromoOutPath = "/C_briggsae/"
47
48     print "Creating database %s" % db
49     createDBFile(db)
50
51     print "Adding gene entries"
52     loadGeneEntries(db, gffPath)
53
54     print "Adding feature entries"
55     loadFeatureEntries(db, gffPath)
56
57     print "Loading genomic sequence"
58     loadChromosome(db, chromoPath, chromoOutPath)
59
60     print "Creating Indices"
61     createDBindices(db)
62
63     print "Finished creating database %s" % db
64
65
66 def createDBFile(db):
67     cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
68     cbGenome.createGeneDB(db)
69
70
71 def loadGeneEntries(db, gffFile):
72     cbGenome = Genome("cbriggsae", dbFile=db)
73     geneFile = open(gffFile, "r")
74     geneEntries = []
75     geneAnnotations = []
76     for line in geneFile:
77         field = line[:-1].split("\t")
78         if field[1] != "hybrid":
79             continue
80
81         if field[2] != "CDS":
82             continue
83
84         annot = field[8].split('"')
85         gid = annot[1]
86         geneID = ("cbriggsae", gid)
87         annotation = string.join(annot[3:], "")
88         gidVersion = 1
89         try:
90             gidVersion = giddots[2]
91         except:
92             pass
93
94         start = int(field[3]) - 1
95         stop = int(field[4]) - 1
96         sense = field[6]
97         chrom = field[0].strip()
98         if sense == "+":
99             sense = "F"
100         else:
101             sense = "R"
102
103         if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
104             geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
105         if (geneID, annotation) not in geneAnnotations:
106             geneAnnotations.append((geneID, annotation))
107
108     print "Adding %d gene entries" % len(geneEntries)
109     cbGenome.addGeneEntryBatch(geneEntries)
110
111     print "Adding %d annotations" % len(geneAnnotations)
112     cbGenome.addAnnotationBatch(geneAnnotations)
113
114
115 def loadFeatureEntries(db, gffFile):
116     cbGenome = Genome("cbriggsae", dbFile=db)
117     featureFile = open(gffFile, "r")
118     featureEntries = []
119     seenFeatures = {}
120     featureTranslation = {"coding_exon": "CDS",
121                           "three_prime_UTR": "3UTR",
122                           "five_prime_UTR": "5UTR"
123     }
124
125     for line in featureFile:
126         field = line.split("\t")
127         if field[1] != "hybrid":
128             continue
129
130         if field[2].strip() not in featureTranslation:
131             continue
132
133         featureType = featureTranslation[field[2].strip()]
134         gidrev = field[8].split('"')
135         gid = gidrev[1]
136         geneID = ("cbriggsae", gid)
137         gidVersion = 1
138         start = int(field[3]) - 1
139         stop = int(field[4]) - 1
140         sense = field[6]
141         chrom = field[0].strip()
142         if sense == "+":
143             sense = "F"
144         else:
145             sense = "R"
146
147         if geneID not in seenFeatures:
148             seenFeatures[geneID] = []
149         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
150             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
151             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
152
153     print "Adding %d feature entries" % len(featureEntries)
154     cbGenome.addFeatureEntryBatch(featureEntries)
155
156
157 def loadChromosome(db, chromPath, chromOutPath):
158     seqArray = []
159     seqLen = 0
160     cbGenome = Genome("cbriggsae", dbFile=db)
161     inFile = open(chromPath, "r")
162     header = inFile.readline()
163     while header != "":
164         seqArray = []
165         seqLen = 0
166         chromID = header.strip()[1:]
167         currentLine = inFile.readline()
168         while currentLine != "" and currentLine[0] != ">":
169             lineSeq = currentLine.strip()
170             seqLen += len(lineSeq)
171             seqArray.append(lineSeq)
172             currentLine = inFile.readline()
173
174         seq = string.join(seqArray, "")
175         if seqLen < 900000:
176             print "Added contig %s to database" % chromID
177             cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
178             cbGenome.addChromosomeEntry(chromID, chromID, "db")
179         else:
180             outFileName = "%s%s.bin" % (chromOutPath, chromID)
181             outFile = open("%s%s" % (cisRoot, outFileName), "w")
182             outFile.write(seq)
183             outFile.close()
184             print "Added contig file %s to database" % outFileName
185             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
186
187         header = currentLine
188
189     inFile.close()
190
191
192 def createDBindices(db):
193     cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
194     cbGenome.createIndices()