first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / scerevisiae.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Saccharomyces cerevisiae
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
41
42
43 def buildScerevisiaeDB(db=geneDB):
44     genePath = "%s/download/SGD_features.tab" % cisRoot
45     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
46     goPath = "%s/download/gene_association.sgd" % cisRoot
47
48     print "Creating database %s" % db
49     createDBFile(db)
50
51     print "Adding gene entries"
52     loadGeneEntries(db, genePath)
53
54     print "Adding gene annotations"
55     loadGeneAnnotations(db, genePath)
56
57     print "Adding gene ontology"
58     loadGeneOntology(db, goPath, goDefPath)
59
60     for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
61         print "Loading chromosome %s" % chromID
62         chromPath = "%s/download/chr%s.fsa" % (cisRoot, chromID)
63         loadChromosome(db, chromID, chromPath, "/S_cerevisiae/chr%s.bin" % chromID)
64
65     print "Creating Indices"
66     createDBindices(db)
67
68     print "Finished creating database %s" % db
69
70
71 def createDBFile(db):
72     scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
73     scGenome.createGeneDB(db)
74
75
76 def loadGeneEntries(db, gFile):
77     geneEntries = []
78     geneFeatures = []
79     scGenome = Genome("scerevisiae", dbFile=db)
80     geneFile = open(gFile, "r")
81     for line in geneFile:
82         field = line.split("\t")
83         if field[1] != "ORF":
84             continue
85
86         orfName = field[3].strip()
87         sense = field[11]
88         chrom = field[8].strip()
89         if sense == "W":
90             sense = "F"
91             try:
92                 start = int(field[9].strip()) - 1
93                 stop  = int(field[10].strip()) - 1
94             except:
95                 start = 0
96                 stop = 0
97         else:
98             sense = "R"
99             try:
100                 start = int(field[10].strip()) - 1
101                 stop  = int(field[9].strip()) - 1
102             except:
103                 start = 0
104                 stop = 0
105
106         geneID = ("scerevisiae", orfName)
107         gidVersion = 1
108         geneEntries.append((geneID, chrom, start, stop, sense, "chromosomal_feature", gidVersion))
109         geneFeatures.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
110
111     print "loading %d gene entries" % len(geneEntries)
112     scGenome.addGeneEntryBatch(geneEntries)
113     print "loading %d gene features" % len(geneFeatures)
114     scGenome.addFeatureEntryBatch(geneFeatures)
115
116
117 def loadGeneAnnotations(db, annotPath):
118     geneAnnotations = []
119     annotFile = open(annotPath, "r")
120     lines = annotFile.readlines()
121     annotFile.close()
122     scGenome = Genome("scerevisiae", dbFile=db)
123     for line in lines:
124         field = line.split("\t")
125         if field[1] != "ORF":
126             continue
127
128         try:
129             orfName = field[6].strip()
130             description =  field[15].strip()
131             geneAnnotations.append((("scerevisiae", orfName), string.replace(description, "'", "p")))
132         except:
133             pass
134
135     print "Adding %d annotations" % len(geneAnnotations)
136     scGenome.addAnnotationBatch(geneAnnotations)
137
138
139 def loadGeneOntology(db, goPath, goDefPath):
140     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
141     goDefFile = open(goDefPath, "r")
142     goFile = open(goPath, "r")
143     goDefEntries = goDefFile.readlines()
144     goDefs = {}
145     for goDefEntry in goDefEntries:
146         if goDefEntry[0] != "!":
147             cols = goDefEntry.split("\t")
148             goDefs[cols[0]] = (cols[1], cols[2].strip())
149
150     goEntries = goFile.readlines()
151     goArray = []
152     for line in goEntries:
153         if line[0] == "!":
154             continue
155
156         fields = line.split("\t")
157         genes = fields[10].split("|")
158         gID = genes[0]
159         GOID = fields[4]
160         objType = fields[11]
161         objNameArray = fields[10].split("|")
162         objName = objNameArray[0]
163         isNot = fields[3]
164         try:
165             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
166         except:
167             print "Could not translate %s" % (GOID)
168             GOterm = ""
169
170         evidence = fields[6]
171         goArray.append((("scerevisiae", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[1]))
172
173     scGenome.addGoInfoBatch(goArray)
174
175
176 def loadChromosome(db, chromID, chromPath, chromOut):
177     seqArray = []
178     scGenome = Genome("scerevisiae", dbFile=db)
179     inFile = open(chromPath, "r")
180     line = inFile.readline()
181     for line in inFile:
182         seqArray.append(line.strip())
183
184     seq = string.join(seqArray, "")
185     seqLen = len(seq)
186     if seqLen < 1:
187         print "Problems reading sequence from file"
188
189     print "writing to file %s" % chromOut
190     outFile = open("%s%s" % (cisRoot, chromOut), "w")
191     outFile.write(seq)
192     outFile.close()
193     seq = ""
194     print "calling scGenome()"
195     scGenome.addChromosomeEntry(chromID, chromOut, "file")
196
197
198 def createDBindices(db):
199     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
200     scGenome.createIndices()