cistematic/genomes/scerevisiae.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Saccharomyces cerevisiae
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
  41
  42
  43 def buildScerevisiaeDB(db=geneDB):
  44     genePath = "%s/download/SGD_features.tab" % cisRoot
  45     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
  46     goPath = "%s/download/gene_association.sgd" % cisRoot
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, genePath)
  53
  54     print "Adding gene annotations"
  55     loadGeneAnnotations(db, genePath)
  56
  57     print "Adding gene ontology"
  58     loadGeneOntology(db, goPath, goDefPath)
  59
  60     for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
  61         print "Loading chromosome %s" % chromID
  62         chromPath = "%s/download/chr%s.fsa" % (cisRoot, chromID)
  63         loadChromosome(db, chromID, chromPath, "/S_cerevisiae/chr%s.bin" % chromID)
  64
  65     print "Creating Indices"
  66     createDBindices(db)
  67
  68     print "Finished creating database %s" % db
  69
  70
  71 def createDBFile(db):
  72     scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
  73     scGenome.createGeneDB(db)
  74
  75
  76 def loadGeneEntries(db, gFile):
  77     geneEntries = []
  78     geneFeatures = []
  79     scGenome = Genome("scerevisiae", dbFile=db)
  80     geneFile = open(gFile, "r")
  81     for line in geneFile:
  82         field = line.split("\t")
  83         if field[1] != "ORF":
  84             continue
  85
  86         orfName = field[3].strip()
  87         sense = field[11]
  88         chrom = field[8].strip()
  89         if sense == "W":
  90             sense = "F"
  91             try:
  92                 start = int(field[9].strip()) - 1
  93                 stop  = int(field[10].strip()) - 1
  94             except:
  95                 start = 0
  96                 stop = 0
  97         else:
  98             sense = "R"
  99             try:
 100                 start = int(field[10].strip()) - 1
 101                 stop  = int(field[9].strip()) - 1
 102             except:
 103                 start = 0
 104                 stop = 0
 105
 106         geneID = ("scerevisiae", orfName)
 107         gidVersion = 1
 108         geneEntries.append((geneID, chrom, start, stop, sense, "chromosomal_feature", gidVersion))
 109         geneFeatures.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
 110
 111     print "loading %d gene entries" % len(geneEntries)
 112     scGenome.addGeneEntryBatch(geneEntries)
 113     print "loading %d gene features" % len(geneFeatures)
 114     scGenome.addFeatureEntryBatch(geneFeatures)
 115
 116
 117 def loadGeneAnnotations(db, annotPath):
 118     geneAnnotations = []
 119     annotFile = open(annotPath, "r")
 120     lines = annotFile.readlines()
 121     annotFile.close()
 122     scGenome = Genome("scerevisiae", dbFile=db)
 123     for line in lines:
 124         field = line.split("\t")
 125         if field[1] != "ORF":
 126             continue
 127
 128         try:
 129             orfName = field[6].strip()
 130             description =  field[15].strip()
 131             geneAnnotations.append((("scerevisiae", orfName), string.replace(description, "'", "p")))
 132         except:
 133             pass
 134
 135     print "Adding %d annotations" % len(geneAnnotations)
 136     scGenome.addAnnotationBatch(geneAnnotations)
 137
 138
 139 def loadGeneOntology(db, goPath, goDefPath):
 140     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
 141     goDefFile = open(goDefPath, "r")
 142     goFile = open(goPath, "r")
 143     goDefEntries = goDefFile.readlines()
 144     goDefs = {}
 145     for goDefEntry in goDefEntries:
 146         if goDefEntry[0] != "!":
 147             cols = goDefEntry.split("\t")
 148             goDefs[cols[0]] = (cols[1], cols[2].strip())
 149
 150     goEntries = goFile.readlines()
 151     goArray = []
 152     for line in goEntries:
 153         if line[0] == "!":
 154             continue
 155
 156         fields = line.split("\t")
 157         genes = fields[10].split("|")
 158         gID = genes[0]
 159         GOID = fields[4]
 160         objType = fields[11]
 161         objNameArray = fields[10].split("|")
 162         objName = objNameArray[0]
 163         isNot = fields[3]
 164         try:
 165             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
 166         except:
 167             print "Could not translate %s" % (GOID)
 168             GOterm = ""
 169
 170         evidence = fields[6]
 171         goArray.append((("scerevisiae", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[1]))
 172
 173     scGenome.addGoInfoBatch(goArray)
 174
 175
 176 def loadChromosome(db, chromID, chromPath, chromOut):
 177     seqArray = []
 178     scGenome = Genome("scerevisiae", dbFile=db)
 179     inFile = open(chromPath, "r")
 180     line = inFile.readline()
 181     for line in inFile:
 182         seqArray.append(line.strip())
 183
 184     seq = string.join(seqArray, "")
 185     seqLen = len(seq)
 186     if seqLen < 1:
 187         print "Problems reading sequence from file"
 188
 189     print "writing to file %s" % chromOut
 190     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 191     outFile.write(seq)
 192     outFile.close()
 193     seq = ""
 194     print "calling scGenome()"
 195     scGenome.addChromosomeEntry(chromID, chromOut, "file")
 196
 197
 198 def createDBindices(db):
 199     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
 200     scGenome.createIndices()