cistematic/genomes/celegans.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis elegans
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
  41
  42
  43 def loadChromosome(db, chromID, chromPath, chromOut):
  44     seqArray = []
  45     ceGenome = Genome("celegans", dbFile=db)
  46     inFile = open(chromPath, "r")
  47     line = inFile.readline()
  48     for line in inFile:
  49         seqArray.append(line.strip())
  50
  51     seq = string.join(seqArray, "")
  52     seqLen = len(seq)
  53     if seqLen < 1:
  54         print "Problems reading sequence from file"
  55
  56     print "writing to file %s" % chromOut
  57     outFile = open("%s%s" % (cisRoot, chromOut), "w")
  58     outFile.write(seq)
  59     outFile.close()
  60     ceGenome.addChromosomeEntry(chromID, chromOut, "file")
  61
  62
  63 def loadGeneEntries(db, gffFile):
  64     ceGenome = Genome("celegans", dbFile=db)
  65     geneFile = open(gffFile, "r")
  66     geneEntries = []
  67     for line in geneFile:
  68         if line[0] == "#":
  69             continue
  70
  71         field = line.split("\t")
  72         if field[1] != "Coding_transcript" and field[1] != "miRNA":
  73             continue
  74
  75         if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
  76             continue
  77
  78         gidrev = field[8].split('"')
  79         giddots = gidrev[1].split(".")
  80         # we are ignoring gene models!!!!
  81         if giddots[1][-1] in string.letters:
  82             gidGene = giddots[1][:-1]
  83             gidLetter = giddots[1][-1]
  84         else:
  85             gidGene = giddots[1]
  86             gidLetter = "a"
  87         gid = "%s.%s" % (giddots[0], gidGene)
  88         geneID = ("celegans", gid)
  89         gidVersion = 1
  90         if gidLetter != "a":
  91             try:
  92                 gidVersion = ord(gidLetter.lower()) - 96
  93             except:
  94                 print "problem processing %s - skipping" % gidrev[1]
  95                 continue
  96
  97         start = int(field[3]) - 1
  98         stop = int(field[4]) - 1
  99         sense = field[6]
 100         chrom = field[0].strip()
 101         if sense == "+":
 102             sense = "F"
 103         else:
 104             sense = "R"
 105
 106         geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
 107
 108     print "Adding %d gene entries" % len(geneEntries)
 109     ceGenome.addGeneEntryBatch(geneEntries)
 110
 111
 112 def loadFeatureEntries(db, gffFile):
 113     ceGenome = Genome("celegans", dbFile=db)
 114     featureFile = open(gffFile, "r")
 115     featureEntries = []
 116     seenFeatures = {}
 117     featureTranslation = {"coding_exon": "CDS",
 118                           "three_prime_UTR": "3UTR",
 119                           "five_prime_UTR": "5UTR"
 120     }
 121
 122     for line in featureFile:
 123         if line[0] == "#":
 124             continue
 125
 126         field = line.split("\t")
 127         if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
 128             continue
 129
 130         if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
 131             continue
 132
 133         if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
 134             featureType = "CDS" # identifying these as CDS will force their masking later on
 135         else:
 136             featureType = featureTranslation[field[2].strip()]
 137
 138         gidrev = field[8].split('"')
 139         giddots = gidrev[1].split(".")
 140         # we are ignoring gene models!!!!
 141         if giddots[1][-1] in string.letters:
 142             gidGene = giddots[1][:-1]
 143             gidLetter = giddots[1][-1]
 144         else:
 145             gidGene = giddots[1]
 146             gidLetter = "a"
 147
 148         gid = "%s.%s" % (giddots[0], gidGene)
 149         geneID = ("celegans", gid)
 150         gidVersion = 1
 151         if gidLetter != "a":
 152             try:
 153                 gidVersion = ord(gidLetter.lower()) - 96
 154             except:
 155                 print "problem processing %s - skipping" % gidrev[1]
 156                 continue
 157
 158         start = int(field[3]) - 1
 159         stop = int(field[4]) - 1
 160         sense = field[6]
 161         chrom = field[0].strip()
 162         if sense == "+":
 163             sense = "F"
 164         else:
 165             sense = "R"
 166
 167         if geneID not in seenFeatures:
 168             seenFeatures[geneID] = []
 169
 170         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
 171             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
 172             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
 173
 174     print "Adding %d feature entries" % len(featureEntries)
 175     ceGenome.addFeatureEntryBatch(featureEntries)
 176
 177
 178 def loadGeneAnnotations(db, geneIDPath):
 179     geneAnnotations = []
 180     geneIDFile = open(geneIDPath, "r")
 181     lines = geneIDFile.readlines()
 182     geneIDFile.close()
 183     ceGenome = Genome("celegans", dbFile=db)
 184     for line in lines:
 185         field = line.split(",")
 186         try:
 187             gid = field[2].strip()
 188             geneID = "%s\t%s" % (field[0], field[1])
 189             geneAnnotations.append((("celegans", gid), geneID))
 190         except:
 191             pass
 192
 193     print "Adding %d annotations" % len(geneAnnotations)
 194     ceGenome.addAnnotationBatch(geneAnnotations)
 195
 196
 197 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
 198     ceGenome = Genome("celegans", version="WS200", dbFile=db)
 199     geneIDFile = open(geneIDPath, "r")
 200     goDefFile = open(goDefPath, "r")
 201     goFile = open(goPath, "r")
 202     lines = geneIDFile.readlines()
 203     geneIDFile.close()
 204     geneIDmap = {}
 205     seenGO = {}
 206     for line in lines:
 207         field = line.split(",")
 208         # ugly C elegans hack - map both fields to gid, since either might be
 209         # used by GO !
 210         if len(field[2].strip()) > 1:
 211             geneIDmap[field[1]] =  field[2].strip()
 212
 213     goDefEntries = goDefFile.readlines()
 214     goDefs = {}
 215     for goDefEntry in goDefEntries:
 216         if goDefEntry[0] != "!":
 217             cols = goDefEntry.split("\t")
 218             goDefs[cols[0]] = (cols[1], cols[2].strip())
 219
 220     goEntries = goFile.readlines()
 221     goArray = []
 222     for line in goEntries:
 223         if line[0] == "!":
 224             continue
 225
 226         fields = line.split("\t")
 227         name = fields[2]
 228         if name in geneIDmap:
 229             name = geneIDmap[name]
 230
 231         if name[-1] == "a":
 232             name = name[:-1]
 233
 234         GOIDarray = fields[4].split(" ")
 235         GOID = GOIDarray[0]
 236         objType = fields[8]
 237         objName = fields[10].split("|")
 238         gID = name
 239         isNot = fields[3]
 240         if len(objName) > 1:
 241             name = "%s|%s" % (name.strip(), fields[10])
 242
 243         try:
 244             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
 245         except:
 246             print "could no map %s - using GOID only" % GOID
 247             GOterm = ""
 248
 249         evidence = fields[9]
 250         if gID not in seenGO:
 251             seenGO[gID] = []
 252
 253         if GOID not in seenGO[gID]:
 254             seenGO[gID].append(GOID)
 255             goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
 256
 257     print "Adding %d GO entries" % len(goArray)
 258     ceGenome.addGoInfoBatch(goArray)
 259
 260
 261 def createDBFile(db):
 262     ceGenome = Genome("celegans", version="WS200",  dbFile=db)
 263     ceGenome.createGeneDB(db)
 264
 265
 266 def createDBindices(db):
 267     ceGenome = Genome("celegans", version="WS200", dbFile=db)
 268     ceGenome.createIndices()
 269
 270
 271 def buildCelegansDB(db=geneDB, downloadRoot=""):
 272     if downloadRoot == "":
 273         downloadRoot = "%s/download/" % cisRoot
 274
 275     geneIDPath = "%sgeneIDs.WS200" % downloadRoot
 276     goDefPath = "%sGO.terms_and_ids" % downloadRoot
 277     goPath = "%sgene_association.wb" % downloadRoot
 278
 279     # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
 280     chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
 281                "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
 282                "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
 283                "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
 284                "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
 285                "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
 286     }
 287
 288     # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
 289     gffPath = "%selegansWS200.gff" % downloadRoot
 290
 291     print "Creating database %s" % db
 292     createDBFile(db)
 293
 294     print "Adding gene entries"
 295     loadGeneEntries(db, gffPath)
 296
 297     print "Adding feature entries"
 298     loadFeatureEntries(db, gffPath)
 299
 300     print "Adding gene annotations"
 301     loadGeneAnnotations(db, geneIDPath)
 302
 303     print "Adding gene ontology"
 304     loadGeneOntology(db, goPath, goDefPath, geneIDPath)
 305
 306     for chromID in chromos:
 307         print "Loading chromosome %s" % chromID
 308         loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
 309
 310     print "Creating Indices"
 311     createDBindices(db)
 312
 313     print "Finished creating database %s" % db