cistematic/genomes/celegans.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis elegans
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
  41
  42
  43 def buildCelegansDB(db=geneDB, downloadRoot=""):
  44     if downloadRoot == "":
  45         downloadRoot = "%s/download/" % cisRoot
  46
  47     geneIDPath = "%sgeneIDs.WS200" % downloadRoot
  48     goDefPath = "%sGO.terms_and_ids" % downloadRoot
  49     goPath = "%sgene_association.wb" % downloadRoot
  50
  51     # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
  52     gffPath = "%selegansWS200.gff" % downloadRoot
  53
  54     print "Creating database %s" % db
  55     createDBFile(db)
  56
  57     print "Adding gene entries"
  58     loadGeneEntries(db, gffPath)
  59
  60     print "Adding feature entries"
  61     loadFeatureEntries(db, gffPath)
  62
  63     print "Adding gene annotations"
  64     loadGeneAnnotations(db, geneIDPath)
  65
  66     print "Adding gene ontology"
  67     loadGeneOntology(db, goPath, goDefPath, geneIDPath)
  68
  69     # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
  70     for chromID in ["I", "II", "III", "IV", "V", "X"]:
  71         print "Loading chromosome %s" % chromID
  72         chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
  73         loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
  74
  75     print "Creating Indices"
  76     createDBindices(db)
  77
  78     print "Finished creating database %s" % db
  79
  80
  81 def createDBFile(db):
  82     ceGenome = Genome("celegans", version="WS200",  dbFile=db)
  83     ceGenome.createGeneDB(db)
  84
  85
  86 def loadGeneEntries(db, gffFile):
  87     ceGenome = Genome("celegans", dbFile=db)
  88     geneFile = open(gffFile, "r")
  89     geneEntries = []
  90     for line in geneFile:
  91         if line[0] == "#":
  92             continue
  93
  94         field = line.split("\t")
  95         if field[1] != "Coding_transcript" and field[1] != "miRNA":
  96             continue
  97
  98         if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
  99             continue
 100
 101         gidrev = field[8].split('"')
 102         giddots = gidrev[1].split(".")
 103         # we are ignoring gene models!!!!
 104         if giddots[1][-1] in string.letters:
 105             gidGene = giddots[1][:-1]
 106             gidLetter = giddots[1][-1]
 107         else:
 108             gidGene = giddots[1]
 109             gidLetter = "a"
 110
 111         gid = "%s.%s" % (giddots[0], gidGene)
 112         geneID = ("celegans", gid)
 113         gidVersion = 1
 114         if gidLetter != "a":
 115             try:
 116                 gidVersion = ord(gidLetter.lower()) - 96
 117             except:
 118                 print "problem processing %s - skipping" % gidrev[1]
 119                 continue
 120
 121         start = int(field[3]) - 1
 122         stop = int(field[4]) - 1
 123         sense = field[6]
 124         chrom = field[0].strip()
 125         if sense == "+":
 126             sense = "F"
 127         else:
 128             sense = "R"
 129
 130         geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
 131
 132     print "Adding %d gene entries" % len(geneEntries)
 133     ceGenome.addGeneEntryBatch(geneEntries)
 134
 135
 136 def loadFeatureEntries(db, gffFile):
 137     ceGenome = Genome("celegans", dbFile=db)
 138     featureFile = open(gffFile, "r")
 139     featureEntries = []
 140     seenFeatures = {}
 141     featureTranslation = {"coding_exon": "CDS",
 142                           "three_prime_UTR": "3UTR",
 143                           "five_prime_UTR": "5UTR"
 144     }
 145
 146     for line in featureFile:
 147         if line[0] == "#":
 148             continue
 149
 150         field = line.split("\t")
 151         if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
 152             continue
 153
 154         if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
 155             continue
 156
 157         if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
 158             featureType = "CDS" # identifying these as CDS will force their masking later on
 159         else:
 160             featureType = featureTranslation[field[2].strip()]
 161
 162         gidrev = field[8].split('"')
 163         giddots = gidrev[1].split(".")
 164         # we are ignoring gene models!!!!
 165         if giddots[1][-1] in string.letters:
 166             gidGene = giddots[1][:-1]
 167             gidLetter = giddots[1][-1]
 168         else:
 169             gidGene = giddots[1]
 170             gidLetter = "a"
 171
 172         gid = "%s.%s" % (giddots[0], gidGene)
 173         geneID = ("celegans", gid)
 174         gidVersion = 1
 175         if gidLetter != "a":
 176             try:
 177                 gidVersion = ord(gidLetter.lower()) - 96
 178             except:
 179                 print "problem processing %s - skipping" % gidrev[1]
 180                 continue
 181
 182         start = int(field[3]) - 1
 183         stop = int(field[4]) - 1
 184         sense = field[6]
 185         chrom = field[0].strip()
 186         if sense == "+":
 187             sense = "F"
 188         else:
 189             sense = "R"
 190
 191         if geneID not in seenFeatures:
 192             seenFeatures[geneID] = []
 193
 194         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
 195             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
 196             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
 197
 198     print "Adding %d feature entries" % len(featureEntries)
 199     ceGenome.addFeatureEntryBatch(featureEntries)
 200
 201
 202 def loadGeneAnnotations(db, geneIDPath):
 203     geneAnnotations = []
 204     geneIDFile = open(geneIDPath, "r")
 205     lines = geneIDFile.readlines()
 206     geneIDFile.close()
 207     ceGenome = Genome("celegans", dbFile=db)
 208     for line in lines:
 209         field = line.split(",")
 210         try:
 211             gid = field[2].strip()
 212             geneID = "%s\t%s" % (field[0], field[1])
 213             geneAnnotations.append((("celegans", gid), geneID))
 214         except:
 215             pass
 216
 217     print "Adding %d annotations" % len(geneAnnotations)
 218     ceGenome.addAnnotationBatch(geneAnnotations)
 219
 220
 221 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
 222     ceGenome = Genome("celegans", version="WS200", dbFile=db)
 223     geneIDFile = open(geneIDPath, "r")
 224     goDefFile = open(goDefPath, "r")
 225     goFile = open(goPath, "r")
 226     lines = geneIDFile.readlines()
 227     geneIDFile.close()
 228     geneIDmap = {}
 229     seenGO = {}
 230     for line in lines:
 231         field = line.split(",")
 232         # ugly C elegans hack - map both fields to gid, since either might be
 233         # used by GO !
 234         if len(field[2].strip()) > 1:
 235             geneIDmap[field[1]] =  field[2].strip()
 236
 237     goDefEntries = goDefFile.readlines()
 238     goDefs = {}
 239     for goDefEntry in goDefEntries:
 240         if goDefEntry[0] != "!":
 241             cols = goDefEntry.split("\t")
 242             goDefs[cols[0]] = (cols[1], cols[2].strip())
 243
 244     goEntries = goFile.readlines()
 245     goArray = []
 246     for line in goEntries:
 247         if line[0] == "!":
 248             continue
 249
 250         fields = line.split("\t")
 251         name = fields[2]
 252         if name in geneIDmap:
 253             name = geneIDmap[name]
 254
 255         if name[-1] == "a":
 256             name = name[:-1]
 257
 258         GOIDarray = fields[4].split(" ")
 259         GOID = GOIDarray[0]
 260         objType = fields[8]
 261         objName = fields[10].split("|")
 262         gID = name
 263         isNot = fields[3]
 264         if len(objName) > 1:
 265             name = "%s|%s" % (name.strip(), fields[10])
 266
 267         try:
 268             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
 269         except:
 270             print "could no map %s - using GOID only" % GOID
 271             GOterm = ""
 272
 273         evidence = fields[9]
 274         if gID not in seenGO:
 275             seenGO[gID] = []
 276
 277         if GOID not in seenGO[gID]:
 278             seenGO[gID].append(GOID)
 279             goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
 280
 281     print "Adding %d GO entries" % len(goArray)
 282     ceGenome.addGoInfoBatch(goArray)
 283
 284
 285 def loadChromosome(db, chromID, chromPath, chromOut):
 286     seqArray = []
 287     ceGenome = Genome("celegans", dbFile=db)
 288     inFile = open(chromPath, "r")
 289     line = inFile.readline()
 290     for line in inFile:
 291         seqArray.append(line.strip())
 292
 293     seq = string.join(seqArray, "")
 294     seqLen = len(seq)
 295     if seqLen < 1:
 296         print "Problems reading sequence from file"
 297
 298     print "writing to file %s" % chromOut
 299     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 300     outFile.write(seq)
 301     outFile.close()
 302     ceGenome.addChromosomeEntry(chromID, chromOut, "file")
 303
 304
 305 def createDBindices(db):
 306     ceGenome = Genome("celegans", version="WS200", dbFile=db)
 307     ceGenome.createIndices()