cistematic/genomes/athaliana.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Arabidopsis thaliana
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
  41
  42 chromSize = {"1": 30432563,
  43              "2": 19705359,
  44              "3": 23470805,
  45              "4": 18585042,
  46              "5": 26992738
  47 }
  48
  49 background = [0.3180185, 0.1819815, 0.1819815, 0.3180185]
  50 genomeSize = 119186497
  51
  52
  53 def decodeGFF3(cols):
  54     fType = cols[2]
  55     chrom = cols[0][3:]
  56     start = int(cols[3])
  57     stop = int(cols[4])
  58     sense = cols[6]
  59     if sense == "+":
  60         sense = "F"
  61     else:
  62         sense = "R"
  63
  64     other = cols[-1]
  65     otherList = other.split(";")
  66     otherDict = {}
  67     for otherItem in otherList:
  68         try:
  69             (name, value) = otherItem.split("=")
  70         except:
  71             continue
  72
  73         otherDict[name] = value
  74         if name == "Name":
  75             gid = value.strip()
  76
  77         if name == "Parent":
  78             if "," in value:
  79                 value = value.split(",")[0]
  80
  81             gid = value.strip()
  82
  83     return (fType, gid, chrom, start, stop, sense, otherDict)
  84
  85
  86 def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
  87     genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
  88     annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
  89     goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
  90
  91     print "Creating database %s" % db
  92     createDBFile(db)
  93
  94     print "Adding gene entries"
  95     loadGeneEntries(db, genePath)
  96
  97     print "Adding feature entries"
  98     loadFeatureEntries(db, genePath)
  99
 100     print "Adding gene annotations"
 101     loadGeneAnnotations(db, annotPath)
 102
 103     print "Adding gene ontology"
 104     loadGeneOntology(db, goPath)
 105
 106     for chromID in ["1", "2", "3", "4", "5", "C", "M"]:
 107         print "Loading chromosome %s" % chromID
 108         chromPath = "%s/chr%s.fas" % (downloadDir, chromID)
 109         loadChromosome(db, chromID, chromPath,  "/A_thaliana/chr%s.bin" % chromID)
 110
 111     print "Creating Indices"
 112     createDBindices(db)
 113
 114     print "Finished creating database %s" % db
 115
 116
 117 def createDBFile(db):
 118     atGenome = Genome("athaliana", dbFile=db)
 119     atGenome.createGeneDB(db)
 120
 121
 122 def loadGeneEntries(db, gFile):
 123     geneEntries = []
 124     atGenome = Genome("athaliana", dbFile=db)
 125     geneFile = open(gFile, "r")
 126     for line in geneFile:
 127         if line[0] == "#" or len(line) < 10:
 128             continue
 129
 130         fields = line.strip().split("\t")
 131         if fields[2] != "gene":
 132             continue
 133
 134         (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
 135         geneID = ("athaliana", gid)
 136         version = 1
 137         geneEntries.append((geneID, chrom, start, stop, sense, "gene", version))
 138
 139     print "inserting %d gene entries" % len(geneEntries)
 140     atGenome.addGeneEntryBatch(geneEntries)
 141
 142
 143 def loadFeatureEntries(db, gFile):
 144     featureEntries = []
 145     trackedGenes = []
 146     atGenome = Genome("athaliana", dbFile=db)
 147     featureTranslation = {"CDS": "CDS",
 148                           "three_prime_UTR": "3UTR",
 149                           "five_prime_UTR": "5UTR",
 150                           "miRNA": "5UTR",
 151                           "exon": "5UTR"
 152     }
 153
 154     geneFile = open(gFile, "r")
 155     for line in geneFile:
 156         fields = line.split("\t")
 157         (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
 158         if fType in ["ncRNA"]:
 159             (locus, rev) = gid.split(".")
 160             if gid not in trackedGenes:
 161                 trackedGenes.append(locus)
 162
 163     geneFile = open(gFile, "r")
 164     for line in geneFile:
 165         if line[0] == "c" or len(line) < 10:
 166             continue
 167
 168         fields = line.split("\t")
 169         (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
 170         locusField = gid.split('.')
 171         try:
 172             (locus, rev) = locusField
 173             rev = rev.strip()
 174         except:
 175             locus = gid
 176             rev = 1
 177
 178         if fType not in featureTranslation:
 179             continue
 180
 181         elif fType == "exon" and locus not in trackedGenes:
 182             continue
 183
 184         geneID = ("athaliana", locus)
 185         featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType]))
 186
 187     print "inserted %d feature entries" % len(featureEntries)
 188     atGenome.addFeatureEntryBatch(featureEntries)
 189
 190
 191 def loadGeneAnnotations(db, annotPath):
 192     geneAnnotations = []
 193     annotFile = open(annotPath, "r")
 194     annotFile.readline()
 195     lines = annotFile.readlines()
 196     annotFile.close()
 197
 198     atGenome = Genome("athaliana", dbFile=db)
 199     for line in lines:
 200         field = line.split("\t")
 201         try:
 202             orfName = field[0].strip()
 203             if "." in orfName:
 204                 (locus, rev) = orfName.split(".")
 205                 orfName = locus
 206
 207             description = field[2].strip()
 208             geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p")))
 209         except:
 210             pass
 211
 212     print "Adding %d annotations" % len(geneAnnotations)
 213     atGenome.addAnnotationBatch(geneAnnotations)
 214
 215
 216 def loadGeneOntology(db, goPath):
 217     atGenome = Genome("athaliana", dbFile=db)
 218     goFile = open(goPath, "r")
 219     goEntries = goFile.readlines()
 220     goArray = []
 221     for line in goEntries:
 222         fields = line.split("\t")
 223         gID = fields[0]
 224         GOID = fields[4]
 225         objType = string.replace(fields[3], "'", "p")
 226         objName = string.replace(fields[2], "'", "p")
 227         isNot = ""
 228         GOterm  = fields[7]
 229         evidence = fields[8]
 230         goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9]))
 231
 232     atGenome.addGoInfoBatch(goArray)
 233
 234
 235 def loadChromosome(db, chromID, chromPath, chromOut):
 236     seqArray = []
 237     atGenome = Genome("athaliana", dbFile=db)
 238
 239     inFile = open(chromPath, "r")
 240     line = inFile.readline()
 241     for line in inFile:
 242         seqArray.append(line.strip())
 243
 244     seq = string.join(seqArray,"")
 245     seqLen = len(seq)
 246     if seqLen < 1:
 247         print "Problems reading sequence from file"
 248
 249     print "writing to file %s" % (chromOut)
 250     outFile = open(cisRoot + chromOut, "w")
 251     outFile.write(seq)
 252     outFile.close()
 253     seq = ""
 254
 255     atGenome.addChromosomeEntry(chromID, chromOut, "file")
 256     # Add alternative chromID - should be A-O and 01-09
 257     atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
 258
 259
 260 def createDBindices(db):
 261     atGenome = Genome("athaliana", dbFile=db)
 262     atGenome.createIndices()