cistematic/genomes/cbriggsae.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis briggsae
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
  41
  42
  43 def buildCbriggsaeDB(db=geneDB):
  44     gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
  45     chromoPath = "%s/download/briggsae_25.fa" % cisRoot
  46     chromoOutPath = "/C_briggsae/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, gffPath)
  53
  54     print "Adding feature entries"
  55     loadFeatureEntries(db, gffPath)
  56
  57     print "Loading genomic sequence"
  58     loadChromosome(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
  68     cbGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gffFile):
  72     cbGenome = Genome("cbriggsae", dbFile=db)
  73     geneFile = open(gffFile, "r")
  74     geneEntries = []
  75     geneAnnotations = []
  76     for line in geneFile:
  77         field = line[:-1].split("\t")
  78         if field[1] != "hybrid":
  79             continue
  80
  81         if field[2] != "CDS":
  82             continue
  83
  84         annot = field[8].split('"')
  85         gid = annot[1]
  86         geneID = ("cbriggsae", gid)
  87         annotation = string.join(annot[3:], "")
  88         gidVersion = 1
  89         try:
  90             gidVersion = giddots[2]
  91         except:
  92             pass
  93
  94         start = int(field[3]) - 1
  95         stop = int(field[4]) - 1
  96         sense = field[6]
  97         chrom = field[0].strip()
  98         if sense == "+":
  99             sense = "F"
 100         else:
 101             sense = "R"
 102
 103         if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
 104             geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
 105         if (geneID, annotation) not in geneAnnotations:
 106             geneAnnotations.append((geneID, annotation))
 107
 108     print "Adding %d gene entries" % len(geneEntries)
 109     cbGenome.addGeneEntryBatch(geneEntries)
 110
 111     print "Adding %d annotations" % len(geneAnnotations)
 112     cbGenome.addAnnotationBatch(geneAnnotations)
 113
 114
 115 def loadFeatureEntries(db, gffFile):
 116     cbGenome = Genome("cbriggsae", dbFile=db)
 117     featureFile = open(gffFile, "r")
 118     featureEntries = []
 119     seenFeatures = {}
 120     featureTranslation = {"coding_exon": "CDS",
 121                           "three_prime_UTR": "3UTR",
 122                           "five_prime_UTR": "5UTR"
 123     }
 124
 125     for line in featureFile:
 126         field = line.split("\t")
 127         if field[1] != "hybrid":
 128             continue
 129
 130         if field[2].strip() not in featureTranslation:
 131             continue
 132
 133         featureType = featureTranslation[field[2].strip()]
 134         gidrev = field[8].split('"')
 135         gid = gidrev[1]
 136         geneID = ("cbriggsae", gid)
 137         gidVersion = 1
 138         start = int(field[3]) - 1
 139         stop = int(field[4]) - 1
 140         sense = field[6]
 141         chrom = field[0].strip()
 142         if sense == "+":
 143             sense = "F"
 144         else:
 145             sense = "R"
 146
 147         if geneID not in seenFeatures:
 148             seenFeatures[geneID] = []
 149         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
 150             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
 151             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
 152
 153     print "Adding %d feature entries" % len(featureEntries)
 154     cbGenome.addFeatureEntryBatch(featureEntries)
 155
 156
 157 def loadChromosome(db, chromPath, chromOutPath):
 158     seqArray = []
 159     seqLen = 0
 160     cbGenome = Genome("cbriggsae", dbFile=db)
 161     inFile = open(chromPath, "r")
 162     header = inFile.readline()
 163     while header != "":
 164         seqArray = []
 165         seqLen = 0
 166         chromID = header.strip()[1:]
 167         currentLine = inFile.readline()
 168         while currentLine != "" and currentLine[0] != ">":
 169             lineSeq = currentLine.strip()
 170             seqLen += len(lineSeq)
 171             seqArray.append(lineSeq)
 172             currentLine = inFile.readline()
 173
 174         seq = string.join(seqArray, "")
 175         if seqLen < 900000:
 176             print "Added contig %s to database" % chromID
 177             cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
 178             cbGenome.addChromosomeEntry(chromID, chromID, "db")
 179         else:
 180             outFileName = "%s%s.bin" % (chromOutPath, chromID)
 181             outFile = open("%s%s" % (cisRoot, outFileName), "w")
 182             outFile.write(seq)
 183             outFile.close()
 184             print "Added contig file %s to database" % outFileName
 185             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
 186
 187         header = currentLine
 188
 189     inFile.close()
 190
 191
 192 def createDBindices(db):
 193     cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
 194     cbGenome.createIndices()