cistematic/genomes/cbriggsae.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis briggsae
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_briggsae/cbriggsae.genedb" % cisRoot
  41
  42
  43 def loadChromosome(db, chromPath, chromOutPath):
  44     seqArray = []
  45     seqLen = 0
  46     cbGenome = Genome("cbriggsae", dbFile=db)
  47     inFile = open(chromPath, "r")
  48     header = inFile.readline()
  49     while header != "":
  50         seqArray = []
  51         seqLen = 0
  52         chromID = header.strip()[1:]
  53         currentLine = inFile.readline()
  54         while currentLine != "" and currentLine[0] != ">":
  55             lineSeq = currentLine.strip()
  56             seqLen += len(lineSeq)
  57             seqArray.append(lineSeq)
  58             currentLine = inFile.readline()
  59
  60         seq = string.join(seqArray, "")
  61         if seqLen < 900000:
  62             print "Added contig %s to database" % chromID
  63             cbGenome.addSequence(("cbriggsae", chromID), seq, "chromosome", str(seqLen))
  64             cbGenome.addChromosomeEntry(chromID, chromID, "db")
  65         else:
  66             outFileName = "%s%s.bin" % (chromOutPath, chromID)
  67             outFile = open("%s%s" % (cisRoot, outFileName), "w")
  68             outFile.write(seq)
  69             outFile.close()
  70             print "Added contig file %s to database" % outFileName
  71             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
  72
  73         header = currentLine
  74
  75     inFile.close()
  76
  77
  78 def loadGeneEntries(db, gffFile):
  79     cbGenome = Genome("cbriggsae", dbFile=db)
  80     geneFile = open(gffFile, "r")
  81     geneEntries = []
  82     geneAnnotations = []
  83     for line in geneFile:
  84         field = line[:-1].split("\t")
  85         if field[1] != "hybrid":
  86             continue
  87
  88         if field[2] != "CDS":
  89             continue
  90
  91         annot = field[8].split('"')
  92         gid = annot[1]
  93         geneID = ("cbriggsae", gid)
  94         annotation = string.join(annot[3:], "")
  95         gidVersion = 1
  96         try:
  97             gidVersion = giddots[2]
  98         except:
  99             pass
 100
 101         start = int(field[3]) - 1
 102         stop = int(field[4]) - 1
 103         sense = field[6]
 104         chrom = field[0].strip()
 105         if sense == "+":
 106             sense = "F"
 107         else:
 108             sense = "R"
 109
 110         if (geneID, chrom, start, stop, sense, "CDS", gidVersion) not in geneEntries:
 111             geneEntries.append((geneID, chrom, start, stop, sense, "CDS", gidVersion))
 112         if (geneID, annotation) not in geneAnnotations:
 113             geneAnnotations.append((geneID, annotation))
 114
 115     print "Adding %d gene entries" % len(geneEntries)
 116     cbGenome.addGeneEntryBatch(geneEntries)
 117
 118     print "Adding %d annotations" % len(geneAnnotations)
 119     cbGenome.addAnnotationBatch(geneAnnotations)
 120
 121
 122 def loadFeatureEntries(db, gffFile):
 123     cbGenome = Genome("cbriggsae", dbFile=db)
 124     featureFile = open(gffFile, "r")
 125     featureEntries = []
 126     seenFeatures = {}
 127     featureTranslation = {"coding_exon": "CDS",
 128                           "three_prime_UTR": "3UTR",
 129                           "five_prime_UTR": "5UTR"
 130     }
 131
 132     for line in featureFile:
 133         field = line.split("\t")
 134         if field[1] != "hybrid":
 135             continue
 136
 137         if field[2].strip() not in featureTranslation:
 138             continue
 139
 140         featureType = featureTranslation[field[2].strip()]
 141         gidrev = field[8].split('"')
 142         gid = gidrev[1]
 143         geneID = ("cbriggsae", gid)
 144         gidVersion = 1
 145         start = int(field[3]) - 1
 146         stop = int(field[4]) - 1
 147         sense = field[6]
 148         chrom = field[0].strip()
 149         if sense == "+":
 150             sense = "F"
 151         else:
 152             sense = "R"
 153
 154         if geneID not in seenFeatures:
 155             seenFeatures[geneID] = []
 156         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
 157             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
 158             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
 159
 160     print "Adding %d feature entries" % len(featureEntries)
 161     cbGenome.addFeatureEntryBatch(featureEntries)
 162
 163
 164 def createDBFile(db):
 165     cbGenome = Genome("cbriggsae", version="CB25",  dbFile=db)
 166     cbGenome.createGeneDB(db)
 167
 168
 169 def createDBindices(db):
 170     cbGenome = Genome("cbriggsae", version="CB25", dbFile=db)
 171     cbGenome.createIndices()
 172
 173
 174 def buildCbriggsaeDB(db=geneDB):
 175     gffPath = "%s/download/briggsae_25.WS132.gff" % cisRoot
 176     chromoPath = "%s/download/briggsae_25.fa" % cisRoot
 177     chromoOutPath = "/C_briggsae/"
 178
 179     print "Creating database %s" % db
 180     createDBFile(db)
 181
 182     print "Adding gene entries"
 183     loadGeneEntries(db, gffPath)
 184
 185     print "Adding feature entries"
 186     loadFeatureEntries(db, gffPath)
 187
 188     print "Loading genomic sequence"
 189     loadChromosome(db, chromoPath, chromoOutPath)
 190
 191     print "Creating Indices"
 192     createDBindices(db)
 193
 194     print "Finished creating database %s" % db