cistematic/genomes/cbrenneri.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis remanei
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_brenneri/cbrenneri.genedb" % cisRoot
  41
  42
  43 def buildCbrenneriDB(db=geneDB):
  44     gffPath = "%s/download/PB2801_2007feb09.gff" % cisRoot # using EMS special version
  45     chromoPath = "%s/download/PB2801_supercontigs.fa" % cisRoot
  46     chromoOutPath = "/C_brenneri/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, gffPath)
  53
  54     print "Adding feature entries"
  55     loadFeatureEntries(db, gffPath)
  56
  57     print "Loading genomic sequence"
  58     loadChromosome(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     cbGenome = Genome("cbrenneri", version="PB2801_001",  dbFile=db)
  68     cbGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gffFile):
  72     cbGenome = Genome("cbrenneri", dbFile=db)
  73     geneFile = open(gffFile, "r")
  74     geneStart = {}
  75     geneStop = {}
  76     geneChrom = {}
  77     geneSense = {}
  78     geneEntries = []
  79     for line in geneFile:
  80         if line[0] == "#":
  81             continue
  82
  83         if line[0] == "\n":
  84             continue
  85
  86         field = line[:-1].split("\t")
  87         if field[2] != "stop_codon" and field[2] != "start_codon":
  88             continue
  89
  90         idfield = field[8].split('"')
  91         gid = idfield[1]
  92         geneID = ("cbrenneri", gid)
  93         sense = field[6]
  94         geneChrom[geneID] = field[0].strip()
  95         if sense == "+":
  96             geneSense[geneID] = "F"
  97         else:
  98             geneSense[geneID] = "R"
  99
 100         if field[2] == "start_codon":
 101             if sense == "+":
 102                 geneStart[geneID] = int(field[3])
 103             else:
 104                 geneStart[geneID] = int(field[4])
 105         else:
 106             if sense == "+":
 107                 geneStop[geneID] = int(field[3])
 108             else:
 109                 geneStop[geneID] = int(field[4])
 110
 111     for geneID in geneStart:
 112         if geneID not in geneStop:
 113             print "geneID %s not in geneStop - skipping" % str(geneID)
 114             continue
 115
 116         geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
 117
 118     print "Adding %d gene entries" % len(geneEntries)
 119     cbGenome.addGeneEntryBatch(geneEntries)
 120
 121
 122 def loadFeatureEntries(db, gffFile):
 123     cbGenome = Genome("cbrenneri", dbFile=db)
 124     featureFile = open(gffFile, "r")
 125     featureEntries = []
 126     for line in featureFile:
 127         if line[0] == "#":
 128             continue
 129
 130         if line[0] == "\n":
 131             continue
 132
 133         field = line.split("\t")
 134         if field[2].strip() != "CDS":
 135             continue
 136
 137         gidrev = field[8].split('"')
 138         gid = gidrev[1]
 139         geneID = ("cbrenneri", gid)
 140         gidVersion = 1
 141
 142         start = int(field[3])
 143         stop = int(field[4])
 144
 145         sense = field[6]
 146         chrom = field[0].strip()
 147         if sense == "+":
 148             sense = "F"
 149         else:
 150             sense = "R"
 151
 152         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
 153
 154     print "Adding %d feature entries" % len(featureEntries)
 155     cbGenome.addFeatureEntryBatch(featureEntries)
 156
 157
 158 def loadChromosome(db, chromPath, chromOutPath):
 159     seqArray = []
 160     seqLen = 0
 161     cbGenome = Genome("cbrenneri", dbFile=db)
 162     inFile = open(chromPath, "r")
 163     header = inFile.readline()
 164     while header != "":
 165         seqArray = []
 166         seqLen = 0
 167         chromHeader = header.strip()[1:].split()
 168         chromID = chromHeader[0]
 169         currentLine = inFile.readline()
 170
 171         while currentLine != "" and currentLine[0] != ">":
 172             lineSeq = currentLine.strip()
 173             seqLen += len(lineSeq)
 174             seqArray.append(lineSeq)
 175             currentLine = inFile.readline()
 176
 177         seq = string.join(seqArray, "")
 178         if seqLen < 100000:
 179             print "Added contig %s to database" % chromID
 180             cbGenome.addSequence(("cbrenneri", chromID), seq, "chromosome", str(seqLen))
 181             cbGenome.addChromosomeEntry(chromID, chromID, "db")
 182         else:
 183             outFileName = "%s%s.bin" % (chromOutPath, chromID)
 184             outFile = open( "%s%s" % (cisRoot, outFileName), "w")
 185             outFile.write(seq)
 186             outFile.close()
 187             print "Added contig file %s to database" % outFileName
 188             cbGenome.addChromosomeEntry(chromID, outFileName, "file")
 189
 190         header = currentLine
 191
 192     inFile.close()
 193
 194
 195 def createDBindices(db):
 196     cbGenome = Genome("cbrenneri", version="PB2801_001", dbFile=db)
 197     cbGenome.createIndices()