cistematic/genomes/cremanei.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Caenorhaditis remanei
  31 import string, os
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_remanei/cremanei.genedb" % cisRoot
  41
  42
  43 def buildCremaneiDB(db=geneDB):
  44     gffPath = "%s/download/cr01_wu_merged_gff" % cisRoot # using 20050824 version
  45     chromoPath = "%s/download/sctg_masked_seqs/seqs" % cisRoot
  46     chromoOutPath = "/C_remanei/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, gffPath)
  53
  54     print "Adding feature entries"
  55     loadFeatureEntries(db, gffPath)
  56
  57     print "Loading genomic sequence"
  58     loadChromosomes(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     crGenome = Genome("cremanei", version="CR20050824",  dbFile=db)
  68     crGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gffFile):
  72     crGenome = Genome("cremanei", dbFile=db)
  73     geneFile = open(gffFile, "r")
  74     geneStart = {}
  75     geneStop = {}
  76     geneChrom = {}
  77     geneSense = {}
  78     geneEntries = []
  79     for line in geneFile:
  80         if line[0] == "#":
  81             continue
  82
  83         if line[0] == "\n":
  84             continue
  85
  86         field = line[:-1].split("\t")
  87         if field[2] != "CDS":
  88             continue
  89
  90         idfield = field[8].split('"')
  91         gid = idfield[1]
  92         geneID = ("cremanei", gid)
  93         geneStart[geneID] = int(field[3]) - 1
  94         geneStop[geneID] = int(field[4]) - 1
  95         sense = field[6]
  96         geneChrom[geneID] = field[0].strip()
  97         if sense == "+":
  98             geneSense[geneID] = "F"
  99         else:
 100             geneSense[geneID] = "R"
 101
 102     for geneID in geneStart:
 103         if geneID not in geneStop:
 104             print "geneID %s not in geneStop - skipping" % str(geneID)
 105             continue
 106         geneEntries.append((geneID, geneChrom[geneID], geneStart[geneID], geneStop[geneID], geneSense[geneID], "CDS", 1))
 107
 108     print "Adding %d gene entries" % len(geneEntries)
 109     crGenome.addGeneEntryBatch(geneEntries)
 110
 111
 112 def loadFeatureEntries(db, gffFile):
 113     crGenome = Genome("cremanei", dbFile=db)
 114     featureFile = open(gffFile, "r")
 115     featureEntries = []
 116     for line in featureFile:
 117         if line[0] == "#":
 118             continue
 119
 120         if line[0] == "\n":
 121             continue
 122
 123         field = line.split("\t")
 124         if field[2].strip() != "coding_exon":
 125             continue
 126
 127         gidrev = field[8].split('"')
 128         gid = gidrev[1]
 129         geneID = ("cremanei", gid)
 130         gidVersion = 1
 131         start = int(field[3]) - 1
 132         stop = int(field[4]) - 1
 133         sense = field[6]
 134         chrom = field[0].strip()
 135         if sense == "+":
 136             sense = "F"
 137         else:
 138             sense = "R"
 139
 140         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
 141
 142     print "Adding %d feature entries" % len(featureEntries)
 143     crGenome.addFeatureEntryBatch(featureEntries)
 144
 145
 146 def loadChromosomes(db, inPath, chromOutPath):
 147     crGenome = Genome("cremanei", dbFile=db)
 148     scontigList = os.listdir(inPath)
 149     for scontig in scontigList:
 150         seq = ''
 151         seqArray = []
 152         seqLen = 0
 153         inFile = open("%s/%s" % (inPath, scontig), "r")
 154         index = 0
 155         header = inFile.readline()
 156         chromID = header.strip()[1:]
 157         while header != "":
 158             seqArray = []
 159             seqLen = 0
 160             currentLine = inFile.readline()
 161             while currentLine != "" and currentLine[0] != ">":
 162                 lineSeq = currentLine.strip()
 163                 seqLen += len(lineSeq)
 164                 seqArray.append(lineSeq)
 165                 currentLine = inFile.readline()
 166
 167             seq = string.join(seqArray, "")
 168             if seqLen < 100000:
 169                 print "Added contig %s to database" % chromID
 170                 crGenome.addSequence(("cremanei", chromID), seq, "chromosome", str(seqLen))
 171                 crGenome.addChromosomeEntry(chromID, chromID, "db")
 172             else:
 173                 outFileName = "%s%s.bin" % (chromOutPath, chromID)
 174                 outFile = open("%s%s" % (cisRoot, outFileName), "w")
 175                 outFile.write(seq)
 176                 outFile.close()
 177                 print "Added contig file %s to database" % outFileName
 178                 crGenome.addChromosomeEntry(chromID, outFileName, "file")
 179
 180             index += 1
 181             header = currentLine
 182
 183         inFile.close()
 184
 185
 186 def createDBindices(db):
 187     crGenome = Genome("cremanei", version="CR20050824", dbFile=db)
 188     crGenome.createIndices()