cistematic/genomes/xtropicalis.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Xenopus tropicalis
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/X_tropicalis/xtropicalis.genedb" % cisRoot
  41
  42
  43 def buildXtropicalisDB(db=geneDB):
  44     genePath = "%s/download/xt1/jgiFilteredModels.txt" % cisRoot
  45     chromoPath = "%s/download/xt1/xenTro1.softmask2.fa" % cisRoot
  46     chromoOutPath = "/X_tropicalis/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, genePath)
  53
  54     print "Adding gene features"
  55     loadGeneFeatures(db, genePath)
  56
  57     print "Loading sequences"
  58     loadChromosome(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     xtGenome = Genome("xtropicalis",  dbFile=db)
  68     xtGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gFile):
  72     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  73
  74     geneEntries = []
  75     xtGenome = Genome("xtropicalis", dbFile=db)
  76     geneFile = open(gFile, "r")
  77     for line in geneFile:
  78         cols = line.split("\t")
  79         gid = cols[0]
  80         start = int(cols[5])
  81         stop = int(cols[6])
  82         sense = cols[2]
  83         chrom = cols[1]
  84         if sense == "+":
  85             sense = "F"
  86         else:
  87             sense = "R"
  88
  89         geneID = ("xtropicalis", gid)
  90         gidVersion = 1
  91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  92
  93     print "Adding %d gene entries" % len(geneEntries)
  94     xtGenome.addGeneEntryBatch(geneEntries)
  95
  96
  97 def loadGeneAnnotations(db, annotPath):
  98     geneAnnotations = []
  99     annotFile = open(annotPath, "r")
 100     xtGenome = Genome("xtropicalis", dbFile=db)
 101     for line in annotFile:
 102         try:
 103             cols = line.split("\t")
 104             locID = cols[0]
 105             geneDesc = cols[6]
 106             if len(locID) > 0:
 107                 geneAnnotations.append((("xtropicalis", locID), string.replace(geneDesc.strip(), "'", "p")))
 108         except:
 109             pass
 110
 111     print "Adding %d annotations" % len(geneAnnotations)
 112     xtGenome.addAnnotationBatch(geneAnnotations)
 113
 114
 115 def loadGeneFeatures(db, gfile):
 116     geneFile = open(gfile, "r")
 117     senseArray = {"+": "F",
 118                   "-": "R",
 119                   ".": "F"
 120     }
 121
 122     seenArray = []
 123     insertArray = []
 124     for geneLine in geneFile:
 125         geneFields = geneLine.split("\t")
 126         exonNum = int(geneFields[7])
 127         exonStarts = geneFields[8].split(",")
 128         exonStops = geneFields[9].split(",")
 129         chrom = geneFields[1]
 130         sense = senseArray[geneFields[2]]
 131         gstop = int(geneFields[6]) - 1
 132         gstart = int(geneFields[5]) - 1
 133         geneid = geneFields[0]
 134         try:
 135             geneID = ("xtropicalis", geneid)
 136         except:
 137             continue
 138
 139         gidVersion = "1"
 140         if geneID in seenArray:
 141             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
 142         else:
 143             seenArray.append(geneID)
 144
 145         for index in range(exonNum):
 146             estart = int(exonStarts[index]) - 1
 147             estop = int(exonStops[index]) - 1
 148             if estart >= gstart and estop <= gstop:
 149                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
 150             elif estop <= gstart:
 151                 if sense == "F":
 152                     fType = "5UTR"
 153                 else:
 154                     fType = "3UTR"
 155
 156                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 157             elif estart >= gstop:
 158                 if sense == "F":
 159                     fType = "3UTR"
 160                 else:
 161                     fType = "5UTR"
 162
 163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 164             elif estart <= gstop:
 165                 if sense == "F":
 166                     fType = "3UTR"
 167                 else:
 168                     fType = "5UTR"
 169
 170                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
 171                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
 172             else:
 173                 if sense == "F":
 174                     fType = "5UTR"
 175                 else:
 176                     fType = "3UTR"
 177
 178                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
 179                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
 180
 181     geneFile.close()
 182     xtGenome = Genome("xtropicalis", dbFile=db)
 183     print "Adding %d features" % len(insertArray)
 184     xtGenome.addFeatureEntryBatch(insertArray)
 185
 186
 187 def loadGeneOntology(db, goPath, goDefPath, annotPath):
 188     xtGenome = Genome("xtropicalis", dbFile=db)
 189     goDefFile = open(goDefPath, "r")
 190     goFile = open(goPath, "r")
 191     annotFile = open(annotPath, "r")
 192     annotEntries = annotFile.readlines()
 193     annotFile.close()
 194     goDefEntries = goDefFile.readlines()
 195     goDefs = {}
 196     locus = {}
 197     goArray = []
 198     for goDefEntry in goDefEntries:
 199         if goDefEntry[0] != "!":
 200             cols = goDefEntry.split("\t")
 201             goDefs[cols[0]] = (cols[1], cols[2].strip())
 202
 203     goEntries = goFile.readlines()
 204     for annotEntry in annotEntries:
 205         try:
 206             cols = annotEntry.split("\t")
 207             locID = cols[0]
 208             geneName = cols[1]
 209             geneDesc = cols[6]
 210             mimID = ""
 211             if len(locID) > 0:
 212                 locus[locID] = (geneName, geneDesc, mimID)
 213         except:
 214             pass
 215
 216     for entry in goEntries:
 217         try:
 218             fields = entry.split("\t")
 219             locID = fields[0].strip()
 220             (gene_name, gene_desc, mimID) = locus[locID]
 221             goArray.append((("xtropicalis", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
 222         except:
 223             pass
 224
 225     print "adding %d go entries" % len(goArray)
 226     xtGenome.addGoInfoBatch(goArray)
 227
 228
 229 def loadChromosome(db, chromPath, chromOutPath):
 230     seqArray = []
 231     seqLen = 0
 232     xtGenome = Genome("xtropicalis", dbFile=db)
 233     inFile = open(chromPath, "r")
 234     header = inFile.readline()
 235     while header != "":
 236         seqArray = []
 237         seqLen = 0
 238         chromID = header.strip()[1:]
 239         currentLine = inFile.readline()
 240         while currentLine != "" and currentLine[0] != ">":
 241             lineSeq = currentLine.strip()
 242             seqLen += len(lineSeq)
 243             seqArray.append(lineSeq)
 244             currentLine = inFile.readline()
 245
 246         seq = string.join(seqArray, "")
 247         if seqLen < 500000:
 248             print "Added contig %s to database" % chromID
 249             xtGenome.addSequence(("xtropicalis", chromID), seq, "chromosome", str(seqLen))
 250             xtGenome.addChromosomeEntry(chromID, chromID, "db")
 251         else:
 252             outFileName = "%s%s.bin" % (chromOutPath, chromID)
 253             outFile = open("%s%s" % (cisRoot, outFileName), "w")
 254             outFile.write(seq)
 255             outFile.close()
 256             print "Added contig file %s to database" % outFileName
 257             xtGenome.addChromosomeEntry(chromID, outFileName, "file")
 258
 259         header = currentLine
 260
 261     inFile.close()
 262
 263
 264 def createDBindices(db):
 265     xtGenome = Genome("xtropicalis", dbFile=db)
 266     xtGenome.createIndices()