cistematic/genomes/btaurus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Bos Taurus
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/B_taurus/btaurus.genedb" % cisRoot
  41
  42
  43 def buildBtaurusDB(db=geneDB):
  44     genePath = "%s/download/bt2/genscan.txt" % cisRoot
  45     chromoPath = "%s/download/bt2/bosTau2.softmask2.fa" % cisRoot
  46     chromoOutPath = "/B_taurus/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, genePath)
  53
  54     print "Adding gene features"
  55     loadGeneFeatures(db, genePath)
  56
  57     print "Loading sequences"
  58     loadChromosome(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     btGenome = Genome("btaurus",  dbFile=db)
  68     btGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gFile):
  72     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  73
  74     geneEntries = []
  75     btGenome = Genome("btaurus", dbFile=db)
  76     geneFile = open(gFile, "r")
  77
  78     for line in geneFile:
  79         cols = line.split("\t")
  80         gid = cols[0]
  81         start = int(cols[5])
  82         stop = int(cols[6])
  83         sense = cols[2]
  84         chrom = cols[1]
  85         if sense == "+":
  86             sense = "F"
  87         else:
  88             sense = "R"
  89
  90         geneID = ("btaurus", gid)
  91         gidVersion = 1
  92         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  93
  94     print "Adding %d gene entries" % len(geneEntries)
  95     btGenome.addGeneEntryBatch(geneEntries)
  96
  97
  98 def loadGeneFeatures(db, gfile):
  99     geneFile = open(gfile, "r")
 100     senseArray = {"+": "F",
 101                   "-": "R",
 102                   ".": "F"
 103     }
 104
 105     seenArray = []
 106     insertArray = []
 107     for geneLine in geneFile:
 108         geneFields = geneLine.split("\t")
 109         exonNum = int(geneFields[7])
 110         exonStarts = geneFields[8].split(",")
 111         exonStops = geneFields[9].split(",")
 112         chrom = geneFields[1]
 113         sense = senseArray[geneFields[2]]
 114         gstop = int(geneFields[6]) - 1
 115         gstart = int(geneFields[5]) - 1
 116         geneid = geneFields[0]
 117         try:
 118             geneID = ("btaurus", geneid)
 119         except:
 120             continue
 121
 122         gidVersion = "1"
 123         if geneID in seenArray:
 124             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
 125         else:
 126             seenArray.append(geneID)
 127
 128         for index in range(exonNum):
 129             estart = int(exonStarts[index]) - 1
 130             estop = int(exonStops[index]) - 1
 131             if estart >= gstart and estop <= gstop:
 132                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
 133             elif estop <= gstart:
 134                 if sense == "F":
 135                     fType = "5UTR"
 136                 else:
 137                     fType = "3UTR"
 138
 139                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 140             elif estart >= gstop:
 141                 if sense == "F":
 142                     fType = "3UTR"
 143                 else:
 144                     fType = "5UTR"
 145
 146                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 147             elif estart <= gstop:
 148                 if sense == "F":
 149                     fType = "3UTR"
 150                 else:
 151                     fType = "5UTR"
 152
 153                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
 154                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
 155             else:
 156                 if sense == "F":
 157                     fType = "5UTR"
 158                 else:
 159                     fType = "3UTR"
 160
 161                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
 162                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
 163
 164     geneFile.close()
 165
 166     btGenome = Genome("btaurus", dbFile=db)
 167     print 'Adding %d features' % len(insertArray)
 168     btGenome.addFeatureEntryBatch(insertArray)
 169
 170
 171 def loadGeneAnnotations(db, annotPath):
 172     geneAnnotations = []
 173     annotFile = open(annotPath, "r")
 174     btGenome = Genome("btaurus", dbFile=db)
 175     for line in annotFile:
 176         try:
 177             cols = line.split("\t")
 178             locID = cols[0]
 179             geneDesc = cols[6]
 180             if len(locID) > 0:
 181                 geneAnnotations.append((("btaurus", locID), string.replace(geneDesc.strip(), "'", "p")))
 182         except:
 183             pass
 184
 185     print "Adding %d annotations" % len(geneAnnotations)
 186     btGenome.addAnnotationBatch(geneAnnotations)
 187
 188
 189 def loadGeneOntology(db, goPath, goDefPath, annotPath):
 190     btGenome = Genome("btaurus", dbFile=db)
 191     goDefFile = open(goDefPath, "r")
 192     goFile = open(goPath, "r")
 193     annotFile = open(annotPath, "r")
 194     annotEntries = annotFile.readlines()
 195     annotFile.close()
 196     goDefEntries = goDefFile.readlines()
 197     goDefs = {}
 198     locus = {}
 199     goArray = []
 200
 201     for goDefEntry in goDefEntries:
 202         if goDefEntry[0] != "!":
 203             cols = goDefEntry.split("\t")
 204             goDefs[cols[0]] = (cols[1], cols[2].strip())
 205
 206     goEntries = goFile.readlines()
 207     for annotEntry in annotEntries:
 208         try:
 209             cols = annotEntry.split("\t")
 210             locID = cols[0]
 211             geneName = cols[1]
 212             geneDesc = cols[6]
 213             mimID = ""
 214             if len(locID) > 0:
 215                 locus[locID] = (geneName, geneDesc, mimID)
 216         except:
 217             pass
 218
 219     for entry in goEntries:
 220         try:
 221             fields = entry.split("\t")
 222             locID = fields[0].strip()
 223             (gene_name, gene_desc, mimID) = locus[locID]
 224             goArray.append((("btaurus", locID), fields[1], "", gene_name, "", string.replace(goDefs[fields[1]][0], "'", "p"), goDefs[fields[1]][1], mimID))
 225         except:
 226             #print "locus ID %s could not be added" % locID
 227             pass
 228
 229     print "adding %d go entries" % len(goArray)
 230     btGenome.addGoInfoBatch(goArray)
 231
 232
 233 def loadChromosome(db, chromPath, chromOutPath):
 234     seqArray = []
 235     seqLen = 0
 236     btGenome = Genome("btaurus", dbFile=db)
 237     inFile = open(chromPath, "r")
 238     header = inFile.readline()
 239     while header != "":
 240         seqArray = []
 241         seqLen = 0
 242         chromID = header.strip()[1:]
 243         currentLine = inFile.readline()
 244
 245         while currentLine != "" and currentLine[0] != ">":
 246             lineSeq = currentLine.strip()
 247             seqLen += len(lineSeq)
 248             seqArray.append(lineSeq)
 249             currentLine = inFile.readline()
 250
 251         seq = string.join(seqArray, "")
 252         if seqLen < 500000:
 253             print "Added contig %s to database" % chromID
 254             btGenome.addSequence(("btaurus", chromID), seq, "chromosome", str(seqLen))
 255             btGenome.addChromosomeEntry(chromID, chromID, "db")
 256         else:
 257             outFileName = "%s%s.bin" % (chromOutPath, chromID)
 258             outFile = open("%s%s" % (cisRoot, outFileName), "w")
 259             outFile.write(seq)
 260             outFile.close()
 261             print "Added contig file %s to database" % outFileName
 262             btGenome.addChromosomeEntry(chromID, outFileName, "file")
 263
 264         header = currentLine
 265
 266     inFile.close()
 267
 268
 269 def createDBindices(db):
 270     btGenome = Genome("btaurus", dbFile=db)
 271     btGenome.createIndices()