cistematic/genomes/mdomestica.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Monodelphis domestica
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot
  41
  42
  43 def buildMdomesticaDB(db=geneDB):
  44     genePath = "%s/download/mondom/genscan.txt" % cisRoot
  45     chromoPath = "%s/download/mondom/softMask.fa" % cisRoot
  46     chromoOutPath = "/M_domestica/"
  47
  48     print "Creating database %s" % db
  49     createDBFile(db)
  50
  51     print "Adding gene entries"
  52     loadGeneEntries(db, genePath)
  53
  54     print "Adding gene features"
  55     loadGeneFeatures(db, genePath)
  56
  57     print "Loading chromosomes"
  58     loadChromosome(db, chromoPath, chromoOutPath)
  59
  60     print "Creating Indices"
  61     createDBindices(db)
  62
  63     print "Finished creating database %s" % db
  64
  65
  66 def createDBFile(db):
  67     mdGenome = Genome("mdomestica",  dbFile=db)
  68     mdGenome.createGeneDB(db)
  69
  70
  71 def loadGeneEntries(db, gFile):
  72     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  73
  74     geneEntries = []
  75     mdGenome = Genome("mdomestica", dbFile=db)
  76     geneFile = open(gFile, "r")
  77     for line in geneFile:
  78         cols = line.split("\t")
  79         gid = cols[0]
  80         start = int(cols[5])
  81         stop = int(cols[6])
  82         sense = cols[2]
  83         chrom = cols[1]
  84         if sense == "+":
  85             sense = "F"
  86         else:
  87             sense = "R"
  88
  89         geneID = ("mdomestica", gid)
  90         gidVersion = 1
  91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  92
  93     print "Adding %d gene entries" % len(geneEntries)
  94     mdGenome.addGeneEntryBatch(geneEntries)
  95
  96
  97 def loadGeneAnnotations(db, annotPath):
  98     geneAnnotations = []
  99     annotFile = open(annotPath, "r")
 100     mdGenome = Genome("mdomestica", dbFile=db)
 101     for line in annotFile:
 102         try:
 103             cols = line.split("\t")
 104             locID = cols[0]
 105             geneDesc = cols[6]
 106             if len(locID) > 0:
 107                 geneAnnotations.append((("mdomestica", locID), string.replace(geneDesc.strip(), "'", "p")))
 108         except:
 109             pass
 110
 111     print "Adding %d annotations" % len(geneAnnotations)
 112     mdGenome.addAnnotationBatch(geneAnnotations)
 113
 114
 115 def loadGeneFeatures(db, gfile):
 116     geneFile = open(gfile, "r")
 117     senseArray = {"+": "F",
 118                   "-": "R",
 119                   ".": "F"
 120     }
 121
 122     seenArray = []
 123     insertArray = []
 124     for geneLine in geneFile:
 125         geneFields = geneLine.split("\t")
 126         exonNum = int(geneFields[7])
 127         exonStarts = geneFields[8].split(",")
 128         exonStops = geneFields[9].split(",")
 129         chrom = geneFields[1]
 130         sense = senseArray[geneFields[2]]
 131         gstop = int(geneFields[6]) - 1
 132         gstart = int(geneFields[5]) - 1
 133         geneid = geneFields[0]
 134         try:
 135             geneID = ("mdomestica", geneid)
 136         except:
 137             continue
 138
 139         gidVersion = "1"
 140         if geneID in seenArray:
 141             gidVersion = "2" # doesn't deal with more than 2 refseq's for the same locus, yet.
 142         else:
 143             seenArray.append(geneID)
 144
 145         for index in range(exonNum):
 146             estart = int(exonStarts[index]) - 1
 147             estop = int(exonStops[index]) - 1
 148             if estart >= gstart and estop <= gstop:
 149                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
 150             elif estop <= gstart:
 151                 if sense == "F":
 152                     fType = "5UTR"
 153                 else:
 154                     fType = "3UTR"
 155
 156                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 157             elif estart >= gstop:
 158                 if sense == "F":
 159                     fType = "3UTR"
 160                 else:
 161                     fType = "5UTR"
 162
 163                 insertArray.append((geneID, 1, chrom, estart, estop, sense, fType))
 164             elif estart <= gstop:
 165                 if sense == "F":
 166                     fType = "3UTR"
 167                 else:
 168                     fType = "5UTR"
 169
 170                 insertArray.append((geneID, 1, chrom, estart, gstop, sense, "CDS"))
 171                 insertArray.append((geneID, 1, chrom, gstop + 1, estop, sense, fType))
 172             else:
 173                 if sense == "F":
 174                     fType = "5UTR"
 175                 else:
 176                     fType = "3UTR"
 177
 178                 insertArray.append((geneID, 1, chrom, gstart, estop, sense, "CDS"))
 179                 insertArray.append((geneID, 1, chrom, estart, gstart - 1, sense, fType))
 180
 181     geneFile.close()
 182     mdGenome = Genome("mdomestica", dbFile=db)
 183     print "Adding %d features" % len(insertArray)
 184     mdGenome.addFeatureEntryBatch(insertArray)
 185
 186
 187 def loadChromosome(db, chromPath, chromOutPath):
 188     seqArray = []
 189     seqLen = 0
 190     mdGenome = Genome("mdomestica", dbFile=db)
 191     inFile = open(chromPath, "r")
 192     header = inFile.readline()
 193     while header != "":
 194         seqArray = []
 195         seqLen = 0
 196         chromID = header.strip()[1:]
 197         currentLine = inFile.readline()
 198         while currentLine != "" and currentLine[0] != ">":
 199             lineSeq = currentLine.strip()
 200             seqLen += len(lineSeq)
 201             seqArray.append(lineSeq)
 202             currentLine = inFile.readline()
 203
 204         seq = string.join(seqArray, "")
 205         if seqLen < 500000:
 206             print "Added contig %s to database" % chromID
 207             mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen))
 208             mdGenome.addChromosomeEntry(chromID, chromID, "db")
 209         else:
 210             outFileName = "%s%s.bin" % (chromOutPath, chromID)
 211             outFile = open("%s%s" % (cisRoot, outFileName), "w")
 212             outFile.write(seq)
 213             outFile.close()
 214             print "Added contig file %s to database" % outFileName
 215             mdGenome.addChromosomeEntry(chromID, outFileName, "file")
 216
 217         header = currentLine
 218
 219     inFile.close()
 220
 221
 222 def createDBindices(db):
 223     mdGenome = Genome("mdomestica", dbFile=db)
 224     mdGenome.createIndices()