cistematic/genomes/ecaballus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Equus Caballus
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/E_caballus/ecaballus.genedb" % cisRoot
  41
  42
  43 def buildHorseDB(db=geneDB):
  44     genePath = "%s/download/seq_gene.md" % cisRoot
  45
  46     print "Creating database %s" % db
  47     createDBFile(db)
  48
  49     print "Adding gene entries"
  50     loadGeneEntries(db, genePath)
  51
  52     print "Adding gene features"
  53     loadGeneFeatures(db, genePath)
  54
  55     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  56                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
  57                  "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
  58                  "31", "M", "X", "Un"
  59     ]
  60     for chromID in chromList:
  61         print "Loading chromosome %s" % chromID
  62         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
  63         loadChromosome(db, chromID, chromPath, "/E_caballus/chromo%s.bin" % chromID)
  64
  65     print "Creating Indices"
  66     createDBindices(db)
  67
  68     print "Finished creating database %s" % db
  69
  70
  71 def createDBFile(db):
  72     ecGenome = Genome("ecaballus",  dbFile=db)
  73     ecGenome.createGeneDB(db)
  74
  75
  76 def loadGeneEntries(db, gFile):
  77     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  78
  79     geneEntries = []
  80     alreadySeen = []
  81     ecGenome = Genome("ecaballus", dbFile=db)
  82     geneFile = open(gFile, "r")
  83     geneFile.readline()
  84     for line in geneFile:
  85         cols = line.split("\t")
  86         if cols[11].strip() != "GENE":
  87             continue
  88
  89         name = cols[10].split(":")
  90         gid = name[1]
  91         if gid == "" or gid in alreadySeen:
  92             continue
  93
  94         alreadySeen.append(gid)
  95         start = int(cols[2]) - 1
  96         stop = int(cols[3]) - 1
  97         sense = cols[4]
  98         chrom = cols[1].strip()
  99         if sense == "+":
 100             sense = "F"
 101         else:
 102             sense = "R"
 103
 104         geneID = ("ecaballus", gid)
 105         gidVersion = 1
 106         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 107
 108     print "Adding %d gene entries" % len(geneEntries)
 109     ecGenome.addGeneEntryBatch(geneEntries)
 110
 111
 112 def loadGeneFeatures(db, gFile):
 113     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 114     """
 115     featureEntries = []
 116     ecGenome = Genome("ecaballus", dbFile=db)
 117     featureFile = open(gFile, "r")
 118     featureFile.readline()
 119     for line in featureFile:
 120         cols = line.split("\t")
 121         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
 122             continue
 123
 124         fType = cols[11]
 125         name = cols[10].split(":")
 126         gid = name[1]
 127         if gid == "":
 128             continue
 129
 130         start = int(cols[2]) - 1
 131         stop = int(cols[3]) - 1
 132         sense = cols[4]
 133         chrom = cols[1].strip()
 134         if sense == "+":
 135             sense = "F"
 136         else:
 137             sense = "R"
 138
 139         geneID = ("ecaballus", gid)
 140         gidVersion = 1
 141         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 142
 143     print "Adding %d feature entries" % len(featureEntries)
 144     ecGenome.addFeatureEntryBatch(featureEntries)
 145
 146
 147 def loadChromosome(db, chromID, chromPath, chromOut):
 148     seqArray = []
 149     ecGenome = Genome("ecaballus", dbFile=db)
 150     inFile = open(chromPath, "r")
 151     line = inFile.readline()
 152     for line in inFile:
 153         seqArray.append(line.strip())
 154
 155     seq = string.join(seqArray, "")
 156     seqLen = len(seq)
 157     if seqLen < 1:
 158         print "Problems reading sequence from file"
 159
 160     print "writing to file %s" % chromOut
 161     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 162     outFile.write(seq)
 163     outFile.close()
 164     ecGenome.addChromosomeEntry(chromID, chromOut, "file")
 165
 166
 167 def createDBindices(db):
 168     ecGenome = Genome("ecaballus", dbFile=db)
 169     ecGenome.createIndices()