cistematic/genomes/ggallus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Gallus gallus
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB = "%s/G_gallus/ggallus.genedb" % cisRoot
  42
  43
  44 def buildChickenDB(db=geneDB):
  45     genePath = "%s/download/seq_gene.md" % cisRoot
  46     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
  47     goPath = "%s/download/gene2go" % cisRoot # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
  48
  49     print "Creating database %s" % db
  50     createDBFile(db)
  51
  52     print "Adding gene entries"
  53     loadGeneEntries(db, genePath)
  54
  55     #print "Adding gene annotations"
  56     #loadGeneAnnotations(db, annotPath)
  57
  58     print "Adding gene features"
  59     loadGeneFeatures(db, genePath)
  60
  61     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  62                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
  63                  "21", "22", "23", "24", "25", "26", "27", "28",
  64                  "32", "W", "Z", "M", "E22C19W28_E50C23", "E64",
  65                  "1_random", "2_random", "4_random", "5_random", "6_random",
  66                  "7_random", "8_random", "10_random", "11_random", "12_random",
  67                  "13_random", "16_random", "17_random", "18_random", "20_random",
  68                  "22_random", "25_random", "28_random", "Un_random", "W_random",
  69                  "E64_random", "Z_random", "E22C19W28_E50C23_random"
  70     ]
  71     for chromID in chromList:
  72         print "Loading chromosome %s" % chromID
  73         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
  74         loadChromosome(db, chromID, chromPath, "/G_gallus/chromo%s.bin" % chromID)
  75
  76     print "Adding gene ontology"
  77     loadGeneOntology(db, goPath, goDefPath)
  78
  79     print "Creating Indices"
  80     createDBindices(db)
  81
  82     print "Finished creating database %s" % db
  83
  84
  85 def createDBFile(db):
  86     ggGenome = Genome("ggallus",  dbFile=db)
  87     ggGenome.createGeneDB(db)
  88
  89
  90 def loadGeneEntries(db, gFile):
  91     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  92
  93     geneEntries = []
  94     alreadySeen = []
  95     ggGenome = Genome("ggallus", dbFile=db)
  96     geneFile = open(gFile, "r")
  97     geneFile.readline()
  98     for line in geneFile:
  99         if "|" in line:
 100             continue
 101
 102         cols = line.split("\t")
 103         if cols[11].strip() != "GENE":
 104             continue
 105
 106         name = cols[10].split(":")
 107         gid = name[1]
 108         if gid == "" or gid in alreadySeen:
 109             continue
 110
 111         alreadySeen.append(gid)
 112         start = int(cols[2]) - 1
 113         stop = int(cols[3]) - 1
 114         sense = cols[4]
 115         chrom = cols[1].strip()
 116         if sense == "+":
 117             sense = "F"
 118         else:
 119             sense = "R"
 120
 121         geneID = ("ggallus", gid)
 122         gidVersion = 1
 123         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 124
 125     print "Adding %d gene entries" % len(geneEntries)
 126     ggGenome.addGeneEntryBatch(geneEntries)
 127
 128
 129 def loadGeneFeatures(db, gFile):
 130     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 131     """
 132     featureEntries = []
 133     ggGenome = Genome("ggallus", dbFile=db)
 134     featureFile = open(gFile, "r")
 135     featureFile.readline()
 136     for line in featureFile:
 137         if "|" in line:
 138             continue
 139
 140         cols = line.split("\t")
 141         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
 142             continue
 143
 144         fType = cols[11]
 145         name = cols[10].split(":")
 146         gid = name[1]
 147         if gid == "":
 148             continue
 149
 150         start = int(cols[2]) - 1
 151         stop = int(cols[3]) - 1
 152         sense = cols[4]
 153         chrom = cols[1].strip()
 154         if sense == "+":
 155             sense = "F"
 156         else:
 157             sense = "R"
 158
 159         geneID = ("ggallus", gid)
 160         gidVersion = 1
 161         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 162
 163     print "Adding %d feature entries" % len(featureEntries)
 164     ggGenome.addFeatureEntryBatch(featureEntries)
 165
 166
 167 def loadChromosome(db, chromID, chromPath, chromOut):
 168     seqArray = []
 169     ggGenome = Genome("ggallus", dbFile=db)
 170     inFile = open(chromPath, "r")
 171     line = inFile.readline()
 172     for line in inFile:
 173         seqArray.append(line.strip())
 174
 175     seq = string.join(seqArray, "")
 176     seqLen = len(seq)
 177     if seqLen < 1:
 178         print "Problems reading sequence from file"
 179
 180     print "writing to file %s" % chromOut
 181     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 182     outFile.write(seq)
 183     outFile.close()
 184     ggGenome.addChromosomeEntry(chromID, chromOut, "file")
 185
 186
 187 def loadGeneOntology(db, goPath, goDefPath):
 188     ggGenome = Genome("ggallus", dbFile=db)
 189     goDefFile = open(goDefPath, "r")
 190     goFile = open(goPath, "r")
 191     idb = geneinfoDB()
 192     goDefs = {}
 193     goArray = []
 194     for goDefEntry in goDefFile:
 195         if goDefEntry[0] != "!":
 196             cols = goDefEntry.split("\t")
 197             goDefs[cols[0]] = (cols[1], cols[2].strip())
 198
 199     goEntries = goFile.readlines()
 200     prevGID = ""
 201     for entry in goEntries:
 202         try:
 203             fields = entry.split("\t")
 204             if fields[0] != "9031":
 205                 continue
 206
 207             locID = fields[1].strip()
 208             gID = ("ggallus", locID)
 209             if prevGID != gID:
 210                 prevGID = gID
 211                 gene_name = ""
 212                 synonyms = idb.geneIDSynonyms(gID)
 213                 if len(synonyms) >0:
 214                     gene_name = string.join(synonyms, ",")
 215
 216             goArray.append((gID, fields[2], "", gene_name, "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 217         except:
 218             print "locus ID %s could not be added" % locID
 219             pass
 220
 221     print "adding %d go entries" % len(goArray)
 222     ggGenome.addGoInfoBatch(goArray)
 223
 224
 225 def createDBindices(db):
 226     ggGenome = Genome("ggallus", dbFile=db)
 227     ggGenome.createIndices()