cistematic/genomes/rnorvegicus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Ratus Norvegicus
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
  42
  43
  44 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
  45     genePath = "%s/seq_gene.md" % downloadDir
  46     goDefPath = "%s/GO.terms_and_ids" % downloadDir
  47     goPath = "%s/gene2go" % downloadDir
  48
  49     print "Creating database %s" % db
  50     createDBFile(db)
  51
  52     print "Adding gene entries"
  53     loadGeneEntries(db, genePath)
  54
  55     print "Adding gene features"
  56     loadGeneFeatures(db, genePath)
  57
  58     print "Adding gene annotations"
  59     loadGeneAnnotations(db)
  60
  61     print "Adding gene ontology"
  62     loadGeneOntology(db, goPath, goDefPath)
  63
  64     # ignoring all random chromosomes
  65     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  66                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
  67                  "Un", "X", "M"
  68     ]
  69     for chromID in chromList:
  70         print "Loading chromosome %s" % chromID
  71         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
  72         loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
  73
  74     print "Creating Indices"
  75     createDBindices(db)
  76
  77     print "Finished creating database %s" % db
  78
  79
  80 def createDBFile(db):
  81     rnGenome = Genome("rnorvegicus",  dbFile=db)
  82     rnGenome.createGeneDB(db)
  83
  84
  85 def loadGeneEntries(db, gFile):
  86     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  87
  88     geneEntries = []
  89     rnGenome = Genome("rnorvegicus", dbFile=db)
  90     geneFile = open(gFile, "r")
  91     for line in geneFile:
  92         cols = line.split("\t")
  93         if cols[11] != "GENE":
  94             continue
  95
  96         if cols[12] == "Celera":
  97             continue
  98
  99         name = cols[10].split(":")
 100         gid = name[1]
 101         start = int(cols[2]) - 1
 102         stop = int(cols[3]) - 1
 103         sense = cols[4]
 104         chrom = cols[1].strip()
 105         if sense == "+":
 106             sense = "F"
 107         else:
 108             sense = "R"
 109
 110         geneID = ("rnorvegicus", gid)
 111         gidVersion = 1
 112         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 113
 114     print "Adding %d gene entries" % len(geneEntries)
 115     rnGenome.addGeneEntryBatch(geneEntries)
 116
 117
 118 def loadGeneFeatures(db, gFile):
 119     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 120     """
 121     featureEntries = []
 122     rnGenome = Genome("rnorvegicus", dbFile=db)
 123     featureFile = open(gFile, "r")
 124     for line in featureFile:
 125         cols = line.split("\t")
 126         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
 127             continue
 128
 129         if cols[12] == "Celera":
 130             continue
 131
 132         fType = cols[11]
 133         name = cols[10].split(":")
 134         gid = name[1]
 135         start = int(cols[2]) - 1
 136         stop = int(cols[3]) - 1
 137         sense = cols[4]
 138         chrom = cols[1].strip()
 139         if sense == "+":
 140             sense = "F"
 141         else:
 142             sense = "R"
 143
 144         geneID = ("rnorvegicus", gid)
 145         gidVersion = 1
 146         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 147
 148     print "Adding %d feature entries" % len(featureEntries)
 149     rnGenome.addFeatureEntryBatch(featureEntries)
 150
 151
 152 def loadGeneAnnotations(db):
 153     geneAnnotations = []
 154     idb = geneinfoDB()
 155     rnGenome = Genome("rnorvegicus", dbFile=db)
 156     gidList = rnGenome.allGIDs()
 157     for locID in gidList:
 158         gID = ("rnorvegicus", locID)
 159         geneDescArray = idb.getDescription(gID)
 160         geneDesc = ""
 161         for entry in geneDescArray:
 162             geneDesc += ","
 163             geneDesc += entry.strip()
 164
 165         if len(geneDescArray) > 0:
 166             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 167
 168     print "Adding %d annotations" % len(geneAnnotations)
 169     rnGenome.addAnnotationBatch(geneAnnotations)
 170
 171
 172 def loadGeneOntology(db, goPath, goDefPath):
 173     rnGenome = Genome("rnorvegicus", dbFile=db)
 174     goDefFile = open(goDefPath, "r")
 175     goFile = open(goPath, "r")
 176     idb = geneinfoDB()
 177     goDefs = {}
 178     goArray = []
 179     for goDefEntry in goDefFile:
 180         if goDefEntry[0] != "!":
 181             cols = goDefEntry.split("\t")
 182             goDefs[cols[0]] = (cols[1], cols[2].strip())
 183
 184     goEntries = goFile.readlines()
 185     prevGID = ''
 186     for entry in goEntries:
 187         try:
 188             fields = entry.split("\t")
 189             if fields[0] != "10090":
 190                 continue
 191
 192             locID = fields[1].strip()
 193             gID = ("rnorvegicus", locID)
 194             if prevGID != gID:
 195                 prevGID = gID
 196                 gene_name = ""
 197                 synonyms = idb.geneIDSynonyms(gID)
 198                 if len(synonyms) >0:
 199                     for entry in synonyms:
 200                         gene_name += ","
 201                         gene_name += entry
 202                 else:
 203                     gene_name = " "
 204
 205             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 206         except:
 207             print "locus ID %s could not be added" % locID
 208             pass
 209
 210     print "adding %d go entries" % len(goArray)
 211     rnGenome.addGoInfoBatch(goArray)
 212
 213
 214 def loadChromosome(db, chromID, chromPath, chromOut):
 215     seqArray = []
 216     rnGenome = Genome("rnorvegicus", dbFile=db)
 217     inFile = open(chromPath, "r")
 218     line = inFile.readline()
 219     for line in inFile:
 220         seqArray.append(line.strip())
 221
 222     seq = string.join(seqArray, "")
 223     seqLen = len(seq)
 224     if seqLen < 1:
 225         print "Problems reading sequence from file"
 226
 227     print "writing to file %s" % chromOut
 228     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 229     outFile.write(seq)
 230     outFile.close()
 231     rnGenome.addChromosomeEntry(chromID, chromOut, "file")
 232
 233
 234 def createDBindices(db):
 235     rnGenome = Genome("rnorvegicus", dbFile=db)
 236     rnGenome.createIndices()