cistematic/genomes/rnorvegicus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Ratus Norvegicus
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
  42
  43
  44 def loadChromosome(db, chromID, chromPath, chromOut):
  45     seqArray = []
  46     rnGenome = Genome("rnorvegicus", dbFile=db)
  47     inFile = open(chromPath, "r")
  48     line = inFile.readline()
  49     for line in inFile:
  50         seqArray.append(line.strip())
  51
  52     seq = string.join(seqArray, "")
  53     seqLen = len(seq)
  54     if seqLen < 1:
  55         print "Problems reading sequence from file"
  56
  57     print "writing to file %s" % chromOut
  58     outFile = open("%s%s" % (cisRoot, chromOut), "w")
  59     outFile.write(seq)
  60     outFile.close()
  61     rnGenome.addChromosomeEntry(chromID, chromOut, "file")
  62
  63
  64 def loadGeneEntries(db, gFile):
  65     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  66     """
  67     geneEntries = []
  68     rnGenome = Genome("rnorvegicus", dbFile=db)
  69     geneFile = open(gFile, "r")
  70     for line in geneFile:
  71         cols = line.split("\t")
  72         if cols[11] != "GENE":
  73             continue
  74
  75         if cols[12] == "Celera":
  76             continue
  77
  78         name = cols[10].split(":")
  79         gid = name[1]
  80         start = int(cols[2]) - 1
  81         stop = int(cols[3]) - 1
  82         sense = cols[4]
  83         chrom = cols[1].strip()
  84         if sense == "+":
  85             sense = "F"
  86         else:
  87             sense = "R"
  88
  89         geneID = ("rnorvegicus", gid)
  90         gidVersion = 1
  91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  92
  93     print "Adding %d gene entries" % len(geneEntries)
  94     rnGenome.addGeneEntryBatch(geneEntries)
  95
  96
  97 def loadGeneFeatures(db, gFile):
  98     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
  99     """
 100     featureEntries = []
 101     rnGenome = Genome("rnorvegicus", dbFile=db)
 102     featureFile = open(gFile, "r")
 103     for line in featureFile:
 104         cols = line.split("\t")
 105         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
 106             continue
 107
 108         if cols[12] == "Celera":
 109             continue
 110
 111         fType = cols[11]
 112         name = cols[10].split(":")
 113         gid = name[1]
 114         start = int(cols[2]) - 1
 115         stop = int(cols[3]) - 1
 116         sense = cols[4]
 117         chrom = cols[1].strip()
 118         if sense == "+":
 119             sense = "F"
 120         else:
 121             sense = "R"
 122
 123         geneID = ("rnorvegicus", gid)
 124         gidVersion = 1
 125         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 126
 127     print "Adding %d feature entries" % len(featureEntries)
 128     rnGenome.addFeatureEntryBatch(featureEntries)
 129
 130
 131 def loadGeneAnnotations(db):
 132     geneAnnotations = []
 133     idb = geneinfoDB()
 134     rnGenome = Genome("rnorvegicus", dbFile=db)
 135     gidList = rnGenome.allGIDs()
 136     for locID in gidList:
 137         gID = ("rnorvegicus", locID)
 138         geneDescArray = idb.getDescription(gID)
 139         geneDesc = ""
 140         for entry in geneDescArray:
 141             geneDesc += ","
 142             geneDesc += entry.strip()
 143
 144         if len(geneDescArray) > 0:
 145             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 146
 147     print "Adding %d annotations" % len(geneAnnotations)
 148     rnGenome.addAnnotationBatch(geneAnnotations)
 149
 150
 151 def loadGeneOntology(db, goPath, goDefPath):
 152     rnGenome = Genome("rnorvegicus", dbFile=db)
 153     goDefFile = open(goDefPath, "r")
 154     goFile = open(goPath, "r")
 155     idb = geneinfoDB()
 156     goDefs = {}
 157     goArray = []
 158     for goDefEntry in goDefFile:
 159         if goDefEntry[0] != "!":
 160             cols = goDefEntry.split("\t")
 161             goDefs[cols[0]] = (cols[1], cols[2].strip())
 162
 163     goEntries = goFile.readlines()
 164     prevGID = ''
 165     for entry in goEntries:
 166         try:
 167             fields = entry.split("\t")
 168             if fields[0] != "10090":
 169                 continue
 170
 171             locID = fields[1].strip()
 172             gID = ("rnorvegicus", locID)
 173             if prevGID != gID:
 174                 prevGID = gID
 175                 gene_name = ""
 176                 synonyms = idb.geneIDSynonyms(gID)
 177                 if len(synonyms) >0:
 178                     for entry in synonyms:
 179                         gene_name += ","
 180                         gene_name += entry
 181                 else:
 182                     gene_name = " "
 183
 184             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 185         except:
 186             print "locus ID %s could not be added" % locID
 187             pass
 188
 189     print "adding %d go entries" % len(goArray)
 190     rnGenome.addGoInfoBatch(goArray)
 191
 192
 193 def createDBFile(db):
 194     rnGenome = Genome("rnorvegicus",  dbFile=db)
 195     rnGenome.createGeneDB(db)
 196
 197
 198 def createDBindices(db):
 199     rnGenome = Genome("rnorvegicus", dbFile=db)
 200     rnGenome.createIndices()
 201
 202
 203 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
 204     genePath = "%s/seq_gene.md" % downloadDir
 205     goDefPath = "%s/GO.terms_and_ids" % downloadDir
 206     goPath = "%s/gene2go" % downloadDir
 207     # ignoring all random chromosomes
 208     chromos = {"1": "%s/chr1.fa" % downloadDir,
 209                "2": "%s/chr2.fa" % downloadDir,
 210                "3": "%s/chr3.fa" % downloadDir,
 211                "4": "%s/chr4.fa" % downloadDir,
 212                "5": "%s/chr5.fa" % downloadDir,
 213                "6": "%s/chr6.fa" % downloadDir,
 214                "7": "%s/chr7.fa" % downloadDir,
 215                "8": "%s/chr8.fa" % downloadDir,
 216                "9": "%s/chr9.fa" % downloadDir,
 217                "10": "%s/chr10.fa" % downloadDir,
 218                "11": "%s/chr11.fa" % downloadDir,
 219                "12": "%s/chr12.fa" % downloadDir,
 220                "13": "%s/chr13.fa" % downloadDir,
 221                "14": "%s/chr14.fa" % downloadDir,
 222                "15": "%s/chr15.fa" % downloadDir,
 223                "16": "%s/chr16.fa" % downloadDir,
 224                "17": "%s/chr17.fa" % downloadDir,
 225                "18": "%s/chr18.fa" % downloadDir,
 226                "19": "%s/chr19.fa" % downloadDir,
 227                "Un": "%s/chrUn.fa" % downloadDir,
 228                "X": "%s/chrX.fa" % downloadDir,
 229                "20": "%s/chr20.fa" % downloadDir,
 230                "M": "%s/chrM.fa" % downloadDir
 231     }
 232
 233     print "Creating database %s" % db
 234     createDBFile(db)
 235
 236     print "Adding gene entries"
 237     loadGeneEntries(db, genePath)
 238
 239     print "Adding gene features"
 240     loadGeneFeatures(db, genePath)
 241
 242     print "Adding gene annotations"
 243     loadGeneAnnotations(db)
 244
 245     print "Adding gene ontology"
 246     loadGeneOntology(db, goPath, goDefPath)
 247
 248     for chromID in chromos.keys():
 249         print "Loading chromosome %s" % chromID
 250         loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
 251
 252     print "Creating Indices"
 253     createDBindices(db)
 254
 255     print "Finished creating database %s" % db