cistematic/genomes/mmusculus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Mus musculus
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
  42
  43
  44 def loadChromosome(db, chromID, chromPath, chromOut):
  45     seqArray = []
  46     mmGenome = Genome("mmusculus", dbFile=db)
  47     inFile = open(chromPath, "r")
  48     line = inFile.readline()
  49     for line in inFile:
  50         seqArray.append(line.strip())
  51
  52     seq = string.join(seqArray, "")
  53     seqLen = len(seq)
  54     if seqLen < 1:
  55         print "Problems reading sequence from file"
  56
  57     print "writing to file %s" % chromOut
  58     outFile = open("%s%s" % (cisRoot, chromOut), "w")
  59     outFile.write(seq)
  60     outFile.close()
  61     mmGenome.addChromosomeEntry(chromID, chromOut, "file")
  62
  63
  64 def loadGeneEntries(db, gFile, cDict):
  65     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  66     """
  67     geneEntries = []
  68     alreadySeen = []
  69     mmGenome = Genome("mmusculus", dbFile=db)
  70     geneFile = open(gFile, "r")
  71     for line in geneFile:
  72         cols = line.split("\t")
  73         if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
  74             continue
  75
  76         chrom = cols[1].strip()
  77         if chrom not in cDict:
  78             continue
  79
  80         name = cols[10].split(":")
  81         gid = name[1]
  82         if gid == "" or gid in alreadySeen:
  83             continue
  84
  85         alreadySeen.append(gid)
  86         start = int(cols[2]) - 1
  87         stop = int(cols[3]) - 1
  88         sense = cols[4]
  89         if sense == "+":
  90             sense = "F"
  91         else:
  92             sense = "R"
  93
  94         geneID = ("mmusculus", gid)
  95         gidVersion = 1
  96         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  97
  98     print "Adding %d gene entries" % len(geneEntries)
  99     mmGenome.addGeneEntryBatch(geneEntries)
 100
 101
 102 def loadGeneFeatures(db, gFile, cDict):
 103     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 104     """
 105     featureEntries = []
 106     mmGenome = Genome("mmusculus", dbFile=db)
 107     featureFile = open(gFile, "r")
 108     for line in featureFile:
 109         cols = line.split("\t")
 110         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
 111             continue
 112
 113         chrom = cols[1].strip()
 114         if chrom not in cDict:
 115             continue
 116
 117         fType = cols[11]
 118         name = cols[10].split(":")
 119         gid = name[1]
 120         if gid == "":
 121             continue
 122
 123         start = int(cols[2]) - 1
 124         stop = int(cols[3]) - 1
 125         sense = cols[4]
 126         if sense == "+":
 127             sense = "F"
 128         else:
 129             sense = "R"
 130
 131         geneID = ("mmusculus", gid)
 132         gidVersion = 1
 133         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 134
 135     print "Adding %d feature entries" % len(featureEntries)
 136     mmGenome.addFeatureEntryBatch(featureEntries)
 137
 138
 139 def loadGeneAnnotations(db):
 140     geneAnnotations = []
 141     idb = geneinfoDB()
 142     mmGenome = Genome("mmusculus", dbFile=db)
 143     gidList = mmGenome.allGIDs()
 144     for locID in gidList:
 145         gID = ("mmusculus", locID)
 146         geneDescArray = idb.getDescription(gID)
 147         geneDesc = ""
 148         for entry in geneDescArray:
 149             geneDesc += ","
 150             geneDesc += entry.strip()
 151
 152         if len(geneDescArray) > 0:
 153             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 154
 155     print "Adding %d annotations" % len(geneAnnotations)
 156     mmGenome.addAnnotationBatch(geneAnnotations)
 157
 158
 159 def loadGeneOntology(db, goPath, goDefPath):
 160     mmGenome = Genome("mmusculus", dbFile=db)
 161     goDefFile = open(goDefPath, "r")
 162     goFile = open(goPath, "r")
 163     idb = geneinfoDB()
 164     goDefs = {}
 165     goArray = []
 166     for goDefEntry in goDefFile:
 167         if goDefEntry[0] != "!":
 168             cols = goDefEntry.split("\t")
 169             goDefs[cols[0]] = (cols[1], cols[2].strip())
 170
 171     goEntries = goFile.readlines()
 172     prevGID = ""
 173     for entry in goEntries:
 174         try:
 175             fields = entry.split("\t")
 176             if fields[0] != "10116":
 177                 continue
 178
 179             locID = fields[1].strip()
 180             gID = ("mmusculus", locID)
 181             if prevGID != gID:
 182                 prevGID = gID
 183                 gene_name = ""
 184                 synonyms = idb.geneIDSynonyms(gID)
 185                 if len(synonyms) >0:
 186                     for entry in synonyms:
 187                         gene_name += ","
 188                         gene_name += entry
 189                 else:
 190                     gene_name = " "
 191
 192             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 193         except:
 194             print "locus ID %s could not be added" % locID
 195             pass
 196
 197     print "adding %d go entries" % len(goArray)
 198     mmGenome.addGoInfoBatch(goArray)
 199
 200
 201 def createDBFile(db):
 202     mmGenome = Genome("mmusculus",  dbFile=db)
 203     mmGenome.createGeneDB(db)
 204
 205
 206 def createDBindices(db):
 207     mmGenome = Genome("mmusculus", dbFile=db)
 208     mmGenome.createIndices()
 209
 210
 211 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
 212     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
 213     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
 214     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
 215     # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
 216     # but ignoring all random chromosomes
 217     chromDict = {"1": "%s/chr1.fa" % downloadDir,
 218                  "2": "%s/chr2.fa" % downloadDir,
 219                  "3": "%s/chr3.fa" % downloadDir,
 220                  "4": "%s/chr4.fa" % downloadDir,
 221                  "5": "%s/chr5.fa" % downloadDir,
 222                  "6": "%s/chr6.fa" % downloadDir,
 223                  "7": "%s/chr7.fa" % downloadDir,
 224                  "8": "%s/chr8.fa" % downloadDir,
 225                  "9": "%s/chr9.fa" % downloadDir,
 226                  "10": "%s/chr10.fa" % downloadDir,
 227                  "11": "%s/chr11.fa" % downloadDir,
 228                  "12": "%s/chr12.fa" % downloadDir,
 229                  "13": "%s/chr13.fa" % downloadDir,
 230                  "14": "%s/chr14.fa" % downloadDir,
 231                  "15": "%s/chr15.fa" % downloadDir,
 232                  "16": "%s/chr16.fa" % downloadDir,
 233                  "17": "%s/chr17.fa" % downloadDir,
 234                  "18": "%s/chr18.fa" % downloadDir,
 235                  "19": "%s/chr19.fa" % downloadDir,
 236                  "X": "%s/chrX.fa" % downloadDir,
 237                  "Y": "%s/chrY.fa" % downloadDir,
 238                  "M": "%s/chrM.fa" % downloadDir
 239     }
 240
 241     print "Creating database %s" % db
 242     createDBFile(db)
 243
 244     print "Adding gene entries"
 245     loadGeneEntries(db, genePath, chromDict)
 246
 247     print "Adding gene features"
 248     loadGeneFeatures(db, genePath, chromDict)
 249
 250     print "Adding gene annotations"
 251     loadGeneAnnotations(db)
 252
 253     print "Adding gene ontology"
 254     loadGeneOntology(db, goPath, goDefPath)
 255
 256     for chromID in chromDict.keys():
 257         print "Loading chromosome %s" % chromID
 258         loadChromosome(db, chromID, chromDict[chromID], "/M_musculus/chromo%s.bin" % chromID)
 259
 260     print "Creating Indices"
 261     createDBindices(db)
 262
 263     print "Finished creating database %s" % db