cistematic/genomes/mmusculus.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Mus musculus
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB  = "%s/M_musculus/mmusculus.genedb" % cisRoot
  42
  43
  44 def buildMmusculusDB(db=geneDB, downloadDir="%s/download" % cisRoot):
  45     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/M_musculus/mapview/seq_gene.md
  46     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/pub/go/doc/GO.terms_and_ids
  47     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/DATA/gene2go.gz
  48     # chromosomes are from ftp://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes
  49     # but ignoring all random chromosomes
  50     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  51                  "11", "12", "13", "14", "15", "16", "17", "18", "19",
  52                  "X", "Y", "M"
  53     ]
  54
  55     print "Creating database %s" % db
  56     createDBFile(db)
  57
  58     print "Adding gene entries"
  59     loadGeneEntries(db, genePath, chromList)
  60
  61     print "Adding gene features"
  62     loadGeneFeatures(db, genePath, chromList)
  63
  64     print "Adding gene annotations"
  65     loadGeneAnnotations(db)
  66
  67     print "Adding gene ontology"
  68     loadGeneOntology(db, goPath, goDefPath)
  69
  70     for chromID in chromList:
  71         print "Loading chromosome %s" % chromID
  72         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
  73         loadChromosome(db, chromID, chromPath, "/M_musculus/chromo%s.bin" % chromID)
  74
  75     print "Creating Indices"
  76     createDBindices(db)
  77
  78     print "Finished creating database %s" % db
  79
  80
  81 def createDBFile(db):
  82     mmGenome = Genome("mmusculus",  dbFile=db)
  83     mmGenome.createGeneDB(db)
  84
  85
  86 def loadGeneEntries(db, gFile, cDict):
  87     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  88
  89     geneEntries = []
  90     alreadySeen = []
  91     mmGenome = Genome("mmusculus", dbFile=db)
  92     geneFile = open(gFile, "r")
  93     for line in geneFile:
  94         cols = line.split("\t")
  95         if cols[11].strip() != "GENE" or cols[12] != "C57BL/6J":
  96             continue
  97
  98         chrom = cols[1].strip()
  99         if chrom not in cDict:
 100             continue
 101
 102         name = cols[10].split(":")
 103         gid = name[1]
 104         if gid == "" or gid in alreadySeen:
 105             continue
 106
 107         alreadySeen.append(gid)
 108         start = int(cols[2]) - 1
 109         stop = int(cols[3]) - 1
 110         sense = cols[4]
 111         if sense == "+":
 112             sense = "F"
 113         else:
 114             sense = "R"
 115
 116         geneID = ("mmusculus", gid)
 117         gidVersion = 1
 118         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 119
 120     print "Adding %d gene entries" % len(geneEntries)
 121     mmGenome.addGeneEntryBatch(geneEntries)
 122
 123
 124 def loadGeneFeatures(db, gFile, cDict):
 125     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 126     """
 127     featureEntries = []
 128     mmGenome = Genome("mmusculus", dbFile=db)
 129     featureFile = open(gFile, "r")
 130     for line in featureFile:
 131         cols = line.split("\t")
 132         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"] or cols[12] != "C57BL/6J":
 133             continue
 134
 135         chrom = cols[1].strip()
 136         if chrom not in cDict:
 137             continue
 138
 139         fType = cols[11]
 140         name = cols[10].split(":")
 141         gid = name[1]
 142         if gid == "":
 143             continue
 144
 145         start = int(cols[2]) - 1
 146         stop = int(cols[3]) - 1
 147         sense = cols[4]
 148         if sense == "+":
 149             sense = "F"
 150         else:
 151             sense = "R"
 152
 153         geneID = ("mmusculus", gid)
 154         gidVersion = 1
 155         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 156
 157     print "Adding %d feature entries" % len(featureEntries)
 158     mmGenome.addFeatureEntryBatch(featureEntries)
 159
 160
 161 def loadGeneAnnotations(db):
 162     geneAnnotations = []
 163     idb = geneinfoDB()
 164     mmGenome = Genome("mmusculus", dbFile=db)
 165     gidList = mmGenome.allGIDs()
 166     for locID in gidList:
 167         gID = ("mmusculus", locID)
 168         geneDescArray = idb.getDescription(gID)
 169         geneDesc = ""
 170         for entry in geneDescArray:
 171             geneDesc += ","
 172             geneDesc += entry.strip()
 173
 174         if len(geneDescArray) > 0:
 175             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 176
 177     print "Adding %d annotations" % len(geneAnnotations)
 178     mmGenome.addAnnotationBatch(geneAnnotations)
 179
 180
 181 def loadGeneOntology(db, goPath, goDefPath):
 182     mmGenome = Genome("mmusculus", dbFile=db)
 183     goDefFile = open(goDefPath, "r")
 184     goFile = open(goPath, "r")
 185     idb = geneinfoDB()
 186     goDefs = {}
 187     goArray = []
 188     for goDefEntry in goDefFile:
 189         if goDefEntry[0] != "!":
 190             cols = goDefEntry.split("\t")
 191             goDefs[cols[0]] = (cols[1], cols[2].strip())
 192
 193     goEntries = goFile.readlines()
 194     prevGID = ""
 195     for entry in goEntries:
 196         try:
 197             fields = entry.split("\t")
 198             if fields[0] != "10116":
 199                 continue
 200
 201             locID = fields[1].strip()
 202             gID = ("mmusculus", locID)
 203             if prevGID != gID:
 204                 prevGID = gID
 205                 gene_name = ""
 206                 synonyms = idb.geneIDSynonyms(gID)
 207                 if len(synonyms) >0:
 208                     for entry in synonyms:
 209                         gene_name += ","
 210                         gene_name += entry
 211                 else:
 212                     gene_name = " "
 213
 214             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 215         except:
 216             print "locus ID %s could not be added" % locID
 217             pass
 218
 219     print "adding %d go entries" % len(goArray)
 220     mmGenome.addGoInfoBatch(goArray)
 221
 222
 223 def loadChromosome(db, chromID, chromPath, chromOut):
 224     seqArray = []
 225     mmGenome = Genome("mmusculus", dbFile=db)
 226     inFile = open(chromPath, "r")
 227     line = inFile.readline()
 228     for line in inFile:
 229         seqArray.append(line.strip())
 230
 231     seq = string.join(seqArray, "")
 232     seqLen = len(seq)
 233     if seqLen < 1:
 234         print "Problems reading sequence from file"
 235
 236     print "writing to file %s" % chromOut
 237     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 238     outFile.write(seq)
 239     outFile.close()
 240     mmGenome.addChromosomeEntry(chromID, chromOut, "file")
 241
 242
 243 def createDBindices(db):
 244     mmGenome = Genome("mmusculus", dbFile=db)
 245     mmGenome.createIndices()