cistematic/genomes/hsapiens.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Homo sapiens
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
  42
  43
  44 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
  45     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
  46     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
  47     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
  48     # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
  49     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  50                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
  51                  "21", "22", "X", "Y"
  52     ]
  53
  54     print "Creating database %s" % db
  55     createDBFile(db)
  56
  57     print "Adding gene entries"
  58     loadGeneEntries(db, genePath, chromList)
  59
  60     print "Adding gene features"
  61     loadGeneFeatures(db, genePath, chromList)
  62
  63     print "Adding gene annotations"
  64     loadGeneAnnotations(db)
  65
  66     print "Adding gene ontology"
  67     loadGeneOntology(db, goPath, goDefPath)
  68
  69     for chromID in chromList:
  70         print "Loading chromosome %s" % chromID
  71         chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
  72         loadChromosome(db, chromID, chromPath, "/H_sapiens/chromo%s.bin" % chromID)
  73
  74     print "Creating Indices"
  75     createDBindices(db)
  76
  77     print "Finished creating database %s" % db
  78
  79
  80 def createDBFile(db):
  81     hsGenome = Genome("hsapiens",  dbFile=db)
  82     hsGenome.createGeneDB(db)
  83
  84
  85 def loadGeneEntries(db, gFile, cDict):
  86     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  87     geneEntries = []
  88     hsGenome = Genome("hsapiens", dbFile=db)
  89     geneFile = open(gFile, "r")
  90     for line in geneFile:
  91         cols = line.split("\t")
  92         if cols[11] != "GENE":
  93             continue
  94
  95         if cols[12] == "Celera":
  96             continue
  97
  98         chrom = cols[1].strip()
  99         if chrom not in cDict:
 100             continue
 101
 102         name = cols[10].split(":")
 103         gid = name[1]
 104         start = int(cols[2])
 105         stop = int(cols[3])
 106         sense = cols[4]
 107         if sense == "+":
 108             sense = "F"
 109         else:
 110             sense = "R"
 111
 112         geneID = ("hsapiens", gid)
 113         gidVersion = 1
 114         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 115
 116     print "Adding %d gene entries" % len(geneEntries)
 117     hsGenome.addGeneEntryBatch(geneEntries)
 118
 119
 120 def loadGeneFeatures(db, gFile, cDict):
 121     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 122     """
 123     featureEntries = []
 124     hsGenome = Genome("hsapiens", dbFile=db)
 125     featureFile = open(gFile, "r")
 126     for line in featureFile:
 127         cols = line.split("\t")
 128         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
 129             continue
 130
 131         if cols[12] == "Celera":
 132             continue
 133
 134         chrom = cols[1].strip()
 135         if chrom not in cDict:
 136             continue
 137
 138         fType = cols[11]
 139         name = cols[10].split(":")
 140         gid = name[1]
 141         start = int(cols[2])
 142         stop = int(cols[3])
 143         sense = cols[4]
 144         if sense == "+":
 145             sense = "F"
 146         else:
 147             sense = "R"
 148
 149         geneID = ("hsapiens", gid)
 150         gidVersion = 1
 151         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 152
 153     print "Adding %d feature entries" % len(featureEntries)
 154     hsGenome.addFeatureEntryBatch(featureEntries)
 155
 156
 157 def loadGeneAnnotations(db):
 158     geneAnnotations = []
 159     idb = geneinfoDB()
 160     hsGenome = Genome("hsapiens", dbFile=db)
 161     gidList = hsGenome.allGIDs()
 162     for locID in gidList:
 163         gID = ("hsapiens", locID)
 164         geneDescArray = idb.getDescription(gID)
 165         geneDesc = ""
 166         for entry in geneDescArray:
 167             geneDesc += ","
 168             geneDesc += entry.strip()
 169
 170         if len(geneDescArray) > 0:
 171             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 172
 173     print "Adding %d annotations" % len(geneAnnotations)
 174     hsGenome.addAnnotationBatch(geneAnnotations)
 175
 176
 177 def loadGeneOntology(db, goPath, goDefPath):
 178     hsGenome = Genome("hsapiens", dbFile=db)
 179     goDefFile = open(goDefPath, "r")
 180     goFile = open(goPath, "r")
 181     idb = geneinfoDB()
 182     goDefs = {}
 183     goArray = []
 184     for goDefEntry in goDefFile:
 185         if goDefEntry[0] != "!":
 186             cols = goDefEntry.split("\t")
 187             goDefs[cols[0]] = (cols[1], cols[2].strip())
 188
 189     prevGID = ""
 190     index = 0
 191     for entry in goFile:
 192         try:
 193             fields = entry.split("\t")
 194             if fields[0] != "9606":
 195                 continue
 196
 197             index += 1
 198             if index % 1000 == 0:
 199                 print "adding 1000 go entries"
 200                 hsGenome.addGoInfoBatch(goArray)
 201                 goArray = []
 202
 203             locID = fields[1].strip()
 204             gID = ("hsapiens", locID)
 205             if prevGID != gID:
 206                 prevGID = gID
 207                 gene_name = ""
 208                 synonyms = idb.geneIDSynonyms(gID)
 209                 if len(synonyms) >0:
 210                     for entry in synonyms:
 211                         gene_name += ","
 212                         gene_name += entry
 213                 else:
 214                     gene_name = " "
 215
 216             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 217         except:
 218             print "locus ID %s could not be added" % locID
 219             pass
 220
 221     print "adding %d go entries" % len(goArray)
 222     hsGenome.addGoInfoBatch(goArray)
 223
 224
 225 def loadChromosome(db, chromID, chromPath, chromOut):
 226     seqArray = []
 227     hsGenome = Genome("hsapiens", dbFile=db)
 228     inFile = open(chromPath, "r")
 229     line = inFile.readline()
 230     for line in inFile:
 231         seqArray.append(line.strip())
 232
 233     seq = string.join(seqArray, "")
 234     seqLen = len(seq)
 235     if seqLen < 1:
 236         print "Problems reading sequence from file"
 237     print "writing to file %s" % chromOut
 238     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 239     outFile.write(seq)
 240     outFile.close()
 241     hsGenome.addChromosomeEntry(chromID, chromOut, "file")
 242
 243
 244 def createDBindices(db):
 245     hsGenome = Genome("hsapiens", dbFile=db)
 246     hsGenome.createIndices()