cistematic/genomes/hsapiens.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Homo sapiens
  31 import string
  32 from cistematic.genomes import Genome
  33 from cistematic.core.geneinfo import geneinfoDB
  34 from os import environ
  35
  36 if environ.get("CISTEMATIC_ROOT"):
  37     cisRoot = environ.get("CISTEMATIC_ROOT")
  38 else:
  39     cisRoot = "/proj/genome"
  40
  41 geneDB = "%s/H_sapiens/hsapiens.genedb" % cisRoot
  42
  43 def loadChromosome(db, chromID, chromPath, chromOut):
  44     seqArray = []
  45     hsGenome = Genome("hsapiens", dbFile=db)
  46     inFile = open(chromPath, "r")
  47     line = inFile.readline()
  48     for line in inFile:
  49         seqArray.append(line.strip())
  50
  51     seq = string.join(seqArray, "")
  52     seqLen = len(seq)
  53     if seqLen < 1:
  54         print "Problems reading sequence from file"
  55     print "writing to file %s" % chromOut
  56     outFile = open("%s%s" % (cisRoot, chromOut), "w")
  57     outFile.write(seq)
  58     outFile.close()
  59     hsGenome.addChromosomeEntry(chromID, chromOut, "file")
  60
  61
  62 def loadGeneEntries(db, gFile, cDict):
  63     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  64     """
  65     geneEntries = []
  66     hsGenome = Genome("hsapiens", dbFile=db)
  67     geneFile = open(gFile, "r")
  68     for line in geneFile:
  69         cols = line.split("\t")
  70         if cols[11] != "GENE":
  71             continue
  72
  73         if cols[12] == "Celera":
  74             continue
  75
  76         chrom = cols[1].strip()
  77         if chrom not in cDict:
  78             continue
  79
  80         name = cols[10].split(":")
  81         gid = name[1]
  82         start = int(cols[2])
  83         stop = int(cols[3])
  84         sense = cols[4]
  85         if sense == "+":
  86             sense = "F"
  87         else:
  88             sense = "R"
  89
  90         geneID = ("hsapiens", gid)
  91         gidVersion = 1
  92         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
  93
  94     print "Adding %d gene entries" % len(geneEntries)
  95     hsGenome.addGeneEntryBatch(geneEntries)
  96
  97
  98 def loadGeneFeatures(db, gFile, cDict):
  99     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 100     """
 101     featureEntries = []
 102     hsGenome = Genome("hsapiens", dbFile=db)
 103     featureFile = open(gFile, "r")
 104     for line in featureFile:
 105         cols = line.split("\t")
 106         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
 107             continue
 108         if cols[12] == "Celera":
 109             continue
 110         chrom = cols[1].strip()
 111         if chrom not in cDict:
 112             continue
 113
 114         fType = cols[11]
 115         name = cols[10].split(":")
 116         gid = name[1]
 117         start = int(cols[2])
 118         stop = int(cols[3])
 119         sense = cols[4]
 120         if sense == "+":
 121             sense = "F"
 122         else:
 123             sense = "R"
 124
 125         geneID = ("hsapiens", gid)
 126         gidVersion = 1
 127         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 128
 129     print "Adding %d feature entries" % len(featureEntries)
 130     hsGenome.addFeatureEntryBatch(featureEntries)
 131
 132
 133 def loadGeneAnnotations(db):
 134     geneAnnotations = []
 135     idb = geneinfoDB()
 136     hsGenome = Genome("hsapiens", dbFile=db)
 137     gidList = hsGenome.allGIDs()
 138     for locID in gidList:
 139         gID = ("hsapiens", locID)
 140         geneDescArray = idb.getDescription(gID)
 141         geneDesc = ""
 142         for entry in geneDescArray:
 143             geneDesc += ","
 144             geneDesc += entry.strip()
 145
 146         if len(geneDescArray) > 0:
 147             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
 148
 149     print "Adding %d annotations" % len(geneAnnotations)
 150     hsGenome.addAnnotationBatch(geneAnnotations)
 151
 152
 153 def loadGeneOntology(db, goPath, goDefPath):
 154     hsGenome = Genome("hsapiens", dbFile=db)
 155     goDefFile = open(goDefPath, "r")
 156     goFile = open(goPath, "r")
 157     idb = geneinfoDB()
 158     goDefs = {}
 159     goArray = []
 160     for goDefEntry in goDefFile:
 161         if goDefEntry[0] != "!":
 162             cols = goDefEntry.split("\t")
 163             goDefs[cols[0]] = (cols[1], cols[2].strip())
 164
 165     prevGID = ""
 166     index = 0
 167     for entry in goFile:
 168         try:
 169             fields = entry.split("\t")
 170             if fields[0] != "9606":
 171                 continue
 172
 173             index += 1
 174             if index % 1000 == 0:
 175                 print "adding 1000 go entries"
 176                 hsGenome.addGoInfoBatch(goArray)
 177                 goArray = []
 178
 179             locID = fields[1].strip()
 180             gID = ("hsapiens", locID)
 181             if prevGID != gID:
 182                 prevGID = gID
 183                 gene_name = ""
 184                 synonyms = idb.geneIDSynonyms(gID)
 185                 if len(synonyms) >0:
 186                     for entry in synonyms:
 187                         gene_name += ","
 188                         gene_name += entry
 189                 else:
 190                     gene_name = " "
 191
 192             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 193         except:
 194             print "locus ID %s could not be added" % locID
 195             pass
 196
 197     print "adding %d go entries" % len(goArray)
 198     hsGenome.addGoInfoBatch(goArray)
 199
 200
 201 def createDBFile(db):
 202     hsGenome = Genome("hsapiens",  dbFile=db)
 203     hsGenome.createGeneDB(db)
 204
 205
 206 def createDBindices(db):
 207     hsGenome = Genome("hsapiens", dbFile=db)
 208     hsGenome.createIndices()
 209
 210
 211 def buildHsapiensDB(db=geneDB, downloadDir="%s/download" % cisRoot):
 212     genePath = "%s/seq_gene.md" % downloadDir # ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/mapview/seq_gene.md.gz
 213     goDefPath = "%s/GO.terms_and_ids" % downloadDir # ftp://ftp.geneontology.org/go/doc/GO.terms_and_ids
 214     goPath = "%s/gene2go" % downloadDir # ftp://ftp.ncbi.nih.gov/gene/gene2go.gz
 215     # chromosomes are from UCSC - will ignore all the alternative haplotypes, chrUn, and random chromosomes
 216     chromDict = {"1": "%s/chr1.fa" % downloadDir,
 217                  "2": "%s/chr2.fa" % downloadDir,
 218                  "3": "%s/chr3.fa" % downloadDir,
 219                  "4": "%s/chr4.fa" % downloadDir,
 220                  "5": "%s/chr5.fa" % downloadDir,
 221                  "6": "%s/chr6.fa" % downloadDir,
 222                  "7": "%s/chr7.fa" % downloadDir,
 223                  "8": "%s/chr8.fa" % downloadDir,
 224                  "9": "%s/chr9.fa" % downloadDir,
 225                  "10": "%s/chr10.fa" % downloadDir,
 226                  "11": "%s/chr11.fa" % downloadDir,
 227                  "12": "%s/chr12.fa" % downloadDir,
 228                  "13": "%s/chr13.fa" % downloadDir,
 229                  "14": "%s/chr14.fa" % downloadDir,
 230                  "15": "%s/chr15.fa" % downloadDir,
 231                  "16": "%s/chr16.fa" % downloadDir,
 232                  "17": "%s/chr17.fa" % downloadDir,
 233                  "18": "%s/chr18.fa" % downloadDir,
 234                  "19": "%s/chr19.fa" % downloadDir,
 235                  "20": "%s/chr20.fa" % downloadDir,
 236                  "21": "%s/chr21.fa" % downloadDir,
 237                  "22": "%s/chr22.fa" % downloadDir,
 238                  "X": "%s/chrX.fa" % downloadDir,
 239                  "Y": "%s/chrY.fa" % downloadDir
 240     }
 241
 242     print "Creating database %s" % db
 243     createDBFile(db)
 244
 245     print "Adding gene entries"
 246     loadGeneEntries(db, genePath, chromDict)
 247
 248     print "Adding gene features"
 249     loadGeneFeatures(db, genePath, chromDict)
 250
 251     print "Adding gene annotations"
 252     loadGeneAnnotations(db)
 253
 254     print "Adding gene ontology"
 255     loadGeneOntology(db, goPath, goDefPath)
 256
 257     for chromID in chromDict.keys():
 258         print "Loading chromosome %s" % chromID
 259         loadChromosome(db, chromID, chromDict[chromID], "/H_sapiens/chromo%s.bin" % chromID)
 260
 261     print "Creating Indices"
 262     createDBindices(db)
 263
 264     print "Finished creating database %s" % db