cistematic/genomes/cfamiliaris.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-13 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # data for Canis familiaris
  31 import string
  32 from cistematic.genomes import Genome
  33 from os import environ
  34
  35 if environ.get("CISTEMATIC_ROOT"):
  36     cisRoot = environ.get("CISTEMATIC_ROOT")
  37 else:
  38     cisRoot = "/proj/genome"
  39
  40 geneDB = "%s/C_familiaris/cfamiliaris.genedb" % cisRoot
  41
  42
  43 def buildDogDB(db=geneDB):
  44     genePath = "%s/download/seq_gene.md" % cisRoot
  45     print "Creating database %s" % db
  46     createDBFile(db)
  47
  48     print "Adding gene entries"
  49     loadGeneEntries(db, genePath)
  50
  51     print "Adding gene features"
  52     loadGeneFeatures(db, genePath)
  53
  54     chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
  55                  "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
  56                  "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
  57                  "31", "32", "33", "34", "35", "36", "37", "38", "X", "Un"
  58     ]
  59     for chromID in chromList:
  60         print "Loading chromosome %s" % chromID
  61         chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
  62         loadChromosome(db, chromID, chromPath, "/C_familiaris/chromo%s.bin" % chromID)
  63
  64     print "Creating Indices"
  65     createDBindices(db)
  66
  67     print "Finished creating database %s" % db
  68
  69
  70 def createDBFile(db):
  71     cfGenome = Genome("cfamiliaris",  dbFile=db)
  72     cfGenome.createGeneDB(db)
  73
  74
  75 def loadGeneEntries(db, gFile):
  76     #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
  77
  78     geneEntries = []
  79     alreadySeen = []
  80     cfGenome = Genome("cfamiliaris", dbFile=db)
  81     geneFile = open(gFile, "r")
  82     geneFile.readline()
  83     for line in geneFile:
  84         cols = line.split("\t")
  85         if cols[11].strip() != "GENE":
  86             continue
  87
  88         name = cols[10].split(":")
  89         gid = name[1]
  90         if gid == "" or gid in alreadySeen:
  91             continue
  92
  93         alreadySeen.append(gid)
  94         start = int(cols[2]) - 1
  95         stop = int(cols[3]) - 1
  96         sense = cols[4]
  97         chrom = cols[1].strip()
  98         if sense == "+":
  99             sense = "F"
 100         else:
 101             sense = "R"
 102
 103         geneID = ("cfamiliaris", gid)
 104         gidVersion = 1
 105         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
 106
 107     print "Adding %d gene entries" % len(geneEntries)
 108     cfGenome.addGeneEntryBatch(geneEntries)
 109
 110
 111 def loadGeneFeatures(db, gFile):
 112     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
 113     """
 114     featureEntries = []
 115     cfGenome = Genome("cfamiliaris", dbFile=db)
 116     featureFile = open(gFile, "r")
 117     featureFile.readline()
 118     for line in featureFile:
 119         cols = line.split("\t")
 120         if cols[11].strip() not in ["CDS", "UTR", "PSEUDO"]:
 121             continue
 122
 123         fType = cols[11]
 124         name = cols[10].split(":")
 125         gid = name[1]
 126         if gid == "":
 127             continue
 128
 129         start = int(cols[2]) - 1
 130         stop = int(cols[3]) - 1
 131         sense = cols[4]
 132         chrom = cols[1].strip()
 133         if sense == "+":
 134             sense = "F"
 135         else:
 136             sense = "R"
 137
 138         geneID = ("cfamiliaris", gid)
 139         gidVersion = 1
 140         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
 141
 142     print "Adding %d feature entries" % len(featureEntries)
 143     cfGenome.addFeatureEntryBatch(featureEntries)
 144
 145
 146 def loadGeneAnnotations(db, annotPath):
 147     geneAnnotations = []
 148     annotFile = open(annotPath, "r")
 149     cfGenome = Genome("cfamiliaris", dbFile=db)
 150     for line in annotFile:
 151         try:
 152             cols = line.split("\t")
 153             locID = cols[0]
 154             geneDesc = cols[6]
 155             if len(locID) > 0:
 156                 geneAnnotations.append((("cfamiliaris", locID), string.replace(geneDesc.strip(), "'", "p")))
 157         except:
 158             pass
 159
 160     print "Adding %d annotations" % len(geneAnnotations)
 161     cfGenome.addAnnotationBatch(geneAnnotations)
 162
 163
 164 def loadGeneOntology(db, goPath, goDefPath):
 165     cfGenome = Genome("cfamiliaris", dbFile=db)
 166     goDefFile = open(goDefPath, "r")
 167     goFile = open(goPath, "r")
 168     idb = geneinfoDB()
 169     goDefs = {}
 170     goArray = []
 171     for goDefEntry in goDefFile:
 172         if goDefEntry[0] != "!":
 173             cols = goDefEntry.split("\t")
 174             goDefs[cols[0]] = (cols[1], cols[2].strip())
 175
 176     goEntries = goFile.readlines()
 177     prevGID = ''
 178     for entry in goEntries:
 179         try:
 180             fields = entry.split("\t")
 181             if fields[0] != "9615":
 182                 continue
 183
 184             locID = fields[1].strip()
 185             gID = ("cfamiliaris", locID)
 186             if prevGID != gID:
 187                 prevGID = gID
 188                 gene_name = ""
 189                 synonyms = idb.geneIDSynonyms(gID)
 190                 if len(synonyms) >0:
 191                     for entry in synonyms:
 192                         gene_name += ","
 193                         gene_name += entry
 194                 else:
 195                     gene_name = " "
 196
 197             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
 198         except:
 199             print "locus ID %s could not be added" % locID
 200             pass
 201
 202     print "adding %d go entries" % len(goArray)
 203     cfGenome.addGoInfoBatch(goArray)
 204
 205
 206 def loadChromosome(db, chromID, chromPath, chromOut):
 207     seqArray = []
 208     cfGenome = Genome("cfamiliaris", dbFile=db)
 209     inFile = open(chromPath, "r")
 210     line = inFile.readline()
 211     for line in inFile:
 212         seqArray.append(line.strip())
 213
 214     seq = string.join(seqArray, "")
 215     seqLen = len(seq)
 216     if seqLen < 1:
 217         print "Problems reading sequence from file"
 218
 219     print "writing to file %s" % chromOut
 220     outFile = open("%s%s" % (cisRoot, chromOut), "w")
 221     outFile.write(seq)
 222     outFile.close()
 223     cfGenome.addChromosomeEntry(chromID, chromOut, "file")
 224
 225
 226 def createDBindices(db):
 227     cfGenome = Genome("cfamiliaris", dbFile=db)
 228     cfGenome.createIndices()