cistematic/genomes/__init__.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 try:
  31     from pysqlite2 import dbapi2 as sqlite
  32 except:
  33     from sqlite3 import dbapi2 as sqlite
  34
  35 import string
  36 from mmap import *
  37 from os import environ
  38
  39 if environ.get("CISTEMATIC_ROOT"):
  40     cisRoot = environ.get("CISTEMATIC_ROOT")
  41 else:
  42     cisRoot = "/proj/genome"
  43
  44 __all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri",
  45            "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus",
  46            "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis",
  47            "btaurus", "drerio", "ecaballus"
  48 ]
  49
  50 supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens",
  51                     "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae",
  52                     "cbrenneri", "cremanei", "dmelanogaster", "mdomestica",
  53                     "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus"
  54 ]
  55
  56 geneDB  = {}
  57 chromDict  = {}
  58 chromRoot  = {}
  59 chromGeneEntries = {}
  60 checkGene = {}
  61 geneInfo   = {}
  62 getAllGenes = {}
  63 annotInfo = {}
  64 goInfo = {}
  65 allAnnotInfo = {}
  66 allGOInfo = {}
  67
  68 def compNT(nt):
  69     """ returns the complementary basepair to base nt
  70     """
  71     compDict = {"A": "T", "T": "A",
  72                 "G": "C", "C": "G",
  73                 "S": "S",
  74                 "W": "W",
  75                 "R": "Y", "Y": "R",
  76                 "M": "K", "K": "M",
  77                 "H": "D", "D": "H",
  78                 "B": "V", "V": "B",
  79                 "N": "N",
  80                 "a": "t", "t": "a",
  81                 "g": "c", "c": "g",
  82                 "n": "n",
  83                 "z": "z"
  84     }
  85
  86     return compDict.get(nt, "N")
  87
  88
  89 def complement(sequence, length=0):
  90     """ returns the complement of the sequence.
  91     """
  92     newSeq = ""
  93     seqLength = len(sequence)
  94     if length == seqLength:
  95         seqList = list(sequence)
  96         seqList.reverse()
  97         return "".join(map(compNT, seqList))
  98
  99     for index in range(seqLength - 1, seqLength - length - 1, -1):
 100         try:
 101             newSeq += compNT(sequence[index])
 102         except:
 103             newSeq += "N"
 104
 105     return newSeq
 106
 107
 108 class Genome:
 109     genome = ""
 110     chromosome = ""
 111     version = ""
 112     dbFile = ""
 113     supported = False
 114     oldStyle = False
 115     customAnnotations = False
 116
 117
 118     def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False):
 119         self.genome = genome
 120         if chrom != "":
 121             self.chromosome = chrom
 122
 123         if version != "":
 124             self.version = version
 125
 126         if dbFile != "":
 127             self.dbFile = dbFile
 128
 129         if genome in supportedGenomes and dbFile == "":
 130             self.dbFile = geneDB[genome]
 131             self.supported = True
 132
 133         if inRAM:
 134             self.memoryBacked = True
 135             self.memoryDB = sqlite.connect(":memory:")
 136             self.createGeneDB(inMemory=True)
 137             sql = self.memoryDB.cursor()
 138             try:
 139                 sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
 140                 sql.execute("ATTACH '%s' as diskdb" % self.dbFile)
 141                 for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]:
 142                     sql.execute("insert into %s select * from diskdb.%s" % (table, table))
 143
 144                 sql.execute("DETACH diskdb")
 145             except:
 146                 if self.dbFile != "":
 147                     print "could not import %s" % self.dbFile
 148
 149             sql.close()
 150             self.createIndices(inMemory=True)
 151         else:
 152             self.memoryBacked = False
 153             self.memoryDB = ""
 154
 155
 156     def setGeneDB(self, dbFile):
 157         self.dbFile = dbFile
 158         self.supported = False
 159
 160
 161     def setChromosome(self, chrom):
 162         self.chromosome = chrom
 163
 164
 165     def checkGene(self, geneID):
 166         """ returns True if the geneID matches an entry in the genome database.
 167         """
 168         (genome, gID) = geneID
 169         if genome != self.genome:
 170             return False
 171
 172         try:
 173             stmt = "SELECT chromosome  from gene_entries where name = '%s' " % gID
 174             res = self.queryDB(stmt)
 175             if len(res) > 0:
 176                 return True
 177         except:
 178             pass
 179
 180         return False
 181
 182
 183     def geneInfo(self, geneID, version="1"):
 184         (genome, gID) = geneID
 185         result = ""
 186         if genome != self.genome:
 187             return False
 188
 189         try:
 190             stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version)
 191             res = self.queryDB(stmt)
 192             if len(res) > 0:
 193                 chrom = res[0]
 194                 start = int(res[1])
 195                 stop = int(res[2])
 196                 if start > stop:
 197                     temp = stop
 198                     stop = start
 199                     start = temp
 200
 201                 length = int(res[3])
 202                 orientation = res[4]
 203                 result = (chrom, start, stop, length, orientation)
 204         except:
 205             pass
 206
 207         return result
 208
 209
 210     def getallGeneInfo(self, name="", chrom="", version=""):
 211         resultDict = {}
 212         chromList = []
 213         stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start "
 214         res = self.queryDB(stmt, fetchall=True)
 215         for entry in res:
 216             (name, chromosome, start, stop, orientation) = entry
 217             if name not in resultDict:
 218                 resultDict[name] = []
 219
 220             if chromosome not in chromList:
 221                 resultDict[chromosome] = []
 222                 chromList.append(chromosome)
 223
 224             resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation))
 225
 226         return resultDict
 227
 228
 229     def leftGeneDistance(self, geneID, radius=50000, version="1"):
 230         result = radius
 231         res = self.geneInfo(geneID, version)
 232         if res != "":
 233             (chrom, start, stop, length, orientation) = res
 234             if start > stop:
 235                 temp = stop
 236                 stop = start
 237                 start = temp
 238
 239             stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start)
 240             res = self.queryDB(stmt, fetchall=True)
 241             for entry in res:
 242                 rstart = int(entry[1])
 243                 rstop = int(entry[2])
 244                 if rstart > rstop:
 245                     temp = rstop
 246                     rstop = rstart
 247                     rstart = temp
 248
 249                 thelength = start - rstart
 250                 if (start - rstop) > 0 and (start - rstop) < thelength:
 251                     thelength = start - rstop
 252
 253                 if thelength > 0 and thelength < result:
 254                     result = thelength
 255
 256         return result
 257
 258     def rightGeneDistance(self, geneID, radius=50000, version="1"):
 259         result = radius
 260         res = self.geneInfo(geneID, version)
 261         if res != "":
 262             (chrom, start, stop, length, orientation) = res
 263             if start > stop:
 264                 temp = stop
 265                 stop = start
 266                 start = temp
 267
 268             stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius)
 269             res = self.queryDB(stmt, fetchall=True)
 270             for entry in res:
 271                 rstart = int(entry[1])
 272                 rstop = int(entry[2])
 273                 if rstart > rstop:
 274                     temp = rstop
 275                     rstop = rstart
 276                     rstart = temp
 277
 278                 thelength = rstop - stop
 279                 if (rstart - stop) > 0 and (rstart - stop) < thelength:
 280                     thelength = rstart - stop
 281
 282                 if thelength > 0 and thelength < result:
 283                     result = thelength
 284
 285         return result
 286
 287
 288     def annotInfo(self, geneID):
 289         (genome, gID) = geneID
 290         result = []
 291         if genome != self.genome:
 292             return False
 293
 294         stmt = "SELECT description from gene_annotation where name = '%s'" % gID
 295         res = self.queryDB(stmt, fetchall=True)
 296         if len(res) > 0:
 297             for entry in res:
 298                 result.append(entry[0])
 299
 300         return result
 301
 302
 303     def goInfo(self, geneID):
 304         (genome, gID) = geneID
 305         result = []
 306         if genome != self.genome:
 307             return False
 308
 309         stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID
 310         res = self.queryDB(stmt, fetchall=True)
 311         if len(res) > 0:
 312             for entry in res:
 313                 result.append(string.join(entry,"\t"))
 314
 315         return result
 316
 317
 318     def chromInfo(self, chrom=""):
 319         if chrom == "" and self.chromosome != "":
 320             chrom = self.chromosome
 321
 322         stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom
 323         res = self.queryDB(stmt)
 324         result = "%s\t%s" % (res[0], res[1])
 325
 326         return result
 327
 328
 329     def allGIDs(self):
 330         """ returns [ list of all orf names]
 331         """
 332         result = []
 333         stmt = "SELECT distinct name from gene_entries"
 334         res = self.queryDB(stmt, fetchall=True)
 335         for entry in res:
 336             result.append(entry[0])
 337
 338         return result
 339
 340
 341     def allGIDsbyGOID(self, GOID):
 342         """ returns [ list of all orf names] that match a particular GOID
 343         """
 344         result = []
 345         stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
 346         res = self.queryDB(stmt, fetchall=True)
 347         for entry in res:
 348             result.append(entry[0])
 349
 350         return result
 351
 352
 353     def allGOIDs(self):
 354         """ returns the list of GOID's in the genome
 355         """
 356         result = []
 357         stmt = "SELECT distinct GOID from gene_ontology"
 358         res = self.queryDB(stmt, fetchall=True)
 359         for entry in res:
 360             result.append(entry[0])
 361
 362         return result
 363
 364
 365     def allGOterms(self):
 366         """ returns the list of GOID's and their associated GO term in the genome
 367         """
 368         result = {}
 369         stmt = "SELECT distinct GOID, GOterm from gene_ontology"
 370         res = self.queryDB(stmt, fetchall=True)
 371         for entry in res:
 372             result[str(entry[0])] = str(entry[1])
 373
 374         return result
 375
 376
 377     def getGOIDCount(self, GOID):
 378         """ returns the match count for a particular GOID
 379         """
 380         stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
 381         res = self.queryDB(stmt, fetchall=True)
 382
 383         return len(res)
 384
 385
 386     def allAnnotInfo(self):
 387         result = {}
 388         stmt = "SELECT name, description from gene_annotation"
 389         res = self.queryDB(stmt, fetchall=True)
 390
 391         for entry in res:
 392             geneID = (self.genome, entry[0])
 393             if geneID not in result:
 394                 result[geneID] = []
 395
 396             result[(self.genome,entry[0])].append(entry[1])
 397
 398         return result
 399
 400
 401     def allGoInfo(self):
 402         result = {}
 403         stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name "
 404         res = self.queryDB(stmt, fetchall=True)
 405         for entry in res:
 406             geneID = (self.genome, entry[0])
 407             if geneID not in result:
 408                 result[geneID] = []
 409
 410             result[(self.genome, entry[0])].append(string.join(entry[1:], "\t"))
 411
 412         return result
 413
 414
 415     def allChromNames(self, partition=1, slice=0):
 416         result = []
 417         stmt = "SELECT distinct name from chromosomes"
 418         res = self.queryDB(stmt, fetchall=True)
 419         reslen = len(res)
 420         for index in range(reslen):
 421             if (index + slice) % partition == 0:
 422                 entry = res[index]
 423                 result.append(entry[0])
 424
 425         return result
 426
 427
 428     def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"):
 429         (chrom, start, stop, length, orientation) = self.geneInfo(gID, version)
 430         seq = self.sequence(chrom, start, length, maskCDS, maskLower)
 431         if orientation == "R":
 432             seq = complement(seq, length)
 433
 434         return seq
 435
 436
 437     def getGeneFeatures(self, gID, type="", version="1"):
 438         results = []
 439         featureClause = ""
 440         (genome, geneid) = gID
 441         if len(type) > 0:
 442             featureClause = ' and type = "%s" ' % type
 443
 444         stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s"  %s order by start ' % (geneid, str(version), featureClause)
 445         res = self.queryDB(stmt, fetchall=True)
 446         for entry in res:
 447             (type, chromosome, start, stop, orientation) = entry
 448             results.append((type, chromosome, start, stop, orientation))
 449
 450         return results
 451
 452
 453     def getallGeneFeatures(self):
 454         resultDict = {}
 455         stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start "
 456         res = self.queryDB(stmt, fetchall=True)
 457         for entry in res:
 458             (name, type, chromosome, start, stop, orientation) = entry
 459             if name not in resultDict:
 460                 resultDict[name] = []
 461
 462             resultDict[name].append((type, chromosome, start, stop, orientation))
 463
 464         return resultDict
 465
 466
 467     def getFeatureTypes(self, ftype=''):
 468         """ Returns the distinct feature types available in the sequence_features
 469             tables. Can optionally limit by feature type; the wild-card % can be
 470             used to search feature substrings.
 471         """
 472         results = []
 473         whereClause = ""
 474         useLike = False
 475         if "%" in ftype:
 476             useLike = True
 477
 478         if len(ftype) > 0 and not useLike:
 479             whereClause = 'where type = "%s" ' % ftype
 480         elif len(ftype) > 0:
 481             whereClause = 'where type LIKE "%s" ' % ftype
 482
 483         stmt = "select distinct type from sequence_features %s" % whereClause
 484         res = self.queryDB(stmt, fetchall=True)
 485         for entry in res:
 486             results.append(entry[0])
 487
 488         return results
 489
 490     def getFeatures(self, ftype, name="", chrom="", version =""):
 491         """ Get features stored in sequence_features that match a feature type,
 492             optionally restricted by name/value, chromosome, or version. Will
 493             search for substrings when ftype and/or name are given with a % to
 494             indicate the location of the wildcard. Returns a dictionary of features
 495             with chromosomes as the keys.
 496         """
 497         results = {}
 498         chromList = []
 499         nameClause = ""
 500         chromClause = ""
 501         versionClause = ""
 502         useLike = "="
 503         if "%" in ftype:
 504             useLike = "LIKE"
 505
 506         if len(name) > 0:
 507             if "%" in name:
 508                 nameLike = "LIKE"
 509             else:
 510                 nameLike = "="
 511
 512             nameClause = ' and name %s "%s" ' % (nameLike, name)
 513
 514         if len(chrom) > 0:
 515             chromClause = ' and chromosome = "%s" ' % chrom
 516
 517         if len(version) > 0:
 518             versionClause = ' and version = "%s" ' % version
 519
 520         stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause)
 521         res = self.queryDB(stmt, fetchall=True)
 522         for entry in res:
 523             (name, version, chromosome, start, stop, orientation, atype) = entry
 524             if chromosome not in chromList:
 525                 results[chromosome] = []
 526                 chromList.append(chromosome)
 527
 528             results[chromosome].append((name, version, chromosome, start, stop, orientation, atype))
 529
 530         return results
 531
 532
 533     def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""):
 534         """ Return features that are on a particular stretch of the genome. Can optionally
 535             limit by feature type; the wild-card % can  be used to search feature substrings.
 536         """
 537         results = []
 538         featureClause = ""
 539         qstop = qstart + qlength
 540         useLike = False
 541         if "%" in ftype:
 542             useLike = True
 543
 544         if len(ftype) > 0 and not useLike:
 545             featureClause = ' and type = "%s" ' % ftype
 546         elif len(ftype) > 0:
 547             featureClause = ' and type LIKE "%s" ' % ftype
 548
 549         #select all features that encompass our start/stop
 550         stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause)
 551         res = self.queryDB(stmt, fetchall=True)
 552         for entry in res:
 553             (chromosome, start, stop, orientation, name, version, atype) = entry
 554             results.append((name, version, chromosome, start, stop, orientation, atype))
 555
 556         # select all features that have a "stop" between start and stop that aren't yet in the dataset
 557         stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
 558         res = self.queryDB(stmt, fetchall=True)
 559         for entry in res:
 560             (chromosome, start, stop, orientation, name, version, atype) = entry
 561             if (name, version, chromosome, start, stop, orientation, atype) not in results:
 562                 results.append((name, version, chromosome, start, stop, orientation, atype))
 563
 564         # select all features on chromosome that have a "start" between start and stop
 565         stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
 566         res = self.queryDB(stmt, fetchall=True)
 567         for entry in res:
 568             (chromosome, start, stop, orientation, name, version, atype) = entry
 569             if (name, version, chromosome, start, stop, orientation, atype) not in results:
 570                 results.append((name, version, chromosome, start, stop, orientation, atype))
 571
 572         return results
 573
 574
 575     def getGenesIntersecting(self, chrom, qstart, qlength):
 576         """ Return features that are on a ptarticular stretch of the genome. Can optionally
 577             limit by feature type; the wild-card % can  be used to search feature substrings.
 578         """
 579         results = []
 580         qstop = qstart + qlength
 581         atype = "model"
 582         #select all features that encompass our start/stop
 583         stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop)
 584         res = self.queryDB(stmt, fetchall=True)
 585         for entry in res:
 586             (chromosome, start, stop, orientation, name, version) = entry
 587             results.append((name, version, chromosome, start, stop, orientation, atype))
 588
 589         # select all features that have a "stop" between start and stop that aren't yet in the dataset
 590         stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop)
 591         res = self.queryDB(stmt, fetchall=True)
 592         for entry in res:
 593             (chromosome, start, stop, orientation, name, version) = entry
 594             if (name, version, chromosome, start, stop, orientation) not in results:
 595                 results.append((name, version, chromosome, start, stop, orientation, atype))
 596
 597         # select all features on chromosome that have a "start" between start and stop
 598         stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop)
 599         res = self.queryDB(stmt, fetchall=True)
 600         for entry in res:
 601             (chromosome, start, stop, orientation, name, version) = entry
 602             if (name, version, chromosome, start, stop, orientation) not in results:
 603                 results.append((name, version, chromosome, start, stop, orientation, atype))
 604
 605         return results
 606
 607
 608     def sequence(self, chrom, start, length, maskCDS=False, maskLower=False):
 609         seq = ""
 610         if chrom == "" and self.chromosome != "":
 611             chrom = self.chromosome
 612
 613         stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
 614         res = self.queryDB(stmt)
 615         seqName = res[0]
 616         seqType = res[1]
 617         if seqType == "db":
 618             stmt = "select sequence from sequences where name = '%s' " % seqName
 619             res = self.queryDB(stmt)
 620             seq = res[0][start: start + length]
 621             res = ""
 622         else:
 623             chromFile = open("%s%s" % (cisRoot, seqName), "r")
 624             mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ)
 625             mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ)
 626             mymap.seek(start)
 627             seq = mymap.read(length)
 628             chromFile.close()
 629
 630         if maskCDS:
 631             stop = start + length - 1
 632             seqArray = list(seq)
 633             features = self.getFeaturesIntersecting(chrom, start, length, "CDS")
 634             for entry in features:
 635                 (name, geneID, chromosome, fstart, fstop, forientation, type) = entry
 636                 if fstart < start:
 637                     fstart = start
 638
 639                 if fstop > stop:
 640                     fstop = stop
 641
 642                 nstart = fstart - start
 643                 nstop = fstop - start + 1
 644                 for pos in range(nstop - nstart):
 645                     seqArray[nstart + pos] = "N"
 646
 647             seq = string.join(seqArray, "")
 648
 649         if maskLower:
 650             seqArray = list(seq)
 651             for index in range(len(seqArray)):
 652                 if seqArray[index] in ["a", "c" , "g", "t"]:
 653                     seqArray[index] = "N"
 654
 655             seq = string.join(seqArray, "")
 656
 657         return seq
 658
 659
 660     def getChromosomeSequence(self, chrom=""):
 661         seq = ""
 662         if chrom == "" and self.chromosome != "":
 663             chrom = self.chromosome
 664
 665         stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
 666         res = self.queryDB(stmt)
 667         if res == None:
 668             print "Could not find chromosome %s" % chrom
 669             return ''
 670
 671         seqName = res[0]
 672         seqType = res[1]
 673         if seqType == "db":
 674             res = self.queryDB('select sequence from sequences where name = "%s"' % chrom)
 675             seq = res[0]
 676             res = ""
 677         else:
 678             chromFile = open("%s%s" % (cisRoot, seqName), "r")
 679             seq = chromFile.readline()
 680             chromFile.close()
 681
 682         return seq
 683
 684
 685     def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1):
 686         result = []
 687         if chrom == "" and self.chromosome != "":
 688             chrom = self.chromosome
 689
 690         stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom
 691         res = self.queryDB(stmt, fetchall=True)
 692         if lowerbound > 0 and upperbound > 0:
 693             for entry in res:
 694                 start = int(entry[0])
 695                 stop = int(entry[1])
 696                 if stop < start:
 697                     start, stop = stop, start
 698
 699                 if stop < lowerbound or start > upperbound:
 700                     continue
 701
 702                 result.append((start, stop, entry[2], (self.genome, entry[3])))
 703         else:
 704             for entry in res:
 705                 start = int(entry[0])
 706                 stop = int(entry[1])
 707                 if stop < start:
 708                     start, stop = stop, start
 709
 710                 result.append((start, stop, entry[2], (self.genome, entry[3])))
 711
 712         return result
 713
 714
 715     def createGeneDB(self, dbFile="", inMemory=False):
 716         if len(dbFile) > 0:
 717             self.dbFile = dbFile
 718
 719         tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar",
 720                      "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar",
 721                      "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar",
 722                      "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar",
 723                      "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar",
 724                      "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar"
 725         }
 726
 727         for table in tableDict.keys():
 728             stmt = "create table %s(%s)" % (table, tableDict[table])
 729             self.writeDB(stmt, useMemory=inMemory)
 730
 731
 732     def createIndices(self, inMemory=False):
 733         indexDict = {"nameIndex1": ("gene_entries", "name"),
 734                      "nameIndex2": ("gene_annotation", "name"),
 735                      "nameIndex3": ("gene_ontology", "name"),
 736                      "goidIndex": ("gene_ontology", "GOID"),
 737                      "nameIndex4": ("sequences", "name"),
 738                      "nameIndex5": ("chromosomes", "name"),
 739                      "geneIDIndex": ("sequence_features", "name, type"),
 740                      "posIndex": ("sequence_features", "chromosome, start, stop, type"),
 741                      "typeIndex": ("sequence_features", "type")
 742         }
 743
 744         for indexName in indexDict.keys():
 745             (tableName, fields) = indexDict[indexName]
 746             stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields)
 747             self.writeDB(stmt, useMemory=inMemory)
 748
 749
 750     def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0):
 751         (genome, gID) = geneID
 752         length = str(abs(int(start) - int(stop)) + 1)
 753         stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature)
 754         self.writeDB(stmt)
 755
 756
 757     def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""):
 758         (genome, gID) = geneID
 759         length = str(abs(int(start) - int(stop)) + 1)
 760         stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type)
 761         self.writeDB(stmt)
 762
 763
 764     def addAnnotation(self, geneID, description):
 765         (genome, gID) = geneID
 766         stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description)
 767         self.writeDB(stmt)
 768
 769
 770     def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""):
 771         (genome, gID) = geneID
 772         stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)
 773         self.writeDB(stmt)
 774
 775
 776     def addSequence(self, geneID, seq, seqType, seqLen="-1"):
 777         (genome, gID) = geneID
 778         if int(seqLen) < 0:
 779             seqLen = str(len(seq))
 780
 781         stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq)
 782         self.writeDB(stmt)
 783
 784
 785     def addChromosomeEntry(self, chromo, seqName, storageType):
 786         stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType)
 787         self.writeDB(stmt)
 788
 789
 790     def addSequenceBatch(self, entryArray, inMemory=False):
 791         stmtArray = []
 792         stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)"
 793         for entry in entryArray:
 794             (geneID, seq, seqType, seqLen) = entry
 795             (genome, gID) = geneID
 796             if int(seqLen) < 0:
 797                 seqLen = str(len(seq))
 798
 799             stmtArray.append((gID, str(seqLen), seqType, seq))
 800
 801         self.writeBatchDB(stmt, stmtArray)
 802
 803
 804     def addChromosomeEntryBatch(self, entryArray, inMemory=False):
 805         stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, ?, ?, ?)"
 806         self.writeBatchDB(stmt, entryArray, useMemory=inMemory)
 807
 808
 809     def addGeneEntryBatch(self, entryArray, inMemory=False):
 810         stmtArray = []
 811         stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
 812         for entry in entryArray:
 813             (geneID, chrom, start, stop, orientation, feature, version) = entry
 814             (genome, gID) = geneID
 815             length = str(abs(int(start) - int(stop)) + 1)
 816             stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature))
 817
 818         self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
 819
 820
 821     def addFeatureEntryBatch(self, entryArray, inMemory=False):
 822         stmtArray = []
 823         stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
 824         for entry in entryArray:
 825             (geneID, version, chrom, start, stop, orientation, type) = entry
 826             (genome, gID) = geneID
 827             length = str(abs(int(start) - int(stop)) + 1)
 828             stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type))
 829
 830         self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
 831
 832
 833     def addAnnotationBatch(self, entryArray, inMemory=False):
 834         stmtArray = []
 835         stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) "
 836         for entry in entryArray:
 837             (geneID, description) = entry
 838             (genome, gID) = geneID
 839             stmtArray.append((gID, description))
 840
 841         self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
 842
 843
 844     def addGoInfoBatch(self, entryArray, inMemory=False):
 845         stmtArray = []
 846         stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?,  ?, ?, ?, ?, ?, ?) "
 847         for entry in entryArray:
 848             (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry
 849             (genome, gID) = geneID
 850             stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other))
 851
 852         self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
 853
 854
 855     def extendFeatures(self, featureFile, fileType="cistematic", replace=False):
 856         geneEntryList = []
 857         geneFeatureList = []
 858         currentGene = ""
 859         gstart = -1
 860         gstop = -1
 861         gchrom = ""
 862         gsense = ""
 863         ftype = ""
 864         senseArray = {"+": "F",
 865                       "-": "R",
 866                       ".": "F"
 867         }
 868
 869         featfile = open(featureFile)
 870         if fileType == "cistematic":
 871             for line in featfile:
 872                 if line[0] == "#":
 873                     continue
 874
 875                 fields = line.strip().split()
 876                 if currentGene == "":
 877                     currentGene = fields[0]
 878                     gchrom = fields[2]
 879                     gstart = int(fields[3])
 880                     gsense = senseArray[fields[5]]
 881
 882                 if fields[0] != currentGene:
 883                     geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1"))
 884                     currentGene = fields[0]
 885                     gchrom = fields[2]
 886                     gstart = int(fields[3])
 887                     gsense = senseArray[fields[5]]
 888
 889                 lstart = int(fields[3])
 890                 gstop = int(fields[4])
 891                 ftype = fields[6]
 892                 geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype))
 893         elif fileType == "UCSC":
 894             for line in featfile:
 895                 if line[0] == "#":
 896                     continue
 897
 898                 geneFields = line.split("\t")
 899                 exonNum = int(geneFields[8])
 900                 exonStarts = geneFields[9].split(",")
 901                 exonStops = geneFields[10].split(",")
 902                 gchrom = geneFields[2][3:]
 903                 gsense = senseArray[geneFields[3]]
 904                 gstop = int(geneFields[7]) - 1
 905                 gstart = int(geneFields[6]) - 1
 906                 geneID = ("generic", geneFields[1])
 907                 for index in range(exonNum):
 908                     estart = int(exonStarts[index]) - 1
 909                     estop = int(exonStops[index]) - 1
 910                     if estart >= gstart and estop <= gstop:
 911                         geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS"))
 912                     elif estop <= gstart:
 913                         if gsense == "F":
 914                             fType = "5UTR"
 915                         else:
 916                             fType = "3UTR"
 917
 918                         geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
 919                     elif estart >= gstop:
 920                         if gsense == "F":
 921                             fType = "3UTR"
 922                         else:
 923                             fType = "5UTR"
 924
 925                         geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
 926                     elif estart <= gstop and estart > gstart:
 927                         if gsense == 'F':
 928                             fType = '3UTR'
 929                         else:
 930                             fType = '5UTR'
 931
 932                         geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS"))
 933                         geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType))
 934                     elif estart < gstart and estop <= gstop:
 935                         if gsense == "F":
 936                             fType = "5UTR"
 937                         else:
 938                             fType = "3UTR"
 939
 940                         geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType))
 941                         geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS"))
 942                     else:
 943                         if gsense == "F":
 944                             fType1 = "5UTR"
 945                             fType2 = "3UTR"
 946                         else:
 947                             fType1 = "3UTR"
 948                             fType2 = "5UTR"
 949
 950                         geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1))
 951                         geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS"))
 952                         geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2))
 953         else:
 954             print "%s format not supported yet"
 955             featfile.close()
 956             return
 957
 958         featfile.close()
 959         if replace==True:
 960             self.queryDB("delete from sequence_features", useMemory=True)
 961             self.queryDB("delete from gene_entries", useMemory=True)
 962
 963         self.addGeneEntryBatch(geneEntryList, inMemory=True)
 964         self.addFeatureEntryBatch(geneFeatureList, inMemory=True)
 965
 966
 967     def queryDB(self, stmt, fetchall=False, useMemory=False):
 968         if useMemory or self.memoryBacked:
 969             db = self.memoryDB
 970         else:
 971             db = sqlite.connect(self.dbFile, timeout = 60)
 972
 973         sql = db.cursor()
 974         sql.execute(stmt)
 975         if fetchall:
 976             results = sql.fetchall()
 977         else:
 978             results = sql.fetchone()
 979
 980         sql.close()
 981         if not (useMemory or self.memoryBacked):
 982             db.close()
 983
 984         return results
 985
 986
 987     def writeDB(self, stmt, useMemory=False):
 988         if useMemory:
 989             db = self.memoryDB
 990         else:
 991             db = sqlite.connect(self.dbFile, timeout = 60)
 992
 993         sql = db.cursor()
 994         sql.execute(stmt)
 995         db.commit()
 996         sql.close()
 997         if not useMemory:
 998             db.close()
 999
1000
1001     def writeBatchDB(self, stmt, stmtArray, useMemory=False):
1002         if useMemory:
1003             db = self.memoryDB
1004         else:
1005             db = sqlite.connect(self.dbFile, timeout = 60)
1006
1007         sql = db.cursor()
1008         try:
1009             sql.executemany(stmt, stmtArray)
1010         except:
1011             print "writeBatchDB: problem with %s" % stmt
1012
1013         db.commit()
1014         sql.close()
1015         if not useMemory:
1016             db.close()
1017
1018
1019 def processSql(sqlFile, keyFields={}):
1020     """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the
1021         name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column
1022         name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the
1023         same field name will be used. For example, one possible keyFields dict would be:
1024         keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
1025     """
1026     fields = {}
1027     infile = open(sqlFile)
1028     line = ""
1029     while "CREATE TABLE" not in line:
1030         line = infile.readline()
1031         continue
1032
1033     tabFields = line.split()
1034     tabName = tabFields[2]
1035     if keyFields == {}:
1036         fields["chrom"] = 1
1037         fields["start"] = 2
1038         fields["stop"] = 3
1039         fields["name"] = 4
1040     else:
1041         index = 0
1042         for line in infile:
1043             lineFields = line.split()
1044             if lineFields[0] in keyFields.keys():
1045                 if keyFields[lineFields[0]] == "":
1046                     outfield = lineFields[0]
1047                 else:
1048                     outfield = keyFields[lineFields[0]]
1049
1050                 fields[outfield] = index
1051
1052             index += 1
1053
1054     infile.close()
1055
1056     return (tabName, fields)
1057
1058
1059 def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"):
1060     """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature
1061         type in sequence_features, the dataFields in the format returned by processSql(), and a version.
1062         If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
1063         rather than use addFeatureEntryBatch() to record the added features.
1064
1065         Note that numberic values are overloaded into the name field using @ as a delimiter.
1066     """
1067     records = []
1068     if dataFields == {}:
1069         dataFields["chrom"] = 1
1070         dataFields["start"] = 2
1071         dataFields["stop"] = 3
1072         dataFields["name"] = 4
1073
1074     infile = open(dataFile)
1075     for line in infile:
1076         fields = line.strip().split("\t")
1077         if "name" in dataFields:
1078             recName = fields[dataFields["name"]]
1079         else:
1080             recName = typeName
1081
1082         if "value" in dataFields:
1083             recName += "@" + fields[dataFields["value"]]
1084
1085         start = int(fields[dataFields["start"]])
1086         stop = int(fields[dataFields["stop"]])
1087         chrom = fields[dataFields["chrom"]][3:]
1088         chrom.replace("_random", "rand")
1089         orientation = "F"
1090         if "orientation" in dataFields:
1091             if fields[dataFields["orientation"]] == "-":
1092                 orientation = "R"
1093
1094         if "version" in dataFields:
1095             version = fields[dataFields["version"]]
1096
1097         records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper()))
1098
1099     if genomeObject != "":
1100         genomeObject.addFeatureEntryBatch(records)
1101     else:
1102         print records
1103
1104
1105 # Voodoo to get recursive imports working
1106 for genome in supportedGenomes:
1107     importString = "import %s" % genome
1108     exec importString
1109     geneDB[genome]  = eval("%s.geneDB" % genome)