ReadDataset.py

   1 import string
   2 import tempfile
   3 import shutil
   4 import os
   5 import re
   6 import sys
   7 import pysam
   8 from array import array
   9 from commoncode import getReverseComplement
  10
  11 currentRDSVersion = "3.0"
  12
  13
  14 class ReadDatasetError(Exception):
  15     pass
  16
  17
  18 class ReadCounter():
  19     uniqReadCount = 0
  20     multiReadCount = 0.0
  21     spliceReadCount = 0
  22
  23     def __init__(self, multiReadIDCount={}, sense="both"):
  24         self.multiReadIDCount = multiReadIDCount
  25         self.sense = sense
  26
  27
  28     def __call__(self, read):
  29         multiplicity = self.getReadMultiplicity(read)
  30         if multiplicity > 1:
  31             self.multiReadCount += float(1.0/multiplicity)
  32         elif isSpliceEntry(read.cigar):
  33             self.spliceReadCount += 1
  34         else:
  35             self.uniqReadCount += 1
  36
  37
  38     def getReadMultiplicity(self, read):
  39         pairReadSuffix = getPairedReadNumberSuffix(read)
  40         readID = "%s%s" % (read.qname, pairReadSuffix)
  41         try:
  42             multiplicity = self.multiReadIDCount[readID]
  43         except KeyError:
  44             multiplicity = 1
  45
  46         return multiplicity
  47
  48
  49     def isMulti(self, read):
  50         pairReadSuffix = getPairedReadNumberSuffix(read)
  51         readID = "%s%s" % (read.qname, pairReadSuffix)
  52         return readID in self.multiReadIDCount
  53
  54
  55
  56 class MaxCoordFinder(ReadCounter):
  57     maxUniqueStart = 0
  58     maxMultiStart = 0
  59     maxSpliceStart = 0
  60
  61     def __call__(self, read):
  62         if self.isMulti(read):
  63             self.maxMultiStart = max(self.maxMultiStart, read.pos)
  64         elif isSpliceEntry(read.cigar):
  65             self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
  66         else:
  67             self.maxUniqueStart = max(self.maxMultiStart, read.pos)
  68
  69
  70
  71 class BamReadDataset():
  72     """ Class for storing reads from experiments. Assumes that custom scripts
  73     will translate incoming data into a format that can be inserted into the
  74     class using the insert* methods. Default class subtype ('DNA') includes
  75     tables for unique and multireads, whereas 'RNA' subtype also includes a
  76     splices table.
  77     """
  78
  79
  80     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  81                  cache=False, reportCount=True):
  82         """ creates an rds datafile if initialize is set to true, otherwise
  83         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  84         """
  85
  86         if initialize and datasetType not in ["DNA", "RNA"]:
  87             raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  88
  89         self.dataType = ""
  90         self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
  91         self.bamfile = pysam.Samfile(datafile, "rb")
  92         self.readsize = None
  93         self.totalReadWeight = None
  94
  95         self.metadata = {"dataType": datasetType}
  96
  97         if initialize:
  98             self.dataType = datasetType
  99         else:
 100             self.dataType = self.metadata["dataType"]
 101
 102         try:
 103             self.rdsVersion = self.metadata["rdsVersion"]
 104         except KeyError:
 105             self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
 106
 107         self.fullReadCounts = self.getFullReadCount()
 108         if verbose:
 109             self.printRDSInfo(datafile, reportCount, initialize)
 110
 111
 112     def __len__(self):
 113         """ return the number of reads in the bam file
 114             This is not the same as the original as the original returned total weight.
 115         """
 116         count = 0
 117         references = self.bamfile.references
 118         for reference in references:
 119             count += self.bamfile.count(reference)
 120
 121         return count
 122
 123
 124     def getMultiReadIDCounts(self, samFileName, fileMode):
 125         try:
 126             samfile = pysam.Samfile(samFileName, fileMode)
 127         except ValueError:
 128             print "samfile index not found"
 129             sys.exit(1)
 130
 131         readIDCounts = {}
 132         for read in samfile.fetch(until_eof=True):
 133             pairReadSuffix = getPairedReadNumberSuffix(read)
 134             readName = "%s%s" % (read.qname, pairReadSuffix)
 135             try:
 136                 readIDCounts[readName] += 1.0
 137             except KeyError:
 138                 readIDCounts[readName] = 1.0
 139
 140         for readID in readIDCounts.keys():
 141             if int(readIDCounts[readID]) == 1:
 142                 del readIDCounts[readID]
 143
 144         return readIDCounts
 145
 146
 147     def totalReadWeight(self):
 148         """ return the total weight of usable reads in the bam file.
 149         """
 150         if self.totalReadWeight is None:
 151             total = self.fullReadCounts["uniq"]
 152             total += self.fullReadCounts["multi"]
 153
 154             if self.dataType == "RNA":
 155                 total += self.fullReadCounts["splice"]
 156
 157             self.totalReadWeight = int(total)
 158
 159         return self.totalReadWeight
 160
 161
 162     def __del__(self):
 163         """ cleanup copy in local cache, if present.
 164         """
 165         pass
 166
 167
 168     def printRDSInfo(self, datafile, reportCount, initialize):
 169         if initialize:
 170             print "INITIALIZED dataset %s" % datafile
 171         else:
 172             print "dataset %s" % datafile
 173
 174         print "metadata:"
 175         pnameList = self.metadata.keys()
 176         pnameList.sort()
 177         for pname in pnameList:
 178             print "\t" + pname + "\t" + str(self.metadata[pname])
 179
 180         if reportCount and not initialize:
 181             self.printReadCounts()
 182
 183         print "default cache size is %d pages" % self.getDefaultCacheSize()
 184
 185
 186     def printReadCounts(self):
 187         ucount = self.fullReadCounts["uniq"]
 188         mcount = self.fullReadCounts["multi"]
 189         if self.dataType == "DNA":
 190             print "\n%d unique reads and %d multireads" % (ucount, mcount)
 191         elif self.dataType == "RNA":
 192             scount = self.fullReadCounts["splice"]
 193             print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
 194
 195
 196     def cacheDB(self, filename):
 197         pass
 198
 199
 200     def saveCacheDB(self, filename):
 201         pass
 202
 203
 204     def uncacheDB(self):
 205         pass
 206
 207
 208     def attachDB(self, filename, dbName):
 209         pass
 210
 211
 212     def detachDB(self, dbName):
 213         pass
 214
 215
 216     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
 217         pass
 218
 219
 220     def getTables(self, dbName=""):
 221         pass
 222
 223
 224     def getSqlCursor(self):
 225         pass
 226
 227
 228     def getMemCursor(self):
 229         pass
 230
 231
 232     def getFileCursor(self):
 233         pass
 234
 235
 236     def initializeTables(self, dbConnection, cache=100000):
 237         pass
 238
 239
 240     def getMetadata(self, valueName=""):
 241         return self.metadata
 242
 243
 244     def getReadSize(self):
 245         """ This is following the same model as the original where it assumes that all
 246             read have the same readsize and that this is reflected by the size of the
 247             first read.
 248         """
 249         if self.readsize is None:
 250             bamFileIterator = self.bamfile.fetch(until_eof=True)
 251             for read in bamFileIterator:
 252                 self.calculateReadsizeFromCigar(read.cigar)
 253                 break
 254
 255         return self.readsize
 256
 257
 258     def calculateReadsizeFromCigar(self, cigar):
 259         take = (0, 1) # CIGAR operation (M/match, I/insertion)
 260         self.readsize = sum([length for op,length in cigar if op in take])
 261
 262
 263     def getDefaultCacheSize(self):
 264         """ returns 0 as cache is going to be deprecated
 265         """
 266         return 0
 267
 268
 269     def getChromosomes(self, fullChrom=True):
 270         """ returns a sorted list of distinct chromosomes in bam file reference list.
 271         """
 272         if fullChrom:
 273             results = list(self.getFullReferenceNames())
 274         else:
 275             results = list(self.getShortReferenceNames())
 276
 277         results.sort()
 278
 279         return results
 280
 281
 282     def getFullReferenceNames(self):
 283         """ returns a set of bam file reference names.
 284         """
 285         return set(self.bamfile.references)
 286
 287
 288     def getShortReferenceNames(self):
 289         """ returns a set of bam file reference names after removing first 3 characters.
 290         """
 291         results = set()
 292         references = self.bamfile.references
 293         for chromosome in references:
 294             shortName = chromosome[3:]
 295             if len(shortName.strip()) > 0:
 296                 results.add(shortName)
 297
 298         return results
 299
 300
 301     def getMaxCoordinate(self, chrom, doUniqs=True,
 302                          doMulti=False, doSplices=False):
 303         """ returns the maximum coordinate for reads on a given chromosome.
 304         """
 305         maxCoord = 0
 306         coordFinder = MaxCoordFinder()
 307         self.bamfile.fetch(reference=chrom, callback=coordFinder)
 308
 309         if doUniqs:
 310             maxCoord = coordFinder.maxUniqueStart
 311
 312         if doSplices:
 313             maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
 314
 315         if doMulti:
 316             maxCoord = max(coordFinder.maxMultiStart, maxCoord)
 317
 318         return maxCoord
 319
 320
 321     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 322                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 323                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 324                      readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
 325                      flagLike=False, strand='', combine5p=False):
 326         """ First pass of rewrite
 327             1) Leave as much in place as possible
 328             2) For now treat multis just like uniqs
 329         """
 330         """
 331         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
 332         if findallOptimize:
 333             selectQuery = "select start, sense, sum(weight)"
 334         else:
 335             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 336
 337         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
 338         if doUniqs:
 339             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 340             if doMulti:
 341                 stmt.append("UNION ALL")
 342                 stmt.append(selectQuery)
 343                 stmt.append("from multi")
 344                 stmt.append(whereQuery)
 345                 stmt.append(groupQuery)
 346         else:
 347             stmt = [selectQuery, "from multi", whereQuery]
 348
 349         if combine5p:
 350             if findallOptimize:
 351                 selectQuery = "select start, sense, weight, chrom"
 352
 353             if doUniqs:
 354                 subSelect = [selectQuery, "from uniqs", whereQuery]
 355                 if doMulti:
 356                     subSelect.append("union all")
 357                     subSelect.append(selectQuery)
 358                     subSelect.append("from multi")
 359                     subSelect.append(whereQuery)
 360             else:
 361                 subSelect = [selectQuery, "from multi", whereQuery]
 362
 363             sqlStmt = string.join(subSelect)
 364             if findallOptimize:
 365                 selectQuery = "select start, sense, sum(weight)"
 366
 367             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 368                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 369
 370         if findallOptimize:
 371             if self.memBacked:
 372                 self.memcon.row_factory = None
 373             else:
 374                 self.dbcon.row_factory = None
 375
 376             stmt.append("order by start")
 377         elif readIDDict:
 378             stmt.append("order by readID, start")
 379         else:
 380             stmt.append("order by chrom, start")
 381
 382         sql = self.getSqlCursor()
 383         sqlQuery = string.join(stmt)
 384         sql.execute(sqlQuery)
 385         """
 386
 387         # Edits are starting here - trying to ignore the entire sql construct
 388         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
 389         resultsDict = {}
 390         if findallOptimize and chrom != "":
 391             readIDDict = False
 392             bothEnds = False
 393             noSense = False
 394             withWeight = True
 395             withFlag = False
 396             withMismatch = False
 397             withID = False
 398             withChrom = False
 399             withPairID = False
 400             fullChrom = True
 401             #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
 402
 403         pairID = 0
 404         for read in bamFileIterator:
 405             pairReadSuffix = getPairedReadNumberSuffix(read)
 406             readID = "%s%s" % (read.qname, pairReadSuffix)
 407             if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
 408                 continue
 409
 410             sense = getReadSense(read.is_reverse)
 411             if fullChrom:
 412                 chrom = self.bamfile.getrname(read.rname)
 413             else:
 414                 chrom = self.bamfile.getrname(read.rname)[3:]
 415
 416             if readIDDict:
 417                 dictKey = read.qname
 418             else:
 419                 dictKey = chrom
 420
 421             newrow = {"start": int(read.pos)}
 422             if bothEnds:
 423                 if self.readsize is None:
 424                     self.calculateReadsizeFromCigar(read.cigar)
 425
 426                 newrow["stop"] = int(read.pos + self.readsize)
 427
 428             if not noSense:
 429                 newrow["sense"] = sense
 430
 431             if withWeight:
 432                 try:
 433                     newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
 434                 except KeyError:
 435                     newrow["weight"] = 1.0
 436
 437             if withFlag:
 438                 #newrow["flag"] = row["flag"]
 439                 pass
 440
 441             if withMismatch:
 442                 try:
 443                     mismatchTag = read.opt("MD")
 444                     mismatches = getMismatches(mismatchTag, read.seq, sense)
 445                 except KeyError:
 446                     mismatches = ""
 447
 448                 newrow["mismatch"] = mismatches
 449
 450             if withID:
 451                 newrow["readID"] = readID
 452
 453             if withChrom:
 454                 newrow["chrom"] = chrom
 455
 456             if withPairID:
 457                 newrow["pairID"] = pairID
 458
 459             try:
 460                 resultsDict[dictKey].append(newrow)
 461             except KeyError:
 462                 resultsDict[dictKey] = [newrow]
 463
 464         return resultsDict
 465
 466
 467     def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
 468         readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
 469
 470         if flag != "":
 471             #TODO: need to pick a tag code for the erange flag - using ZF for now
 472             # although it looks like the flag is really just a label on a region.
 473             # since BAM can retrieve based on location if we store the regions then
 474             # we can just retrieve it directly from the BAM and we do not need to
 475             # flag the individual reads.  The location will be sufficient.
 476             try:
 477                 if flag != read.opt("ZF"):
 478                     readDoesNotMeetCriteria = True
 479             except KeyError:
 480                 readDoesNotMeetCriteria = True
 481
 482
 483         if hasMismatch:
 484             try:
 485                 mismatchTag = read.opt("MD")
 486             except KeyError:
 487                 readDoesNotMeetCriteria = True
 488
 489         if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
 490             readDoesNotMeetCriteria = True
 491
 492         return readDoesNotMeetCriteria
 493
 494
 495     def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
 496         rejectRead = False
 497         if readID in self.multiReadIDCounts.keys():
 498             if self.multiReadIDCounts[readID] > threshold or not doMulti:
 499                 rejectRead = True
 500
 501         return rejectRead
 502
 503
 504     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 505                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 506                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 507                        splitRead=False, hasMismatch=False, flagLike=False, start=None,
 508                        stop=None, strand=""):
 509         """ First pass of rewrite
 510             1) Leave as much in place as possible
 511             2) For now treat multis just like regular splices
 512         """
 513         """
 514         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
 515         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
 516         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
 517         sql = self.getSqlCursor()
 518
 519         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 520         sql.execute(stmt)
 521         """
 522
 523         # Edits are starting here - trying to ignore the entire sql construct
 524         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
 525         resultsDict = {}
 526         pairID = 0
 527         for read in bamFileIterator:
 528             if not isSpliceEntry(read.cigar):
 529                 continue
 530
 531             sense = getReadSense(read.is_reverse)
 532             pairReadSuffix = getPairedReadNumberSuffix(read)
 533             readID = "%s%s" % (read.qname, pairReadSuffix)
 534             if fullChrom:
 535                 chrom = self.bamfile.getrname(read.rname)
 536             else:
 537                 chrom = self.bamfile.getrname(read.rname)[3:]
 538
 539             if readIDDict:
 540                 dictKey = read.qname
 541             else:
 542                 dictKey = chrom
 543
 544             if self.readsize is None:
 545                 self.calculateReadsizeFromCigar(read.cigar)
 546
 547             startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
 548             newrow = {"startL": int(startL)}
 549             newrow["stopL"] = int(stopL)
 550             newrow["startR"] = int(startR)
 551             newrow["stopR"] = int(stopR)
 552             if not noSense:
 553                 newrow["sense"] = sense
 554
 555             if withWeight:
 556                 newrow["weight"] = 1.0
 557
 558             if withFlag:
 559                 #newrow["flag"] = row["flag"]
 560                 pass
 561
 562             if withMismatch:
 563                 try:
 564                     mismatchTag = read.opt("MD")
 565                     mismatches = getMismatches(mismatchTag, read.seq, sense)
 566                 except KeyError:
 567                     mismatches = ""
 568
 569                 newrow["mismatch"] = mismatches
 570
 571             if withID:
 572                 newrow["readID"] = readID
 573
 574             if withChrom:
 575                 newrow["chrom"] = chrom
 576
 577             if withPairID:
 578                 newrow["pairID"] = pairID
 579
 580             if splitRead:
 581                 leftDict = newrow.copy()
 582                 del leftDict["startR"]
 583                 del leftDict["stopR"]
 584                 rightDict = newrow
 585                 del rightDict["startL"]
 586                 del rightDict["stopL"]
 587                 try:
 588                     resultsDict[dictKey].append(leftDict)
 589                 except KeyError:
 590                     resultsDict[dictKey] = [leftDict]
 591
 592                 resultsDict[dictKey].append(rightDict)
 593             else:
 594                 try:
 595                     resultsDict[dictKey].append(newrow)
 596                 except KeyError:
 597                     resultsDict[dictKey] = [newrow]
 598
 599         return resultsDict
 600
 601
 602     def getFullReadCount(self):
 603         fullReadCount = {"uniq": 0,
 604                          "multi": 0,
 605                          "splice": 0
 606         }
 607
 608         for chromosome in self.getChromosomes():
 609             uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
 610             fullReadCount["uniq"] += uniqCount
 611             fullReadCount["multi"] += multiCount
 612             fullReadCount["splice"] += spliceCount
 613
 614         return fullReadCount
 615
 616
 617     def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
 618                   splices=False, reportCombined=True, sense="both"):
 619         """ return read counts for a given region.
 620         """
 621         ucount = 0
 622         mcount = 0
 623         scount = 0
 624         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
 625
 626         if uniqs:
 627             ucount = readCounter.uniqReadCount
 628
 629         if multi:
 630             mcount = readCounter.multiReadCount
 631
 632         if splices:
 633             scount = readCounter.spliceReadCount
 634
 635         if reportCombined:
 636             total = ucount + mcount + scount
 637             return total
 638         else:
 639             return (ucount, mcount, scount)
 640
 641
 642     def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
 643         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 644
 645         return readCounter.spliceReadCount
 646
 647
 648     def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
 649         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 650
 651         return readCounter.uniqReadCount
 652
 653
 654     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 655         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 656
 657         return readCounter.multiReadCount
 658
 659
 660     def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
 661         readCounter = ReadCounter(self.multiReadIDCounts, sense)
 662         self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
 663
 664         return readCounter
 665
 666
 667     #TODO: Primary
 668     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 669         """ get readID's.
 670         """
 671         stmt = []
 672         if uniqs:
 673             stmt.append("select readID from uniqs")
 674
 675         if multi:
 676             stmt.append("select readID from multi")
 677
 678         if splices:
 679             stmt.append("select readID from splices")
 680
 681         if len(stmt) > 0:
 682             selectPart = string.join(stmt, " union ")
 683         else:
 684             selectPart = ""
 685
 686         limitPart = ""
 687         if limit > 0:
 688             limitPart = "LIMIT %d" % limit
 689
 690         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 691         sql = self.getSqlCursor()
 692         sql.execute(sqlQuery)
 693         result = sql.fetchall()
 694
 695         if paired:
 696             return [x[0].split("/")[0] for x in result]
 697         else:
 698             return [x[0] for x in result]
 699
 700
 701     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 702         """ returns the uniq and spliced mismatches in a dictionary.
 703         """
 704         readlen = self.getReadSize()
 705         if mischrom:
 706             hitChromList = [mischrom]
 707         else:
 708             hitChromList = self.getChromosomes()
 709             hitChromList.sort()
 710
 711         snpDict = {}
 712         for chromosome in hitChromList:
 713             if verbose:
 714                 print "getting mismatches from chromosome %s" % (chromosome)
 715
 716             snpDict[chromosome] = []
 717             if useSplices and self.dataType == "RNA":
 718                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
 719                 spliceIDList = spliceDict.keys()
 720                 for spliceID in spliceIDList:
 721                     spliceEntry = spliceDict[spliceID][0]
 722                     startpos = spliceEntry["startL"]
 723                     lefthalf = spliceEntry["stopL"]
 724                     rightstart = spliceEntry["startR"]
 725                     sense = spliceEntry["sense"]
 726                     mismatches = spliceEntry["mismatch"]
 727                     spMismatchList = mismatches.split(",")
 728                     for mismatch in spMismatchList:
 729                         if "N" in mismatch:
 730                             continue
 731
 732                         change_len = len(mismatch)
 733                         if sense == "+":
 734                             change_from = mismatch[0]
 735                             change_base = mismatch[change_len-1]
 736                             change_pos = int(mismatch[1:change_len-1])
 737                         elif sense == "-":
 738                             change_from = getReverseComplement(mismatch[0])
 739                             change_base = getReverseComplement(mismatch[change_len-1])
 740                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 741
 742                         firsthalf = int(lefthalf)-int(startpos)+1
 743                         secondhalf = 0
 744                         if int(change_pos) <= int(firsthalf):
 745                             change_at = startpos + change_pos - 1
 746                         else:
 747                             secondhalf = change_pos - firsthalf
 748                             change_at = rightstart + secondhalf
 749
 750                         snpDict[chromosome].append([startpos, change_at, change_base, change_from])
 751
 752             hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
 753             if chromosome not in hitDict.keys():
 754                 continue
 755
 756             for readEntry in hitDict[chromosome]:
 757                 start = readEntry["start"]
 758                 sense = readEntry["sense"]
 759                 mismatches = readEntry["mismatch"]
 760                 mismatchList = mismatches.split(",")
 761                 for mismatch in mismatchList:
 762                     if "N" in mismatch:
 763                         continue
 764
 765                     change_len = len(mismatch)
 766                     if sense == "+":
 767                         change_from = mismatch[0]
 768                         change_base = mismatch[change_len-1]
 769                         change_pos = int(mismatch[1:change_len-1])
 770                     elif sense == "-":
 771                         change_from = getReverseComplement(mismatch[0])
 772                         change_base = getReverseComplement(mismatch[change_len-1])
 773                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 774
 775                     change_at = start + change_pos - 1
 776                     snpDict[chromosome].append([start, change_at, change_base, change_from])
 777
 778         return snpDict
 779
 780
 781     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 782                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
 783                         keepStrand="both", shiftValue=0):
 784         """return a profile of the chromosome as an array of per-base read coverage....
 785             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 786             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 787         """
 788         metadata = self.getMetadata()
 789         try:
 790             readlen = int(metadata["readsize"])
 791         except KeyError:
 792             readlen = 0
 793
 794         dataType = metadata["dataType"]
 795         scale = 1. / normalizationFactor
 796         shift = {}
 797         shift["+"] = int(shiftValue)
 798         shift["-"] = -1 * int(shiftValue)
 799
 800         if cstop > 0:
 801             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 802         else:
 803             lastNT = cstop - cstart + readlen + shift["+"]
 804
 805         chromModel = array("f",[0.] * lastNT)
 806         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 807         if cstart < 0:
 808             cstart = 0
 809
 810         for readEntry in hitDict[chromosome]:
 811             hstart = readEntry["start"]
 812             sense =  readEntry ["sense"]
 813             weight = readEntry["weight"]
 814             hstart = hstart - cstart + shift[sense]
 815             for currentpos in range(hstart,hstart+readlen):
 816                 try:
 817                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 818                         chromModel[currentpos] += scale * weight
 819                     elif sense == "-" and keepStrand != "plusOnly":
 820                         chromModel[currentpos] -= scale * weight
 821                 except:
 822                     continue
 823
 824         del hitDict
 825         if useSplices and dataType == "RNA":
 826             if cstop > 0:
 827                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 828             else:
 829                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 830
 831             if chromosome in spliceDict:
 832                 for spliceEntry in spliceDict[chromosome]:
 833                     Lstart = spliceEntry["startL"]
 834                     Lstop = spliceEntry["stopL"]
 835                     Rstart = spliceEntry["startR"]
 836                     Rstop = spliceEntry["stopR"]
 837                     rsense = spliceEntry["sense"]
 838                     if (Rstop - cstart) < lastNT:
 839                         for index in range(abs(Lstop - Lstart)):
 840                             currentpos = Lstart - cstart + index
 841                             # we only track unique splices
 842                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 843                                 chromModel[currentpos] += scale
 844                             elif rsense == "-" and keepStrand != "plusOnly":
 845                                 chromModel[currentpos] -= scale
 846
 847                         for index in range(abs(Rstop - Rstart)):
 848                             currentpos = Rstart - cstart + index
 849                             # we only track unique splices
 850                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 851                                 chromModel[currentpos] += scale
 852                             elif rsense == "-" and keepStrand != "plusOnly":
 853                                 chromModel[currentpos] -= scale
 854
 855             del spliceDict
 856
 857         return chromModel
 858
 859
 860     def insertMetadata(self, valuesList):
 861         for (pname, pvalue) in valuesList:
 862             self.metadata[pname] = pvalue
 863
 864
 865     def updateMetadata(self, pname, newValue, originalValue=""):
 866         if originalValue != "":
 867             if self.metadata[pname] == originalValue:
 868                 self.metadata[pname] = newValue
 869         else:
 870             self.metadata[pname] = newValue
 871
 872
 873     #TODO: Primary
 874     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
 875         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
 876             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
 877             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
 878         """
 879         restrict = ""
 880         if sense != "both":
 881             restrict = " and sense = ? "
 882
 883         if uniqs:
 884             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
 885
 886         if multi:
 887             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
 888
 889         if self.dataType == "RNA" and splices:
 890             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
 891             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
 892
 893         self.dbcon.commit()
 894
 895
 896     #TODO: Primary
 897     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
 898         """ set the flag fields in the entire dataset.
 899         """
 900         if uniqs:
 901             self.setFlagsInDB("uniqs", flag)
 902
 903         if multi:
 904             self.setFlagsInDB("multi", flag)
 905
 906         if self.dataType == "RNA" and splices:
 907             self.setFlagsInDB("splices", flag)
 908
 909         self.dbcon.commit()
 910
 911
 912     #TODO: Secondary to setFlags()
 913     def setFlagsInDB(self, table, flag):
 914         """ set the flag field for every entry in a table.
 915         """
 916         self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
 917
 918
 919     #TODO: Primary
 920     def resetFlags(self, uniqs=True, multi=True, splices=True):
 921         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
 922         """
 923         self.setFlags("", uniqs, multi, splices)
 924
 925
 926     #TODO: Primary
 927     def reweighMultireads(self, readList):
 928         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
 929
 930
 931     def setSynchronousPragma(self, value="ON"):
 932         pass
 933
 934
 935     def setDBcache(self, cache, default=False):
 936         pass
 937
 938
 939     def execute(self, statement, returnResults=False):
 940         pass
 941
 942
 943     def executeCommit(self, statement):
 944         pass
 945
 946
 947     def buildIndex(self, cache=100000):
 948         pass
 949
 950
 951     def dropIndex(self):
 952         pass
 953
 954
 955     def memSync(self, chrom="", index=False):
 956         pass
 957
 958
 959     def copyDBEntriesToMemory(self, dbName, whereClause=""):
 960         pass
 961
 962
 963     def copySpliceDBEntriesToMemory(self, whereClause=""):
 964         pass
 965
 966
 967 def getReadSense(reverse):
 968     if reverse:
 969         sense = "-"
 970     else:
 971         sense = "+"
 972
 973     return sense
 974
 975
 976 def isSpliceEntry(cigarTupleList):
 977     isSplice = False
 978     for operation,length in cigarTupleList:
 979         if operation == 3:
 980             isSplice = True
 981             break
 982
 983     return isSplice
 984
 985
 986 def getSpliceRightStart(start, cigarTupleList):
 987     offset = 0
 988
 989     for operation,length in cigarTupleList:
 990         if operation == 3:
 991             stopL = int(start + offset)
 992             startR = int(stopL + length)
 993
 994             return startR
 995         else:
 996             offset += length
 997
 998
 999 def getSpliceBounds(start, readsize, cigarTupleList):
1000     offset = 0
1001     #TODO: check operations here - we want to make sure we are adding the correct ones
1002     for operation,length in cigarTupleList:
1003         if operation == 3:
1004             stopL = int(start + offset)
1005             startR = int(stopL + length)
1006             offset = 0
1007         else:
1008             offset += length
1009
1010     stopR = startR + offset
1011
1012     return start, startR, stopL, stopR
1013
1014
1015 def getPairedReadNumberSuffix(read):
1016     readSuffix = ""
1017     if not isPairedRead(read):
1018         return ""
1019
1020     if read.is_read1:
1021         readSuffix = "/1"
1022     elif read.is_read2:
1023         readSuffix = "/2"
1024
1025     return readSuffix
1026
1027
1028 def isPairedRead(read):
1029     return read.is_proper_pair and (read.is_read1 or read.is_read2)
1030
1031
1032 def getMismatches(mismatchTag, querySequence="", sense="+"):
1033     output = []
1034     deletionMarker = "^"
1035     position = 0
1036
1037     lengths = re.findall("\d+", mismatchTag)
1038     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1039
1040     for mismatchEntry in range(len(mismatchSequences)):
1041         mismatch = mismatchSequences[mismatchEntry]
1042         position = position + int(lengths[mismatchEntry])
1043         if string.find(mismatch, deletionMarker) == 0:
1044             continue
1045
1046         try:
1047             if querySequence:
1048                 genomicNucleotide = querySequence[position]
1049             else:
1050                 genomicNucleotide = "N"
1051
1052             if sense == "-":
1053                 mismatch = getReverseComplement(mismatch)
1054                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
1055
1056             erange1BasedElandCompatiblePosition = int(position + 1)
1057             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1058             position += 1
1059         except IndexError:
1060             return ""
1061
1062     return string.join(output, ",")