ReadDataset.py

   1 import string
   2 import tempfile
   3 import shutil
   4 import os
   5 import re
   6 import sys
   7 import pysam
   8 from array import array
   9 from commoncode import getReverseComplement, isSpliceEntry
  10
  11 currentRDSVersion = "3.0"
  12
  13 class ReadDatasetError(Exception):
  14     pass
  15
  16
  17 class ReadCounter():
  18     uniqReadCount = 0
  19     multiReadCount = 0.0
  20     spliceReadCount = 0
  21
  22     def __init__(self, multiReadIDCount={}, sense="both"):
  23         self.multiReadIDCount = multiReadIDCount
  24         self.sense = sense
  25
  26
  27     def __call__(self, read):
  28         multiplicity = self.getReadMultiplicity(read)
  29         if multiplicity > 1:
  30             self.multiReadCount += float(1.0/multiplicity)
  31         elif isSpliceEntry(read.cigar):
  32             self.spliceReadCount += 1
  33         else:
  34             self.uniqReadCount += 1
  35
  36
  37     def getReadMultiplicity(self, read):
  38         pairReadSuffix = getPairedReadNumberSuffix(read)
  39         readID = "%s%s" % (read.qname, pairReadSuffix)
  40         try:
  41             multiplicity = self.multiReadIDCount[readID]
  42         except KeyError:
  43             multiplicity = 1
  44
  45         return multiplicity
  46
  47
  48     def isMulti(self, read):
  49         pairReadSuffix = getPairedReadNumberSuffix(read)
  50         readID = "%s%s" % (read.qname, pairReadSuffix)
  51         return readID in self.multiReadIDCount
  52
  53
  54
  55 class MaxCoordFinder(ReadCounter):
  56     maxUniqueStart = 0
  57     maxMultiStart = 0
  58     maxSpliceStart = 0
  59
  60     def __call__(self, read):
  61         if self.isMulti(read):
  62             self.maxMultiStart = max(self.maxMultiStart, read.pos)
  63         elif isSpliceEntry(read.cigar):
  64             self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
  65         else:
  66             self.maxUniqueStart = max(self.maxMultiStart, read.pos)
  67
  68
  69
  70 class ReadDataset():
  71     """ Class for storing reads from experiments. Assumes that custom scripts
  72     will translate incoming data into a format that can be inserted into the
  73     class using the insert* methods. Default class subtype ('DNA') includes
  74     tables for unique and multireads, whereas 'RNA' subtype also includes a
  75     splices table.
  76     """
  77
  78
  79     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  80                  cache=False, reportCount=True):
  81         """ creates an rds datafile if initialize is set to true, otherwise
  82         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  83         """
  84
  85         if initialize and datasetType not in ["DNA", "RNA"]:
  86             raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  87
  88         self.dataType = ""
  89         self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
  90         self.bamfile = pysam.Samfile(datafile, "rb")
  91         self.readsize = None
  92         self.totalReadWeight = None
  93
  94         self.metadata = {"dataType": datasetType}
  95
  96         if initialize:
  97             self.dataType = datasetType
  98         else:
  99             self.dataType = self.metadata["dataType"]
 100
 101         try:
 102             self.rdsVersion = self.metadata["rdsVersion"]
 103         except KeyError:
 104             self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
 105
 106         self.fullReadCounts = self.getFullReadCount()
 107         if verbose:
 108             self.printRDSInfo(datafile, reportCount, initialize)
 109
 110
 111     def __len__(self):
 112         """ return the number of reads in the bam file
 113             This is not the same as the original as the original returned total weight.
 114         """
 115         count = 0
 116         references = self.bamfile.references
 117         for reference in references:
 118             count += self.bamfile.count(reference)
 119
 120         return count
 121
 122
 123     def getMultiReadIDCounts(self, samFileName, fileMode):
 124         try:
 125             samfile = pysam.Samfile(samFileName, fileMode)
 126         except ValueError:
 127             print "samfile index not found"
 128             sys.exit(1)
 129
 130         readIDCounts = {}
 131         for read in samfile.fetch(until_eof=True):
 132             pairReadSuffix = getPairedReadNumberSuffix(read)
 133             readName = "%s%s" % (read.qname, pairReadSuffix)
 134             try:
 135                 readIDCounts[readName] += 1.0
 136             except KeyError:
 137                 readIDCounts[readName] = 1.0
 138
 139         for readID in readIDCounts.keys():
 140             if int(readIDCounts[readID]) == 1:
 141                 del readIDCounts[readID]
 142
 143         return readIDCounts
 144
 145
 146     def totalReadWeight(self):
 147         """ return the total weight of usable reads in the bam file.
 148         """
 149         if self.totalReadWeight is None:
 150             total = self.fullReadCounts["uniq"]
 151             total += self.fullReadCounts["multi"]
 152
 153             if self.dataType == "RNA":
 154                 total += self.fullReadCounts["splice"]
 155
 156             self.totalReadWeight = int(total)
 157
 158         return self.totalReadWeight
 159
 160
 161     def __del__(self):
 162         """ cleanup copy in local cache, if present.
 163         """
 164         pass
 165
 166
 167     def printRDSInfo(self, datafile, reportCount, initialize):
 168         if initialize:
 169             print "INITIALIZED dataset %s" % datafile
 170         else:
 171             print "dataset %s" % datafile
 172
 173         print "metadata:"
 174         pnameList = self.metadata.keys()
 175         pnameList.sort()
 176         for pname in pnameList:
 177             print "\t" + pname + "\t" + str(self.metadata[pname])
 178
 179         if reportCount and not initialize:
 180             self.printReadCounts()
 181
 182         print "default cache size is %d pages" % self.getDefaultCacheSize()
 183
 184
 185     def printReadCounts(self):
 186         ucount = self.fullReadCounts["uniq"]
 187         mcount = self.fullReadCounts["multi"]
 188         if self.dataType == "DNA":
 189             print "\n%d unique reads and %d multireads" % (ucount, mcount)
 190         elif self.dataType == "RNA":
 191             scount = self.fullReadCounts["splice"]
 192             print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
 193
 194
 195     def cacheDB(self, filename):
 196         pass
 197
 198
 199     def saveCacheDB(self, filename):
 200         pass
 201
 202
 203     def uncacheDB(self):
 204         pass
 205
 206
 207     def attachDB(self, filename, dbName):
 208         pass
 209
 210
 211     def detachDB(self, dbName):
 212         pass
 213
 214
 215     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
 216         pass
 217
 218
 219     def getTables(self, dbName=""):
 220         pass
 221
 222
 223     def getSqlCursor(self):
 224         pass
 225
 226
 227     def getMemCursor(self):
 228         pass
 229
 230
 231     def getFileCursor(self):
 232         pass
 233
 234
 235     def initializeTables(self, dbConnection, cache=100000):
 236         pass
 237
 238
 239     def getMetadata(self, valueName=""):
 240         return self.metadata
 241
 242
 243     def getReadSize(self):
 244         """ This is following the same model as the original where it assumes that all
 245             read have the same readsize and that this is reflected by the size of the
 246             first read.
 247         """
 248         if self.readsize is None:
 249             bamFileIterator = self.bamfile.fetch(until_eof=True)
 250             for read in bamFileIterator:
 251                 self.calculateReadsizeFromCigar(read.cigar)
 252                 break
 253
 254         return self.readsize
 255
 256
 257     def calculateReadsizeFromCigar(self, cigar):
 258         take = (0, 1) # CIGAR operation (M/match, I/insertion)
 259         self.readsize = sum([length for op,length in cigar if op in take])
 260
 261
 262     def getDefaultCacheSize(self):
 263         """ returns 0 as cache is going to be deprecated
 264         """
 265         return 0
 266
 267
 268     def getChromosomes(self, fullChrom=True):
 269         """ returns a sorted list of distinct chromosomes in bam file reference list.
 270         """
 271         if fullChrom:
 272             results = list(self.getFullReferenceNames())
 273         else:
 274             results = list(self.getShortReferenceNames())
 275
 276         results.sort()
 277
 278         return results
 279
 280
 281     def getFullReferenceNames(self):
 282         """ returns a set of bam file reference names.
 283         """
 284         return set(self.bamfile.references)
 285
 286
 287     def getShortReferenceNames(self):
 288         """ returns a set of bam file reference names after removing first 3 characters.
 289         """
 290         results = set()
 291         references = self.bamfile.references
 292         for chromosome in references:
 293             shortName = chromosome[3:]
 294             if len(shortName.strip()) > 0:
 295                 results.add(shortName)
 296
 297         return results
 298
 299
 300     def getMaxCoordinate(self, chrom, doUniqs=True,
 301                          doMulti=False, doSplices=False):
 302         """ returns the maximum coordinate for reads on a given chromosome.
 303         """
 304         maxCoord = 0
 305         coordFinder = MaxCoordFinder()
 306         self.bamfile.fetch(reference=chrom, callback=coordFinder)
 307
 308         if doUniqs:
 309             maxCoord = coordFinder.maxUniqueStart
 310
 311         if doSplices:
 312             maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
 313
 314         if doMulti:
 315             maxCoord = max(coordFinder.maxMultiStart, maxCoord)
 316
 317         return maxCoord
 318
 319
 320     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 321                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 322                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 323                      readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
 324                      flagLike=False, strand='', combine5p=False):
 325         """ First pass of rewrite
 326             1) Leave as much in place as possible
 327             2) For now treat multis just like uniqs
 328         """
 329         """
 330         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
 331         if findallOptimize:
 332             selectQuery = "select start, sense, sum(weight)"
 333         else:
 334             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 335
 336         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
 337         if doUniqs:
 338             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 339             if doMulti:
 340                 stmt.append("UNION ALL")
 341                 stmt.append(selectQuery)
 342                 stmt.append("from multi")
 343                 stmt.append(whereQuery)
 344                 stmt.append(groupQuery)
 345         else:
 346             stmt = [selectQuery, "from multi", whereQuery]
 347
 348         if combine5p:
 349             if findallOptimize:
 350                 selectQuery = "select start, sense, weight, chrom"
 351
 352             if doUniqs:
 353                 subSelect = [selectQuery, "from uniqs", whereQuery]
 354                 if doMulti:
 355                     subSelect.append("union all")
 356                     subSelect.append(selectQuery)
 357                     subSelect.append("from multi")
 358                     subSelect.append(whereQuery)
 359             else:
 360                 subSelect = [selectQuery, "from multi", whereQuery]
 361
 362             sqlStmt = string.join(subSelect)
 363             if findallOptimize:
 364                 selectQuery = "select start, sense, sum(weight)"
 365
 366             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 367                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 368
 369         if findallOptimize:
 370             if self.memBacked:
 371                 self.memcon.row_factory = None
 372             else:
 373                 self.dbcon.row_factory = None
 374
 375             stmt.append("order by start")
 376         elif readIDDict:
 377             stmt.append("order by readID, start")
 378         else:
 379             stmt.append("order by chrom, start")
 380
 381         sql = self.getSqlCursor()
 382         sqlQuery = string.join(stmt)
 383         sql.execute(sqlQuery)
 384         """
 385
 386         # Edits are starting here - trying to ignore the entire sql construct
 387         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
 388         resultsDict = {}
 389         if findallOptimize and chrom != "":
 390             readIDDict = False
 391             bothEnds = False
 392             noSense = False
 393             withWeight = True
 394             withFlag = False
 395             withMismatch = False
 396             withID = False
 397             withChrom = False
 398             withPairID = False
 399             fullChrom = True
 400             #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
 401
 402         pairID = 0
 403         for read in bamFileIterator:
 404             pairReadSuffix = getPairedReadNumberSuffix(read)
 405             readID = "%s%s" % (read.qname, pairReadSuffix)
 406             if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
 407                 continue
 408
 409             sense = getReadSense(read.is_reverse)
 410             if fullChrom:
 411                 chrom = self.bamfile.getrname(read.rname)
 412             else:
 413                 chrom = self.bamfile.getrname(read.rname)[3:]
 414
 415             if readIDDict:
 416                 dictKey = read.qname
 417             else:
 418                 dictKey = chrom
 419
 420             newrow = {"start": int(read.pos)}
 421             if bothEnds:
 422                 if self.readsize is None:
 423                     self.calculateReadsizeFromCigar(read.cigar)
 424
 425                 newrow["stop"] = int(read.pos + self.readsize)
 426
 427             if not noSense:
 428                 newrow["sense"] = sense
 429
 430             if withWeight:
 431                 try:
 432                     newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
 433                 except KeyError:
 434                     newrow["weight"] = 1.0
 435
 436             if withFlag:
 437                 #newrow["flag"] = row["flag"]
 438                 pass
 439
 440             if withMismatch:
 441                 try:
 442                     mismatchTag = read.opt("MD")
 443                     mismatches = getMismatches(mismatchTag, read.seq, sense)
 444                 except KeyError:
 445                     mismatches = ""
 446
 447                 newrow["mismatch"] = mismatches
 448
 449             if withID:
 450                 newrow["readID"] = readID
 451
 452             if withChrom:
 453                 newrow["chrom"] = chrom
 454
 455             if withPairID:
 456                 newrow["pairID"] = pairReadSuffix[1:]
 457
 458             try:
 459                 resultsDict[dictKey].append(newrow)
 460             except KeyError:
 461                 resultsDict[dictKey] = [newrow]
 462
 463         return resultsDict
 464
 465
 466     def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
 467         readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
 468
 469         if flag != "":
 470             #TODO: need to pick a tag code for the erange flag - using ZF for now
 471             # although it looks like the flag is really just a label on a region.
 472             # since BAM can retrieve based on location if we store the regions then
 473             # we can just retrieve it directly from the BAM and we do not need to
 474             # flag the individual reads.  The location will be sufficient.
 475             try:
 476                 if flag != read.opt("ZF"):
 477                     readDoesNotMeetCriteria = True
 478             except KeyError:
 479                 readDoesNotMeetCriteria = True
 480
 481
 482         if hasMismatch:
 483             try:
 484                 mismatchTag = read.opt("MD")
 485             except KeyError:
 486                 readDoesNotMeetCriteria = True
 487
 488         if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
 489             readDoesNotMeetCriteria = True
 490
 491         return readDoesNotMeetCriteria
 492
 493
 494     def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
 495         rejectRead = False
 496         if readID in self.multiReadIDCounts.keys():
 497             if self.multiReadIDCounts[readID] > threshold or not doMulti:
 498                 rejectRead = True
 499
 500         return rejectRead
 501
 502
 503     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 504                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 505                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 506                        splitRead=False, hasMismatch=False, flagLike=False, start=None,
 507                        stop=None, strand=""):
 508         """ First pass of rewrite
 509             1) Leave as much in place as possible
 510             2) For now treat multis just like regular splices
 511         """
 512         """
 513         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
 514         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
 515         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
 516         sql = self.getSqlCursor()
 517
 518         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 519         sql.execute(stmt)
 520         """
 521
 522         # Edits are starting here - trying to ignore the entire sql construct
 523         bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
 524         resultsDict = {}
 525         pairID = 0
 526         for read in bamFileIterator:
 527             if not isSpliceEntry(read.cigar):
 528                 continue
 529
 530             sense = getReadSense(read.is_reverse)
 531             pairReadSuffix = getPairedReadNumberSuffix(read)
 532             readID = "%s%s" % (read.qname, pairReadSuffix)
 533             if fullChrom:
 534                 chrom = self.bamfile.getrname(read.rname)
 535             else:
 536                 chrom = self.bamfile.getrname(read.rname)[3:]
 537
 538             if readIDDict:
 539                 dictKey = read.qname
 540             else:
 541                 dictKey = chrom
 542
 543             if self.readsize is None:
 544                 self.calculateReadsizeFromCigar(read.cigar)
 545
 546             startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
 547             newrow = {"startL": int(startL)}
 548             newrow["stopL"] = int(stopL)
 549             newrow["startR"] = int(startR)
 550             newrow["stopR"] = int(stopR)
 551             if not noSense:
 552                 newrow["sense"] = sense
 553
 554             if withWeight:
 555                 newrow["weight"] = 1.0
 556
 557             if withFlag:
 558                 #newrow["flag"] = row["flag"]
 559                 pass
 560
 561             if withMismatch:
 562                 try:
 563                     mismatchTag = read.opt("MD")
 564                     mismatches = getMismatches(mismatchTag, read.seq, sense)
 565                 except KeyError:
 566                     mismatches = ""
 567
 568                 newrow["mismatch"] = mismatches
 569
 570             if withID:
 571                 newrow["readID"] = readID
 572
 573             if withChrom:
 574                 newrow["chrom"] = chrom
 575
 576             if withPairID:
 577                 newrow["pairID"] = pairID
 578
 579             if splitRead:
 580                 leftDict = newrow.copy()
 581                 del leftDict["startR"]
 582                 del leftDict["stopR"]
 583                 rightDict = newrow
 584                 del rightDict["startL"]
 585                 del rightDict["stopL"]
 586                 try:
 587                     resultsDict[dictKey].append(leftDict)
 588                 except KeyError:
 589                     resultsDict[dictKey] = [leftDict]
 590
 591                 resultsDict[dictKey].append(rightDict)
 592             else:
 593                 try:
 594                     resultsDict[dictKey].append(newrow)
 595                 except KeyError:
 596                     resultsDict[dictKey] = [newrow]
 597
 598         return resultsDict
 599
 600
 601     def getFullReadCount(self):
 602         fullReadCount = {"uniq": 0,
 603                          "multi": 0,
 604                          "splice": 0
 605         }
 606
 607         for chromosome in self.getChromosomes():
 608             uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
 609             fullReadCount["uniq"] += uniqCount
 610             fullReadCount["multi"] += multiCount
 611             fullReadCount["splice"] += spliceCount
 612
 613         return fullReadCount
 614
 615
 616     def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
 617                   splices=False, reportCombined=True, sense="both"):
 618         """ return read counts for a given region.
 619         """
 620         ucount = 0
 621         mcount = 0
 622         scount = 0
 623         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
 624
 625         if uniqs:
 626             ucount = readCounter.uniqReadCount
 627
 628         if multi:
 629             mcount = readCounter.multiReadCount
 630
 631         if splices:
 632             scount = readCounter.spliceReadCount
 633
 634         if reportCombined:
 635             total = ucount + mcount + scount
 636             return total
 637         else:
 638             return (ucount, mcount, scount)
 639
 640
 641     def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
 642         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 643
 644         return readCounter.spliceReadCount
 645
 646
 647     def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
 648         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 649
 650         return readCounter.uniqReadCount
 651
 652
 653     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 654         readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
 655
 656         return readCounter.multiReadCount
 657
 658
 659     def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
 660         readCounter = ReadCounter(self.multiReadIDCounts, sense)
 661         self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
 662
 663         return readCounter
 664
 665
 666     #TODO: Primary
 667     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 668         """ get readID's.
 669         """
 670         stmt = []
 671         if uniqs:
 672             stmt.append("select readID from uniqs")
 673
 674         if multi:
 675             stmt.append("select readID from multi")
 676
 677         if splices:
 678             stmt.append("select readID from splices")
 679
 680         if len(stmt) > 0:
 681             selectPart = string.join(stmt, " union ")
 682         else:
 683             selectPart = ""
 684
 685         limitPart = ""
 686         if limit > 0:
 687             limitPart = "LIMIT %d" % limit
 688
 689         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 690         sql = self.getSqlCursor()
 691         sql.execute(sqlQuery)
 692         result = sql.fetchall()
 693
 694         if paired:
 695             return [x[0].split("/")[0] for x in result]
 696         else:
 697             return [x[0] for x in result]
 698
 699
 700     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 701         """ returns the uniq and spliced mismatches in a dictionary.
 702         """
 703         readlen = self.getReadSize()
 704         if mischrom:
 705             hitChromList = [mischrom]
 706         else:
 707             hitChromList = self.getChromosomes()
 708             hitChromList.sort()
 709
 710         snpDict = {}
 711         for chromosome in hitChromList:
 712             if verbose:
 713                 print "getting mismatches from chromosome %s" % (chromosome)
 714
 715             snpDict[chromosome] = []
 716             if useSplices and self.dataType == "RNA":
 717                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
 718                 spliceIDList = spliceDict.keys()
 719                 for spliceID in spliceIDList:
 720                     spliceEntry = spliceDict[spliceID][0]
 721                     startpos = spliceEntry["startL"]
 722                     lefthalf = spliceEntry["stopL"]
 723                     rightstart = spliceEntry["startR"]
 724                     sense = spliceEntry["sense"]
 725                     mismatches = spliceEntry["mismatch"]
 726                     spMismatchList = mismatches.split(",")
 727                     for mismatch in spMismatchList:
 728                         if "N" in mismatch:
 729                             continue
 730
 731                         change_len = len(mismatch)
 732                         if sense == "+":
 733                             change_from = mismatch[0]
 734                             change_base = mismatch[change_len-1]
 735                             change_pos = int(mismatch[1:change_len-1])
 736                         elif sense == "-":
 737                             change_from = getReverseComplement(mismatch[0])
 738                             change_base = getReverseComplement(mismatch[change_len-1])
 739                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 740
 741                         firsthalf = int(lefthalf)-int(startpos)+1
 742                         secondhalf = 0
 743                         if int(change_pos) <= int(firsthalf):
 744                             change_at = startpos + change_pos - 1
 745                         else:
 746                             secondhalf = change_pos - firsthalf
 747                             change_at = rightstart + secondhalf
 748
 749                         snpDict[chromosome].append([startpos, change_at, change_base, change_from])
 750
 751             hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
 752             if chromosome not in hitDict.keys():
 753                 continue
 754
 755             for readEntry in hitDict[chromosome]:
 756                 start = readEntry["start"]
 757                 sense = readEntry["sense"]
 758                 mismatches = readEntry["mismatch"]
 759                 mismatchList = mismatches.split(",")
 760                 for mismatch in mismatchList:
 761                     if "N" in mismatch:
 762                         continue
 763
 764                     change_len = len(mismatch)
 765                     if sense == "+":
 766                         change_from = mismatch[0]
 767                         change_base = mismatch[change_len-1]
 768                         change_pos = int(mismatch[1:change_len-1])
 769                     elif sense == "-":
 770                         change_from = getReverseComplement(mismatch[0])
 771                         change_base = getReverseComplement(mismatch[change_len-1])
 772                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 773
 774                     change_at = start + change_pos - 1
 775                     snpDict[chromosome].append([start, change_at, change_base, change_from])
 776
 777         return snpDict
 778
 779
 780     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 781                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
 782                         keepStrand="both", shiftValue=0):
 783         """return a profile of the chromosome as an array of per-base read coverage....
 784             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 785             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 786         """
 787         metadata = self.getMetadata()
 788         try:
 789             readlen = int(metadata["readsize"])
 790         except KeyError:
 791             readlen = 0
 792
 793         dataType = metadata["dataType"]
 794         scale = 1. / normalizationFactor
 795         shift = {}
 796         shift["+"] = int(shiftValue)
 797         shift["-"] = -1 * int(shiftValue)
 798
 799         if cstop > 0:
 800             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 801         else:
 802             lastNT = cstop - cstart + readlen + shift["+"]
 803
 804         chromModel = array("f",[0.] * lastNT)
 805         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 806         if cstart < 0:
 807             cstart = 0
 808
 809         for readEntry in hitDict[chromosome]:
 810             hstart = readEntry["start"]
 811             sense =  readEntry ["sense"]
 812             weight = readEntry["weight"]
 813             hstart = hstart - cstart + shift[sense]
 814             for currentpos in range(hstart,hstart+readlen):
 815                 try:
 816                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 817                         chromModel[currentpos] += scale * weight
 818                     elif sense == "-" and keepStrand != "plusOnly":
 819                         chromModel[currentpos] -= scale * weight
 820                 except:
 821                     continue
 822
 823         del hitDict
 824         if useSplices and dataType == "RNA":
 825             if cstop > 0:
 826                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 827             else:
 828                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 829
 830             if chromosome in spliceDict:
 831                 for spliceEntry in spliceDict[chromosome]:
 832                     Lstart = spliceEntry["startL"]
 833                     Lstop = spliceEntry["stopL"]
 834                     Rstart = spliceEntry["startR"]
 835                     Rstop = spliceEntry["stopR"]
 836                     rsense = spliceEntry["sense"]
 837                     if (Rstop - cstart) < lastNT:
 838                         for index in range(abs(Lstop - Lstart)):
 839                             currentpos = Lstart - cstart + index
 840                             # we only track unique splices
 841                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 842                                 chromModel[currentpos] += scale
 843                             elif rsense == "-" and keepStrand != "plusOnly":
 844                                 chromModel[currentpos] -= scale
 845
 846                         for index in range(abs(Rstop - Rstart)):
 847                             currentpos = Rstart - cstart + index
 848                             # we only track unique splices
 849                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 850                                 chromModel[currentpos] += scale
 851                             elif rsense == "-" and keepStrand != "plusOnly":
 852                                 chromModel[currentpos] -= scale
 853
 854             del spliceDict
 855
 856         return chromModel
 857
 858
 859     def insertMetadata(self, valuesList):
 860         for (pname, pvalue) in valuesList:
 861             self.metadata[pname] = pvalue
 862
 863
 864     def updateMetadata(self, pname, newValue, originalValue=""):
 865         if originalValue != "":
 866             if self.metadata[pname] == originalValue:
 867                 self.metadata[pname] = newValue
 868         else:
 869             self.metadata[pname] = newValue
 870
 871
 872     #TODO: Primary
 873     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
 874         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
 875             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
 876             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
 877         """
 878         restrict = ""
 879         if sense != "both":
 880             restrict = " and sense = ? "
 881
 882         if uniqs:
 883             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
 884
 885         if multi:
 886             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
 887
 888         if self.dataType == "RNA" and splices:
 889             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
 890             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
 891
 892         self.dbcon.commit()
 893
 894
 895     #TODO: Primary
 896     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
 897         """ set the flag fields in the entire dataset.
 898         """
 899         if uniqs:
 900             self.setFlagsInDB("uniqs", flag)
 901
 902         if multi:
 903             self.setFlagsInDB("multi", flag)
 904
 905         if self.dataType == "RNA" and splices:
 906             self.setFlagsInDB("splices", flag)
 907
 908         self.dbcon.commit()
 909
 910
 911     #TODO: Secondary to setFlags()
 912     def setFlagsInDB(self, table, flag):
 913         """ set the flag field for every entry in a table.
 914         """
 915         self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
 916
 917
 918     #TODO: Primary
 919     def resetFlags(self, uniqs=True, multi=True, splices=True):
 920         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
 921         """
 922         self.setFlags("", uniqs, multi, splices)
 923
 924
 925     #TODO: Primary
 926     def reweighMultireads(self, readList):
 927         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
 928
 929
 930     def setSynchronousPragma(self, value="ON"):
 931         pass
 932
 933
 934     def setDBcache(self, cache, default=False):
 935         pass
 936
 937
 938     def execute(self, statement, returnResults=False):
 939         pass
 940
 941
 942     def executeCommit(self, statement):
 943         pass
 944
 945
 946     def buildIndex(self, cache=100000):
 947         pass
 948
 949
 950     def dropIndex(self):
 951         pass
 952
 953
 954     def memSync(self, chrom="", index=False):
 955         pass
 956
 957
 958     def copyDBEntriesToMemory(self, dbName, whereClause=""):
 959         pass
 960
 961
 962     def copySpliceDBEntriesToMemory(self, whereClause=""):
 963         pass
 964
 965
 966 def getReadSense(reverse):
 967     if reverse:
 968         sense = "-"
 969     else:
 970         sense = "+"
 971
 972     return sense
 973
 974
 975 def getSpliceRightStart(start, cigarTupleList):
 976     offset = 0
 977
 978     for operation,length in cigarTupleList:
 979         if operation == 3:
 980             stopL = int(start + offset)
 981             startR = int(stopL + length)
 982
 983             return startR
 984         else:
 985             offset += length
 986
 987
 988 def getSpliceBounds(start, readsize, cigarTupleList):
 989     offset = 0
 990     #TODO: check operations here - we want to make sure we are adding the correct ones
 991     for operation,length in cigarTupleList:
 992         if operation == 3:
 993             stopL = int(start + offset)
 994             startR = int(stopL + length)
 995             offset = 0
 996         else:
 997             offset += length
 998
 999     stopR = startR + offset
1000
1001     return start, startR, stopL, stopR
1002
1003
1004 def getPairedReadNumberSuffix(read):
1005     readSuffix = ""
1006     if not isPairedRead(read):
1007         return ""
1008
1009     if read.is_read1:
1010         readSuffix = "/1"
1011     elif read.is_read2:
1012         readSuffix = "/2"
1013
1014     return readSuffix
1015
1016
1017 def isPairedRead(read):
1018     return read.is_proper_pair and (read.is_read1 or read.is_read2)
1019
1020
1021 def getMismatches(mismatchTag, querySequence="", sense="+"):
1022     output = []
1023     deletionMarker = "^"
1024     position = 0
1025
1026     lengths = re.findall("\d+", mismatchTag)
1027     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
1028
1029     for mismatchEntry in range(len(mismatchSequences)):
1030         mismatch = mismatchSequences[mismatchEntry]
1031         position = position + int(lengths[mismatchEntry])
1032         if string.find(mismatch, deletionMarker) == 0:
1033             continue
1034
1035         try:
1036             if querySequence:
1037                 genomicNucleotide = querySequence[position]
1038             else:
1039                 genomicNucleotide = "N"
1040
1041             if sense == "-":
1042                 mismatch = getReverseComplement(mismatch)
1043                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
1044
1045             erange1BasedElandCompatiblePosition = int(position + 1)
1046             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
1047             position += 1
1048         except IndexError:
1049             return ""
1050
1051     return string.join(output, ",")