convert standard analysis pipelines to use bam format natively

author Sean Upchurch <sau@caltech.edu>

Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)

committer Sean Upchurch <sau@caltech.edu>

Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
author Sean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
committer Sean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
diff --git a/ReadDataset.py b/ReadDataset.py

index 9a795c8cde5c5ba6d257da963ebf4c32f7d6e42d..dffa76de9436f4379f42ba2175bec17fc1a8fa59 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -6,11 +6,10 @@ import re
  import sys
  import pysam
  from array import array
  import sys
  import pysam
  from array import array
-from commoncode import getReverseComplement
+from commoncode import getReverseComplement, isSpliceEntry
  
  currentRDSVersion = "3.0"
  
  
  currentRDSVersion = "3.0"
  
-
  class ReadDatasetError(Exception):
      pass
  
  class ReadDatasetError(Exception):
      pass
  
@@ -68,7 +67,7 @@ class MaxCoordFinder(ReadCounter):
  
  
  
  
  
  
-class BamReadDataset():
+class ReadDataset():
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
      class using the insert* methods. Default class subtype ('DNA') includes
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
      class using the insert* methods. Default class subtype ('DNA') includes
@@ -454,7 +453,7 @@ class BamReadDataset():
                  newrow["chrom"] = chrom
  
              if withPairID:
                  newrow["chrom"] = chrom
  
              if withPairID:
-                newrow["pairID"] = pairID
+                newrow["pairID"] = pairReadSuffix[1:]
  
              try:
                  resultsDict[dictKey].append(newrow)
  
              try:
                  resultsDict[dictKey].append(newrow)
@@ -973,16 +972,6 @@ def getReadSense(reverse):
      return sense
  
  
      return sense
  
  
-def isSpliceEntry(cigarTupleList):
-    isSplice = False
-    for operation,length in cigarTupleList:
-        if operation == 3:
-            isSplice = True
-            break
-
-    return isSplice
-
-
  def getSpliceRightStart(start, cigarTupleList):
      offset = 0
  
  def getSpliceRightStart(start, cigarTupleList):
      offset = 0
  
diff --git a/Region.py b/Region.py

index 0d0a15b1cd91e7f3ceccee518f448ffccf2daad4..39ba811bc1d94100ed63516467e0a99fb2f637a0 100644 (file)
--- a/Region.py
+++ b/Region.py
@@ -6,7 +6,7 @@ class Region(object):
      """
  
  
      """
  
  
-    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0., foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
          self.label = label
          self.index = index
          self.chrom = chrom
          self.label = label
          self.index = index
          self.chrom = chrom
diff --git a/bamPreprocessing.py b/bamPreprocessing.py

new file mode 100644 (file)

index 0000000..dd5e861
--- /dev/null
+++ b/bamPreprocessing.py
@@ -0,0 +1,193 @@
+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+    usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+    if len(sys.argv) < 3:
+        print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+        print '       BAM file has to be indexed'
+        sys.exit(1)
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    BAM = args[0]
+    outputfilename = args[1]
+
+    samfile = pysam.Samfile(BAM, "rb" )
+    readMultiplicityDict = getReadMultiplicity(samfile)
+
+    counts = getReadCounts(samfile, readMultiplicityDict)
+    outheader = buildHeader(samfile.header, counts)
+    if options.markGID:
+        genome = Genome(options.genomeName, inRAM=True)
+        if options.extendGenome != "":
+            genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+        print "getting gene features...."
+        featuresByChromDict = getFeaturesByChromDict(genome)    
+        outheader['CO'].append(options.genomeComment)
+
+    outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+    for alignedread in samfile.fetch(until_eof=True):
+        geneModelFlag = ""
+        if options.markGID:
+            chrom = samfile.getrname(alignedread.tid)
+            chromNum = chrom[3:]
+            geneModelFlag = "NM"
+            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+                if start < alignedread.pos and stop > alignedread.pos:
+                    geneModelFlag = gid
+                    continue
+
+        ID = getReadID(alignedread)
+        try:
+            scaleby = alignedread.opt('NH')
+            outfile.write(alignedread)
+        except KeyError:
+            scaleby = readMultiplicityDict[ID]
+            writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+    outfile.close()
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--genome", dest="genomeName")
+    parser.add_option("--genomeName", dest="genomeComment")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+
+    parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+    return parser
+
+
+def getReadMultiplicity(samfile):
+    """ There is an assumption here that the set of all reads will not have a mixture of
+        reads bearing the NH tag and reads without.
+    """
+    readMultiplicityDict = {}
+    processedReads = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        try:
+            readMultiplicity = alignedread.opt('NH')
+            return {}
+        except KeyError:
+            ID = getReadID(alignedread)
+            try:
+                readMultiplicityDict[ID] += 1
+            except KeyError:
+                readMultiplicityDict[ID] = 1
+
+        processedReads += 1
+        if processedReads % 5000000 == 0:
+            print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+    return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+    uniques = 0
+    multis = 0.0
+    minusUnique = 0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    minusMulti = 0.0
+    readLength = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        if readLength == 0:
+            readLength = getReadLength(alignedread.cigar)
+
+        try:
+            readMultiplicity = alignedread.opt('NH')
+        except KeyError:
+            ID = getReadID(alignedread)
+            readMultiplicity = readMultiplicityDict[ID]
+
+        if readMultiplicity == 1:
+            if alignedread.is_reverse:
+                minusUnique += 1
+
+            if isSpliceEntry(alignedread.cigar):
+                uniqueSplice += 1
+            else:
+                uniques += 1
+        else:
+            if alignedread.is_reverse:
+                minusMulti += 1.0/readMultiplicity
+
+            if isSpliceEntry(alignedread.cigar):
+                multiSplice += 1.0/readMultiplicity
+            else:
+                multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+    plusUnique = uniques + uniqueSplice - minusUnique
+    plusMulti = multis + multiSplice - minusMulti
+    counts = ["Total\t%d" % totalReads,
+              "Unique\t%d" % uniques,
+              "Multis\t%d" % multis,
+              "UniqueSplices\t%d" % uniqueSplice,
+              "Multisplices\t%d" % multiSplice,
+              "PlusUnique\t%d" % plusUnique,
+              "PlusMulti\t%d" % plusMulti,
+              "MinusUnique\t%d" % minusUnique,
+              "MinusMulti\t%d" % minusMulti,
+              "ReadLength\t%d" % readLength
+    ]
+
+    return counts
+
+
+def buildHeader(templateheader, commentList):
+    header = templateheader
+    header["CO"] = commentList
+
+    return header
+
+
+def getReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getReadLength(cigar):
+    take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+    return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+    newAlignedRead = originalRead
+    newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+    outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/checkrmask.py b/checkrmask.py

index 457e4c1f9ebedb7b4daf6137f9f8b850635afb28..8ae30ee958dd10fc0d7e598b7bb25fa4b4ed17c2 100755 (executable)
--- a/checkrmask.py
+++ b/checkrmask.py
@@ -55,6 +55,27 @@ def makeParser(usage=""):
  
  def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
  
  
  def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
  
+    if os.path.isfile(dbfile):
+        checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
+    else:
+        outfile = open(outFileName, "w")
+        goodfile = open(goodFileName, "w")
+        infile = open(filename)
+        print "No database - passing through"
+        if logfilename is not None:
+            writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+            writeLog(logfilename, versionString, "No database - passing through")
+
+        for line in infile:
+            outfile.write("%s\tNR\tNR\t0.00\n" % line)
+            goodfile.write(line)
+
+        outfile.close()
+        goodfile.close()
+
+
+def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
      outfile = open(outFileName, "w")
      goodfile = open(goodFileName, "w")
      if startField < 0:
      outfile = open(outFileName, "w")
      goodfile = open(goodFileName, "w")
      if startField < 0:
@@ -63,29 +84,14 @@ def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cacheP
      if cachePages < 250000:
          cachePages = 250000
  
      if cachePages < 250000:
          cachePages = 250000
  
-    doLog = False
      if logfilename is not None:
          writeLog(logfilename, versionString, string.join(sys.argv[1:]))
      if logfilename is not None:
          writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-        doLog = True
  
      infile = open(filename)
  
      infile = open(filename)
-    if os.path.isfile(dbfile):
-        db = sqlite.connect(dbfile)
-        sql = db.cursor()
-        sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
-        sql.execute("PRAGMA temp_store = MEMORY")
-    else:
-        print "No database - passing through"
-        if doLog:
-            writeLog(logfilename, versionString, "No database - passing through")
-
-        for line in infile:
-            outfile.write("%s\tNR\tNR\t0.00\n" % line)
-            goodfile.write(line)
-
-        outfile.close()
-        goodfile.close()
-        sys.exit(0)
+    db = sqlite.connect(dbfile)
+    sql = db.cursor()
+    sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+    sql.execute("PRAGMA temp_store = MEMORY")
  
      featureList = []
      featureDict = {}
  
      featureList = []
      featureDict = {}
diff --git a/combineRPKMs.py b/combineRPKMs.py

index 88b57ab547edeea3f9be5387780796aadbe72bc2..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -63,7 +63,7 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      outfile.write(header)
  
      finalfile = open(finalfileName)
      outfile.write(header)
  
      finalfile = open(finalfileName)
-    #TODO: QForAli - the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
      #      are not in the finalfile then they will be lost.
      for line in finalfile:
          fields = line.strip().split()
      #      are not in the finalfile then they will be lost.
      for line in finalfile:
          fields = line.strip().split()
diff --git a/combinerds.py b/combinerds.py

index 2695e6b512c677735fc61be2931108e947d0ab43..d6960c2783908d67db405315b797aab3db5a02e9 100755 (executable)
--- a/combinerds.py
+++ b/combinerds.py
@@ -21,7 +21,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
@@ -98,9 +98,7 @@ def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, c
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
              dbColumns = "*"
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
              dbColumns = "*"
-            if table == "uniqs":
-                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
-            elif table == "multi":
+            if table in ["uniqs", "multi"]:
                  dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "splices":
                  dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
                  dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "splices":
                  dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
diff --git a/commoncode.py b/commoncode.py

index cd449620f0fc88304f4e8adceee0e96e37e1295d..0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -10,8 +10,8 @@ from cistematic.genomes import Genome
  import Region
  
  commoncodeVersion = 5.6
  import Region
  
  commoncodeVersion = 5.6
-currentRDSversion = 2.0
  
  
+#TODO: This is a function dumping ground - needs to be reworked
  class ErangeError(Exception):
      pass
  
  class ErangeError(Exception):
      pass
  
@@ -49,6 +49,78 @@ def writeLog(logFile, messenger, message):
      logfile.close()
  
  
      logfile.close()
  
  
+def getChromInfoList(chrominfo):
+    chromInfoList = []
+    linelist = open(chrominfo)
+    for line in linelist:
+        fields = line.strip().split('\t')
+        chr = fields[0]
+        start = 0
+        end = int(fields[1])
+        chromInfoList.append((chr,start,end))
+
+    linelist.close()
+
+    return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    if cigarTupleList is not None:
+        for operation,length in cigarTupleList:
+            if operation == 3:
+                isSplice = True
+                break
+
+    return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+    countRead = True
+
+    if sense in ['-', '+'] and sense != getReadSense(read):
+        countRead = False
+    elif not countSplices and isSpliceEntry(read.cigar):
+        countRead = False
+    elif not countUniqs and read.opt('NH') == 1:
+        countRead = False
+    elif not countMulti and read.opt('NH') > 1:
+        countRead = False
+
+    return countRead
+
+
+# Cistematic functions
  def getGeneInfoDict(genome, cache=False):
      idb = geneinfoDB(cache=cache)
      if genome == "dmelanogaster":
  def getGeneInfoDict(genome, cache=False):
      idb = geneinfoDB(cache=cache)
      if genome == "dmelanogaster":
@@ -73,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA
      return geneannotDict
  
  
      return geneannotDict
  
  
+# Configuration File Support
  def getConfigParser(fileList=[]):
      configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
      for filename in fileList:
  def getConfigParser(fileList=[]):
      configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
      for filename in fileList:
@@ -129,6 +202,7 @@ def getAllConfigSectionOptions(parser, section):
      return setting
  
  
      return setting
  
  
+# All the Other Stuff from before
  def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                       fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
  def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                       fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
@@ -192,7 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
-    #TODO: QForAli - Are we going to require sorted region files to have this even work?
+    #TODO: Are we going to require sorted region files to have this even work?
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -341,10 +415,6 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
  
      # implementing a triangular smooth
      smoothArray = array("f", [0.] * length)
  
      # implementing a triangular smooth
      smoothArray = array("f", [0.] * length)
-    #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
-    # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
-    # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
-    # edge cases?
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
diff --git a/docs/README.chip-seq b/docs/README.chip-seq

index 846a441a434f8e3cff4506108f27126855a2fb47..ea7b0a347661d59a40e838cd666156f9c9b8bfe4 100644 (file)
--- a/docs/README.chip-seq
+++ b/docs/README.chip-seq
@@ -57,18 +57,24 @@ options are case sensitive and that they could well
  fail silently.
  
  
  fail silently.
  
  
-3. MAKING THE NECESSARY INPUT (RDS) FILES
-
-You will want to first convert your read mappings to the 
-native ERANGE read store. Please see the file 
-README.build-rds for instructions on how to do this.
-
-Build an RDS file for both the ChIP, and if available and 
-appropriate, the control. Note that we *HIGHLY* recommend 
-the use of a matched control sample to account for some 
-of the general background artifacts that can be present 
-in ChIP-seq samples (e.g. DNAse hypersensitivity, 
-assembly collapse of some sattelite repeats, etc....). 
+3. MAKING THE NECESSARY INPUT FILES
+
+Erange uses BAM format files, but there are a couple of
+modifications that need to be made to the header and
+individual entries.  The python script bamPreprocessing.py
+will do the following:
+1. Count the reads by type and write these counts to the
+header as comments.
+2. Verify that every read has a value in the NH tag or add
+it if needed.
+3. Optionally annotate the reads with the geneID using the
+ZG flag
+
+Note that we *HIGHLY* recommend the use of a matched
+control sample to account for some of the general
+background artifacts that can be present in ChIP-seq
+samples (e.g. DNAse hypersensitivity, assembly collapse
+of some sattelite repeats, etc....). 
  
  
  4. WEIGHING MULTIREADS
  
  
  4. WEIGHING MULTIREADS
diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt

index b0a1c0f09e270ad2cc47c9af05c84564b5325aaf..96ea795586a252968d1d214f53dfa4a9cdec46fc 100644 (file)
--- a/docs/RNA-seq.analysisSteps.txt
+++ b/docs/RNA-seq.analysisSteps.txt
@@ -1,5 +1,5 @@
  # analysis steps for an ERANGE analysis of RNA-seq data
  # analysis steps for an ERANGE analysis of RNA-seq data
-# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.py
  
  # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
  #              export PYTHONPATH=/my/path/to/cistematic
  
  # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
  #              export PYTHONPATH=/my/path/to/cistematic
diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh

index cf75b6a5c9047ebb94332f26e8af234b68f3e382..bd8290127d305d6a184df30deeecffe815043a07 100755 (executable)
--- a/docs/runRNAPairedAnalysis.sh
+++ b/docs/runRNAPairedAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runRNAPairedAnalysis.sh
  # ENRAGE
  #
  # runRNAPairedAnalysis.sh
  # ENRAGE
  #
diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh

index 103b576e73ac450c0af315645abfca41d7abbb40..51f2e4ccd8bc9ad166cd8f7a5609e6648413c929 100755 (executable)
--- a/docs/runSNPAnalysis.sh
+++ b/docs/runSNPAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runSNPAnalysis.sh
  #
  # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
  # runSNPAnalysis.sh
  #
  # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
@@ -59,4 +61,4 @@ python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt
  
  # make bed file for displaying the snps on UCSC genome browser
  python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
  
  # make bed file for displaying the snps on UCSC genome browser
  python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
-fi
-\ No newline at end of file
+fi
diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh

index aa5fe60588a30ceba6a65301db81b13ac4f6051f..830abea6ecec73a603597d0989e49763cc5f67bd 100755 (executable)
--- a/docs/runStandardAnalysis.sh
+++ b/docs/runStandardAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runStandardAnalysis.sh
  # ENRAGE
  #
  # runStandardAnalysis.sh
  # ENRAGE
  #
@@ -88,4 +90,4 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi
  echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
  python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  
  echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
  python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  
-fi
-\ No newline at end of file
+fi
diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh

index 44459019f601732e67f652eed96becc38d0d920d..ee9d1151d91dcb97e3c532a62afdf002563b120c 100755 (executable)
--- a/docs/runStrandedAnalysis.sh
+++ b/docs/runStrandedAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runStrandedAnalysis.sh
  # ENRAGE
  #
  # runStrandedAnalysis.sh
  # ENRAGE
  #
diff --git a/findall.py b/findall.py

index 9dd85df733f86417debf9b060e53f59230110b76..f8907a66c2db6d3b36a4ac9a13872554c1f1d9ca 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -1,6 +1,6 @@
  """
  """
-    usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
-           [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+    usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+           [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
             [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
             [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
             [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
             [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
             [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
             [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
@@ -50,8 +50,8 @@ import math
  import string
  import optparse
  import operator
  import string
  import optparse
  import operator
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
  import Region
  
  
  import Region
  
  
@@ -65,7 +65,7 @@ class RegionDirectionError(Exception):
  class RegionFinder():
      def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
                   minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
  class RegionFinder():
      def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
                   minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
-                 normalize=True, listPeak=False, reportshift=False, stringency=1.0):
+                 normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
  
          self.statistics = {"index": 0,
                             "total": 0,
  
          self.statistics = {"index": 0,
                             "total": 0,
@@ -77,8 +77,8 @@ class RegionFinder():
  
          self.regionLabel = label
          self.rnaSettings = False
  
          self.regionLabel = label
          self.rnaSettings = False
-        self.controlRDSsize = 1
-        self.sampleRDSsize = 1
+        self.controlRDSsize = 1.0
+        self.sampleRDSsize = 1.0
          self.minRatio = minRatio
          self.minPeak = minPeak
          self.leftPlusRatio = leftPlusRatio
          self.minRatio = minRatio
          self.minPeak = minPeak
          self.leftPlusRatio = leftPlusRatio
@@ -115,6 +115,10 @@ class RegionFinder():
          self.listPeak = listPeak
          self.reportshift = reportshift
          self.stringency = max(stringency, 1.0)
          self.listPeak = listPeak
          self.reportshift = reportshift
          self.stringency = max(stringency, 1.0)
+        self.controlfile = controlfile
+        self.doControl = self.controlfile is not None
+        self.doPvalue = False
+        self.doRevBackground = doRevBackground
  
  
      def useRNASettings(self, readlen):
  
  
      def useRNASettings(self, readlen):
@@ -125,7 +129,7 @@ class RegionFinder():
          self.maxSpacing = readlen
  
  
          self.maxSpacing = readlen
  
  
-    def getHeader(self, doPvalue):
+    def getHeader(self):
          if self.normalize:
              countType = "RPM"
          else:
          if self.normalize:
              countType = "RPM"
          else:
@@ -142,33 +146,33 @@ class RegionFinder():
          if self.reportshift:
              headerFields.append("readShift")
  
          if self.reportshift:
              headerFields.append("readShift")
  
-        if doPvalue:
+        if self.doPvalue:
              headerFields.append("pValue")
  
          return string.join(headerFields, "\t")
  
  
              headerFields.append("pValue")
  
          return string.join(headerFields, "\t")
  
  
-    def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType):
+    def printSettings(self, ptype, useMulti, pValueType):
          print
          print
-        self.printStatusMessages(doRevBackground, ptype, doControl, useMulti)
-        self.printOptionsSummary(useMulti, doCache, pValueType)
+        self.printStatusMessages(ptype, useMulti)
+        self.printOptionsSummary(useMulti, pValueType)
  
  
  
  
-    def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti):
+    def printStatusMessages(self, ptype, useMulti):
          if self.shiftValue == "learn":
              print "Will try to learn shift"
  
          if self.normalize:
              print "Normalizing to RPM"
  
          if self.shiftValue == "learn":
              print "Will try to learn shift"
  
          if self.normalize:
              print "Normalizing to RPM"
  
-        if doRevBackground:
+        if self.doRevBackground:
              print "Swapping IP and background to calculate FDR"
  
          if ptype != "":
              if ptype in ["NONE", "SELF"]:
                  pass
              elif ptype == "BACK":
              print "Swapping IP and background to calculate FDR"
  
          if ptype != "":
              if ptype in ["NONE", "SELF"]:
                  pass
              elif ptype == "BACK":
-                if doControl and doRevBackground:
+                if self.doControl and self.doRevBackground:
                      pass
                  else:
                      print "must have a control dataset and -revbackground for pValue type 'back'"
                      pass
                  else:
                      print "must have a control dataset and -revbackground for pValue type 'back'"
@@ -190,9 +194,9 @@ class RegionFinder():
              print "only analyzing reads on the minus strand"
  
  
              print "only analyzing reads on the minus strand"
  
  
-    def printOptionsSummary(self, useMulti, doCache, pValueType):
+    def printOptionsSummary(self, useMulti, pValueType):
  
  
-        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache)
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
          print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
          try:
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
          print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
          try:
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
@@ -200,18 +204,18 @@ class RegionFinder():
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
  
  
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
  
  
-    def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl):
+    def getAnalysisDescription(self, hitfile, useMulti, pValueType):
  
          description = ["#ERANGE %s" % versionString]
  
          description = ["#ERANGE %s" % versionString]
-        if doControl:
-            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        if self.doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
          else:
              description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
  
          if self.withFlag != "":
              description.append("#restrict to Flag = %s" % self.withFlag)
  
          else:
              description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
  
          if self.withFlag != "":
              description.append("#restrict to Flag = %s" % self.withFlag)
  
-        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache))
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
          description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
          try:
              description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
          description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
          try:
              description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
@@ -221,6 +225,31 @@ class RegionFinder():
          return string.join(description, "\n")
  
  
          return string.join(description, "\n")
  
  
+    def getFooter(self, bestShift):
+        index = self.statistics["index"]
+        mIndex = self.statistics["mIndex"]
+        footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+        if self.doDirectionality:
+            footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+        if self.doRevBackground:
+            try:
+                percent = min(100. * (float(mIndex)/index), 100.)
+            except ZeroDivisionError:
+                percent = 0.
+
+            footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+        if self.shiftValue == "auto" and self.reportshift:
+            
+            footerLines.append("#mode of shift values: %d" % bestShift)
+
+        if self.statistics["badRegionTrim"] > 0:
+            footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+        return string.join(footerLines, "\n")
+
+
      def updateControlStatistics(self, peak, sumAll, peakScore):
  
          plusRatio = float(peak.numPlus)/peak.numHits
      def updateControlStatistics(self, peak, sumAll, peakScore):
  
          plusRatio = float(peak.numPlus)/peak.numHits
@@ -281,11 +310,11 @@ def main(argv=None):
                                  minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
                                  doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
                                  withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
                                  minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
                                  doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
                                  withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
-                                reportshift=options.reportshift, stringency=options.stringency)
+                                reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+                                doRevBackground=options.doRevBackground)
  
      findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
  
      findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
-            options.cachePages, options.ptype, options.controlfile, options.doRevBackground,
-            options.useMulti, options.combine5p)
+            options.ptype, options.useMulti, options.combine5p)
  
  
  def makeParser():
  
  
  def makeParser():
@@ -363,51 +392,67 @@ def makeParser():
      return parser
  
  
      return parser
  
  
-def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None,
-            ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+            ptype="", useMulti=True, combine5p=False):
  
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
  
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    doCache = cachePages is not None
-    controlRDS = None
-    doControl = controlfile is not None
-    if doControl:
+    controlBAM = None
+    if regionFinder.doControl:
          print "\ncontrol:" 
          print "\ncontrol:" 
-        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
-        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+        controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+        regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
  
      print "\nsample:" 
  
      print "\nsample:" 
-    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
-    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
-    pValueType = getPValueType(ptype, doControl, doRevBackground)
-    doPvalue = not pValueType == "none"
-    regionFinder.readlen = hitRDS.getReadSize()
+    sampleBAM = pysam.Samfile(hitfile, "rb")
+    regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+    pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+    regionFinder.doPvalue = not pValueType == "none"
+    regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
      if rnaSettings:
          regionFinder.useRNASettings(regionFinder.readlen)
  
      if rnaSettings:
          regionFinder.useRNASettings(regionFinder.readlen)
  
-    regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType)
+    regionFinder.printSettings(ptype, useMulti, pValueType)
      outfile = open(outfilename, outputMode)
      outfile = open(outfilename, outputMode)
-    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl)
+    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
      shiftDict = {}
      shiftDict = {}
-    chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
-    for chromosome in chromosomeList:
-        #TODO: QForAli -Really? Use first chr shift value for all of them
+    chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+    for chromosome in chromList:
+        #TODO: Really? Use first chr shift value for all of them
+        maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
          if regionFinder.shiftValue == "learn":
          if regionFinder.shiftValue == "learn":
-            learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
+            regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
  
  
-        allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
-        if doRevBackground:
-            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti)
-            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+        allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+        if regionFinder.doRevBackground:
+            maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+            backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
          else:
          else:
-            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header)
+            backregions = []
+            pValueType = "self"
+
+        writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
  
  
-    footer = getFooter(regionFinder, shiftDict, doRevBackground)
+    try:
+        bestShift = getBestShiftInDict(shiftDict)
+    except ValueError:
+        bestShift = 0
+
+    footer = regionFinder.getFooter(bestShift)
      print footer
      print >> outfile, footer
      outfile.close()
      writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
  
  
      print footer
      print >> outfile, footer
      outfile.close()
      writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
  
  
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
  def getPValueType(ptype, doControl, doRevBackground):
      pValueType = "self"
      if ptype in ["NONE", "SELF", "BACK"]:
  def getPValueType(ptype, doControl, doRevBackground):
      pValueType = "self"
      if ptype in ["NONE", "SELF", "BACK"]:
@@ -424,52 +469,41 @@ def getPValueType(ptype, doControl, doRevBackground):
      return pValueType
  
  
      return pValueType
  
  
-def openRDSFile(filename, cachePages=None, doCache=False):
-    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
-    if cachePages > rds.getDefaultCacheSize():
-        rds.setDBcache(cachePages)
-
-    return rds
-
-
-def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl):
-    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl)
-    header = regionFinder.getHeader(doPvalue)
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+    header = regionFinder.getHeader()
      print >> outfile, header
  
      return header
  
  
      print >> outfile, header
  
      return header
  
  
-def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False):
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        controlChromList = controlRDS.getChromosomes()
-        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+    if controlBAM is not None:
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
      else:
      else:
-        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
  
      return chromosomeList
  
  
  
      return chromosomeList
  
  
-def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-                    outfile, useMulti, doControl, controlRDS, combine5p):
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+                    outfile, useMulti, controlBAM, combine5p):
  
      outregions = []
      allregions = []
      print "chromosome %s" % (chromosome)
      previousHit = - 1 * regionFinder.maxSpacing
      readStartPositions = [-1]
  
      outregions = []
      allregions = []
      print "chromosome %s" % (chromosome)
      previousHit = - 1 * regionFinder.maxSpacing
      readStartPositions = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
      uniqueReadCount = 0
      reads = []
      uniqueReadCount = 0
      reads = []
-    numStarts = 0
-    badRegion = False
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+    numStartsInRegion = 0
  
  
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = readStartPositions[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = readStartPositions[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -485,8 +519,8 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
  
              allregions.append(int(region.numReads))
              regionLength = lastReadPos - region.start
  
              allregions.append(int(region.numReads))
              regionLength = lastReadPos - region.start
-            if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+            if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+                region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
  
                  if region.foldRatio >= regionFinder.minRatio:
                      # first pass, with absolute numbers
  
                  if region.foldRatio >= regionFinder.minRatio:
                      # first pass, with absolute numbers
@@ -495,25 +529,24 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                          try:
                              lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
                          except IndexError:
                          try:
                              lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
                          except IndexError:
-                            badRegion = True
+                            regionFinder.statistics["badRegionTrim"] += 1
                              continue
  
                              continue
  
-                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+                        region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
  
  
-                    # just in case it changed, use latest data
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
                          if regionFinder.normalize:
                              peakScore /= regionFinder.sampleRDSsize
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
                          if regionFinder.normalize:
                              peakScore /= regionFinder.sampleRDSsize
-                    except:
+                    except (IndexError, AttributeError, ZeroDivisionError):
                          continue
  
                      if regionFinder.listPeak:
                          continue
  
                      if regionFinder.listPeak:
-                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
+                        region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
  
                      if useMulti:
  
                      if useMulti:
-                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+                        setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
                                                 regionFinder.normalize, regionFinder.doTrim)
  
                      region.shift = peak.shift
                                                 regionFinder.normalize, regionFinder.doTrim)
  
                      region.shift = peak.shift
@@ -530,43 +563,75 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                              regionFinder.statistics["failed"] += 1
  
              readStartPositions = []
                              regionFinder.statistics["failed"] += 1
  
              readStartPositions = []
-            totalWeight = 0
+            totalWeight = 0.0
              uniqueReadCount = 0
              reads = []
              uniqueReadCount = 0
              reads = []
-            numStarts = 0
-            if badRegion:
-                badRegion = False
-                regionFinder.statistics["badRegionTrim"] += 1
+            numStartsInRegion = 0
  
          if pos not in readStartPositions:
  
          if pos not in readStartPositions:
-            numStarts += 1
+            numStartsInRegion += 1
  
          readStartPositions.append(pos)
  
          readStartPositions.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          totalWeight += weight
          if weight == 1.0:
              uniqueReadCount += 1
  
          totalWeight += weight
          if weight == 1.0:
              uniqueReadCount += 1
  
-        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      return allregions, outregions
  
  
          previousHit = pos
  
      return allregions, outregions
  
  
-def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti):
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+    if read.opt('NH') > 1 and not doMulti:
+        return True
+
+    if strand == "+" and read.is_reverse:
+        return True
+
+    if strand == "-" and not read.is_reverse:
+        return True
+        
+    return False
+
+
+def getMaxCoordinate(samfile, chr, doMulti=False):
+    maxCoord = 0
+    for alignedread in samfile.fetch(chr):
+        if alignedread.opt('NH') > 1:
+            if doMulti:
+                maxCoord = max(maxCoord, alignedread.pos)
+        else:
+            maxCoord = max(maxCoord, alignedread.pos)
+
+    return maxCoord
+
+
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
      #TODO: this is *almost* the same calculation - there are small yet important differences
      print "calculating background..."
      previousHit = - 1 * regionFinder.maxSpacing
      currentHitList = [-1]
      #TODO: this is *almost* the same calculation - there are small yet important differences
      print "calculating background..."
      previousHit = - 1 * regionFinder.maxSpacing
      currentHitList = [-1]
-    currentTotalWeight = 0
+    currentTotalWeight = 0.0
      currentReadList = []
      backregions = []
      numStarts = 0
      badRegion = False
      currentReadList = []
      backregions = []
      numStarts = 0
      badRegion = False
-    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in controlBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = currentHitList[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = currentHitList[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -578,7 +643,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
              region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
              regionLength = lastReadPos - region.start
              if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
              region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
              regionLength = lastReadPos - region.start
              if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                  if regionFinder.normalize:
                      numMock /= regionFinder.sampleRDSsize
  
                  if regionFinder.normalize:
                      numMock /= regionFinder.sampleRDSsize
  
@@ -595,13 +660,12 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                              badRegion = True
                              continue
  
                              badRegion = True
                              continue
  
-                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                          if regionFinder.normalize:
                              numMock /= regionFinder.sampleRDSsize
  
                          foldRatio = region.numReads / numMock
  
                          if regionFinder.normalize:
                              numMock /= regionFinder.sampleRDSsize
  
                          foldRatio = region.numReads / numMock
  
-                    # just in case it changed, use latest data
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
@@ -618,7 +682,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                          regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
  
              currentHitList = []
                          regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
  
              currentHitList = []
-            currentTotalWeight = 0
+            currentTotalWeight = 0.0
              currentReadList = []
              numStarts = 0
              if badRegion:
              currentReadList = []
              numStarts = 0
              if badRegion:
@@ -629,32 +693,31 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
              numStarts += 1
  
          currentHitList.append(pos)
              numStarts += 1
  
          currentHitList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          currentTotalWeight += weight
          currentTotalWeight += weight
-        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      return backregions
  
  
          previousHit = pos
  
      return backregions
  
  
-def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-               outfile, useMulti, doControl, controlRDS, combine5p):
-
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+               outfile, useMulti, controlBAM, combine5p):
  
  
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
      print "learning shift.... will need at least 30 training sites"
      stringency = regionFinder.stringency
      previousHit = -1 * regionFinder.maxSpacing
      positionList = [-1]
      print "learning shift.... will need at least 30 training sites"
      stringency = regionFinder.stringency
      previousHit = -1 * regionFinder.maxSpacing
      positionList = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
      readList = []
      shiftDict = {}
      count = 0
      numStarts = 0
      readList = []
      shiftDict = {}
      count = 0
      numStarts = 0
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              if regionFinder.normalize:
                  totalWeight /= regionFinder.sampleRDSsize
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              if regionFinder.normalize:
                  totalWeight /= regionFinder.sampleRDSsize
@@ -663,22 +726,22 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
              regionStop = positionList[-1]
              regionLength = regionStop - regionStart
              if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
              regionStop = positionList[-1]
              regionLength = regionStop - regionStart
              if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
-                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl)
+                foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
                  if foldRatio >= regionFinder.minRatio:
                      updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                      count += 1
  
              positionList = []
                  if foldRatio >= regionFinder.minRatio:
                      updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                      count += 1
  
              positionList = []
-            totalWeight = 0
+            totalWeight = 0.0
              readList = []
  
          if pos not in positionList:
              numStarts += 1
  
          positionList.append(pos)
              readList = []
  
          if pos not in positionList:
              numStarts += 1
  
          positionList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          totalWeight += weight
          totalWeight += weight
-        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
          previousHit = pos
  
      outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
@@ -689,12 +752,14 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
  
      print outline
      writeLog(logfilename, versionString, outfilename + outline)
  
      print outline
      writeLog(logfilename, versionString, outfilename + outline)
-    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
-    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % shiftValue
      print outline
      print >> outfile, outline
      writeLog(logfilename, versionString, outfilename + outline)
  
      print outline
      print >> outfile, outline
      writeLog(logfilename, versionString, outfilename + outline)
  
+    return shiftValue, shiftDict
+
  
  def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
      return abs(pos - previousHit) > maxSpacing or pos == maxCoord
  
  def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
      return abs(pos - previousHit) > maxSpacing or pos == maxCoord
@@ -705,7 +770,7 @@ def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringen
      minNumReadStarts = stringency * regionFinder.minRatio
      minRegionLength = stringency * regionFinder.readlen
  
      minNumReadStarts = stringency * regionFinder.minRatio
      minRegionLength = stringency * regionFinder.readlen
  
-    return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength
+    return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
  
  
  def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
  
  
  def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
@@ -758,12 +823,12 @@ def peakEdgeLocated(peak, position, minSignalThresh):
      return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
  
  
      return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
  
  
-def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl):
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
      """ Fold ratio calculated is total read weight over control
      """
      #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
      """ Fold ratio calculated is total read weight over control
      """
      #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
-    if doControl:
-        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+    if regionFinder.doControl:
+        numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
          if regionFinder.normalize:
              numMock /= regionFinder.controlRDSsize
  
          if regionFinder.normalize:
              numMock /= regionFinder.controlRDSsize
  
@@ -774,6 +839,25 @@ def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regi
      return foldRatio
  
  
      return foldRatio
  
  
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+    count = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        if alignedread.opt('NH') > 1:
+            if countMulti:
+                if isSpliceEntry(alignedread.cigar):
+                    if countSplices:
+                        count += 1.0/alignedread.opt('NH')
+                else:
+                    count += 1.0/alignedread.opt('NH')
+        elif uniqs:
+            if isSpliceEntry(alignedread.cigar):
+                if countSplices:
+                    count += 1.0
+            else:
+                count += 1.0
+
+    return count
+
  def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
      peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
      try:
  def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
      peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
      try:
@@ -819,9 +903,12 @@ def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRa
      return region
  
  
      return region
  
  
-def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
      if doTrim:
      if doTrim:
-        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+        sumMulti = 0.0
+        for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+            if alignedread.opt('NH') > 1:
+                sumMulti += 1.0/alignedread.opt('NH')
      else:
          sumMulti = currentTotalWeight - currentUniqueCount
  
      else:
          sumMulti = currentTotalWeight - currentUniqueCount
  
@@ -856,25 +943,33 @@ def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plus
              raise RegionDirectionError
  
  
              raise RegionDirectionError
  
  
-def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                                allregions, header):
-
-    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                           allregions, header, backregions=[], pValueType="self")
-
-
-def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
                             allregions, header, backregions=[], pValueType="none"):
  
      print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
                             allregions, header, backregions=[], pValueType="none"):
  
      print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
-    if doPvalue:
+    if regionFinder.doPvalue:
          if pValueType == "self":
              poissonmean = calculatePoissonMean(allregions)
          else:
              poissonmean = calculatePoissonMean(backregions)
  
      print header
          if pValueType == "self":
              poissonmean = calculatePoissonMean(allregions)
          else:
              poissonmean = calculatePoissonMean(backregions)
  
      print header
-    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict)
+    for region in outregions:
+        if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, regionFinder.reportshift)
+
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if regionFinder.doPvalue:
+            pValue = calculatePValue(int(region.numReads), poissonmean)
+            outline += "\t%1.2g" % pValue
+
+        print outline
+        print >> outfile, outline
  
  
  def calculatePoissonMean(dataList):
  
  
  def calculatePoissonMean(dataList):
@@ -890,29 +985,8 @@ def calculatePoissonMean(dataList):
      return poissonmean
  
  
      return poissonmean
  
  
-def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
-    for region in outregions:
-        if shiftValue == "auto" and reportshift:
-            try:
-                shiftDict[region.shift] += 1
-            except KeyError:
-                shiftDict[region.shift] = 1
-
-        outline = getRegionString(region, reportshift)
-
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if doPvalue:
-            sumAll = int(region.numReads)
-            pValue = calculatePValue(sumAll, poissonmean)
-            outline += "\t%1.2g" % pValue
-
-        print outline
-        print >> outfile, outline
-
-
  def calculatePValue(sum, poissonmean):
      pValue = math.exp(-poissonmean)
  def calculatePValue(sum, poissonmean):
      pValue = math.exp(-poissonmean)
-    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
      for i in xrange(sum):
          pValue *= poissonmean
          pValue /= i+1
      for i in xrange(sum):
          pValue *= poissonmean
          pValue /= i+1
@@ -929,34 +1003,9 @@ def getRegionString(region, reportShift):
      return outline
  
  
      return outline
  
  
-def getFooter(regionFinder, shiftDict, doRevBackground):
-    index = regionFinder.statistics["index"]
-    mIndex = regionFinder.statistics["mIndex"]
-    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
-    if regionFinder.doDirectionality:
-        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
-
-    if doRevBackground:
-        try:
-            percent = min(100. * (float(mIndex)/index), 100.)
-        except ZeroDivisionError:
-            percent = 0.
-
-        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
-
-    if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
-        bestShift = getBestShiftInDict(shiftDict)
-        footerLines.append("#mode of shift values: %d" % bestShift)
-
-    if regionFinder.statistics["badRegionTrim"] > 0:
-        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
-
-    return string.join(footerLines, "\n")
-
-
  def getBestShiftInDict(shiftDict):
      return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
  
  
  if __name__ == "__main__":
  def getBestShiftInDict(shiftDict):
      return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 03cade292225b55513b300a0fca2d8f2c4f5d50a..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -154,7 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
-                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py

index 7b4a2cc819c30976692d9721ddb363756ebdf70a..10ccc837dca3e6b36f3c7a14d94d3ebd4938ccd4 100755 (executable)
--- a/geneMrnaCounts.py
+++ b/geneMrnaCounts.py
@@ -6,12 +6,11 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
  from cistematic.genomes import Genome
  from cistematic.core.geneinfo import geneinfoDB
  
  from cistematic.genomes import Genome
  from cistematic.core.geneinfo import geneinfoDB
  
-print "geneMrnaCounts: version 5.2"
+print "geneMrnaCounts: version 6.0"
  
  
  def main(argv=None):
  
  
  def main(argv=None):
@@ -33,7 +32,7 @@ def main(argv=None):
  
      geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                     options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
  
      geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                     options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
-                   options.searchGID, options.countFeats, options.cachePages, options.markGID)
+                   options.searchGID, options.countFeats, options.cachePages)
  
  
  def getParser(usage):
  
  
  def getParser(usage):
@@ -47,7 +46,6 @@ def getParser(usage):
      parser.add_option("--searchGID", action="store_true", dest="searchGID")
      parser.add_option("--countfeatures", action="store_true", dest="countFeats")
      parser.add_option("--cache", type="int", dest="cachePages")
      parser.add_option("--searchGID", action="store_true", dest="searchGID")
      parser.add_option("--countfeatures", action="store_true", dest="countFeats")
      parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--markGID", action="store_true", dest="markGID")
  
      configParser = getConfigParser()
      section = "geneMrnaCounts"
  
      configParser = getConfigParser()
      section = "geneMrnaCounts"
@@ -60,17 +58,16 @@ def getParser(usage):
      searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
      countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
      searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
      countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
-    markGID = getConfigBoolOption(configParser, section, "markGID", False)
  
      parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
                          extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
  
      parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
                          extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
-                        countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+                        countFeats=countFeats, cachePages=cachePages)
  
      return parser
  
  
      return parser
  
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
                     doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
                     doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
-                   searchGID=False, countFeats=False, cachePages=None, markGID=False):
+                   searchGID=False, countFeats=False):
  
      if trackStrand:
          print "will track strandedness"
  
      if trackStrand:
          print "will track strandedness"
@@ -86,16 +83,6 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
      else:
          replaceModels = False
  
      else:
          replaceModels = False
  
-    if cachePages is not None:
-        doCache = True
-    else:
-        cachePages = 100000
-        doCache = False
-
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      genome = Genome(genomeName, inRAM=True)
      if extendGenome != "":
          genome.extendFeatures(extendGenome, replace=replaceModels)
      genome = Genome(genomeName, inRAM=True)
      if extendGenome != "":
          genome.extendFeatures(extendGenome, replace=replaceModels)
@@ -111,24 +98,15 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
      for gid in gidList:
          gidCount[gid] = 0
  
      for gid in gidList:
          gidCount[gid] = 0
  
-    chromList = hitRDS.getChromosomes(fullChrom=False)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
-    if markGID:
-        print "Flagging all reads as NM"
-        hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    for chrom in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chrom in chromosomeList:
          if chrom not in featuresByChromDict:
              continue
  
          if countFeats:
              seenFeaturesByChromDict[chrom] = set([])
  
          if chrom not in featuresByChromDict:
              continue
  
          if countFeats:
              seenFeaturesByChromDict[chrom] = set([])
  
-        print "\nchr%s" % chrom
-        fullchrom = "chr%s" % chrom
-        regionList = []        
+        print "\nchr%s" % chrom      
          print "counting GIDs"
          for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
              try:
          print "counting GIDs"
          for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
              try:
@@ -137,34 +115,31 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
                      if featureSense == "R":
                          checkSense = "-"
  
                      if featureSense == "R":
                          checkSense = "-"
  
-                    regionData = (gid, fullchrom, start, stop, checkSense)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                  else:
                  else:
-                    regionData = (gid, fullchrom, start, stop)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
  
                  gidCount[gid] += count
  
                  gidCount[gid] += count
-                if markGID:
-                    regionList.append(regionData)
-
                  if countFeats:
                      seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
              except:
                  print "problem with %s - skipping" % gid
  
                  if countFeats:
                      seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
              except:
                  print "problem with %s - skipping" % gid
  
-        if markGID:
-            print "marking GIDs"
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
-            print "finished marking"
-
      print " "
      if countFeats:
          numFeatures = countFeatures(seenFeaturesByChromDict)
          print "saw %d features" % numFeatures
  
      print " "
      if countFeats:
          numFeatures = countFeatures(seenFeaturesByChromDict)
          print "saw %d features" % numFeatures
  
-    writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
-    if markGID and doCache:
-        hitRDS.saveCacheDB(hitfile)
+    writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+    count = 0.0
+    for alignedread in bamfile.fetch(chrom, start, stop):
+        if countThisRead(alignedread, uniqs, multi, splices, sense):
+            count += 1.0/alignedread.opt('NH')
+
+    return count
  
  
  def countFeatures(seenFeaturesByChromDict):
  
  
  def countFeatures(seenFeaturesByChromDict):
@@ -178,18 +153,15 @@ def countFeatures(seenFeaturesByChromDict):
      return count
  
  
      return count
  
  
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
      geneAnnotDict = genome.allAnnotInfo()
      genomeName = genome.genome
      outfile = open(outfilename, "w")
      idb = geneinfoDB(cache=True)
      geneInfoDict = idb.getallGeneInfo(genomeName)
      geneAnnotDict = genome.allAnnotInfo()
      genomeName = genome.genome
      outfile = open(outfilename, "w")
      idb = geneinfoDB(cache=True)
      geneInfoDict = idb.getallGeneInfo(genomeName)
-    for gid in gidList:
+    for gid in gidCount:
          symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
          symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
-        if gid in gidCount:
-            outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
-        else:
-            outfile.write("%s\t%s\t0\n" % (gid, symbol))
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
  
      outfile.close()
  
  
      outfile.close()
  
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
  
  
  def main(argv=None):
  
  
  def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
  
  
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          doCache = True
      else:
          doCache = False
          doCache = True
      else:
          doCache = False
-        cachePages = 0
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          gidReadDict[gid] = []
  
      index = 0
          gidReadDict[gid] = []
  
      index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
          featureList = featuresByChromDict[chromosome]
  
          featureList = featuresByChromDict[chromosome]
  
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
      return chromosome not in chromosomeList
  
  
      return chromosome not in chromosomeList
  
  
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
  
      startFeature = 0
      readGidList = []
  
      startFeature = 0
      readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
  
          index += 1
          if index % 100000 == 0:
  
          index += 1
          if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      return readGidList, index
  
  
      return readGidList, index
  
  
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
diff --git a/geneStartBins.py b/geneStartBins.py

index d929d41a89a2da712b1ee953770e0b01bb2f96bb..3c5183730c288f9c61e29f41fb1df69c3e0f992f 100755 (executable)
--- a/geneStartBins.py
+++ b/geneStartBins.py
@@ -1,8 +1,3 @@
-#
-#  geneStartBins.py
-#  ENRAGE
-#
-
  try:
      import psyco
      psyco.full()
  try:
      import psyco
      psyco.full()
@@ -10,11 +5,10 @@ except:
      pass
  
  import sys
      pass
  
  import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
  from cistematic.genomes import Genome
  
  from cistematic.genomes import Genome
  
-
  print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
  print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
@@ -24,7 +18,6 @@ if len(sys.argv) < 4:
  genome = sys.argv[1]
  hitfile =  sys.argv[2]
  outfilename = sys.argv[3]
  genome = sys.argv[1]
  hitfile =  sys.argv[2]
  outfilename = sys.argv[3]
-
  standardMinDist = 3000
  if '-max' in sys.argv:
      standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
  standardMinDist = 3000
  if '-max' in sys.argv:
      standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
@@ -42,23 +35,21 @@ if '-cache' in sys.argv:
  
  bins = 10
  standardMinThresh = standardMinDist / bins
  
  bins = 10
  standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
  normalizationFactor = 1.0
  if normalize:
  normalizationFactor = 1.0
  if normalize:
-    totalCount = len(hitRDS)
+    totalCount = getHeaderComment(bamfile.header, "Total")
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
  gidDict = {}
  geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
  gidDict = {}
  geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
-
  outfile = open(outfilename,'w')
  outfile = open(outfilename,'w')
-
  gidList = hg.allGIDs()
  gidList.sort()
  gidList = hg.allGIDs()
  gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
  for gid in gidList:
      symbol = 'LOC' + gid
      geneinfo = ''
  for gid in gidList:
      symbol = 'LOC' + gid
      geneinfo = ''
@@ -78,7 +69,7 @@ for gid in gidList:
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
  
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
  
-    if chrom not in hitDict:
+    if chrom not in chromosomeList:
          continue
  
      newfeatureList.sort()
          continue
  
      newfeatureList.sort()
@@ -109,15 +100,16 @@ for gid in gidList:
  
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
  
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
+
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
  
      binList = [0] * bins
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
  
      binList = [0] * bins
-    for read in hitDict[chrom]:
-        tagStart = read["start"] - gstart
-        sense = read["sense"]
-        weight = read["weight"]
+    for alignedread in bamfile.fetch(chrom):
+        tagStart = alignedread.pos - gstart
+        sense = getReadSense(alignedread)
+        weight = 1.0/alignedread.opt('NH')
          if tagStart >= 2 * glen:
              break
  
          if tagStart >= 2 * glen:
              break
  
diff --git a/getsplicefa.py b/getsplicefa.py

index 197800e09fab70bf2824bcf3e9caa1ecfb364ef1..819675f26f869895fc12b8058daf195f02b2d07b 100755 (executable)
--- a/getsplicefa.py
+++ b/getsplicefa.py
@@ -20,6 +20,7 @@ def main(argv=None):
      delimiter = "|"
  
      usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
      delimiter = "|"
  
      usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+            \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\
              \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
              \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
  
              \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
              \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
  
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index cf02cb3af4e26d987a971817057c9e790dae874d..e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -6,10 +6,10 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-import ReadDataset
+import pysam
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
  
  print "normalizeExpandedExonic: version 5.7"
  
  
  print "normalizeExpandedExonic: version 5.7"
  
@@ -18,7 +18,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+    usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -34,22 +34,19 @@ def main(argv=None):
      splicecountfile = args[3]
      outfile = args[4]
  
      splicecountfile = args[3]
      outfile = args[4]
  
-    candidateLines = []
+    candidatefilename = ""
      acceptedfilename = ""
      acceptedfilename = ""
-    if len(args) > 5:
-        try:
-            candidatefile = open(args[5])
-            candidateLines = candidatefile.readlines()
-            candidatefile.close()
-            acceptedfilename = args[6]
-        except IndexError:
-            pass
-
-    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+    try:
+        candidatefilename = args[5]
+        acceptedfilename = args[6]
+    except IndexError:
+        pass
+
+    bamfile = pysam.Samfile(hitfile, "rb")    
+    uniqcount = getHeaderComment(bamfile.header, "Unique")
      
      normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
      
      normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
-                            candidateLines, acceptedfilename, options.fieldID,
+                            candidatefilename, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
  
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
  
@@ -77,102 +74,22 @@ def makeParser(usage=""):
  
  
  def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
  
  
  def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
-                            outfilename, candidateLines=[], acceptedfilename="",
+                            outfilename, candidatefilename="", acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
-
-    uniquecountfile = open(uniquecountfilename)
-
+  
+    print "%d unique reads" % uniqcount
+    gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
+    totalCount = (uniqcount + splicecount) / 1000000.
+    uniqScale = uniqcount / 1000000.
      if acceptedfilename:
          acceptedfile = open(acceptedfilename, "w")
  
      if acceptedfilename:
          acceptedfile = open(acceptedfilename, "w")
  
-    dosplicecount = False
-    if splicecountfilename != "none":
-        dosplicecount = True
-        splicecountfile = open(splicecountfilename)
-
-    if extendGenome:
-        if replaceModels:
-            print "will replace gene models with %s" % extendGenome
-        else:
-            print "will extend gene models with %s" % extendGenome
-
-    if doCache:
-        cacheGeneDB(genome)
-        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        print "%s cached" % genome
-    else:
-        hg = Genome(genome, inRAM=True)
-
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
-
-    
-    print "%d unique reads" % uniqcount
-
-    splicecount = 0
-    countDict = {}
-    gidList = []
-    farList = []
-    candidateDict = {}
-
-    gidToGeneDict = {}
-
-    featuresDict = hg.getallGeneFeatures()
-    print "got featuresDict"
-
      outfile = open(outfilename, "w")
      outfile = open(outfilename, "w")
-
-    for line in uniquecountfile:
-        fields = line.strip().split()
-        gid = fields[fieldID]
-        gene = fields[1]
-        countDict[gid] = float(fields[-1])
-        gidList.append(gid)
-        gidToGeneDict[gid] = gene
-
-    uniquecountfile.close()
-
-    if dosplicecount:
-        for line in splicecountfile:
-            fields = line.strip().split()
-            gid = fields[fieldID]
-            try:
-                countDict[gid] += float(fields[-1])
-            except:
-                print fields
-                continue
-
-            splicecount += float(fields[-1])
-
-        splicecountfile.close()
-
-    for line in candidateLines:
-        if "#" in line:
-            continue
-
-        fields = line.strip().split()
-        gid = fields[1]
-        gene = fields[0]
-        if gid not in gidList:
-            if gid not in farList:
-                farList.append(gid)
-                gidToGeneDict[gid] = gene
-
-            if gid not in countDict:
-                countDict[gid] = 0
-
-            countDict[gid] += float(fields[6])
-
-        if gid not in candidateDict:
-            candidateDict[gid] = []
-
-        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
-    totalCount = (uniqcount + splicecount) / 1000000.
-    uniqScale = uniqcount / 1000000.
+    featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+    print "got featuresDict"
      for gid in gidList:
      for gid in gidList:
-        gene = gidToGeneDict[gid]
+        gene = gidToGeneDict[gid]["name"]
          featureList = []
          try:
              featureList = featuresDict[gid]
          featureList = []
          try:
              featureList = featuresDict[gid]
@@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
          elif geneLength > maxLength:
              geneLength = maxLength
  
          elif geneLength > maxLength:
              geneLength = maxLength
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          if gid in candidateDict:
              for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
          rpkm = rpm / geneLength
          if gid in candidateDict:
              for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-                cratio = cCount / (cLength / 1000.)
-                cratio = (uniqScale * cratio) / totalCount
+                kilobaseLength = cLength / 1000.
+                cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
                  if 10. * cratio < rpkm:
                      continue
  
                  if 10. * cratio < rpkm:
                      continue
  
-                countDict[gid] += cCount
-                geneLength += cLength / 1000.
-                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                gidToGeneDict[gid]["count"] += cCount
+                geneLength += kilobaseLength
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
  
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
  
          rpkm = rpm / geneLength
          outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
  
-    for gid in farList:
-        gene = gidToGeneDict[gid]
-        geneLength = 0
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            geneLength += cLength / 1000.
-
+    for (gid, geneLength) in farList:
          if geneLength < 0.1:
              continue
  
          if geneLength < 0.1:
              continue
  
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            cratio = cCount / (cLength / 1000.)
-            cratio = cratio / totalCount
-            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
-        rpm = countDict[gid] / totalCount
+        gene = gidToGeneDict[gid]["name"]
+        if acceptedfilename:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                kilobaseLength = cLength / 1000.
+                cratio = (cCount / kilobaseLength) / totalCount
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
+
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
  
          rpkm = rpm / geneLength
          outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
  
@@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
      except:
          pass
  
      except:
          pass
  
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+    gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+    uniqueGidList = gidToGeneDict.keys()
+    if candidatefilename:
+        candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+    else:
+        candidateDict = {}
+        geneDictUpdates = {}
+
+    farList = []
+    for gid in geneDictUpdates:
+        gidToGeneDict[gid] = geneDictUpdates[gid]
+        geneLength = 0.
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        farList.append((gid, geneLength))
+
+    return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+    gidToGeneDict = {}
+    uniquecountfile = open(uniquecountfilename)
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+    uniquecountfile.close()
+    splicecount = 0.
+    if splicecountfilename != "none":
+        splicecountfile = open(splicecountfilename)
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                gidToGeneDict[gid]["count"] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+    """ Reads entries from the candiates file
+        gid entries not in gidList are returned for inclusion
+        creates and returns a dictionary keyed off gid with a single list of
+        tuples (count, length, chrom, start, stop) for each gid
+    """
+    geneDictUpdates = {}
+    candidateDict = {}
+    candidatefile = open(candidatefilename)
+    for line in candidatefile.readlines():
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            try:
+                geneDictUpdates[gid]["count"] += float(fields[6])
+            except KeyError:
+                geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+        try:
+            candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+        except KeyError:
+            candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+    candidatefile.close()
+
+    return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    featuresDict = hg.getallGeneFeatures()
+
      if doCache:
          uncacheGeneDB(genome)
  
      if doCache:
          uncacheGeneDB(genome)
  
+    return featuresDict
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/regionCounts.py b/regionCounts.py

index ae005cb3146b686fd3934cd9a152c331900ebde4..e11508ea65e7a61614afb971f88fdb8f00824da7 100755 (executable)
--- a/regionCounts.py
+++ b/regionCounts.py
@@ -12,8 +12,8 @@ except:
  import sys
  import string
  import optparse
  import sys
  import string
  import optparse
-from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
  
  versionString = "regionCounts: version 3.10"
  print versionString
  
  versionString = "regionCounts: version 3.10"
  print versionString
@@ -32,10 +32,12 @@ def main(argv=None):
          sys.exit(1)
  
      regionfilename = args[0]
          sys.exit(1)
  
      regionfilename = args[0]
-    hitfile =  args[1]
+    bamfilename =  args[1]
      outfilename = args[2]
  
      outfilename = args[2]
  
-    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
                   options.useFullchrom, options.normalize, options.padregion,
                   options.mergeregion, options.merging, options.doUniqs, options.doMulti,
                   options.doSplices, options.usePeak, options.cachePages, options.logfilename,
                   options.useFullchrom, options.normalize, options.padregion,
                   options.mergeregion, options.merging, options.doUniqs, options.doMulti,
                   options.doSplices, options.usePeak, options.cachePages, options.logfilename,
@@ -89,7 +91,7 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
                   useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
                   merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
                   cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
                   useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
                   merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
                   cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
@@ -99,11 +101,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      print "merging regions closer than %d bp" % mergeregion
      print "will use peak values"
  
      print "merging regions closer than %d bp" % mergeregion
      print "will use peak values"
  
-    if cachePages != -1:
-        doCache = True
-    else:
-        doCache = False
-
      normalize = True
      doRPKM = False
      if doRPKM == True:
      normalize = True
      doRPKM = False
      if doRPKM == True:
@@ -119,24 +116,13 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      labeltoRegionDict = {}
      regionCount = {}
  
      labeltoRegionDict = {}
      regionCount = {}
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
-    totalCount = len(hitRDS)
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    totalCount = getHeaderComment(bamfile.header, "Total")
      if normalize:
          normalizationFactor = totalCount / 1000000.
  
      if normalize:
          normalizationFactor = totalCount / 1000000.
  
-    chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
-
+    chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
      chromList.sort()
      chromList.sort()
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("OFF")        
-
      for rchrom in regionDict:
          if forceRegion and rchrom not in chromList:
              print rchrom
      for rchrom in regionDict:
          if forceRegion and rchrom not in chromList:
              print rchrom
@@ -146,7 +132,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
                  labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
  
      for rchrom in chromList:
                  labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
  
      for rchrom in chromList:
-        regionList = []
+        #regionList = []
          if rchrom not in regionDict:
              continue
  
          if rchrom not in regionDict:
              continue
  
@@ -156,11 +142,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
          else:
              fullchrom = "chr%s" % rchrom
  
          else:
              fullchrom = "chr%s" % rchrom
  
-        if usePeak:
-            readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
-            rindex = 0
-            dictLen = len(readDict[fullchrom])
-
          for region in regionDict[rchrom]:
              label = region.label
              start = region.start
          for region in regionDict[rchrom]:
              label = region.label
              start = region.start
@@ -168,17 +149,12 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
              regionCount[label] = 0
              labelList.append(label)
              labeltoRegionDict[label] = (rchrom, start, stop)
              regionCount[label] = 0
              labelList.append(label)
              labeltoRegionDict[label] = (rchrom, start, stop)
-            regionList.append((label, fullchrom, start, stop))
+            #regionList.append((label, fullchrom, start, stop))
              if usePeak:
                  readList = []
              if usePeak:
                  readList = []
-                for localIndex in xrange(rindex, dictLen):
-                    read = readDict[fullchrom][localIndex]
-                    if read["start"] < start:
-                        rindex += 1
-                    elif start <= read["start"] <= stop:
-                        readList.append(read)
-                    else:
-                        break
+                for alignedread in bamfile.fetch(fullchrom, start, stop):
+                    weight = 1.0/alignedread.opt('NH')
+                    readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
  
                  if len(readList) < 1:
                      continue
  
                  if len(readList) < 1:
                      continue
@@ -193,13 +169,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
  
                  regionCount[label] += topValue
              else:
  
                  regionCount[label] += topValue
              else:
-                regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-        if flagRDS:
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("ON")    
+                regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
  
      if normalize:
          for label in regionCount:
  
      if normalize:
          for label in regionCount:
@@ -235,10 +205,29 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
          outfile.write("\n")
  
      outfile.close()
          outfile.write("\n")
  
      outfile.close()
-    if doCache and flagRDS:
-        hitRDS.saveCacheDB(hitfile)
-
-    writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+    writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
+
+
+def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
+    uniques = 0
+    multis = 0.0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        readMultiplicity = alignedread.opt('NH')
+        if doSplices and isSpliceEntry(alignedread.cigar):
+            if readMultiplicity == 1 and doUniqs:
+                uniqueSplice += 1
+            elif doMulti:
+                multiSplice += 1.0/readMultiplicity
+        elif readMultiplicity == 1 and doUniqs:
+            uniques += 1
+        elif doMulti:
+            multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+
+    return totalReads
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
diff --git a/rnafarPairs.py b/rnafarPairs.py

index 4d70c4965ef77c2eea4f646e58526db83682ffa6..2c5aff43ced96fe650d5cc52562cb998033b4d04 100755 (executable)
--- a/rnafarPairs.py
+++ b/rnafarPairs.py
@@ -1,9 +1,3 @@
-#
-#  RNAFARpairs.py
-#  ENRAGE
-#
-#  Created by Ali Mortazavi on 11/2/08.
-#
  """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
             looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
  """
  """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
             looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
  """
@@ -16,8 +10,8 @@ except:
  import sys
  import time
  import optparse
  import sys
  import time
  import optparse
-import ReadDataset
-from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+import pysam
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
  
  
  def main(argv=None):
  
  
  def main(argv=None):
@@ -36,10 +30,12 @@ def main(argv=None):
  
      genome = args[0]
      goodfilename = args[1]
  
      genome = args[0]
      goodfilename = args[1]
-    rdsfile = args[2]
+    bamfilename = args[2]
      outfilename = args[3]
  
      outfilename = args[3]
  
-    rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
  
  
  def makeParser(usage=""):
  
  
  def makeParser(usage=""):
@@ -62,16 +58,26 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+    """ map all candidate regions that have paired ends overlapping with known genes
+    """
+
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    regions = {}
+    for chromosome in chromosomeList:
+        regions[chromosome] = []
+
      goodDict = {}
      goodfile = open(goodfilename)
      for line in goodfile:
          fields = line.split()
      goodDict = {}
      goodfile = open(goodfilename)
      for line in goodfile:
          fields = line.split()
-        goodDict[fields[0]] = line
+        label = fields[0]
+        start = int(fields[2])
+        stop = int(fields[3])
+        goodDict[label] = line
+        regions[chromosome].append((start, stop, label))
  
      goodfile.close()
  
      goodfile.close()
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
-    chromosomeList = RDS.getChromosomes()
      if doVerbose:
          print time.ctime()
  
      if doVerbose:
          print time.ctime()
  
@@ -83,23 +89,18 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
      assigned = {}
      farConnected = {}
      for chromosome in chromosomeList:
      assigned = {}
      farConnected = {}
      for chromosome in chromosomeList:
-        if doNotProcessChromosome(chromosome):
-            continue
-
          print chromosome
          print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+        regionList = regions[chromosome].sort()
+        uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
          if doVerbose:
              print len(uniqDict), time.ctime()    
  
          if doVerbose:
              print len(uniqDict), time.ctime()    
  
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                if processReads(readList[:2], maxDist):
-                    flags = (readList[0]["flag"], readList[1]["flag"])
-                    processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
-                    total += processed
-                    distinct += distinctPairs
+        total += pairCount
+        for readID, readList in uniqDict.items():
+            flags = (readList[0]["flag"], readList[1]["flag"])
+            processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+            total += processed
+            distinct += distinctPairs
  
      entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
      distinct += entriesWritten
  
      entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
      distinct += entriesWritten
@@ -109,15 +110,55 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
      print time.ctime()
  
  
      print time.ctime()
  
  
-def doNotProcessChromosome(chromosome):
-    return chromosome == "chrM"
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+    """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+        Only returns unique non-spliced read pairs matching the criteria given in processReads().
+    """
+    start = 1
+    readDict = {}
+    for regionstart, regionstop, regionname in regions:
+        for alignedread in bamfile.fetch(chromosome, start, regionstop):
+            if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+                if alignedread.pos >= regionstart:
+                    flag = regionname
+                else:
+                    flag = alignedread.opt("ZG")
  
  
+                try:
+                    readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+                except KeyError:
+                    readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
  
  
-def processReads(reads, maxDist):
+        start = regionstop + 1
+
+    for alignedread in bamfile.fetch(chromosome, start):
+        if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+            flag = alignedread.opt("ZG")
+
+            try:
+                readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+            except KeyError:
+                readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+    pairCount = len(readDict.keys())
+    for readID, readList in readDict.items():
+        if len(readList) != 2:
+            del readDict[readID]
+            pairCount -= 1
+        elif not processReads(readList, maxDist):
+            del readDict[readID]
+
+    return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+    """ For a pair of readID's to be acceptable:
+            - flags must be different
+            - neither flag can be 'NM'
+            - the read starts have to be within maxDist
+    """
      process = False
      process = False
-    start1 = reads[0]["start"]
-    start2 = reads[1]["start"]
-    dist = abs(start1 - start2)
+    dist = abs(reads[0]["start"] - reads[1]["start"])
      flag1 = reads[0]["flag"]
      flag2 = reads[1]["flag"]
  
      flag1 = reads[0]["flag"]
      flag2 = reads[1]["flag"]
  
@@ -128,6 +169,11 @@ def processReads(reads, maxDist):
  
  
  def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
  
  
  def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+    """ Writes out the region information along with symbol and geneID for paired reads where one read
+        of the pair is in a known gene and the other is in a new region.  If both reads in the pair are
+        in new regions the region is added to farConnected.  No action is taken if both reads in the
+        pair are in known genes.
+    """
      flag1, flag2 = flags
      total = 0
      distinct = 0
      flag1, flag2 = flags
      total = 0
      distinct = 0
@@ -147,7 +193,7 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
          except KeyError:
              farConnected[geneID] = [farFlag]
      elif read1IsGood or read2IsGood:
          except KeyError:
              farConnected[geneID] = [farFlag]
      elif read1IsGood or read2IsGood:
-        total += 1
+        total = 1
          if read2IsGood:
              farFlag = flag2
              geneID = flag1
          if read2IsGood:
              farFlag = flag2
              geneID = flag1
@@ -155,30 +201,35 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
              farFlag = flag1
              geneID = flag2
  
              farFlag = flag1
              geneID = flag2
  
-        try:
-            if genome == "dmelanogaster":
-                symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
-            else:
-                symbol = geneInfoDict[geneID][0][0]
-        except (KeyError, IndexError):
-            try:
-                symbol = geneAnnotDict[(genome, geneID)][0]
-            except (KeyError, IndexError):
-                symbol = "LOC%s" % geneID
-
-        symbol = symbol.strip()
-        symbol = symbol.replace(" ","|")
-        symbol = symbol.replace("\t","|")
-
+        symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
          if farFlag not in assigned:
              assigned[farFlag] = (symbol, geneID)
              print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
              outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
          if farFlag not in assigned:
              assigned[farFlag] = (symbol, geneID)
              print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
              outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
-            distinct += 1
+            distinct = 1
  
      return total, distinct
  
  
  
      return total, distinct
  
  
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+    try:
+        if genome == "dmelanogaster":
+            symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+        else:
+            symbol = geneInfoDict[geneID][0][0]
+    except (KeyError, IndexError):
+        try:
+            symbol = geneAnnotDict[(genome, geneID)][0]
+        except (KeyError, IndexError):
+            symbol = "LOC%s" % geneID
+
+    symbol = symbol.strip()
+    symbol = symbol.replace(" ","|")
+    symbol = symbol.replace("\t","|")
+
+    return symbol
+
+
  def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
      total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
      writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
  def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
      total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
      writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py

index ebc25a530af4166ea69b17a8228a6943b19960c6..0c01de47d6a4b0aa7a1f4476ad82c024da669809 100644 (file)
--- a/runRNAPairedAnalysis.py
+++ b/runRNAPairedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -50,84 +50,78 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False):
      """ based on original script runRNAPairedAnalysis.sh
      """ based on original script runRNAPairedAnalysis.sh
-        usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
-            where rdsprefix is the name of the rds file without the .rds extension
+        usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels]
+            where fileprefix is the name of the bam file without the .bam extension
              use "none" for the repeatmaskdb if you do not have one
      """
  
              use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+    message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb)
      writeLog(logfile, "runRNAPairedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
      writeLog(logfile, "runRNAPairedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # find new regions outside of gene models with reads piled up 
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # filter out new regions that overlap repeats more than a certain fraction
  
      # filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
-    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    regionfilename = "%s.newregions.checked" % fileprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile)
  
      # calculate the read densities
  
      # calculate the read densities
-    regionfilename = "%s.newregions.checked" % rdsprefix
-    regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+    goodFileName = "%s.newregions.good" % fileprefix
+    regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that have paired ends overlapping with known genes
  
      # map all candidate regions that have paired ends overlapping with known genes
-    candidatefilename = "%s.candidates.txt" % rdsprefix
-    rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+    candidatefilename = "%s.candidates.txt" % fileprefix
+    rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True)
  
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
                              acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
                              acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    acceptfile = "%s.accepted.rpkm" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+    multicountfilename = "%s.multi.count" % fileprefix
+    acceptfile = "%s.accepted.rpkm" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
                             extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate final exonic read density
                             extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py

index adfa37522a5f2c9af6d70b2a117102a6ab4163c8..d20ef640899392ec1f08f474328298d1274e69c9 100644 (file)
--- a/runStandardAnalysis.py
+++ b/runStandardAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -17,7 +17,7 @@ def main(argv=None):
          argv = sys.argv
  
      print "runStandardAnalysis: version %s" % VERSION
          argv = sys.argv
  
      print "runStandardAnalysis: version %s" % VERSION
-    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+    usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
  
      parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -27,15 +27,15 @@ def main(argv=None):
          sys.exit(1)
  
      genome = args[0]
          sys.exit(1)
  
      genome = args[0]
-    rdsprefix = args[1]
+    fileprefix = args[1]
      repeatmaskdb = args[2]
      repeatmaskdb = args[2]
-    bpradius = args[3]
+    bpradius = int(args[3])
      try:
          modelfile = args[4]
      except IndexError:
          modelfile = ""
  
      try:
          modelfile = args[4]
      except IndexError:
          modelfile = ""
  
-    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+    runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
  
  
  def getParser(usage):
  
  
  def getParser(usage):
@@ -50,82 +50,78 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
      """ based on original script runStandardAnalysis.sh
      """ based on original script runStandardAnalysis.sh
-        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
-               where rdsprefix is the name of the rds file without the .rds extension
+        usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where fileprefix is the name of the bam file without the .bam extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStandardAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are
      # mappable reads that fell outside of the Cistematic gene models and not the
      # unmappable of Eland (i.e, the "NM" reads)
      writeLog(logfile, "runStandardAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are
      # mappable reads that fell outside of the Cistematic gene models and not the
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    splicecountfilename = "none"
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # Alternative 1: find new regions outside of gene models with reads piled up
  
      # Alternative 1: find new regions outside of gene models with reads piled up
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
  
      # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that are within a given radius of a gene in bp
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
                             replaceModels=replacemodels)
  
      # calculate final exonic read density
                             replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
+    #TODO: remove when not tracking
+    writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py

index 55c87ca76e48a1e272bf1184ec6fd3bd5ff9b1e9..35a67008d196ea0c942e2fb94614a7e76f612f7e 100644 (file)
--- a/runStrandedAnalysis.py
+++ b/runStrandedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -40,58 +40,57 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
      """ based on original script runStrandedAnalysis.sh
          usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                 where rdsprefix is the name of the rds file without the .rds extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
      """ based on original script runStrandedAnalysis.sh
          usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                 where rdsprefix is the name of the rds file without the .rds extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStrandedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
      writeLog(logfile, "runStrandedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  
      # recount the unique reads with weights calculated during the first pass
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  
      # recount the unique reads with weights calculated during the first pass
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
  
      # count splice reads
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
  
      # find new regions outside of gene models with reads piled up 
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
      regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
  
      regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+    findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
  
      # filter out new regions that overlap repeats more than a certain fraction
  
      # filter out new regions that overlap repeats more than a certain fraction
-    newregionfilename = "%s.newregions.txt" % rdsprefix
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      #TODO: these calls look wrong
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      #TODO: these calls look wrong
@@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
      #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  
      # map all candidate regions that are within a given radius of a gene in bp
      #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
  
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True)
  
      # weigh multi-reads
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
  
      # calculate final exonic read density
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
diff --git a/siteintersects.py b/siteintersects.py

index 58d9e3059ad8a587c10badd3fb8c7cc820462047..9c0c38fa197cbd18669f65269c6f94672cc6e604 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -127,7 +127,7 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite
  
          siteNotCommon = True
          #TODO: as handed to me this is O(n^m) - probably could be better...
  
          siteNotCommon = True
          #TODO: as handed to me this is O(n^m) - probably could be better...
-        #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file
+        #TODO: if there are multiple matches will count them but only write the first match to the file
          for (rstart, rstop, rline) in siteDict[chrom]:
              if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
          for (rstart, rstop, rline) in siteDict[chrom]:
              if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
diff --git a/weighMultireads.py b/weighMultireads.py

index 3fd6bd8e151b3f909034e856aeb5eb329db7c17f..f4b1691a07a08b15988539f75f1fbb2a62e148b2 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -168,7 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
-            #TODO: QForAli - Is this right? If match index is 0 are we doing nothing?
+            #TODO: Is this right? If match index is 0 are we doing nothing?
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
author	Sean Upchurch <sau@caltech.edu>
	Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
ReadDataset.py		patch \| blob \| history
Region.py		patch \| blob \| history
bamPreprocessing.py	[new file with mode: 0644]	patch \| blob
checkrmask.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
combinerds.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
docs/README.chip-seq		patch \| blob \| history
docs/RNA-seq.analysisSteps.txt		patch \| blob \| history
docs/runRNAPairedAnalysis.sh		patch \| blob \| history
docs/runSNPAnalysis.sh		patch \| blob \| history
docs/runStandardAnalysis.sh		patch \| blob \| history
docs/runStrandedAnalysis.sh		patch \| blob \| history
findall.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCounts.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
geneStartBins.py		patch \| blob \| history
getsplicefa.py		patch \| blob \| history
normalizeExpandedExonic.py		patch \| blob \| history
regionCounts.py		patch \| blob \| history
rnafarPairs.py		patch \| blob \| history
runRNAPairedAnalysis.py		patch \| blob \| history
runStandardAnalysis.py		patch \| blob \| history
runStrandedAnalysis.py		patch \| blob \| history
siteintersects.py		patch \| blob \| history
weighMultireads.py		patch \| blob \| history