convert standard analysis pipelines to use bam format natively
authorSean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
committerSean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
27 files changed:
ReadDataset.py
Region.py
bamPreprocessing.py [new file with mode: 0644]
checkrmask.py
combineRPKMs.py
combinerds.py
commoncode.py
docs/README.chip-seq
docs/RNA-seq.analysisSteps.txt
docs/runRNAPairedAnalysis.sh
docs/runSNPAnalysis.sh
docs/runStandardAnalysis.sh
docs/runStrandedAnalysis.sh
findall.py
geneLocusBins.py
geneMrnaCounts.py
geneMrnaCountsWeighted.py
geneStartBins.py
getsplicefa.py
normalizeExpandedExonic.py
regionCounts.py
rnafarPairs.py
runRNAPairedAnalysis.py
runStandardAnalysis.py
runStrandedAnalysis.py
siteintersects.py
weighMultireads.py

index 9a795c8cde5c5ba6d257da963ebf4c32f7d6e42d..dffa76de9436f4379f42ba2175bec17fc1a8fa59 100644 (file)
@@ -6,11 +6,10 @@ import re
 import sys
 import pysam
 from array import array
 import sys
 import pysam
 from array import array
-from commoncode import getReverseComplement
+from commoncode import getReverseComplement, isSpliceEntry
 
 currentRDSVersion = "3.0"
 
 
 currentRDSVersion = "3.0"
 
-
 class ReadDatasetError(Exception):
     pass
 
 class ReadDatasetError(Exception):
     pass
 
@@ -68,7 +67,7 @@ class MaxCoordFinder(ReadCounter):
 
 
 
 
 
 
-class BamReadDataset():
+class ReadDataset():
     """ Class for storing reads from experiments. Assumes that custom scripts
     will translate incoming data into a format that can be inserted into the
     class using the insert* methods. Default class subtype ('DNA') includes
     """ Class for storing reads from experiments. Assumes that custom scripts
     will translate incoming data into a format that can be inserted into the
     class using the insert* methods. Default class subtype ('DNA') includes
@@ -454,7 +453,7 @@ class BamReadDataset():
                 newrow["chrom"] = chrom
 
             if withPairID:
                 newrow["chrom"] = chrom
 
             if withPairID:
-                newrow["pairID"] = pairID
+                newrow["pairID"] = pairReadSuffix[1:]
 
             try:
                 resultsDict[dictKey].append(newrow)
 
             try:
                 resultsDict[dictKey].append(newrow)
@@ -973,16 +972,6 @@ def getReadSense(reverse):
     return sense
 
 
     return sense
 
 
-def isSpliceEntry(cigarTupleList):
-    isSplice = False
-    for operation,length in cigarTupleList:
-        if operation == 3:
-            isSplice = True
-            break
-
-    return isSplice
-
-
 def getSpliceRightStart(start, cigarTupleList):
     offset = 0
 
 def getSpliceRightStart(start, cigarTupleList):
     offset = 0
 
index 0d0a15b1cd91e7f3ceccee518f448ffccf2daad4..39ba811bc1d94100ed63516467e0a99fb2f637a0 100644 (file)
--- a/Region.py
+++ b/Region.py
@@ -6,7 +6,7 @@ class Region(object):
     """
 
 
     """
 
 
-    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0., foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
         self.label = label
         self.index = index
         self.chrom = chrom
         self.label = label
         self.index = index
         self.chrom = chrom
diff --git a/bamPreprocessing.py b/bamPreprocessing.py
new file mode 100644 (file)
index 0000000..dd5e861
--- /dev/null
@@ -0,0 +1,193 @@
+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+    usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+    if len(sys.argv) < 3:
+        print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+        print '       BAM file has to be indexed'
+        sys.exit(1)
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    BAM = args[0]
+    outputfilename = args[1]
+
+    samfile = pysam.Samfile(BAM, "rb" )
+    readMultiplicityDict = getReadMultiplicity(samfile)
+
+    counts = getReadCounts(samfile, readMultiplicityDict)
+    outheader = buildHeader(samfile.header, counts)
+    if options.markGID:
+        genome = Genome(options.genomeName, inRAM=True)
+        if options.extendGenome != "":
+            genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+        print "getting gene features...."
+        featuresByChromDict = getFeaturesByChromDict(genome)    
+        outheader['CO'].append(options.genomeComment)
+
+    outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+    for alignedread in samfile.fetch(until_eof=True):
+        geneModelFlag = ""
+        if options.markGID:
+            chrom = samfile.getrname(alignedread.tid)
+            chromNum = chrom[3:]
+            geneModelFlag = "NM"
+            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+                if start < alignedread.pos and stop > alignedread.pos:
+                    geneModelFlag = gid
+                    continue
+
+        ID = getReadID(alignedread)
+        try:
+            scaleby = alignedread.opt('NH')
+            outfile.write(alignedread)
+        except KeyError:
+            scaleby = readMultiplicityDict[ID]
+            writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+    outfile.close()
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--genome", dest="genomeName")
+    parser.add_option("--genomeName", dest="genomeComment")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+
+    parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+    return parser
+
+
+def getReadMultiplicity(samfile):
+    """ There is an assumption here that the set of all reads will not have a mixture of
+        reads bearing the NH tag and reads without.
+    """
+    readMultiplicityDict = {}
+    processedReads = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        try:
+            readMultiplicity = alignedread.opt('NH')
+            return {}
+        except KeyError:
+            ID = getReadID(alignedread)
+            try:
+                readMultiplicityDict[ID] += 1
+            except KeyError:
+                readMultiplicityDict[ID] = 1
+
+        processedReads += 1
+        if processedReads % 5000000 == 0:
+            print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+    return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+    uniques = 0
+    multis = 0.0
+    minusUnique = 0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    minusMulti = 0.0
+    readLength = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        if readLength == 0:
+            readLength = getReadLength(alignedread.cigar)
+
+        try:
+            readMultiplicity = alignedread.opt('NH')
+        except KeyError:
+            ID = getReadID(alignedread)
+            readMultiplicity = readMultiplicityDict[ID]
+
+        if readMultiplicity == 1:
+            if alignedread.is_reverse:
+                minusUnique += 1
+
+            if isSpliceEntry(alignedread.cigar):
+                uniqueSplice += 1
+            else:
+                uniques += 1
+        else:
+            if alignedread.is_reverse:
+                minusMulti += 1.0/readMultiplicity
+
+            if isSpliceEntry(alignedread.cigar):
+                multiSplice += 1.0/readMultiplicity
+            else:
+                multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+    plusUnique = uniques + uniqueSplice - minusUnique
+    plusMulti = multis + multiSplice - minusMulti
+    counts = ["Total\t%d" % totalReads,
+              "Unique\t%d" % uniques,
+              "Multis\t%d" % multis,
+              "UniqueSplices\t%d" % uniqueSplice,
+              "Multisplices\t%d" % multiSplice,
+              "PlusUnique\t%d" % plusUnique,
+              "PlusMulti\t%d" % plusMulti,
+              "MinusUnique\t%d" % minusUnique,
+              "MinusMulti\t%d" % minusMulti,
+              "ReadLength\t%d" % readLength
+    ]
+
+    return counts
+
+
+def buildHeader(templateheader, commentList):
+    header = templateheader
+    header["CO"] = commentList
+
+    return header
+
+
+def getReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getReadLength(cigar):
+    take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+    return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+    newAlignedRead = originalRead
+    newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+    outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
\ No newline at end of file
index 457e4c1f9ebedb7b4daf6137f9f8b850635afb28..8ae30ee958dd10fc0d7e598b7bb25fa4b4ed17c2 100755 (executable)
@@ -55,6 +55,27 @@ def makeParser(usage=""):
 
 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
 
 
 def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
 
+    if os.path.isfile(dbfile):
+        checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
+    else:
+        outfile = open(outFileName, "w")
+        goodfile = open(goodFileName, "w")
+        infile = open(filename)
+        print "No database - passing through"
+        if logfilename is not None:
+            writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+            writeLog(logfilename, versionString, "No database - passing through")
+
+        for line in infile:
+            outfile.write("%s\tNR\tNR\t0.00\n" % line)
+            goodfile.write(line)
+
+        outfile.close()
+        goodfile.close()
+
+
+def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
     outfile = open(outFileName, "w")
     goodfile = open(goodFileName, "w")
     if startField < 0:
     outfile = open(outFileName, "w")
     goodfile = open(goodFileName, "w")
     if startField < 0:
@@ -63,29 +84,14 @@ def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cacheP
     if cachePages < 250000:
         cachePages = 250000
 
     if cachePages < 250000:
         cachePages = 250000
 
-    doLog = False
     if logfilename is not None:
         writeLog(logfilename, versionString, string.join(sys.argv[1:]))
     if logfilename is not None:
         writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-        doLog = True
 
     infile = open(filename)
 
     infile = open(filename)
-    if os.path.isfile(dbfile):
-        db = sqlite.connect(dbfile)
-        sql = db.cursor()
-        sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
-        sql.execute("PRAGMA temp_store = MEMORY")
-    else:
-        print "No database - passing through"
-        if doLog:
-            writeLog(logfilename, versionString, "No database - passing through")
-
-        for line in infile:
-            outfile.write("%s\tNR\tNR\t0.00\n" % line)
-            goodfile.write(line)
-
-        outfile.close()
-        goodfile.close()
-        sys.exit(0)
+    db = sqlite.connect(dbfile)
+    sql = db.cursor()
+    sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+    sql.execute("PRAGMA temp_store = MEMORY")
 
     featureList = []
     featureDict = {}
 
     featureList = []
     featureDict = {}
index 88b57ab547edeea3f9be5387780796aadbe72bc2..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
@@ -63,7 +63,7 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
     outfile.write(header)
 
     finalfile = open(finalfileName)
     outfile.write(header)
 
     finalfile = open(finalfileName)
-    #TODO: QForAli - the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
     #      are not in the finalfile then they will be lost.
     for line in finalfile:
         fields = line.strip().split()
     #      are not in the finalfile then they will be lost.
     for line in finalfile:
         fields = line.strip().split()
index 2695e6b512c677735fc61be2931108e947d0ab43..d6960c2783908d67db405315b797aab3db5a02e9 100755 (executable)
@@ -21,7 +21,7 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
     if not argv:
         argv = sys.argv
 
-    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
     parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
@@ -98,9 +98,7 @@ def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, c
         for table in tableList:
             print "importing table %s from file %s" % (table, inputfile)
             dbColumns = "*"
         for table in tableList:
             print "importing table %s from file %s" % (table, inputfile)
             dbColumns = "*"
-            if table == "uniqs":
-                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
-            elif table == "multi":
+            if table in ["uniqs", "multi"]:
                 dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "splices":
                 dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
                 dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
             elif table == "splices":
                 dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
index cd449620f0fc88304f4e8adceee0e96e37e1295d..0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55 100755 (executable)
@@ -10,8 +10,8 @@ from cistematic.genomes import Genome
 import Region
 
 commoncodeVersion = 5.6
 import Region
 
 commoncodeVersion = 5.6
-currentRDSversion = 2.0
 
 
+#TODO: This is a function dumping ground - needs to be reworked
 class ErangeError(Exception):
     pass
 
 class ErangeError(Exception):
     pass
 
@@ -49,6 +49,78 @@ def writeLog(logFile, messenger, message):
     logfile.close()
 
 
     logfile.close()
 
 
+def getChromInfoList(chrominfo):
+    chromInfoList = []
+    linelist = open(chrominfo)
+    for line in linelist:
+        fields = line.strip().split('\t')
+        chr = fields[0]
+        start = 0
+        end = int(fields[1])
+        chromInfoList.append((chr,start,end))
+
+    linelist.close()
+
+    return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    if cigarTupleList is not None:
+        for operation,length in cigarTupleList:
+            if operation == 3:
+                isSplice = True
+                break
+
+    return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+    countRead = True
+
+    if sense in ['-', '+'] and sense != getReadSense(read):
+        countRead = False
+    elif not countSplices and isSpliceEntry(read.cigar):
+        countRead = False
+    elif not countUniqs and read.opt('NH') == 1:
+        countRead = False
+    elif not countMulti and read.opt('NH') > 1:
+        countRead = False
+
+    return countRead
+
+
+# Cistematic functions
 def getGeneInfoDict(genome, cache=False):
     idb = geneinfoDB(cache=cache)
     if genome == "dmelanogaster":
 def getGeneInfoDict(genome, cache=False):
     idb = geneinfoDB(cache=cache)
     if genome == "dmelanogaster":
@@ -73,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA
     return geneannotDict
 
 
     return geneannotDict
 
 
+# Configuration File Support
 def getConfigParser(fileList=[]):
     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
     for filename in fileList:
 def getConfigParser(fileList=[]):
     configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
     for filename in fileList:
@@ -129,6 +202,7 @@ def getAllConfigSectionOptions(parser, section):
     return setting
 
 
     return setting
 
 
+# All the Other Stuff from before
 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                      doMerge=True, keepPeak=False, returnTop=0):
 def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                      fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                      doMerge=True, keepPeak=False, returnTop=0):
@@ -192,7 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
     #      be merged with A but that will exit the loop and the merge with C will be missed.
     #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
     #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
     #      be merged with A but that will exit the loop and the merge with C will be missed.
-    #TODO: QForAli - Are we going to require sorted region files to have this even work?
+    #TODO: Are we going to require sorted region files to have this even work?
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
@@ -341,10 +415,6 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
 
     # implementing a triangular smooth
     smoothArray = array("f", [0.] * length)
 
     # implementing a triangular smooth
     smoothArray = array("f", [0.] * length)
-    #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
-    # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
-    # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
-    # edge cases?
     for pos in range(2,length -2):
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
     for pos in range(2,length -2):
         smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
 
index 846a441a434f8e3cff4506108f27126855a2fb47..ea7b0a347661d59a40e838cd666156f9c9b8bfe4 100644 (file)
@@ -57,18 +57,24 @@ options are case sensitive and that they could well
 fail silently.
 
 
 fail silently.
 
 
-3. MAKING THE NECESSARY INPUT (RDS) FILES
-
-You will want to first convert your read mappings to the 
-native ERANGE read store. Please see the file 
-README.build-rds for instructions on how to do this.
-
-Build an RDS file for both the ChIP, and if available and 
-appropriate, the control. Note that we *HIGHLY* recommend 
-the use of a matched control sample to account for some 
-of the general background artifacts that can be present 
-in ChIP-seq samples (e.g. DNAse hypersensitivity, 
-assembly collapse of some sattelite repeats, etc....). 
+3. MAKING THE NECESSARY INPUT FILES
+
+Erange uses BAM format files, but there are a couple of
+modifications that need to be made to the header and
+individual entries.  The python script bamPreprocessing.py
+will do the following:
+1. Count the reads by type and write these counts to the
+header as comments.
+2. Verify that every read has a value in the NH tag or add
+it if needed.
+3. Optionally annotate the reads with the geneID using the
+ZG flag
+
+Note that we *HIGHLY* recommend the use of a matched
+control sample to account for some of the general
+background artifacts that can be present in ChIP-seq
+samples (e.g. DNAse hypersensitivity, assembly collapse
+of some sattelite repeats, etc....). 
 
 
 4. WEIGHING MULTIREADS
 
 
 4. WEIGHING MULTIREADS
index b0a1c0f09e270ad2cc47c9af05c84564b5325aaf..96ea795586a252968d1d214f53dfa4a9cdec46fc 100644 (file)
@@ -1,5 +1,5 @@
 # analysis steps for an ERANGE analysis of RNA-seq data
 # analysis steps for an ERANGE analysis of RNA-seq data
-# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.py
 
 # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
 #              export PYTHONPATH=/my/path/to/cistematic
 
 # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
 #              export PYTHONPATH=/my/path/to/cistematic
index cf75b6a5c9047ebb94332f26e8af234b68f3e382..bd8290127d305d6a184df30deeecffe815043a07 100755 (executable)
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
 #!/bin/bash
 #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
 # runRNAPairedAnalysis.sh
 # ENRAGE
 #
 # runRNAPairedAnalysis.sh
 # ENRAGE
 #
index 103b576e73ac450c0af315645abfca41d7abbb40..51f2e4ccd8bc9ad166cd8f7a5609e6648413c929 100755 (executable)
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
 #!/bin/bash
 #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
 # runSNPAnalysis.sh
 #
 # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
 # runSNPAnalysis.sh
 #
 # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
@@ -59,4 +61,4 @@ python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt
 
 # make bed file for displaying the snps on UCSC genome browser
 python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
 
 # make bed file for displaying the snps on UCSC genome browser
 python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
-fi
\ No newline at end of file
+fi
index aa5fe60588a30ceba6a65301db81b13ac4f6051f..830abea6ecec73a603597d0989e49763cc5f67bd 100755 (executable)
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
 #!/bin/bash
 #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
 # runStandardAnalysis.sh
 # ENRAGE
 #
 # runStandardAnalysis.sh
 # ENRAGE
 #
@@ -88,4 +90,4 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi
 echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
 python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
 
 echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
 python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
 
-fi
\ No newline at end of file
+fi
index 44459019f601732e67f652eed96becc38d0d920d..ee9d1151d91dcb97e3c532a62afdf002563b120c 100755 (executable)
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
 #!/bin/bash
 #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
 # runStrandedAnalysis.sh
 # ENRAGE
 #
 # runStrandedAnalysis.sh
 # ENRAGE
 #
index 9dd85df733f86417debf9b060e53f59230110b76..f8907a66c2db6d3b36a4ac9a13872554c1f1d9ca 100755 (executable)
@@ -1,6 +1,6 @@
 """
 """
-    usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
-           [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+    usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+           [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
@@ -50,8 +50,8 @@ import math
 import string
 import optparse
 import operator
 import string
 import optparse
 import operator
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
 import Region
 
 
 import Region
 
 
@@ -65,7 +65,7 @@ class RegionDirectionError(Exception):
 class RegionFinder():
     def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
                  minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
 class RegionFinder():
     def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
                  minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
-                 normalize=True, listPeak=False, reportshift=False, stringency=1.0):
+                 normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
 
         self.statistics = {"index": 0,
                            "total": 0,
 
         self.statistics = {"index": 0,
                            "total": 0,
@@ -77,8 +77,8 @@ class RegionFinder():
 
         self.regionLabel = label
         self.rnaSettings = False
 
         self.regionLabel = label
         self.rnaSettings = False
-        self.controlRDSsize = 1
-        self.sampleRDSsize = 1
+        self.controlRDSsize = 1.0
+        self.sampleRDSsize = 1.0
         self.minRatio = minRatio
         self.minPeak = minPeak
         self.leftPlusRatio = leftPlusRatio
         self.minRatio = minRatio
         self.minPeak = minPeak
         self.leftPlusRatio = leftPlusRatio
@@ -115,6 +115,10 @@ class RegionFinder():
         self.listPeak = listPeak
         self.reportshift = reportshift
         self.stringency = max(stringency, 1.0)
         self.listPeak = listPeak
         self.reportshift = reportshift
         self.stringency = max(stringency, 1.0)
+        self.controlfile = controlfile
+        self.doControl = self.controlfile is not None
+        self.doPvalue = False
+        self.doRevBackground = doRevBackground
 
 
     def useRNASettings(self, readlen):
 
 
     def useRNASettings(self, readlen):
@@ -125,7 +129,7 @@ class RegionFinder():
         self.maxSpacing = readlen
 
 
         self.maxSpacing = readlen
 
 
-    def getHeader(self, doPvalue):
+    def getHeader(self):
         if self.normalize:
             countType = "RPM"
         else:
         if self.normalize:
             countType = "RPM"
         else:
@@ -142,33 +146,33 @@ class RegionFinder():
         if self.reportshift:
             headerFields.append("readShift")
 
         if self.reportshift:
             headerFields.append("readShift")
 
-        if doPvalue:
+        if self.doPvalue:
             headerFields.append("pValue")
 
         return string.join(headerFields, "\t")
 
 
             headerFields.append("pValue")
 
         return string.join(headerFields, "\t")
 
 
-    def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType):
+    def printSettings(self, ptype, useMulti, pValueType):
         print
         print
-        self.printStatusMessages(doRevBackground, ptype, doControl, useMulti)
-        self.printOptionsSummary(useMulti, doCache, pValueType)
+        self.printStatusMessages(ptype, useMulti)
+        self.printOptionsSummary(useMulti, pValueType)
 
 
 
 
-    def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti):
+    def printStatusMessages(self, ptype, useMulti):
         if self.shiftValue == "learn":
             print "Will try to learn shift"
 
         if self.normalize:
             print "Normalizing to RPM"
 
         if self.shiftValue == "learn":
             print "Will try to learn shift"
 
         if self.normalize:
             print "Normalizing to RPM"
 
-        if doRevBackground:
+        if self.doRevBackground:
             print "Swapping IP and background to calculate FDR"
 
         if ptype != "":
             if ptype in ["NONE", "SELF"]:
                 pass
             elif ptype == "BACK":
             print "Swapping IP and background to calculate FDR"
 
         if ptype != "":
             if ptype in ["NONE", "SELF"]:
                 pass
             elif ptype == "BACK":
-                if doControl and doRevBackground:
+                if self.doControl and self.doRevBackground:
                     pass
                 else:
                     print "must have a control dataset and -revbackground for pValue type 'back'"
                     pass
                 else:
                     print "must have a control dataset and -revbackground for pValue type 'back'"
@@ -190,9 +194,9 @@ class RegionFinder():
             print "only analyzing reads on the minus strand"
 
 
             print "only analyzing reads on the minus strand"
 
 
-    def printOptionsSummary(self, useMulti, doCache, pValueType):
+    def printOptionsSummary(self, useMulti, pValueType):
 
 
-        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache)
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
         print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
         try:
             print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
         print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
         try:
             print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
@@ -200,18 +204,18 @@ class RegionFinder():
             print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
 
 
             print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
 
 
-    def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl):
+    def getAnalysisDescription(self, hitfile, useMulti, pValueType):
 
         description = ["#ERANGE %s" % versionString]
 
         description = ["#ERANGE %s" % versionString]
-        if doControl:
-            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        if self.doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
         else:
             description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
 
         if self.withFlag != "":
             description.append("#restrict to Flag = %s" % self.withFlag)
 
         else:
             description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
 
         if self.withFlag != "":
             description.append("#restrict to Flag = %s" % self.withFlag)
 
-        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache))
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
         description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
         try:
             description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
         description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
         try:
             description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
@@ -221,6 +225,31 @@ class RegionFinder():
         return string.join(description, "\n")
 
 
         return string.join(description, "\n")
 
 
+    def getFooter(self, bestShift):
+        index = self.statistics["index"]
+        mIndex = self.statistics["mIndex"]
+        footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+        if self.doDirectionality:
+            footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+        if self.doRevBackground:
+            try:
+                percent = min(100. * (float(mIndex)/index), 100.)
+            except ZeroDivisionError:
+                percent = 0.
+
+            footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+        if self.shiftValue == "auto" and self.reportshift:
+            
+            footerLines.append("#mode of shift values: %d" % bestShift)
+
+        if self.statistics["badRegionTrim"] > 0:
+            footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+        return string.join(footerLines, "\n")
+
+
     def updateControlStatistics(self, peak, sumAll, peakScore):
 
         plusRatio = float(peak.numPlus)/peak.numHits
     def updateControlStatistics(self, peak, sumAll, peakScore):
 
         plusRatio = float(peak.numPlus)/peak.numHits
@@ -281,11 +310,11 @@ def main(argv=None):
                                 minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
                                 doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
                                 withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
                                 minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
                                 doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
                                 withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
-                                reportshift=options.reportshift, stringency=options.stringency)
+                                reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+                                doRevBackground=options.doRevBackground)
 
     findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
 
     findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
-            options.cachePages, options.ptype, options.controlfile, options.doRevBackground,
-            options.useMulti, options.combine5p)
+            options.ptype, options.useMulti, options.combine5p)
 
 
 def makeParser():
 
 
 def makeParser():
@@ -363,51 +392,67 @@ def makeParser():
     return parser
 
 
     return parser
 
 
-def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None,
-            ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+            ptype="", useMulti=True, combine5p=False):
 
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
 
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    doCache = cachePages is not None
-    controlRDS = None
-    doControl = controlfile is not None
-    if doControl:
+    controlBAM = None
+    if regionFinder.doControl:
         print "\ncontrol:" 
         print "\ncontrol:" 
-        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
-        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+        controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+        regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
 
     print "\nsample:" 
 
     print "\nsample:" 
-    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
-    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
-    pValueType = getPValueType(ptype, doControl, doRevBackground)
-    doPvalue = not pValueType == "none"
-    regionFinder.readlen = hitRDS.getReadSize()
+    sampleBAM = pysam.Samfile(hitfile, "rb")
+    regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+    pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+    regionFinder.doPvalue = not pValueType == "none"
+    regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
     if rnaSettings:
         regionFinder.useRNASettings(regionFinder.readlen)
 
     if rnaSettings:
         regionFinder.useRNASettings(regionFinder.readlen)
 
-    regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType)
+    regionFinder.printSettings(ptype, useMulti, pValueType)
     outfile = open(outfilename, outputMode)
     outfile = open(outfilename, outputMode)
-    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl)
+    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
     shiftDict = {}
     shiftDict = {}
-    chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
-    for chromosome in chromosomeList:
-        #TODO: QForAli -Really? Use first chr shift value for all of them
+    chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+    for chromosome in chromList:
+        #TODO: Really? Use first chr shift value for all of them
+        maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
         if regionFinder.shiftValue == "learn":
         if regionFinder.shiftValue == "learn":
-            learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
+            regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
 
 
-        allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
-        if doRevBackground:
-            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti)
-            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+        allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+        if regionFinder.doRevBackground:
+            maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+            backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
         else:
         else:
-            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header)
+            backregions = []
+            pValueType = "self"
+
+        writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
 
 
-    footer = getFooter(regionFinder, shiftDict, doRevBackground)
+    try:
+        bestShift = getBestShiftInDict(shiftDict)
+    except ValueError:
+        bestShift = 0
+
+    footer = regionFinder.getFooter(bestShift)
     print footer
     print >> outfile, footer
     outfile.close()
     writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
 
 
     print footer
     print >> outfile, footer
     outfile.close()
     writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
 
 
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
 def getPValueType(ptype, doControl, doRevBackground):
     pValueType = "self"
     if ptype in ["NONE", "SELF", "BACK"]:
 def getPValueType(ptype, doControl, doRevBackground):
     pValueType = "self"
     if ptype in ["NONE", "SELF", "BACK"]:
@@ -424,52 +469,41 @@ def getPValueType(ptype, doControl, doRevBackground):
     return pValueType
 
 
     return pValueType
 
 
-def openRDSFile(filename, cachePages=None, doCache=False):
-    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
-    if cachePages > rds.getDefaultCacheSize():
-        rds.setDBcache(cachePages)
-
-    return rds
-
-
-def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl):
-    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl)
-    header = regionFinder.getHeader(doPvalue)
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+    header = regionFinder.getHeader()
     print >> outfile, header
 
     return header
 
 
     print >> outfile, header
 
     return header
 
 
-def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False):
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        controlChromList = controlRDS.getChromosomes()
-        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+    if controlBAM is not None:
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
     else:
     else:
-        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
 
     return chromosomeList
 
 
 
     return chromosomeList
 
 
-def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-                    outfile, useMulti, doControl, controlRDS, combine5p):
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+                    outfile, useMulti, controlBAM, combine5p):
 
     outregions = []
     allregions = []
     print "chromosome %s" % (chromosome)
     previousHit = - 1 * regionFinder.maxSpacing
     readStartPositions = [-1]
 
     outregions = []
     allregions = []
     print "chromosome %s" % (chromosome)
     previousHit = - 1 * regionFinder.maxSpacing
     readStartPositions = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
     uniqueReadCount = 0
     reads = []
     uniqueReadCount = 0
     reads = []
-    numStarts = 0
-    badRegion = False
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+    numStartsInRegion = 0
 
 
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             lastReadPos = readStartPositions[-1]
             lastBasePosition = lastReadPos + regionFinder.readlen - 1
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             lastReadPos = readStartPositions[-1]
             lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -485,8 +519,8 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
 
             allregions.append(int(region.numReads))
             regionLength = lastReadPos - region.start
 
             allregions.append(int(region.numReads))
             regionLength = lastReadPos - region.start
-            if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+            if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+                region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
 
                 if region.foldRatio >= regionFinder.minRatio:
                     # first pass, with absolute numbers
 
                 if region.foldRatio >= regionFinder.minRatio:
                     # first pass, with absolute numbers
@@ -495,25 +529,24 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                         try:
                             lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
                         except IndexError:
                         try:
                             lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
                         except IndexError:
-                            badRegion = True
+                            regionFinder.statistics["badRegionTrim"] += 1
                             continue
 
                             continue
 
-                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+                        region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
 
 
-                    # just in case it changed, use latest data
                     try:
                         bestPos = peak.topPos[0]
                         peakScore = peak.smoothArray[bestPos]
                         if regionFinder.normalize:
                             peakScore /= regionFinder.sampleRDSsize
                     try:
                         bestPos = peak.topPos[0]
                         peakScore = peak.smoothArray[bestPos]
                         if regionFinder.normalize:
                             peakScore /= regionFinder.sampleRDSsize
-                    except:
+                    except (IndexError, AttributeError, ZeroDivisionError):
                         continue
 
                     if regionFinder.listPeak:
                         continue
 
                     if regionFinder.listPeak:
-                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
+                        region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
 
                     if useMulti:
 
                     if useMulti:
-                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+                        setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
                                                regionFinder.normalize, regionFinder.doTrim)
 
                     region.shift = peak.shift
                                                regionFinder.normalize, regionFinder.doTrim)
 
                     region.shift = peak.shift
@@ -530,43 +563,75 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                             regionFinder.statistics["failed"] += 1
 
             readStartPositions = []
                             regionFinder.statistics["failed"] += 1
 
             readStartPositions = []
-            totalWeight = 0
+            totalWeight = 0.0
             uniqueReadCount = 0
             reads = []
             uniqueReadCount = 0
             reads = []
-            numStarts = 0
-            if badRegion:
-                badRegion = False
-                regionFinder.statistics["badRegionTrim"] += 1
+            numStartsInRegion = 0
 
         if pos not in readStartPositions:
 
         if pos not in readStartPositions:
-            numStarts += 1
+            numStartsInRegion += 1
 
         readStartPositions.append(pos)
 
         readStartPositions.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
         totalWeight += weight
         if weight == 1.0:
             uniqueReadCount += 1
 
         totalWeight += weight
         if weight == 1.0:
             uniqueReadCount += 1
 
-        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
         previousHit = pos
 
     return allregions, outregions
 
 
         previousHit = pos
 
     return allregions, outregions
 
 
-def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti):
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+    if read.opt('NH') > 1 and not doMulti:
+        return True
+
+    if strand == "+" and read.is_reverse:
+        return True
+
+    if strand == "-" and not read.is_reverse:
+        return True
+        
+    return False
+
+
+def getMaxCoordinate(samfile, chr, doMulti=False):
+    maxCoord = 0
+    for alignedread in samfile.fetch(chr):
+        if alignedread.opt('NH') > 1:
+            if doMulti:
+                maxCoord = max(maxCoord, alignedread.pos)
+        else:
+            maxCoord = max(maxCoord, alignedread.pos)
+
+    return maxCoord
+
+
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
     #TODO: this is *almost* the same calculation - there are small yet important differences
     print "calculating background..."
     previousHit = - 1 * regionFinder.maxSpacing
     currentHitList = [-1]
     #TODO: this is *almost* the same calculation - there are small yet important differences
     print "calculating background..."
     previousHit = - 1 * regionFinder.maxSpacing
     currentHitList = [-1]
-    currentTotalWeight = 0
+    currentTotalWeight = 0.0
     currentReadList = []
     backregions = []
     numStarts = 0
     badRegion = False
     currentReadList = []
     backregions = []
     numStarts = 0
     badRegion = False
-    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in controlBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti):
+            continue
+
+        pos = alignedread.pos
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             lastReadPos = currentHitList[-1]
             lastBasePosition = lastReadPos + regionFinder.readlen - 1
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             lastReadPos = currentHitList[-1]
             lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -578,7 +643,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
             region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
             regionLength = lastReadPos - region.start
             if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
             region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
             regionLength = lastReadPos - region.start
             if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                 if regionFinder.normalize:
                     numMock /= regionFinder.sampleRDSsize
 
                 if regionFinder.normalize:
                     numMock /= regionFinder.sampleRDSsize
 
@@ -595,13 +660,12 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                             badRegion = True
                             continue
 
                             badRegion = True
                             continue
 
-                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                         if regionFinder.normalize:
                             numMock /= regionFinder.sampleRDSsize
 
                         foldRatio = region.numReads / numMock
 
                         if regionFinder.normalize:
                             numMock /= regionFinder.sampleRDSsize
 
                         foldRatio = region.numReads / numMock
 
-                    # just in case it changed, use latest data
                     try:
                         bestPos = peak.topPos[0]
                         peakScore = peak.smoothArray[bestPos]
                     try:
                         bestPos = peak.topPos[0]
                         peakScore = peak.smoothArray[bestPos]
@@ -618,7 +682,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                         regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
 
             currentHitList = []
                         regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
 
             currentHitList = []
-            currentTotalWeight = 0
+            currentTotalWeight = 0.0
             currentReadList = []
             numStarts = 0
             if badRegion:
             currentReadList = []
             numStarts = 0
             if badRegion:
@@ -629,32 +693,31 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
             numStarts += 1
 
         currentHitList.append(pos)
             numStarts += 1
 
         currentHitList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
         currentTotalWeight += weight
         currentTotalWeight += weight
-        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
         previousHit = pos
 
     return backregions
 
 
         previousHit = pos
 
     return backregions
 
 
-def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-               outfile, useMulti, doControl, controlRDS, combine5p):
-
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+               outfile, useMulti, controlBAM, combine5p):
 
 
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
     print "learning shift.... will need at least 30 training sites"
     stringency = regionFinder.stringency
     previousHit = -1 * regionFinder.maxSpacing
     positionList = [-1]
     print "learning shift.... will need at least 30 training sites"
     stringency = regionFinder.stringency
     previousHit = -1 * regionFinder.maxSpacing
     positionList = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
     readList = []
     shiftDict = {}
     count = 0
     numStarts = 0
     readList = []
     shiftDict = {}
     count = 0
     numStarts = 0
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             if regionFinder.normalize:
                 totalWeight /= regionFinder.sampleRDSsize
         if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             if regionFinder.normalize:
                 totalWeight /= regionFinder.sampleRDSsize
@@ -663,22 +726,22 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
             regionStop = positionList[-1]
             regionLength = regionStop - regionStart
             if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
             regionStop = positionList[-1]
             regionLength = regionStop - regionStart
             if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
-                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl)
+                foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
                 if foldRatio >= regionFinder.minRatio:
                     updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                     count += 1
 
             positionList = []
                 if foldRatio >= regionFinder.minRatio:
                     updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                     count += 1
 
             positionList = []
-            totalWeight = 0
+            totalWeight = 0.0
             readList = []
 
         if pos not in positionList:
             numStarts += 1
 
         positionList.append(pos)
             readList = []
 
         if pos not in positionList:
             numStarts += 1
 
         positionList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
         totalWeight += weight
         totalWeight += weight
-        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
         previousHit = pos
 
     outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
         previousHit = pos
 
     outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
@@ -689,12 +752,14 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
 
     print outline
     writeLog(logfilename, versionString, outfilename + outline)
 
     print outline
     writeLog(logfilename, versionString, outfilename + outline)
-    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
-    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % shiftValue
     print outline
     print >> outfile, outline
     writeLog(logfilename, versionString, outfilename + outline)
 
     print outline
     print >> outfile, outline
     writeLog(logfilename, versionString, outfilename + outline)
 
+    return shiftValue, shiftDict
+
 
 def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
     return abs(pos - previousHit) > maxSpacing or pos == maxCoord
 
 def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
     return abs(pos - previousHit) > maxSpacing or pos == maxCoord
@@ -705,7 +770,7 @@ def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringen
     minNumReadStarts = stringency * regionFinder.minRatio
     minRegionLength = stringency * regionFinder.readlen
 
     minNumReadStarts = stringency * regionFinder.minRatio
     minRegionLength = stringency * regionFinder.readlen
 
-    return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength
+    return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
 
 
 def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
 
 
 def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
@@ -758,12 +823,12 @@ def peakEdgeLocated(peak, position, minSignalThresh):
     return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
 
 
     return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
 
 
-def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl):
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
     """ Fold ratio calculated is total read weight over control
     """
     #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
     """ Fold ratio calculated is total read weight over control
     """
     #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
-    if doControl:
-        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+    if regionFinder.doControl:
+        numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
         if regionFinder.normalize:
             numMock /= regionFinder.controlRDSsize
 
         if regionFinder.normalize:
             numMock /= regionFinder.controlRDSsize
 
@@ -774,6 +839,25 @@ def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regi
     return foldRatio
 
 
     return foldRatio
 
 
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+    count = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        if alignedread.opt('NH') > 1:
+            if countMulti:
+                if isSpliceEntry(alignedread.cigar):
+                    if countSplices:
+                        count += 1.0/alignedread.opt('NH')
+                else:
+                    count += 1.0/alignedread.opt('NH')
+        elif uniqs:
+            if isSpliceEntry(alignedread.cigar):
+                if countSplices:
+                    count += 1.0
+            else:
+                count += 1.0
+
+    return count
+
 def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
     peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
     try:
 def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
     peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
     try:
@@ -819,9 +903,12 @@ def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRa
     return region
 
 
     return region
 
 
-def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
     if doTrim:
     if doTrim:
-        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+        sumMulti = 0.0
+        for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+            if alignedread.opt('NH') > 1:
+                sumMulti += 1.0/alignedread.opt('NH')
     else:
         sumMulti = currentTotalWeight - currentUniqueCount
 
     else:
         sumMulti = currentTotalWeight - currentUniqueCount
 
@@ -856,25 +943,33 @@ def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plus
             raise RegionDirectionError
 
 
             raise RegionDirectionError
 
 
-def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                                allregions, header):
-
-    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                           allregions, header, backregions=[], pValueType="self")
-
-
-def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
                            allregions, header, backregions=[], pValueType="none"):
 
     print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
                            allregions, header, backregions=[], pValueType="none"):
 
     print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
-    if doPvalue:
+    if regionFinder.doPvalue:
         if pValueType == "self":
             poissonmean = calculatePoissonMean(allregions)
         else:
             poissonmean = calculatePoissonMean(backregions)
 
     print header
         if pValueType == "self":
             poissonmean = calculatePoissonMean(allregions)
         else:
             poissonmean = calculatePoissonMean(backregions)
 
     print header
-    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict)
+    for region in outregions:
+        if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, regionFinder.reportshift)
+
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if regionFinder.doPvalue:
+            pValue = calculatePValue(int(region.numReads), poissonmean)
+            outline += "\t%1.2g" % pValue
+
+        print outline
+        print >> outfile, outline
 
 
 def calculatePoissonMean(dataList):
 
 
 def calculatePoissonMean(dataList):
@@ -890,29 +985,8 @@ def calculatePoissonMean(dataList):
     return poissonmean
 
 
     return poissonmean
 
 
-def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
-    for region in outregions:
-        if shiftValue == "auto" and reportshift:
-            try:
-                shiftDict[region.shift] += 1
-            except KeyError:
-                shiftDict[region.shift] = 1
-
-        outline = getRegionString(region, reportshift)
-
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if doPvalue:
-            sumAll = int(region.numReads)
-            pValue = calculatePValue(sumAll, poissonmean)
-            outline += "\t%1.2g" % pValue
-
-        print outline
-        print >> outfile, outline
-
-
 def calculatePValue(sum, poissonmean):
     pValue = math.exp(-poissonmean)
 def calculatePValue(sum, poissonmean):
     pValue = math.exp(-poissonmean)
-    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
     for i in xrange(sum):
         pValue *= poissonmean
         pValue /= i+1
     for i in xrange(sum):
         pValue *= poissonmean
         pValue /= i+1
@@ -929,34 +1003,9 @@ def getRegionString(region, reportShift):
     return outline
 
 
     return outline
 
 
-def getFooter(regionFinder, shiftDict, doRevBackground):
-    index = regionFinder.statistics["index"]
-    mIndex = regionFinder.statistics["mIndex"]
-    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
-    if regionFinder.doDirectionality:
-        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
-
-    if doRevBackground:
-        try:
-            percent = min(100. * (float(mIndex)/index), 100.)
-        except ZeroDivisionError:
-            percent = 0.
-
-        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
-
-    if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
-        bestShift = getBestShiftInDict(shiftDict)
-        footerLines.append("#mode of shift values: %d" % bestShift)
-
-    if regionFinder.statistics["badRegionTrim"] > 0:
-        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
-
-    return string.join(footerLines, "\n")
-
-
 def getBestShiftInDict(shiftDict):
     return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
 
 
 if __name__ == "__main__":
 def getBestShiftInDict(shiftDict):
     return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
 
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)
index 03cade292225b55513b300a0fca2d8f2c4f5d50a..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
@@ -154,7 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                 try:
                     normalizedValue = 100. * binAmount / tagCount
                 except ZeroDivisionError:
                 try:
                     normalizedValue = 100. * binAmount / tagCount
                 except ZeroDivisionError:
-                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
                     normalizedValue = 100. * binAmount
 
                 binAmount = normalizedValue
                     normalizedValue = 100. * binAmount
 
                 binAmount = normalizedValue
index 7b4a2cc819c30976692d9721ddb363756ebdf70a..10ccc837dca3e6b36f3c7a14d94d3ebd4938ccd4 100755 (executable)
@@ -6,12 +6,11 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
 from cistematic.genomes import Genome
 from cistematic.core.geneinfo import geneinfoDB
 
 from cistematic.genomes import Genome
 from cistematic.core.geneinfo import geneinfoDB
 
-print "geneMrnaCounts: version 5.2"
+print "geneMrnaCounts: version 6.0"
 
 
 def main(argv=None):
 
 
 def main(argv=None):
@@ -33,7 +32,7 @@ def main(argv=None):
 
     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
 
     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
-                   options.searchGID, options.countFeats, options.cachePages, options.markGID)
+                   options.searchGID, options.countFeats, options.cachePages)
 
 
 def getParser(usage):
 
 
 def getParser(usage):
@@ -47,7 +46,6 @@ def getParser(usage):
     parser.add_option("--searchGID", action="store_true", dest="searchGID")
     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
     parser.add_option("--cache", type="int", dest="cachePages")
     parser.add_option("--searchGID", action="store_true", dest="searchGID")
     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
     parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--markGID", action="store_true", dest="markGID")
 
     configParser = getConfigParser()
     section = "geneMrnaCounts"
 
     configParser = getConfigParser()
     section = "geneMrnaCounts"
@@ -60,17 +58,16 @@ def getParser(usage):
     searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
     countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
     countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
-    markGID = getConfigBoolOption(configParser, section, "markGID", False)
 
     parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
                         extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
 
     parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
                         extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
-                        countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+                        countFeats=countFeats, cachePages=cachePages)
 
     return parser
 
 
     return parser
 
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
-                   searchGID=False, countFeats=False, cachePages=None, markGID=False):
+                   searchGID=False, countFeats=False):
 
     if trackStrand:
         print "will track strandedness"
 
     if trackStrand:
         print "will track strandedness"
@@ -86,16 +83,6 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
     else:
         replaceModels = False
 
     else:
         replaceModels = False
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        cachePages = 100000
-        doCache = False
-
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     genome = Genome(genomeName, inRAM=True)
     if extendGenome != "":
         genome.extendFeatures(extendGenome, replace=replaceModels)
     genome = Genome(genomeName, inRAM=True)
     if extendGenome != "":
         genome.extendFeatures(extendGenome, replace=replaceModels)
@@ -111,24 +98,15 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
     for gid in gidList:
         gidCount[gid] = 0
 
     for gid in gidList:
         gidCount[gid] = 0
 
-    chromList = hitRDS.getChromosomes(fullChrom=False)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
-    if markGID:
-        print "Flagging all reads as NM"
-        hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    for chrom in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chrom in chromosomeList:
         if chrom not in featuresByChromDict:
             continue
 
         if countFeats:
             seenFeaturesByChromDict[chrom] = set([])
 
         if chrom not in featuresByChromDict:
             continue
 
         if countFeats:
             seenFeaturesByChromDict[chrom] = set([])
 
-        print "\nchr%s" % chrom
-        fullchrom = "chr%s" % chrom
-        regionList = []        
+        print "\nchr%s" % chrom      
         print "counting GIDs"
         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
             try:
         print "counting GIDs"
         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
             try:
@@ -137,34 +115,31 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
                     if featureSense == "R":
                         checkSense = "-"
 
                     if featureSense == "R":
                         checkSense = "-"
 
-                    regionData = (gid, fullchrom, start, stop, checkSense)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                 else:
                 else:
-                    regionData = (gid, fullchrom, start, stop)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 
                 gidCount[gid] += count
 
                 gidCount[gid] += count
-                if markGID:
-                    regionList.append(regionData)
-
                 if countFeats:
                     seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
             except:
                 print "problem with %s - skipping" % gid
 
                 if countFeats:
                     seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
             except:
                 print "problem with %s - skipping" % gid
 
-        if markGID:
-            print "marking GIDs"
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
-            print "finished marking"
-
     print " "
     if countFeats:
         numFeatures = countFeatures(seenFeaturesByChromDict)
         print "saw %d features" % numFeatures
 
     print " "
     if countFeats:
         numFeatures = countFeatures(seenFeaturesByChromDict)
         print "saw %d features" % numFeatures
 
-    writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
-    if markGID and doCache:
-        hitRDS.saveCacheDB(hitfile)
+    writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+    count = 0.0
+    for alignedread in bamfile.fetch(chrom, start, stop):
+        if countThisRead(alignedread, uniqs, multi, splices, sense):
+            count += 1.0/alignedread.opt('NH')
+
+    return count
 
 
 def countFeatures(seenFeaturesByChromDict):
 
 
 def countFeatures(seenFeaturesByChromDict):
@@ -178,18 +153,15 @@ def countFeatures(seenFeaturesByChromDict):
     return count
 
 
     return count
 
 
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
     geneAnnotDict = genome.allAnnotInfo()
     genomeName = genome.genome
     outfile = open(outfilename, "w")
     idb = geneinfoDB(cache=True)
     geneInfoDict = idb.getallGeneInfo(genomeName)
     geneAnnotDict = genome.allAnnotInfo()
     genomeName = genome.genome
     outfile = open(outfilename, "w")
     idb = geneinfoDB(cache=True)
     geneInfoDict = idb.getallGeneInfo(genomeName)
-    for gid in gidList:
+    for gid in gidCount:
         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
-        if gid in gidCount:
-            outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
-        else:
-            outfile.write("%s\t%s\t0\n" % (gid, symbol))
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
 
     outfile.close()
 
 
     outfile.close()
 
index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
@@ -6,13 +6,13 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
 
 
 def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         doCache = True
     else:
         doCache = False
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         gidReadDict[gid] = []
 
     index = 0
         gidReadDict[gid] = []
 
     index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
         featureList = featuresByChromDict[chromosome]
 
         featureList = featuresByChromDict[chromosome]
 
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
     return chromosome not in chromosomeList
 
 
     return chromosome not in chromosomeList
 
 
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
 
     startFeature = 0
     readGidList = []
 
     startFeature = 0
     readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
 
         index += 1
         if index % 100000 == 0:
 
         index += 1
         if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
     return readGidList, index
 
 
     return readGidList, index
 
 
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
index d929d41a89a2da712b1ee953770e0b01bb2f96bb..3c5183730c288f9c61e29f41fb1df69c3e0f992f 100755 (executable)
@@ -1,8 +1,3 @@
-#
-#  geneStartBins.py
-#  ENRAGE
-#
-
 try:
     import psyco
     psyco.full()
 try:
     import psyco
     psyco.full()
@@ -10,11 +5,10 @@ except:
     pass
 
 import sys
     pass
 
 import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
 from cistematic.genomes import Genome
 
 from cistematic.genomes import Genome
 
-
 print "geneStartBins: version 2.1"
 if len(sys.argv) < 4:
     print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
 print "geneStartBins: version 2.1"
 if len(sys.argv) < 4:
     print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
@@ -24,7 +18,6 @@ if len(sys.argv) < 4:
 genome = sys.argv[1]
 hitfile =  sys.argv[2]
 outfilename = sys.argv[3]
 genome = sys.argv[1]
 hitfile =  sys.argv[2]
 outfilename = sys.argv[3]
-
 standardMinDist = 3000
 if '-max' in sys.argv:
     standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
 standardMinDist = 3000
 if '-max' in sys.argv:
     standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
@@ -42,23 +35,21 @@ if '-cache' in sys.argv:
 
 bins = 10
 standardMinThresh = standardMinDist / bins
 
 bins = 10
 standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
 normalizationFactor = 1.0
 if normalize:
 normalizationFactor = 1.0
 if normalize:
-    totalCount = len(hitRDS)
+    totalCount = getHeaderComment(bamfile.header, "Total")
     normalizationFactor = totalCount / 1000000.
 
 hg = Genome(genome)
 gidDict = {}
 geneinfoDict = getGeneInfoDict(genome, cache=True)
 featuresDict = hg.getallGeneFeatures()
     normalizationFactor = totalCount / 1000000.
 
 hg = Genome(genome)
 gidDict = {}
 geneinfoDict = getGeneInfoDict(genome, cache=True)
 featuresDict = hg.getallGeneFeatures()
-
 outfile = open(outfilename,'w')
 outfile = open(outfilename,'w')
-
 gidList = hg.allGIDs()
 gidList.sort()
 gidList = hg.allGIDs()
 gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
 for gid in gidList:
     symbol = 'LOC' + gid
     geneinfo = ''
 for gid in gidList:
     symbol = 'LOC' + gid
     geneinfo = ''
@@ -78,7 +69,7 @@ for gid in gidList:
         if (start, stop) not in newfeatureList:
             newfeatureList.append((start, stop))
 
         if (start, stop) not in newfeatureList:
             newfeatureList.append((start, stop))
 
-    if chrom not in hitDict:
+    if chrom not in chromosomeList:
         continue
 
     newfeatureList.sort()
         continue
 
     newfeatureList.sort()
@@ -109,15 +100,16 @@ for gid in gidList:
 
         gstart = newfeatureList[-1][1] - glen
         gstop = newfeatureList[-1][1] + glen
 
         gstart = newfeatureList[-1][1] - glen
         gstop = newfeatureList[-1][1] + glen
+
     tagCount = 0
     if glen < standardMinDist / 2:
         continue
 
     binList = [0] * bins
     tagCount = 0
     if glen < standardMinDist / 2:
         continue
 
     binList = [0] * bins
-    for read in hitDict[chrom]:
-        tagStart = read["start"] - gstart
-        sense = read["sense"]
-        weight = read["weight"]
+    for alignedread in bamfile.fetch(chrom):
+        tagStart = alignedread.pos - gstart
+        sense = getReadSense(alignedread)
+        weight = 1.0/alignedread.opt('NH')
         if tagStart >= 2 * glen:
             break
 
         if tagStart >= 2 * glen:
             break
 
index 197800e09fab70bf2824bcf3e9caa1ecfb364ef1..819675f26f869895fc12b8058daf195f02b2d07b 100755 (executable)
@@ -20,6 +20,7 @@ def main(argv=None):
     delimiter = "|"
 
     usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
     delimiter = "|"
 
     usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+            \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\
             \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
             \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
 
             \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
             \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
 
index cf02cb3af4e26d987a971817057c9e790dae874d..e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d 100644 (file)
@@ -6,10 +6,10 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-import ReadDataset
+import pysam
 from cistematic.genomes import Genome
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 from cistematic.genomes import Genome
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
 
 print "normalizeExpandedExonic: version 5.7"
 
 
 print "normalizeExpandedExonic: version 5.7"
 
@@ -18,7 +18,7 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
     if not argv:
         argv = sys.argv
 
-    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+    usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
 
     parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
@@ -34,22 +34,19 @@ def main(argv=None):
     splicecountfile = args[3]
     outfile = args[4]
 
     splicecountfile = args[3]
     outfile = args[4]
 
-    candidateLines = []
+    candidatefilename = ""
     acceptedfilename = ""
     acceptedfilename = ""
-    if len(args) > 5:
-        try:
-            candidatefile = open(args[5])
-            candidateLines = candidatefile.readlines()
-            candidatefile.close()
-            acceptedfilename = args[6]
-        except IndexError:
-            pass
-
-    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+    try:
+        candidatefilename = args[5]
+        acceptedfilename = args[6]
+    except IndexError:
+        pass
+
+    bamfile = pysam.Samfile(hitfile, "rb")    
+    uniqcount = getHeaderComment(bamfile.header, "Unique")
     
     normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
     
     normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
-                            candidateLines, acceptedfilename, options.fieldID,
+                            candidatefilename, acceptedfilename, options.fieldID,
                             options.maxLength, options.doCache, options.extendGenome,
                             options.replaceModels)
 
                             options.maxLength, options.doCache, options.extendGenome,
                             options.replaceModels)
 
@@ -77,102 +74,22 @@ def makeParser(usage=""):
 
 
 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
 
 
 def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
-                            outfilename, candidateLines=[], acceptedfilename="",
+                            outfilename, candidatefilename="", acceptedfilename="",
                             fieldID=0, maxLength=1000000000., doCache=False,
                             extendGenome="", replaceModels=False):
                             fieldID=0, maxLength=1000000000., doCache=False,
                             extendGenome="", replaceModels=False):
-
-    uniquecountfile = open(uniquecountfilename)
-
+  
+    print "%d unique reads" % uniqcount
+    gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
+    totalCount = (uniqcount + splicecount) / 1000000.
+    uniqScale = uniqcount / 1000000.
     if acceptedfilename:
         acceptedfile = open(acceptedfilename, "w")
 
     if acceptedfilename:
         acceptedfile = open(acceptedfilename, "w")
 
-    dosplicecount = False
-    if splicecountfilename != "none":
-        dosplicecount = True
-        splicecountfile = open(splicecountfilename)
-
-    if extendGenome:
-        if replaceModels:
-            print "will replace gene models with %s" % extendGenome
-        else:
-            print "will extend gene models with %s" % extendGenome
-
-    if doCache:
-        cacheGeneDB(genome)
-        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        print "%s cached" % genome
-    else:
-        hg = Genome(genome, inRAM=True)
-
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
-
-    
-    print "%d unique reads" % uniqcount
-
-    splicecount = 0
-    countDict = {}
-    gidList = []
-    farList = []
-    candidateDict = {}
-
-    gidToGeneDict = {}
-
-    featuresDict = hg.getallGeneFeatures()
-    print "got featuresDict"
-
     outfile = open(outfilename, "w")
     outfile = open(outfilename, "w")
-
-    for line in uniquecountfile:
-        fields = line.strip().split()
-        gid = fields[fieldID]
-        gene = fields[1]
-        countDict[gid] = float(fields[-1])
-        gidList.append(gid)
-        gidToGeneDict[gid] = gene
-
-    uniquecountfile.close()
-
-    if dosplicecount:
-        for line in splicecountfile:
-            fields = line.strip().split()
-            gid = fields[fieldID]
-            try:
-                countDict[gid] += float(fields[-1])
-            except:
-                print fields
-                continue
-
-            splicecount += float(fields[-1])
-
-        splicecountfile.close()
-
-    for line in candidateLines:
-        if "#" in line:
-            continue
-
-        fields = line.strip().split()
-        gid = fields[1]
-        gene = fields[0]
-        if gid not in gidList:
-            if gid not in farList:
-                farList.append(gid)
-                gidToGeneDict[gid] = gene
-
-            if gid not in countDict:
-                countDict[gid] = 0
-
-            countDict[gid] += float(fields[6])
-
-        if gid not in candidateDict:
-            candidateDict[gid] = []
-
-        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
-    totalCount = (uniqcount + splicecount) / 1000000.
-    uniqScale = uniqcount / 1000000.
+    featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+    print "got featuresDict"
     for gid in gidList:
     for gid in gidList:
-        gene = gidToGeneDict[gid]
+        gene = gidToGeneDict[gid]["name"]
         featureList = []
         try:
             featureList = featuresDict[gid]
         featureList = []
         try:
             featureList = featuresDict[gid]
@@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
         elif geneLength > maxLength:
             geneLength = maxLength
 
         elif geneLength > maxLength:
             geneLength = maxLength
 
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
         rpkm = rpm / geneLength
         if gid in candidateDict:
             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
         rpkm = rpm / geneLength
         if gid in candidateDict:
             for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-                cratio = cCount / (cLength / 1000.)
-                cratio = (uniqScale * cratio) / totalCount
+                kilobaseLength = cLength / 1000.
+                cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
                 if 10. * cratio < rpkm:
                     continue
 
                 if 10. * cratio < rpkm:
                     continue
 
-                countDict[gid] += cCount
-                geneLength += cLength / 1000.
-                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                gidToGeneDict[gid]["count"] += cCount
+                geneLength += kilobaseLength
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
 
 
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
         rpkm = rpm / geneLength
         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
 
         rpkm = rpm / geneLength
         outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
 
-    for gid in farList:
-        gene = gidToGeneDict[gid]
-        geneLength = 0
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            geneLength += cLength / 1000.
-
+    for (gid, geneLength) in farList:
         if geneLength < 0.1:
             continue
 
         if geneLength < 0.1:
             continue
 
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            cratio = cCount / (cLength / 1000.)
-            cratio = cratio / totalCount
-            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
-        rpm = countDict[gid] / totalCount
+        gene = gidToGeneDict[gid]["name"]
+        if acceptedfilename:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                kilobaseLength = cLength / 1000.
+                cratio = (cCount / kilobaseLength) / totalCount
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
+
+        rpm = gidToGeneDict[gid]["count"] / totalCount
         rpkm = rpm / geneLength
         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
 
         rpkm = rpm / geneLength
         outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
 
@@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
     except:
         pass
 
     except:
         pass
 
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+    gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+    uniqueGidList = gidToGeneDict.keys()
+    if candidatefilename:
+        candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+    else:
+        candidateDict = {}
+        geneDictUpdates = {}
+
+    farList = []
+    for gid in geneDictUpdates:
+        gidToGeneDict[gid] = geneDictUpdates[gid]
+        geneLength = 0.
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        farList.append((gid, geneLength))
+
+    return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+    gidToGeneDict = {}
+    uniquecountfile = open(uniquecountfilename)
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+    uniquecountfile.close()
+    splicecount = 0.
+    if splicecountfilename != "none":
+        splicecountfile = open(splicecountfilename)
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                gidToGeneDict[gid]["count"] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+    """ Reads entries from the candiates file
+        gid entries not in gidList are returned for inclusion
+        creates and returns a dictionary keyed off gid with a single list of
+        tuples (count, length, chrom, start, stop) for each gid
+    """
+    geneDictUpdates = {}
+    candidateDict = {}
+    candidatefile = open(candidatefilename)
+    for line in candidatefile.readlines():
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            try:
+                geneDictUpdates[gid]["count"] += float(fields[6])
+            except KeyError:
+                geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+        try:
+            candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+        except KeyError:
+            candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+    candidatefile.close()
+
+    return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    featuresDict = hg.getallGeneFeatures()
+
     if doCache:
         uncacheGeneDB(genome)
 
     if doCache:
         uncacheGeneDB(genome)
 
+    return featuresDict
+
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index ae005cb3146b686fd3934cd9a152c331900ebde4..e11508ea65e7a61614afb971f88fdb8f00824da7 100755 (executable)
@@ -12,8 +12,8 @@ except:
 import sys
 import string
 import optparse
 import sys
 import string
 import optparse
-from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
 
 versionString = "regionCounts: version 3.10"
 print versionString
 
 versionString = "regionCounts: version 3.10"
 print versionString
@@ -32,10 +32,12 @@ def main(argv=None):
         sys.exit(1)
 
     regionfilename = args[0]
         sys.exit(1)
 
     regionfilename = args[0]
-    hitfile =  args[1]
+    bamfilename =  args[1]
     outfilename = args[2]
 
     outfilename = args[2]
 
-    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
                  options.useFullchrom, options.normalize, options.padregion,
                  options.mergeregion, options.merging, options.doUniqs, options.doMulti,
                  options.doSplices, options.usePeak, options.cachePages, options.logfilename,
                  options.useFullchrom, options.normalize, options.padregion,
                  options.mergeregion, options.merging, options.doUniqs, options.doMulti,
                  options.doSplices, options.usePeak, options.cachePages, options.logfilename,
@@ -89,7 +91,7 @@ def getParser(usage):
     return parser
 
 
     return parser
 
 
-def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
                  useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
                  merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
                  cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
                  useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
                  merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
                  cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
@@ -99,11 +101,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
     print "merging regions closer than %d bp" % mergeregion
     print "will use peak values"
 
     print "merging regions closer than %d bp" % mergeregion
     print "will use peak values"
 
-    if cachePages != -1:
-        doCache = True
-    else:
-        doCache = False
-
     normalize = True
     doRPKM = False
     if doRPKM == True:
     normalize = True
     doRPKM = False
     if doRPKM == True:
@@ -119,24 +116,13 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
     labeltoRegionDict = {}
     regionCount = {}
 
     labeltoRegionDict = {}
     regionCount = {}
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
-    totalCount = len(hitRDS)
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    totalCount = getHeaderComment(bamfile.header, "Total")
     if normalize:
         normalizationFactor = totalCount / 1000000.
 
     if normalize:
         normalizationFactor = totalCount / 1000000.
 
-    chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
-
+    chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
     chromList.sort()
     chromList.sort()
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("OFF")        
-
     for rchrom in regionDict:
         if forceRegion and rchrom not in chromList:
             print rchrom
     for rchrom in regionDict:
         if forceRegion and rchrom not in chromList:
             print rchrom
@@ -146,7 +132,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
                 labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
 
     for rchrom in chromList:
                 labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
 
     for rchrom in chromList:
-        regionList = []
+        #regionList = []
         if rchrom not in regionDict:
             continue
 
         if rchrom not in regionDict:
             continue
 
@@ -156,11 +142,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
         else:
             fullchrom = "chr%s" % rchrom
 
         else:
             fullchrom = "chr%s" % rchrom
 
-        if usePeak:
-            readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
-            rindex = 0
-            dictLen = len(readDict[fullchrom])
-
         for region in regionDict[rchrom]:
             label = region.label
             start = region.start
         for region in regionDict[rchrom]:
             label = region.label
             start = region.start
@@ -168,17 +149,12 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
             regionCount[label] = 0
             labelList.append(label)
             labeltoRegionDict[label] = (rchrom, start, stop)
             regionCount[label] = 0
             labelList.append(label)
             labeltoRegionDict[label] = (rchrom, start, stop)
-            regionList.append((label, fullchrom, start, stop))
+            #regionList.append((label, fullchrom, start, stop))
             if usePeak:
                 readList = []
             if usePeak:
                 readList = []
-                for localIndex in xrange(rindex, dictLen):
-                    read = readDict[fullchrom][localIndex]
-                    if read["start"] < start:
-                        rindex += 1
-                    elif start <= read["start"] <= stop:
-                        readList.append(read)
-                    else:
-                        break
+                for alignedread in bamfile.fetch(fullchrom, start, stop):
+                    weight = 1.0/alignedread.opt('NH')
+                    readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
 
                 if len(readList) < 1:
                     continue
 
                 if len(readList) < 1:
                     continue
@@ -193,13 +169,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
 
                 regionCount[label] += topValue
             else:
 
                 regionCount[label] += topValue
             else:
-                regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-        if flagRDS:
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("ON")    
+                regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
 
     if normalize:
         for label in regionCount:
 
     if normalize:
         for label in regionCount:
@@ -235,10 +205,29 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
         outfile.write("\n")
 
     outfile.close()
         outfile.write("\n")
 
     outfile.close()
-    if doCache and flagRDS:
-        hitRDS.saveCacheDB(hitfile)
-
-    writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+    writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
+
+
+def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
+    uniques = 0
+    multis = 0.0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        readMultiplicity = alignedread.opt('NH')
+        if doSplices and isSpliceEntry(alignedread.cigar):
+            if readMultiplicity == 1 and doUniqs:
+                uniqueSplice += 1
+            elif doMulti:
+                multiSplice += 1.0/readMultiplicity
+        elif readMultiplicity == 1 and doUniqs:
+            uniques += 1
+        elif doMulti:
+            multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+
+    return totalReads
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
index 4d70c4965ef77c2eea4f646e58526db83682ffa6..2c5aff43ced96fe650d5cc52562cb998033b4d04 100755 (executable)
@@ -1,9 +1,3 @@
-#
-#  RNAFARpairs.py
-#  ENRAGE
-#
-#  Created by Ali Mortazavi on 11/2/08.
-#
 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
            looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
 """
 """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
            looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
 """
@@ -16,8 +10,8 @@ except:
 import sys
 import time
 import optparse
 import sys
 import time
 import optparse
-import ReadDataset
-from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+import pysam
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
 
 
 def main(argv=None):
 
 
 def main(argv=None):
@@ -36,10 +30,12 @@ def main(argv=None):
 
     genome = args[0]
     goodfilename = args[1]
 
     genome = args[0]
     goodfilename = args[1]
-    rdsfile = args[2]
+    bamfilename = args[2]
     outfilename = args[3]
 
     outfilename = args[3]
 
-    rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
 
 
 def makeParser(usage=""):
 
 
 def makeParser(usage=""):
@@ -62,16 +58,26 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
-def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+    """ map all candidate regions that have paired ends overlapping with known genes
+    """
+
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    regions = {}
+    for chromosome in chromosomeList:
+        regions[chromosome] = []
+
     goodDict = {}
     goodfile = open(goodfilename)
     for line in goodfile:
         fields = line.split()
     goodDict = {}
     goodfile = open(goodfilename)
     for line in goodfile:
         fields = line.split()
-        goodDict[fields[0]] = line
+        label = fields[0]
+        start = int(fields[2])
+        stop = int(fields[3])
+        goodDict[label] = line
+        regions[chromosome].append((start, stop, label))
 
     goodfile.close()
 
     goodfile.close()
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
-    chromosomeList = RDS.getChromosomes()
     if doVerbose:
         print time.ctime()
 
     if doVerbose:
         print time.ctime()
 
@@ -83,23 +89,18 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
     assigned = {}
     farConnected = {}
     for chromosome in chromosomeList:
     assigned = {}
     farConnected = {}
     for chromosome in chromosomeList:
-        if doNotProcessChromosome(chromosome):
-            continue
-
         print chromosome
         print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+        regionList = regions[chromosome].sort()
+        uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
         if doVerbose:
             print len(uniqDict), time.ctime()    
 
         if doVerbose:
             print len(uniqDict), time.ctime()    
 
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                if processReads(readList[:2], maxDist):
-                    flags = (readList[0]["flag"], readList[1]["flag"])
-                    processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
-                    total += processed
-                    distinct += distinctPairs
+        total += pairCount
+        for readID, readList in uniqDict.items():
+            flags = (readList[0]["flag"], readList[1]["flag"])
+            processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+            total += processed
+            distinct += distinctPairs
 
     entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
     distinct += entriesWritten
 
     entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
     distinct += entriesWritten
@@ -109,15 +110,55 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
     print time.ctime()
 
 
     print time.ctime()
 
 
-def doNotProcessChromosome(chromosome):
-    return chromosome == "chrM"
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+    """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+        Only returns unique non-spliced read pairs matching the criteria given in processReads().
+    """
+    start = 1
+    readDict = {}
+    for regionstart, regionstop, regionname in regions:
+        for alignedread in bamfile.fetch(chromosome, start, regionstop):
+            if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+                if alignedread.pos >= regionstart:
+                    flag = regionname
+                else:
+                    flag = alignedread.opt("ZG")
 
 
+                try:
+                    readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+                except KeyError:
+                    readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
 
 
-def processReads(reads, maxDist):
+        start = regionstop + 1
+
+    for alignedread in bamfile.fetch(chromosome, start):
+        if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+            flag = alignedread.opt("ZG")
+
+            try:
+                readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+            except KeyError:
+                readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+    pairCount = len(readDict.keys())
+    for readID, readList in readDict.items():
+        if len(readList) != 2:
+            del readDict[readID]
+            pairCount -= 1
+        elif not processReads(readList, maxDist):
+            del readDict[readID]
+
+    return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+    """ For a pair of readID's to be acceptable:
+            - flags must be different
+            - neither flag can be 'NM'
+            - the read starts have to be within maxDist
+    """
     process = False
     process = False
-    start1 = reads[0]["start"]
-    start2 = reads[1]["start"]
-    dist = abs(start1 - start2)
+    dist = abs(reads[0]["start"] - reads[1]["start"])
     flag1 = reads[0]["flag"]
     flag2 = reads[1]["flag"]
 
     flag1 = reads[0]["flag"]
     flag2 = reads[1]["flag"]
 
@@ -128,6 +169,11 @@ def processReads(reads, maxDist):
 
 
 def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
 
 
 def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+    """ Writes out the region information along with symbol and geneID for paired reads where one read
+        of the pair is in a known gene and the other is in a new region.  If both reads in the pair are
+        in new regions the region is added to farConnected.  No action is taken if both reads in the
+        pair are in known genes.
+    """
     flag1, flag2 = flags
     total = 0
     distinct = 0
     flag1, flag2 = flags
     total = 0
     distinct = 0
@@ -147,7 +193,7 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
         except KeyError:
             farConnected[geneID] = [farFlag]
     elif read1IsGood or read2IsGood:
         except KeyError:
             farConnected[geneID] = [farFlag]
     elif read1IsGood or read2IsGood:
-        total += 1
+        total = 1
         if read2IsGood:
             farFlag = flag2
             geneID = flag1
         if read2IsGood:
             farFlag = flag2
             geneID = flag1
@@ -155,30 +201,35 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
             farFlag = flag1
             geneID = flag2
 
             farFlag = flag1
             geneID = flag2
 
-        try:
-            if genome == "dmelanogaster":
-                symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
-            else:
-                symbol = geneInfoDict[geneID][0][0]
-        except (KeyError, IndexError):
-            try:
-                symbol = geneAnnotDict[(genome, geneID)][0]
-            except (KeyError, IndexError):
-                symbol = "LOC%s" % geneID
-
-        symbol = symbol.strip()
-        symbol = symbol.replace(" ","|")
-        symbol = symbol.replace("\t","|")
-
+        symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
         if farFlag not in assigned:
             assigned[farFlag] = (symbol, geneID)
             print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
             outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
         if farFlag not in assigned:
             assigned[farFlag] = (symbol, geneID)
             print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
             outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
-            distinct += 1
+            distinct = 1
 
     return total, distinct
 
 
 
     return total, distinct
 
 
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+    try:
+        if genome == "dmelanogaster":
+            symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+        else:
+            symbol = geneInfoDict[geneID][0][0]
+    except (KeyError, IndexError):
+        try:
+            symbol = geneAnnotDict[(genome, geneID)][0]
+        except (KeyError, IndexError):
+            symbol = "LOC%s" % geneID
+
+    symbol = symbol.strip()
+    symbol = symbol.replace(" ","|")
+    symbol = symbol.replace("\t","|")
+
+    return symbol
+
+
 def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
     total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
     writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
 def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
     total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
     writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
index ebc25a530af4166ea69b17a8228a6943b19960c6..0c01de47d6a4b0aa7a1f4476ad82c024da669809 100644 (file)
@@ -1,7 +1,7 @@
 import sys
 import optparse
 import sys
 import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -50,84 +50,78 @@ def getParser(usage):
     return parser
 
 
     return parser
 
 
-def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False):
     """ based on original script runRNAPairedAnalysis.sh
     """ based on original script runRNAPairedAnalysis.sh
-        usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
-            where rdsprefix is the name of the rds file without the .rds extension
+        usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels]
+            where fileprefix is the name of the bam file without the .bam extension
             use "none" for the repeatmaskdb if you do not have one
     """
 
             use "none" for the repeatmaskdb if you do not have one
     """
 
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
     logfile = "rna.log"
 
     # log the parameters
     logfile = "rna.log"
 
     # log the parameters
-    message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+    message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb)
     writeLog(logfile, "runRNAPairedAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are 
     # mappable reads that fell outside of the Cistematic gene models and not the 
     # unmappable of Eland (i.e, the "NM" reads)
     writeLog(logfile, "runRNAPairedAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are 
     # mappable reads that fell outside of the Cistematic gene models and not the 
     # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
     readCounts = {}
     readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # recount the unique reads with weights calculated during the first pass
     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
 
     # count splice reads
 
     # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
 
     # find new regions outside of gene models with reads piled up 
 
     # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
 
     # filter out new regions that overlap repeats more than a certain fraction
 
     # filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
-    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    regionfilename = "%s.newregions.checked" % fileprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile)
 
     # calculate the read densities
 
     # calculate the read densities
-    regionfilename = "%s.newregions.checked" % rdsprefix
-    regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+    goodFileName = "%s.newregions.good" % fileprefix
+    regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
 
     # map all candidate regions that have paired ends overlapping with known genes
 
     # map all candidate regions that have paired ends overlapping with known genes
-    candidatefilename = "%s.candidates.txt" % rdsprefix
-    rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+    candidatefilename = "%s.candidates.txt" % fileprefix
+    rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True)
 
 
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
     # calculate expanded exonic read density
     # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
                             acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # weigh multi-reads
                             acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    acceptfile = "%s.accepted.rpkm" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+    multicountfilename = "%s.multi.count" % fileprefix
+    acceptfile = "%s.accepted.rpkm" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
                            extendGenome=modelfile, replaceModels=replacemodels)
 
     # calculate final exonic read density
                            extendGenome=modelfile, replaceModels=replacemodels)
 
     # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
 
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
 
index adfa37522a5f2c9af6d70b2a117102a6ab4163c8..d20ef640899392ec1f08f474328298d1274e69c9 100644 (file)
@@ -1,7 +1,7 @@
 import sys
 import optparse
 import sys
 import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -17,7 +17,7 @@ def main(argv=None):
         argv = sys.argv
 
     print "runStandardAnalysis: version %s" % VERSION
         argv = sys.argv
 
     print "runStandardAnalysis: version %s" % VERSION
-    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+    usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
 
     parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
@@ -27,15 +27,15 @@ def main(argv=None):
         sys.exit(1)
 
     genome = args[0]
         sys.exit(1)
 
     genome = args[0]
-    rdsprefix = args[1]
+    fileprefix = args[1]
     repeatmaskdb = args[2]
     repeatmaskdb = args[2]
-    bpradius = args[3]
+    bpradius = int(args[3])
     try:
         modelfile = args[4]
     except IndexError:
         modelfile = ""
 
     try:
         modelfile = args[4]
     except IndexError:
         modelfile = ""
 
-    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+    runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
 
 
 def getParser(usage):
 
 
 def getParser(usage):
@@ -50,82 +50,78 @@ def getParser(usage):
     return parser
 
 
     return parser
 
 
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
     """ based on original script runStandardAnalysis.sh
     """ based on original script runStandardAnalysis.sh
-        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
-               where rdsprefix is the name of the rds file without the .rds extension
+        usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where fileprefix is the name of the bam file without the .bam extension
                use "none" for the repeatmaskdb if you do not have one
     """
 
                use "none" for the repeatmaskdb if you do not have one
     """
 
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
     logfile = "rna.log"
 
     # log the parameters
     logfile = "rna.log"
 
     # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
     writeLog(logfile, "runStandardAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are
     # mappable reads that fell outside of the Cistematic gene models and not the
     # unmappable of Eland (i.e, the "NM" reads)
     writeLog(logfile, "runStandardAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are
     # mappable reads that fell outside of the Cistematic gene models and not the
     # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
-    splicecountfilename = "none"
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
     readCounts = {}
     readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # recount the unique reads with weights calculated during the first pass
 
     # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
 
     # count splice reads
 
     # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
 
     # Alternative 1: find new regions outside of gene models with reads piled up
 
     # Alternative 1: find new regions outside of gene models with reads piled up
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
     regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
 
     # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
 
     # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 
     # map all candidate regions that are within a given radius of a gene in bp
     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 
     # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
     # calculate expanded exonic read density
     # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
 
     # weigh multi-reads
 
     # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
                            replaceModels=replacemodels)
 
     # calculate final exonic read density
                            replaceModels=replacemodels)
 
     # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
+    #TODO: remove when not tracking
+    writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
 
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file
index 55c87ca76e48a1e272bf1184ec6fd3bd5ff9b1e9..35a67008d196ea0c942e2fb94614a7e76f612f7e 100644 (file)
@@ -1,7 +1,7 @@
 import sys
 import optparse
 import sys
 import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
 from checkrmask import checkrmask
 from geneMrnaCounts import geneMrnaCounts
 from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -40,58 +40,57 @@ def getParser(usage):
     return parser
 
 
     return parser
 
 
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
     """ based on original script runStrandedAnalysis.sh
         usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                where rdsprefix is the name of the rds file without the .rds extension
                use "none" for the repeatmaskdb if you do not have one
     """
 
     """ based on original script runStrandedAnalysis.sh
         usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                where rdsprefix is the name of the rds file without the .rds extension
                use "none" for the repeatmaskdb if you do not have one
     """
 
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
     logfile = "rna.log"
 
     # log the parameters
     logfile = "rna.log"
 
     # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
     writeLog(logfile, "runStrandedAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are 
     # mappable reads that fell outside of the Cistematic gene models and not the 
     # unmappable of Eland (i.e, the "NM" reads)
     writeLog(logfile, "runStrandedAnalysis.py", message)
 
     # count the unique reads falling on the gene models ; the nomatch files are 
     # mappable reads that fell outside of the Cistematic gene models and not the 
     # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
 
     # calculate a first-pass RPKM to re-weigh the unique reads,
     # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
     readCounts = {}
     readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
 
     # recount the unique reads with weights calculated during the first pass
     normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
 
     # recount the unique reads with weights calculated during the first pass
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
 
     # count splice reads
 
     # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
 
     # find new regions outside of gene models with reads piled up 
 
     # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
     regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
     regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
 
     regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
 
     regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+    findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
 
     # filter out new regions that overlap repeats more than a certain fraction
 
     # filter out new regions that overlap repeats more than a certain fraction
-    newregionfilename = "%s.newregions.txt" % rdsprefix
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 
     #TODO: these calls look wrong
     checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
 
     #TODO: these calls look wrong
@@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
     #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
 
     # map all candidate regions that are within a given radius of a gene in bp
     #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
 
     # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
 
     getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
 
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
     # calculate expanded exonic read density
     # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True)
 
     # weigh multi-reads
 
     # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
 
     # calculate final exonic read density
 
     # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
 
     normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
 
 
index 58d9e3059ad8a587c10badd3fb8c7cc820462047..9c0c38fa197cbd18669f65269c6f94672cc6e604 100755 (executable)
@@ -127,7 +127,7 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite
 
         siteNotCommon = True
         #TODO: as handed to me this is O(n^m) - probably could be better...
 
         siteNotCommon = True
         #TODO: as handed to me this is O(n^m) - probably could be better...
-        #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file
+        #TODO: if there are multiple matches will count them but only write the first match to the file
         for (rstart, rstop, rline) in siteDict[chrom]:
             if regionsOverlap(start, stop, rstart, rstop):
                 commonSites += 1
         for (rstart, rstop, rline) in siteDict[chrom]:
             if regionsOverlap(start, stop, rstart, rstop):
                 commonSites += 1
index 3fd6bd8e151b3f909034e856aeb5eb329db7c17f..f4b1691a07a08b15988539f75f1fbb2a62e148b2 100755 (executable)
@@ -168,7 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                 chrom = "chr%s" % multiReadList[index][1]
                 reweighList.append((round(score,3), chrom, start, theID))
 
                 chrom = "chr%s" % multiReadList[index][1]
                 reweighList.append((round(score,3), chrom, start, theID))
 
-            #TODO: QForAli - Is this right? If match index is 0 are we doing nothing?
+            #TODO: Is this right? If match index is 0 are we doing nothing?
             if theMatch > 0:
                 RDS.reweighMultireads(reweighList)
                 fixedPair += 1
             if theMatch > 0:
                 RDS.reweighMultireads(reweighList)
                 fixedPair += 1