convert standard analysis pipelines to use bam format natively

author Sean Upchurch <sau@caltech.edu>

Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)

committer Sean Upchurch <sau@caltech.edu>

Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
author Sean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
committer Sean Upchurch <sau@caltech.edu>
Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
diff --git a/ReadDataset.py b/ReadDataset.py

index 9a795c8cde5c5ba6d257da963ebf4c32f7d6e42d..dffa76de9436f4379f42ba2175bec17fc1a8fa59 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -6,11 +6,10 @@ import re
  import sys
  import pysam
  from array import array
-from commoncode import getReverseComplement
+from commoncode import getReverseComplement, isSpliceEntry
  
  currentRDSVersion = "3.0"
  
-
  class ReadDatasetError(Exception):
      pass
  
@@ -68,7 +67,7 @@ class MaxCoordFinder(ReadCounter):
  
  
  
-class BamReadDataset():
+class ReadDataset():
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
      class using the insert* methods. Default class subtype ('DNA') includes
@@ -454,7 +453,7 @@ class BamReadDataset():
                  newrow["chrom"] = chrom
  
              if withPairID:
-                newrow["pairID"] = pairID
+                newrow["pairID"] = pairReadSuffix[1:]
  
              try:
                  resultsDict[dictKey].append(newrow)
@@ -973,16 +972,6 @@ def getReadSense(reverse):
      return sense
  
  
-def isSpliceEntry(cigarTupleList):
-    isSplice = False
-    for operation,length in cigarTupleList:
-        if operation == 3:
-            isSplice = True
-            break
-
-    return isSplice
-
-
  def getSpliceRightStart(start, cigarTupleList):
      offset = 0
  
diff --git a/Region.py b/Region.py

index 0d0a15b1cd91e7f3ceccee518f448ffccf2daad4..39ba811bc1d94100ed63516467e0a99fb2f637a0 100644 (file)
--- a/Region.py
+++ b/Region.py
@@ -6,7 +6,7 @@ class Region(object):
      """
  
  
-    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0, foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
+    def __init__(self, start, stop, label="", index=0, chrom="", numReads=0., foldRatio=0., multiP=0., peakDescription="", shift=0, peakPos=0, peakHeight=0):
          self.label = label
          self.index = index
          self.chrom = chrom
diff --git a/bamPreprocessing.py b/bamPreprocessing.py

new file mode 100644 (file)

index 0000000..dd5e861
--- /dev/null
+++ b/bamPreprocessing.py
@@ -0,0 +1,193 @@
+'''
+User specs:
+1. Checks if there are NH tags presents
+2. If there aren't, it goes through the BAM file, calculates the read multiplicity
+3. Then makes a temp BAM file into which all alignments (and unaligned reads) are written but this time with the NH tags
+4. Then deletes the original file and replaces with the temp file / renames the temp file to have the name of the original file
+5. Also, if we can write the number of reads into the header and check for its presence and if present, use it, if not replace
+the file with one that has that information in the same manner, it will save the need to count all the reads every time.
+It will have to be several numbers - total reads, unique reads, multi reads, reads mapping on the plus and minus strand.
+
+The only place to have the flag is going to be to attach them to the individual reads.  We'll use the 'ZG' tag and have it
+flagged here if the tag will be populated.  Read will be tagged with the feature label or 'NM' if it is not in a feature.
+'''
+import sys
+import pysam
+import optparse
+from cistematic.genomes import Genome
+from commoncode import isSpliceEntry, getFeaturesByChromDict
+
+
+def main(argv=None):
+
+    usage = 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+    if len(sys.argv) < 3:
+        print 'usage: python %s BAMfilename outputfilename [options]' % sys.argv[0]
+        print '       BAM file has to be indexed'
+        sys.exit(1)
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    BAM = args[0]
+    outputfilename = args[1]
+
+    samfile = pysam.Samfile(BAM, "rb" )
+    readMultiplicityDict = getReadMultiplicity(samfile)
+
+    counts = getReadCounts(samfile, readMultiplicityDict)
+    outheader = buildHeader(samfile.header, counts)
+    if options.markGID:
+        genome = Genome(options.genomeName, inRAM=True)
+        if options.extendGenome != "":
+            genome.extendFeatures(options.extendGenome, replace=options.replaceModels)
+
+        print "getting gene features...."
+        featuresByChromDict = getFeaturesByChromDict(genome)    
+        outheader['CO'].append(options.genomeComment)
+
+    outfile = pysam.Samfile(outputfilename, "wb", header=outheader)
+    for alignedread in samfile.fetch(until_eof=True):
+        geneModelFlag = ""
+        if options.markGID:
+            chrom = samfile.getrname(alignedread.tid)
+            chromNum = chrom[3:]
+            geneModelFlag = "NM"
+            for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chromNum]:
+                if start < alignedread.pos and stop > alignedread.pos:
+                    geneModelFlag = gid
+                    continue
+
+        ID = getReadID(alignedread)
+        try:
+            scaleby = alignedread.opt('NH')
+            outfile.write(alignedread)
+        except KeyError:
+            scaleby = readMultiplicityDict[ID]
+            writeBAMEntry(outfile, alignedread, scaleby, geneModelFlag)
+
+    outfile.close()
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--genome", dest="genomeName")
+    parser.add_option("--genomeName", dest="genomeComment")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--markGID", action="store_true", dest="markGID")
+
+    parser.set_defaults(genomeName="", genomeComment="", extendGenome="", replaceModels=False, markGID="")
+
+    return parser
+
+
+def getReadMultiplicity(samfile):
+    """ There is an assumption here that the set of all reads will not have a mixture of
+        reads bearing the NH tag and reads without.
+    """
+    readMultiplicityDict = {}
+    processedReads = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        try:
+            readMultiplicity = alignedread.opt('NH')
+            return {}
+        except KeyError:
+            ID = getReadID(alignedread)
+            try:
+                readMultiplicityDict[ID] += 1
+            except KeyError:
+                readMultiplicityDict[ID] = 1
+
+        processedReads += 1
+        if processedReads % 5000000 == 0:
+            print str(processedReads/1000000) + 'M alignments processed in multiplicity examination'
+
+    return readMultiplicityDict
+
+
+def getReadCounts(samfile, readMultiplicityDict):
+    uniques = 0
+    multis = 0.0
+    minusUnique = 0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    minusMulti = 0.0
+    readLength = 0
+    for alignedread in samfile.fetch(until_eof=True):
+        if readLength == 0:
+            readLength = getReadLength(alignedread.cigar)
+
+        try:
+            readMultiplicity = alignedread.opt('NH')
+        except KeyError:
+            ID = getReadID(alignedread)
+            readMultiplicity = readMultiplicityDict[ID]
+
+        if readMultiplicity == 1:
+            if alignedread.is_reverse:
+                minusUnique += 1
+
+            if isSpliceEntry(alignedread.cigar):
+                uniqueSplice += 1
+            else:
+                uniques += 1
+        else:
+            if alignedread.is_reverse:
+                minusMulti += 1.0/readMultiplicity
+
+            if isSpliceEntry(alignedread.cigar):
+                multiSplice += 1.0/readMultiplicity
+            else:
+                multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+    plusUnique = uniques + uniqueSplice - minusUnique
+    plusMulti = multis + multiSplice - minusMulti
+    counts = ["Total\t%d" % totalReads,
+              "Unique\t%d" % uniques,
+              "Multis\t%d" % multis,
+              "UniqueSplices\t%d" % uniqueSplice,
+              "Multisplices\t%d" % multiSplice,
+              "PlusUnique\t%d" % plusUnique,
+              "PlusMulti\t%d" % plusMulti,
+              "MinusUnique\t%d" % minusUnique,
+              "MinusMulti\t%d" % minusMulti,
+              "ReadLength\t%d" % readLength
+    ]
+
+    return counts
+
+
+def buildHeader(templateheader, commentList):
+    header = templateheader
+    header["CO"] = commentList
+
+    return header
+
+
+def getReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getReadLength(cigar):
+    take = (0, 1) # CIGAR operation (M/match, I/insertion)
+
+    return sum([length for op,length in cigar if op in take])
+
+
+def writeBAMEntry(outfile, originalRead, multiplicity, geneModelFlag):
+    newAlignedRead = originalRead
+    newAlignedRead.tags = newAlignedRead.tags + [("NH", multiplicity), ("ZG", geneModelFlag)]
+    outfile.write(newAlignedRead)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/checkrmask.py b/checkrmask.py

index 457e4c1f9ebedb7b4daf6137f9f8b850635afb28..8ae30ee958dd10fc0d7e598b7bb25fa4b4ed17c2 100755 (executable)
--- a/checkrmask.py
+++ b/checkrmask.py
@@ -55,6 +55,27 @@ def makeParser(usage=""):
  
  def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
  
+    if os.path.isfile(dbfile):
+        checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField, cachePages, logfilename)
+    else:
+        outfile = open(outFileName, "w")
+        goodfile = open(goodFileName, "w")
+        infile = open(filename)
+        print "No database - passing through"
+        if logfilename is not None:
+            writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+            writeLog(logfilename, versionString, "No database - passing through")
+
+        for line in infile:
+            outfile.write("%s\tNR\tNR\t0.00\n" % line)
+            goodfile.write(line)
+
+        outfile.close()
+        goodfile.close()
+
+
+def checkrmaskdb(dbfile, filename, outFileName, goodFileName, startField=0, cachePages=500000, logfilename=None):
+
      outfile = open(outFileName, "w")
      goodfile = open(goodFileName, "w")
      if startField < 0:
@@ -63,29 +84,14 @@ def checkrmask(dbfile, filename, outFileName, goodFileName, startField=0, cacheP
      if cachePages < 250000:
          cachePages = 250000
  
-    doLog = False
      if logfilename is not None:
          writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-        doLog = True
  
      infile = open(filename)
-    if os.path.isfile(dbfile):
-        db = sqlite.connect(dbfile)
-        sql = db.cursor()
-        sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
-        sql.execute("PRAGMA temp_store = MEMORY")
-    else:
-        print "No database - passing through"
-        if doLog:
-            writeLog(logfilename, versionString, "No database - passing through")
-
-        for line in infile:
-            outfile.write("%s\tNR\tNR\t0.00\n" % line)
-            goodfile.write(line)
-
-        outfile.close()
-        goodfile.close()
-        sys.exit(0)
+    db = sqlite.connect(dbfile)
+    sql = db.cursor()
+    sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
+    sql.execute("PRAGMA temp_store = MEMORY")
  
      featureList = []
      featureDict = {}
diff --git a/combineRPKMs.py b/combineRPKMs.py

index 88b57ab547edeea3f9be5387780796aadbe72bc2..1e2d2aaedbf3476fc43058500c30c76c2b7c3e03 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -63,7 +63,7 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      outfile.write(header)
  
      finalfile = open(finalfileName)
-    #TODO: QForAli - the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #TODO: the output lines are driven by finalfile.  If there are genes in the first 2 that
      #      are not in the finalfile then they will be lost.
      for line in finalfile:
          fields = line.strip().split()
diff --git a/combinerds.py b/combinerds.py

index 2695e6b512c677735fc61be2931108e947d0ab43..d6960c2783908d67db405315b797aab3db5a02e9 100755 (executable)
--- a/combinerds.py
+++ b/combinerds.py
@@ -21,7 +21,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [-table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
+    usage = "usage: python %s destinationRDS inputrds1 [inputrds2 ....] [--table table_name] [--init] [--initrna] [--index] [--cache pages]" % argv[0]
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
@@ -98,9 +98,7 @@ def combinerds(datafile, infileList, tableList=[], withFlag="", doIndex=False, c
          for table in tableList:
              print "importing table %s from file %s" % (table, inputfile)
              dbColumns = "*"
-            if table == "uniqs":
-                dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
-            elif table == "multi":
+            if table in ["uniqs", "multi"]:
                  dbColumns = "NULL, '%s' || readID, chrom, start, stop, sense, weight, flag, mismatch" % dbName
              elif table == "splices":
                  dbColumns = "NULL, '%s' || readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch" % dbName
diff --git a/commoncode.py b/commoncode.py

index cd449620f0fc88304f4e8adceee0e96e37e1295d..0e4fdbbb7229b5f0c7689218dd9f1cbf2ba40b55 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -10,8 +10,8 @@ from cistematic.genomes import Genome
  import Region
  
  commoncodeVersion = 5.6
-currentRDSversion = 2.0
  
+#TODO: This is a function dumping ground - needs to be reworked
  class ErangeError(Exception):
      pass
  
@@ -49,6 +49,78 @@ def writeLog(logFile, messenger, message):
      logfile.close()
  
  
+def getChromInfoList(chrominfo):
+    chromInfoList = []
+    linelist = open(chrominfo)
+    for line in linelist:
+        fields = line.strip().split('\t')
+        chr = fields[0]
+        start = 0
+        end = int(fields[1])
+        chromInfoList.append((chr,start,end))
+
+    linelist.close()
+
+    return chromInfoList
+
+
+# BAM file support functions
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    if cigarTupleList is not None:
+        for operation,length in cigarTupleList:
+            if operation == 3:
+                isSplice = True
+                break
+
+    return isSplice
+
+
+def addPairIDtoReadID(alignedread):
+    ID = alignedread.qname
+    if alignedread.is_read1:
+        ID = ID + '/1'
+
+    if alignedread.is_read2:
+        ID = ID + '/2'
+
+    return ID
+
+
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+#TODO: is this where we need to take into account the NH > 10 restriction?
+def countThisRead(read, countUniqs, countMulti, countSplices, sense):
+    countRead = True
+
+    if sense in ['-', '+'] and sense != getReadSense(read):
+        countRead = False
+    elif not countSplices and isSpliceEntry(read.cigar):
+        countRead = False
+    elif not countUniqs and read.opt('NH') == 1:
+        countRead = False
+    elif not countMulti and read.opt('NH') > 1:
+        countRead = False
+
+    return countRead
+
+
+# Cistematic functions
  def getGeneInfoDict(genome, cache=False):
      idb = geneinfoDB(cache=cache)
      if genome == "dmelanogaster":
@@ -73,6 +145,7 @@ def getExtendedGeneAnnotDict(genomeName, extendGenome, replaceModels=False, inRA
      return geneannotDict
  
  
+# Configuration File Support
  def getConfigParser(fileList=[]):
      configFiles = ["erange.config", os.path.expanduser("~/.erange.config")]
      for filename in fileList:
@@ -129,6 +202,7 @@ def getAllConfigSectionOptions(parser, section):
      return setting
  
  
+# All the Other Stuff from before
  def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False,
                       fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False,
                       doMerge=True, keepPeak=False, returnTop=0):
@@ -192,7 +266,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
-    #TODO: QForAli - Are we going to require sorted region files to have this even work?
+    #TODO: Are we going to require sorted region files to have this even work?
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -341,10 +415,6 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
  
      # implementing a triangular smooth
      smoothArray = array("f", [0.] * length)
-    #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
-    # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
-    # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
-    # edge cases?
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
diff --git a/docs/README.chip-seq b/docs/README.chip-seq

index 846a441a434f8e3cff4506108f27126855a2fb47..ea7b0a347661d59a40e838cd666156f9c9b8bfe4 100644 (file)
--- a/docs/README.chip-seq
+++ b/docs/README.chip-seq
@@ -57,18 +57,24 @@ options are case sensitive and that they could well
  fail silently.
  
  
-3. MAKING THE NECESSARY INPUT (RDS) FILES
-
-You will want to first convert your read mappings to the 
-native ERANGE read store. Please see the file 
-README.build-rds for instructions on how to do this.
-
-Build an RDS file for both the ChIP, and if available and 
-appropriate, the control. Note that we *HIGHLY* recommend 
-the use of a matched control sample to account for some 
-of the general background artifacts that can be present 
-in ChIP-seq samples (e.g. DNAse hypersensitivity, 
-assembly collapse of some sattelite repeats, etc....). 
+3. MAKING THE NECESSARY INPUT FILES
+
+Erange uses BAM format files, but there are a couple of
+modifications that need to be made to the header and
+individual entries.  The python script bamPreprocessing.py
+will do the following:
+1. Count the reads by type and write these counts to the
+header as comments.
+2. Verify that every read has a value in the NH tag or add
+it if needed.
+3. Optionally annotate the reads with the geneID using the
+ZG flag
+
+Note that we *HIGHLY* recommend the use of a matched
+control sample to account for some of the general
+background artifacts that can be present in ChIP-seq
+samples (e.g. DNAse hypersensitivity, assembly collapse
+of some sattelite repeats, etc....). 
  
  
  4. WEIGHING MULTIREADS
diff --git a/docs/RNA-seq.analysisSteps.txt b/docs/RNA-seq.analysisSteps.txt

index b0a1c0f09e270ad2cc47c9af05c84564b5325aaf..96ea795586a252968d1d214f53dfa4a9cdec46fc 100644 (file)
--- a/docs/RNA-seq.analysisSteps.txt
+++ b/docs/RNA-seq.analysisSteps.txt
@@ -1,5 +1,5 @@
  # analysis steps for an ERANGE analysis of RNA-seq data
-# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.sh
+# This is an example of the command-line settings used to run each of the scripts in runStandardAnalysis.py
  
  # preliminary: set PYTHONPATH to point to the parent directory of the Cistematic, e.g.
  #              export PYTHONPATH=/my/path/to/cistematic
diff --git a/docs/runRNAPairedAnalysis.sh b/docs/runRNAPairedAnalysis.sh

index cf75b6a5c9047ebb94332f26e8af234b68f3e382..bd8290127d305d6a184df30deeecffe815043a07 100755 (executable)
--- a/docs/runRNAPairedAnalysis.sh
+++ b/docs/runRNAPairedAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runRNAPairedAnalysis.sh
  # ENRAGE
  #
diff --git a/docs/runSNPAnalysis.sh b/docs/runSNPAnalysis.sh

index 103b576e73ac450c0af315645abfca41d7abbb40..51f2e4ccd8bc9ad166cd8f7a5609e6648413c929 100755 (executable)
--- a/docs/runSNPAnalysis.sh
+++ b/docs/runSNPAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runSNPAnalysis.sh
  #
  # Usages: $ERANGEPATH/runSNPAnalysis.sh mouse rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile cachepages
@@ -59,4 +61,4 @@ python $ERANGEPATH/getNovelSNPs.py $1 $3.nr_dbsnp_geneinfo.txt $3.nr.final.txt
  
  # make bed file for displaying the snps on UCSC genome browser
  python $ERANGEPATH/makeSNPtrack.py $3.nr_snps.txt $3 $3.nr_snps.bed
-fi
-\ No newline at end of file
+fi
diff --git a/docs/runStandardAnalysis.sh b/docs/runStandardAnalysis.sh

index aa5fe60588a30ceba6a65301db81b13ac4f6051f..830abea6ecec73a603597d0989e49763cc5f67bd 100755 (executable)
--- a/docs/runStandardAnalysis.sh
+++ b/docs/runStandardAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runStandardAnalysis.sh
  # ENRAGE
  #
@@ -88,4 +90,4 @@ python $ERANGEPATH/geneMrnaCountsWeighted.py $1 $2.rds $2.expanded.rpkm $2.multi
  echo "python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache"
  python $ERANGEPATH/normalizeFinalExonic.py $2.rds $2.expanded.rpkm $2.multi.count $2.final.rpkm --multifraction --withGID --cache
  
-fi
-\ No newline at end of file
+fi
diff --git a/docs/runStrandedAnalysis.sh b/docs/runStrandedAnalysis.sh

index 44459019f601732e67f652eed96becc38d0d920d..ee9d1151d91dcb97e3c532a62afdf002563b120c 100755 (executable)
--- a/docs/runStrandedAnalysis.sh
+++ b/docs/runStrandedAnalysis.sh
@@ -1,5 +1,7 @@
  #!/bin/bash
  #
+# This is no longer supported.  It is recommended that the pythin script of the same name be used instead.
+#
  # runStrandedAnalysis.sh
  # ENRAGE
  #
diff --git a/findall.py b/findall.py

index 9dd85df733f86417debf9b060e53f59230110b76..f8907a66c2db6d3b36a4ac9a13872554c1f1d9ca 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -1,6 +1,6 @@
  """
-    usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
-           [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+    usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+           [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
             [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
             [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
             [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
@@ -50,8 +50,8 @@ import math
  import string
  import optparse
  import operator
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
  import Region
  
  
@@ -65,7 +65,7 @@ class RegionDirectionError(Exception):
  class RegionFinder():
      def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
                   minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
-                 normalize=True, listPeak=False, reportshift=False, stringency=1.0):
+                 normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
  
          self.statistics = {"index": 0,
                             "total": 0,
@@ -77,8 +77,8 @@ class RegionFinder():
  
          self.regionLabel = label
          self.rnaSettings = False
-        self.controlRDSsize = 1
-        self.sampleRDSsize = 1
+        self.controlRDSsize = 1.0
+        self.sampleRDSsize = 1.0
          self.minRatio = minRatio
          self.minPeak = minPeak
          self.leftPlusRatio = leftPlusRatio
@@ -115,6 +115,10 @@ class RegionFinder():
          self.listPeak = listPeak
          self.reportshift = reportshift
          self.stringency = max(stringency, 1.0)
+        self.controlfile = controlfile
+        self.doControl = self.controlfile is not None
+        self.doPvalue = False
+        self.doRevBackground = doRevBackground
  
  
      def useRNASettings(self, readlen):
@@ -125,7 +129,7 @@ class RegionFinder():
          self.maxSpacing = readlen
  
  
-    def getHeader(self, doPvalue):
+    def getHeader(self):
          if self.normalize:
              countType = "RPM"
          else:
@@ -142,33 +146,33 @@ class RegionFinder():
          if self.reportshift:
              headerFields.append("readShift")
  
-        if doPvalue:
+        if self.doPvalue:
              headerFields.append("pValue")
  
          return string.join(headerFields, "\t")
  
  
-    def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType):
+    def printSettings(self, ptype, useMulti, pValueType):
          print
-        self.printStatusMessages(doRevBackground, ptype, doControl, useMulti)
-        self.printOptionsSummary(useMulti, doCache, pValueType)
+        self.printStatusMessages(ptype, useMulti)
+        self.printOptionsSummary(useMulti, pValueType)
  
  
-    def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti):
+    def printStatusMessages(self, ptype, useMulti):
          if self.shiftValue == "learn":
              print "Will try to learn shift"
  
          if self.normalize:
              print "Normalizing to RPM"
  
-        if doRevBackground:
+        if self.doRevBackground:
              print "Swapping IP and background to calculate FDR"
  
          if ptype != "":
              if ptype in ["NONE", "SELF"]:
                  pass
              elif ptype == "BACK":
-                if doControl and doRevBackground:
+                if self.doControl and self.doRevBackground:
                      pass
                  else:
                      print "must have a control dataset and -revbackground for pValue type 'back'"
@@ -190,9 +194,9 @@ class RegionFinder():
              print "only analyzing reads on the minus strand"
  
  
-    def printOptionsSummary(self, useMulti, doCache, pValueType):
+    def printOptionsSummary(self, useMulti, pValueType):
  
-        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache)
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
          print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
          try:
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
@@ -200,18 +204,18 @@ class RegionFinder():
              print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
  
  
-    def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl):
+    def getAnalysisDescription(self, hitfile, useMulti, pValueType):
  
          description = ["#ERANGE %s" % versionString]
-        if doControl:
-            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        if self.doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
          else:
              description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
  
          if self.withFlag != "":
              description.append("#restrict to Flag = %s" % self.withFlag)
  
-        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache))
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
          description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
          try:
              description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
@@ -221,6 +225,31 @@ class RegionFinder():
          return string.join(description, "\n")
  
  
+    def getFooter(self, bestShift):
+        index = self.statistics["index"]
+        mIndex = self.statistics["mIndex"]
+        footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+        if self.doDirectionality:
+            footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+        if self.doRevBackground:
+            try:
+                percent = min(100. * (float(mIndex)/index), 100.)
+            except ZeroDivisionError:
+                percent = 0.
+
+            footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+        if self.shiftValue == "auto" and self.reportshift:
+            
+            footerLines.append("#mode of shift values: %d" % bestShift)
+
+        if self.statistics["badRegionTrim"] > 0:
+            footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+        return string.join(footerLines, "\n")
+
+
      def updateControlStatistics(self, peak, sumAll, peakScore):
  
          plusRatio = float(peak.numPlus)/peak.numHits
@@ -281,11 +310,11 @@ def main(argv=None):
                                  minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
                                  doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
                                  withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
-                                reportshift=options.reportshift, stringency=options.stringency)
+                                reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+                                doRevBackground=options.doRevBackground)
  
      findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
-            options.cachePages, options.ptype, options.controlfile, options.doRevBackground,
-            options.useMulti, options.combine5p)
+            options.ptype, options.useMulti, options.combine5p)
  
  
  def makeParser():
@@ -363,51 +392,67 @@ def makeParser():
      return parser
  
  
-def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None,
-            ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+            ptype="", useMulti=True, combine5p=False):
  
      writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    doCache = cachePages is not None
-    controlRDS = None
-    doControl = controlfile is not None
-    if doControl:
+    controlBAM = None
+    if regionFinder.doControl:
          print "\ncontrol:" 
-        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
-        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+        controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+        regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
  
      print "\nsample:" 
-    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
-    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
-    pValueType = getPValueType(ptype, doControl, doRevBackground)
-    doPvalue = not pValueType == "none"
-    regionFinder.readlen = hitRDS.getReadSize()
+    sampleBAM = pysam.Samfile(hitfile, "rb")
+    regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+    pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+    regionFinder.doPvalue = not pValueType == "none"
+    regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
      if rnaSettings:
          regionFinder.useRNASettings(regionFinder.readlen)
  
-    regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType)
+    regionFinder.printSettings(ptype, useMulti, pValueType)
      outfile = open(outfilename, outputMode)
-    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl)
+    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
      shiftDict = {}
-    chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
-    for chromosome in chromosomeList:
-        #TODO: QForAli -Really? Use first chr shift value for all of them
+    chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+    for chromosome in chromList:
+        #TODO: Really? Use first chr shift value for all of them
+        maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
          if regionFinder.shiftValue == "learn":
-            learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
+            regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
  
-        allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
-        if doRevBackground:
-            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti)
-            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+        allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+        if regionFinder.doRevBackground:
+            maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+            backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
          else:
-            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header)
+            backregions = []
+            pValueType = "self"
+
+        writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
  
-    footer = getFooter(regionFinder, shiftDict, doRevBackground)
+    try:
+        bestShift = getBestShiftInDict(shiftDict)
+    except ValueError:
+        bestShift = 0
+
+    footer = regionFinder.getFooter(bestShift)
      print footer
      print >> outfile, footer
      outfile.close()
      writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
  
  
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
+
+
  def getPValueType(ptype, doControl, doRevBackground):
      pValueType = "self"
      if ptype in ["NONE", "SELF", "BACK"]:
@@ -424,52 +469,41 @@ def getPValueType(ptype, doControl, doRevBackground):
      return pValueType
  
  
-def openRDSFile(filename, cachePages=None, doCache=False):
-    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
-    if cachePages > rds.getDefaultCacheSize():
-        rds.setDBcache(cachePages)
-
-    return rds
-
-
-def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl):
-    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl)
-    header = regionFinder.getHeader(doPvalue)
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+    header = regionFinder.getHeader()
      print >> outfile, header
  
      return header
  
  
-def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False):
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        controlChromList = controlRDS.getChromosomes()
-        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+    if controlBAM is not None:
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
      else:
-        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
  
      return chromosomeList
  
  
-def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-                    outfile, useMulti, doControl, controlRDS, combine5p):
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+                    outfile, useMulti, controlBAM, combine5p):
  
      outregions = []
      allregions = []
      print "chromosome %s" % (chromosome)
      previousHit = - 1 * regionFinder.maxSpacing
      readStartPositions = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
      uniqueReadCount = 0
      reads = []
-    numStarts = 0
-    badRegion = False
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+    numStartsInRegion = 0
  
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = readStartPositions[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -485,8 +519,8 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
  
              allregions.append(int(region.numReads))
              regionLength = lastReadPos - region.start
-            if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+            if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+                region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
  
                  if region.foldRatio >= regionFinder.minRatio:
                      # first pass, with absolute numbers
@@ -495,25 +529,24 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                          try:
                              lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
                          except IndexError:
-                            badRegion = True
+                            regionFinder.statistics["badRegionTrim"] += 1
                              continue
  
-                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
+                        region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
  
-                    # just in case it changed, use latest data
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
                          if regionFinder.normalize:
                              peakScore /= regionFinder.sampleRDSsize
-                    except:
+                    except (IndexError, AttributeError, ZeroDivisionError):
                          continue
  
                      if regionFinder.listPeak:
-                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
+                        region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
  
                      if useMulti:
-                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+                        setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
                                                 regionFinder.normalize, regionFinder.doTrim)
  
                      region.shift = peak.shift
@@ -530,43 +563,75 @@ def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
                              regionFinder.statistics["failed"] += 1
  
              readStartPositions = []
-            totalWeight = 0
+            totalWeight = 0.0
              uniqueReadCount = 0
              reads = []
-            numStarts = 0
-            if badRegion:
-                badRegion = False
-                regionFinder.statistics["badRegionTrim"] += 1
+            numStartsInRegion = 0
  
          if pos not in readStartPositions:
-            numStarts += 1
+            numStartsInRegion += 1
  
          readStartPositions.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          totalWeight += weight
          if weight == 1.0:
              uniqueReadCount += 1
  
-        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      return allregions, outregions
  
  
-def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti):
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+    if read.opt('NH') > 1 and not doMulti:
+        return True
+
+    if strand == "+" and read.is_reverse:
+        return True
+
+    if strand == "-" and not read.is_reverse:
+        return True
+        
+    return False
+
+
+def getMaxCoordinate(samfile, chr, doMulti=False):
+    maxCoord = 0
+    for alignedread in samfile.fetch(chr):
+        if alignedread.opt('NH') > 1:
+            if doMulti:
+                maxCoord = max(maxCoord, alignedread.pos)
+        else:
+            maxCoord = max(maxCoord, alignedread.pos)
+
+    return maxCoord
+
+
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
      #TODO: this is *almost* the same calculation - there are small yet important differences
      print "calculating background..."
      previousHit = - 1 * regionFinder.maxSpacing
      currentHitList = [-1]
-    currentTotalWeight = 0
+    currentTotalWeight = 0.0
      currentReadList = []
      backregions = []
      numStarts = 0
      badRegion = False
-    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in controlBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              lastReadPos = currentHitList[-1]
              lastBasePosition = lastReadPos + regionFinder.readlen - 1
@@ -578,7 +643,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
              region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
              regionLength = lastReadPos - region.start
              if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
-                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                  if regionFinder.normalize:
                      numMock /= regionFinder.sampleRDSsize
  
@@ -595,13 +660,12 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                              badRegion = True
                              continue
  
-                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
                          if regionFinder.normalize:
                              numMock /= regionFinder.sampleRDSsize
  
                          foldRatio = region.numReads / numMock
  
-                    # just in case it changed, use latest data
                      try:
                          bestPos = peak.topPos[0]
                          peakScore = peak.smoothArray[bestPos]
@@ -618,7 +682,7 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
                          regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
  
              currentHitList = []
-            currentTotalWeight = 0
+            currentTotalWeight = 0.0
              currentReadList = []
              numStarts = 0
              if badRegion:
@@ -629,32 +693,31 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti
              numStarts += 1
  
          currentHitList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          currentTotalWeight += weight
-        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      return backregions
  
  
-def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
-               outfile, useMulti, doControl, controlRDS, combine5p):
-
-    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
-                                  strand=regionFinder.stranded, combine5p=combine5p)
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+               outfile, useMulti, controlBAM, combine5p):
  
-    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
      print "learning shift.... will need at least 30 training sites"
      stringency = regionFinder.stringency
      previousHit = -1 * regionFinder.maxSpacing
      positionList = [-1]
-    totalWeight = 0
+    totalWeight = 0.0
      readList = []
      shiftDict = {}
      count = 0
      numStarts = 0
-    for read in hitDict[chromosome]:
-        pos = read["start"]
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
+
+        pos = alignedread.pos
          if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              if regionFinder.normalize:
                  totalWeight /= regionFinder.sampleRDSsize
@@ -663,22 +726,22 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
              regionStop = positionList[-1]
              regionLength = regionStop - regionStart
              if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
-                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl)
+                foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
                  if foldRatio >= regionFinder.minRatio:
                      updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                      count += 1
  
              positionList = []
-            totalWeight = 0
+            totalWeight = 0.0
              readList = []
  
          if pos not in positionList:
              numStarts += 1
  
          positionList.append(pos)
-        weight = read["weight"]
+        weight = 1.0/alignedread.opt('NH')
          totalWeight += weight
-        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
          previousHit = pos
  
      outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
@@ -689,12 +752,14 @@ def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
  
      print outline
      writeLog(logfilename, versionString, outfilename + outline)
-    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
-    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % shiftValue
      print outline
      print >> outfile, outline
      writeLog(logfilename, versionString, outfilename + outline)
  
+    return shiftValue, shiftDict
+
  
  def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
      return abs(pos - previousHit) > maxSpacing or pos == maxCoord
@@ -705,7 +770,7 @@ def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringen
      minNumReadStarts = stringency * regionFinder.minRatio
      minRegionLength = stringency * regionFinder.readlen
  
-    return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength
+    return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
  
  
  def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
@@ -758,12 +823,12 @@ def peakEdgeLocated(peak, position, minSignalThresh):
      return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
  
  
-def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl):
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
      """ Fold ratio calculated is total read weight over control
      """
      #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
-    if doControl:
-        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+    if regionFinder.doControl:
+        numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
          if regionFinder.normalize:
              numMock /= regionFinder.controlRDSsize
  
@@ -774,6 +839,25 @@ def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regi
      return foldRatio
  
  
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+    count = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        if alignedread.opt('NH') > 1:
+            if countMulti:
+                if isSpliceEntry(alignedread.cigar):
+                    if countSplices:
+                        count += 1.0/alignedread.opt('NH')
+                else:
+                    count += 1.0/alignedread.opt('NH')
+        elif uniqs:
+            if isSpliceEntry(alignedread.cigar):
+                if countSplices:
+                    count += 1.0
+            else:
+                count += 1.0
+
+    return count
+
  def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
      peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
      try:
@@ -819,9 +903,12 @@ def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRa
      return region
  
  
-def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
      if doTrim:
-        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+        sumMulti = 0.0
+        for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+            if alignedread.opt('NH') > 1:
+                sumMulti += 1.0/alignedread.opt('NH')
      else:
          sumMulti = currentTotalWeight - currentUniqueCount
  
@@ -856,25 +943,33 @@ def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plus
              raise RegionDirectionError
  
  
-def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                                allregions, header):
-
-    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
-                           allregions, header, backregions=[], pValueType="self")
-
-
-def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
                             allregions, header, backregions=[], pValueType="none"):
  
      print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
-    if doPvalue:
+    if regionFinder.doPvalue:
          if pValueType == "self":
              poissonmean = calculatePoissonMean(allregions)
          else:
              poissonmean = calculatePoissonMean(backregions)
  
      print header
-    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict)
+    for region in outregions:
+        if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, regionFinder.reportshift)
+
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if regionFinder.doPvalue:
+            pValue = calculatePValue(int(region.numReads), poissonmean)
+            outline += "\t%1.2g" % pValue
+
+        print outline
+        print >> outfile, outline
  
  
  def calculatePoissonMean(dataList):
@@ -890,29 +985,8 @@ def calculatePoissonMean(dataList):
      return poissonmean
  
  
-def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
-    for region in outregions:
-        if shiftValue == "auto" and reportshift:
-            try:
-                shiftDict[region.shift] += 1
-            except KeyError:
-                shiftDict[region.shift] = 1
-
-        outline = getRegionString(region, reportshift)
-
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if doPvalue:
-            sumAll = int(region.numReads)
-            pValue = calculatePValue(sumAll, poissonmean)
-            outline += "\t%1.2g" % pValue
-
-        print outline
-        print >> outfile, outline
-
-
  def calculatePValue(sum, poissonmean):
      pValue = math.exp(-poissonmean)
-    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
      for i in xrange(sum):
          pValue *= poissonmean
          pValue /= i+1
@@ -929,34 +1003,9 @@ def getRegionString(region, reportShift):
      return outline
  
  
-def getFooter(regionFinder, shiftDict, doRevBackground):
-    index = regionFinder.statistics["index"]
-    mIndex = regionFinder.statistics["mIndex"]
-    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
-    if regionFinder.doDirectionality:
-        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
-
-    if doRevBackground:
-        try:
-            percent = min(100. * (float(mIndex)/index), 100.)
-        except ZeroDivisionError:
-            percent = 0.
-
-        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
-
-    if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
-        bestShift = getBestShiftInDict(shiftDict)
-        footerLines.append("#mode of shift values: %d" % bestShift)
-
-    if regionFinder.statistics["badRegionTrim"] > 0:
-        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
-
-    return string.join(footerLines, "\n")
-
-
  def getBestShiftInDict(shiftDict):
      return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
  
  
  if __name__ == "__main__":
-    main(sys.argv)
-\ No newline at end of file
+    main(sys.argv)
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 03cade292225b55513b300a0fca2d8f2c4f5d50a..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -154,7 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
-                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py

index 7b4a2cc819c30976692d9721ddb363756ebdf70a..10ccc837dca3e6b36f3c7a14d94d3ebd4938ccd4 100755 (executable)
--- a/geneMrnaCounts.py
+++ b/geneMrnaCounts.py
@@ -6,12 +6,11 @@ except:
  
  import sys
  import optparse
-from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
  from cistematic.genomes import Genome
  from cistematic.core.geneinfo import geneinfoDB
  
-print "geneMrnaCounts: version 5.2"
+print "geneMrnaCounts: version 6.0"
  
  
  def main(argv=None):
@@ -33,7 +32,7 @@ def main(argv=None):
  
      geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                     options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
-                   options.searchGID, options.countFeats, options.cachePages, options.markGID)
+                   options.searchGID, options.countFeats, options.cachePages)
  
  
  def getParser(usage):
@@ -47,7 +46,6 @@ def getParser(usage):
      parser.add_option("--searchGID", action="store_true", dest="searchGID")
      parser.add_option("--countfeatures", action="store_true", dest="countFeats")
      parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--markGID", action="store_true", dest="markGID")
  
      configParser = getConfigParser()
      section = "geneMrnaCounts"
@@ -60,17 +58,16 @@ def getParser(usage):
      searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
      countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
-    markGID = getConfigBoolOption(configParser, section, "markGID", False)
  
      parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
                          extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
-                        countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+                        countFeats=countFeats, cachePages=cachePages)
  
      return parser
  
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
                     doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
-                   searchGID=False, countFeats=False, cachePages=None, markGID=False):
+                   searchGID=False, countFeats=False):
  
      if trackStrand:
          print "will track strandedness"
@@ -86,16 +83,6 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
      else:
          replaceModels = False
  
-    if cachePages is not None:
-        doCache = True
-    else:
-        cachePages = 100000
-        doCache = False
-
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      genome = Genome(genomeName, inRAM=True)
      if extendGenome != "":
          genome.extendFeatures(extendGenome, replace=replaceModels)
@@ -111,24 +98,15 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
      for gid in gidList:
          gidCount[gid] = 0
  
-    chromList = hitRDS.getChromosomes(fullChrom=False)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
-    if markGID:
-        print "Flagging all reads as NM"
-        hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    for chrom in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chrom in chromosomeList:
          if chrom not in featuresByChromDict:
              continue
  
          if countFeats:
              seenFeaturesByChromDict[chrom] = set([])
  
-        print "\nchr%s" % chrom
-        fullchrom = "chr%s" % chrom
-        regionList = []        
+        print "\nchr%s" % chrom      
          print "counting GIDs"
          for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
              try:
@@ -137,34 +115,31 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
                      if featureSense == "R":
                          checkSense = "-"
  
-                    regionData = (gid, fullchrom, start, stop, checkSense)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                  else:
-                    regionData = (gid, fullchrom, start, stop)
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
  
                  gidCount[gid] += count
-                if markGID:
-                    regionList.append(regionData)
-
                  if countFeats:
                      seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
              except:
                  print "problem with %s - skipping" % gid
  
-        if markGID:
-            print "marking GIDs"
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
-            print "finished marking"
-
      print " "
      if countFeats:
          numFeatures = countFeatures(seenFeaturesByChromDict)
          print "saw %d features" % numFeatures
  
-    writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
-    if markGID and doCache:
-        hitRDS.saveCacheDB(hitfile)
+    writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+    count = 0.0
+    for alignedread in bamfile.fetch(chrom, start, stop):
+        if countThisRead(alignedread, uniqs, multi, splices, sense):
+            count += 1.0/alignedread.opt('NH')
+
+    return count
  
  
  def countFeatures(seenFeaturesByChromDict):
@@ -178,18 +153,15 @@ def countFeatures(seenFeaturesByChromDict):
      return count
  
  
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
      geneAnnotDict = genome.allAnnotInfo()
      genomeName = genome.genome
      outfile = open(outfilename, "w")
      idb = geneinfoDB(cache=True)
      geneInfoDict = idb.getallGeneInfo(genomeName)
-    for gid in gidList:
+    for gid in gidCount:
          symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
-        if gid in gidCount:
-            outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
-        else:
-            outfile.write("%s\t%s\t0\n" % (gid, symbol))
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
  
      outfile.close()
  
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
  
  import sys
  import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
  
  
  def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
  
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          doCache = True
      else:
          doCache = False
-        cachePages = 0
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          gidReadDict[gid] = []
  
      index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
          featureList = featuresByChromDict[chromosome]
  
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
      return chromosome not in chromosomeList
  
  
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
  
      startFeature = 0
      readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
  
          index += 1
          if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      return readGidList, index
  
  
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
diff --git a/geneStartBins.py b/geneStartBins.py

index d929d41a89a2da712b1ee953770e0b01bb2f96bb..3c5183730c288f9c61e29f41fb1df69c3e0f992f 100755 (executable)
--- a/geneStartBins.py
+++ b/geneStartBins.py
@@ -1,8 +1,3 @@
-#
-#  geneStartBins.py
-#  ENRAGE
-#
-
  try:
      import psyco
      psyco.full()
@@ -10,11 +5,10 @@ except:
      pass
  
  import sys
-from commoncode import *
-import ReadDataset
+from commoncode import getReadSense, getGeneInfoDict, getHeaderComment
+import pysam
  from cistematic.genomes import Genome
  
-
  print "geneStartBins: version 2.1"
  if len(sys.argv) < 4:
      print 'usage: python %s genome rdsfile outfilename [-max regionSize] [-raw] [-cache]' % sys.argv[0]
@@ -24,7 +18,6 @@ if len(sys.argv) < 4:
  genome = sys.argv[1]
  hitfile =  sys.argv[2]
  outfilename = sys.argv[3]
-
  standardMinDist = 3000
  if '-max' in sys.argv:
      standardMinDist = int(sys.argv[sys.argv.index('-max') + 1])
@@ -42,23 +35,21 @@ if '-cache' in sys.argv:
  
  bins = 10
  standardMinThresh = standardMinDist / bins
-
-hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
-readlen = hitRDS.getReadSize()
+bamfile = pysam.Samfile(hitfile, "rb")
+readlen = getHeaderComment(bamfile.header, "ReadLength")
  normalizationFactor = 1.0
  if normalize:
-    totalCount = len(hitRDS)
+    totalCount = getHeaderComment(bamfile.header, "Total")
      normalizationFactor = totalCount / 1000000.
  
  hg = Genome(genome)
  gidDict = {}
  geneinfoDict = getGeneInfoDict(genome, cache=True)
  featuresDict = hg.getallGeneFeatures()
-
  outfile = open(outfilename,'w')
-
  gidList = hg.allGIDs()
  gidList.sort()
+chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
  for gid in gidList:
      symbol = 'LOC' + gid
      geneinfo = ''
@@ -78,7 +69,7 @@ for gid in gidList:
          if (start, stop) not in newfeatureList:
              newfeatureList.append((start, stop))
  
-    if chrom not in hitDict:
+    if chrom not in chromosomeList:
          continue
  
      newfeatureList.sort()
@@ -109,15 +100,16 @@ for gid in gidList:
  
          gstart = newfeatureList[-1][1] - glen
          gstop = newfeatureList[-1][1] + glen
+
      tagCount = 0
      if glen < standardMinDist / 2:
          continue
  
      binList = [0] * bins
-    for read in hitDict[chrom]:
-        tagStart = read["start"] - gstart
-        sense = read["sense"]
-        weight = read["weight"]
+    for alignedread in bamfile.fetch(chrom):
+        tagStart = alignedread.pos - gstart
+        sense = getReadSense(alignedread)
+        weight = 1.0/alignedread.opt('NH')
          if tagStart >= 2 * glen:
              break
  
diff --git a/getsplicefa.py b/getsplicefa.py

index 197800e09fab70bf2824bcf3e9caa1ecfb364ef1..819675f26f869895fc12b8058daf195f02b2d07b 100755 (executable)
--- a/getsplicefa.py
+++ b/getsplicefa.py
@@ -20,6 +20,7 @@ def main(argv=None):
      delimiter = "|"
  
      usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+            \n\twhere ucscModels is the gene model file in same format as the UCSC Table Browser\
              \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
              \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
  
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index cf02cb3af4e26d987a971817057c9e790dae874d..e4eb12e7d1e1b35a6e09a104346e6d22d2d92a6d 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -6,10 +6,10 @@ except:
  
  import sys
  import optparse
-import ReadDataset
+import pysam
  from cistematic.genomes import Genome
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getConfigFloatOption, getHeaderComment
  
  print "normalizeExpandedExonic: version 5.7"
  
@@ -18,7 +18,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %s genome rdsfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
+    usage = "usage: python %s genome bamfile uniqcountfile splicecountfile outfile [candidatefile acceptfile] [--gidField fieldID] [--maxLength kblength] [--cache]"
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -34,22 +34,19 @@ def main(argv=None):
      splicecountfile = args[3]
      outfile = args[4]
  
-    candidateLines = []
+    candidatefilename = ""
      acceptedfilename = ""
-    if len(args) > 5:
-        try:
-            candidatefile = open(args[5])
-            candidateLines = candidatefile.readlines()
-            candidatefile.close()
-            acceptedfilename = args[6]
-        except IndexError:
-            pass
-
-    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+    try:
+        candidatefilename = args[5]
+        acceptedfilename = args[6]
+    except IndexError:
+        pass
+
+    bamfile = pysam.Samfile(hitfile, "rb")    
+    uniqcount = getHeaderComment(bamfile.header, "Unique")
      
      normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
-                            candidateLines, acceptedfilename, options.fieldID,
+                            candidatefilename, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
  
@@ -77,102 +74,22 @@ def makeParser(usage=""):
  
  
  def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
-                            outfilename, candidateLines=[], acceptedfilename="",
+                            outfilename, candidatefilename="", acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
-
-    uniquecountfile = open(uniquecountfilename)
-
+  
+    print "%d unique reads" % uniqcount
+    gidList, gidToGeneDict, candidateDict, farList, splicecount = getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename="")
+    totalCount = (uniqcount + splicecount) / 1000000.
+    uniqScale = uniqcount / 1000000.
      if acceptedfilename:
          acceptedfile = open(acceptedfilename, "w")
  
-    dosplicecount = False
-    if splicecountfilename != "none":
-        dosplicecount = True
-        splicecountfile = open(splicecountfilename)
-
-    if extendGenome:
-        if replaceModels:
-            print "will replace gene models with %s" % extendGenome
-        else:
-            print "will extend gene models with %s" % extendGenome
-
-    if doCache:
-        cacheGeneDB(genome)
-        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        print "%s cached" % genome
-    else:
-        hg = Genome(genome, inRAM=True)
-
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace=replaceModels)
-
-    
-    print "%d unique reads" % uniqcount
-
-    splicecount = 0
-    countDict = {}
-    gidList = []
-    farList = []
-    candidateDict = {}
-
-    gidToGeneDict = {}
-
-    featuresDict = hg.getallGeneFeatures()
-    print "got featuresDict"
-
      outfile = open(outfilename, "w")
-
-    for line in uniquecountfile:
-        fields = line.strip().split()
-        gid = fields[fieldID]
-        gene = fields[1]
-        countDict[gid] = float(fields[-1])
-        gidList.append(gid)
-        gidToGeneDict[gid] = gene
-
-    uniquecountfile.close()
-
-    if dosplicecount:
-        for line in splicecountfile:
-            fields = line.strip().split()
-            gid = fields[fieldID]
-            try:
-                countDict[gid] += float(fields[-1])
-            except:
-                print fields
-                continue
-
-            splicecount += float(fields[-1])
-
-        splicecountfile.close()
-
-    for line in candidateLines:
-        if "#" in line:
-            continue
-
-        fields = line.strip().split()
-        gid = fields[1]
-        gene = fields[0]
-        if gid not in gidList:
-            if gid not in farList:
-                farList.append(gid)
-                gidToGeneDict[gid] = gene
-
-            if gid not in countDict:
-                countDict[gid] = 0
-
-            countDict[gid] += float(fields[6])
-
-        if gid not in candidateDict:
-            candidateDict[gid] = []
-
-        candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
-
-    totalCount = (uniqcount + splicecount) / 1000000.
-    uniqScale = uniqcount / 1000000.
+    featuresDict = getGenomeFeatures(genome, doCache, extendGenome, replaceModels)
+    print "got featuresDict"
      for gid in gidList:
-        gene = gidToGeneDict[gid]
+        gene = gidToGeneDict[gid]["name"]
          featureList = []
          try:
              featureList = featuresDict[gid]
@@ -194,38 +111,41 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
          elif geneLength > maxLength:
              geneLength = maxLength
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          if gid in candidateDict:
              for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-                cratio = cCount / (cLength / 1000.)
-                cratio = (uniqScale * cratio) / totalCount
+                kilobaseLength = cLength / 1000.
+                cratio = (uniqScale * (cCount / kilobaseLength)) / totalCount
                  if 10. * cratio < rpkm:
                      continue
  
-                countDict[gid] += cCount
-                geneLength += cLength / 1000.
-                acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                gidToGeneDict[gid]["count"] += cCount
+                geneLength += kilobaseLength
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gid, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
  
-        rpm = countDict[gid] / totalCount
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write("%s\t%s\t%.4f\t%.2f\n" %  (gid, gene, geneLength, rpkm))
  
-    for gid in farList:
-        gene = gidToGeneDict[gid]
-        geneLength = 0
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            geneLength += cLength / 1000.
-
+    for (gid, geneLength) in farList:
          if geneLength < 0.1:
              continue
  
-        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
-            cratio = cCount / (cLength / 1000.)
-            cratio = cratio / totalCount
-            acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
-
-        rpm = countDict[gid] / totalCount
+        gene = gidToGeneDict[gid]["name"]
+        if acceptedfilename:
+            for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+                kilobaseLength = cLength / 1000.
+                cratio = (cCount / kilobaseLength) / totalCount
+                try:
+                    acceptedfile.write("%s\t%s\t%s\t%s\t%.2f\t%d\t%s\n" % (gene, chrom, cStart, cStop, cratio, cLength, gene))
+                except TypeError:
+                    pass
+
+        rpm = gidToGeneDict[gid]["count"] / totalCount
          rpkm = rpm / geneLength
          outfile.write('%s\t%s\t%.4f\t%.2f\n' %  (gene, gene, geneLength, rpkm))
  
@@ -235,9 +155,115 @@ def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountf
      except:
          pass
  
+
+def getDictionariesFromFiles(uniquecountfilename, splicecountfilename, fieldID, candidatefilename=""):
+
+    gidToGeneDict, splicecount = getGeneDict(uniquecountfilename, splicecountfilename, fieldID)
+    uniqueGidList = gidToGeneDict.keys()
+    if candidatefilename:
+        candidateDict, geneDictUpdates = processCandidatesFile(candidatefilename, fieldID, uniqueGidList)
+    else:
+        candidateDict = {}
+        geneDictUpdates = {}
+
+    farList = []
+    for gid in geneDictUpdates:
+        gidToGeneDict[gid] = geneDictUpdates[gid]
+        geneLength = 0.
+        for (cCount, cLength, chrom, cStart, cStop) in candidateDict[gid]:
+            geneLength += cLength / 1000.
+
+        farList.append((gid, geneLength))
+
+    return uniqueGidList, gidToGeneDict, candidateDict, farList, splicecount
+
+
+def getGeneDict(uniquecountfilename, splicecountfilename, fieldID):
+
+    gidToGeneDict = {}
+    uniquecountfile = open(uniquecountfilename)
+    for line in uniquecountfile:
+        fields = line.strip().split()
+        gid = fields[fieldID]
+        gene = fields[1]
+        gidToGeneDict[gid] = {"name": gene, "count": float(fields[-1])}
+
+    uniquecountfile.close()
+    splicecount = 0.
+    if splicecountfilename != "none":
+        splicecountfile = open(splicecountfilename)
+        for line in splicecountfile:
+            fields = line.strip().split()
+            gid = fields[fieldID]
+            try:
+                gidToGeneDict[gid]["count"] += float(fields[-1])
+            except:
+                print fields
+                continue
+
+            splicecount += float(fields[-1])
+
+        splicecountfile.close()
+
+    return gidToGeneDict, splicecount
+
+
+def processCandidatesFile(candidatefilename, fieldID, gidList):
+    """ Reads entries from the candiates file
+        gid entries not in gidList are returned for inclusion
+        creates and returns a dictionary keyed off gid with a single list of
+        tuples (count, length, chrom, start, stop) for each gid
+    """
+    geneDictUpdates = {}
+    candidateDict = {}
+    candidatefile = open(candidatefilename)
+    for line in candidatefile.readlines():
+        if "#" in line:
+            continue
+
+        fields = line.strip().split()
+        gid = fields[1]
+        gene = fields[0]
+        if gid not in gidList:
+            try:
+                geneDictUpdates[gid]["count"] += float(fields[6])
+            except KeyError:
+                geneDictUpdates[gid] = {"name": gene, "count": float(fields[6])}
+
+        try:
+            candidateDict[gid].append((float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5]))
+        except KeyError:
+            candidateDict[gid] = [(float(fields[6]), abs(int(fields[5]) - int(fields[4])), fields[3], fields[4], fields[5])]
+
+    candidatefile.close()
+
+    return candidateDict, geneDictUpdates
+
+
+def getGenomeFeatures(genome, doCache=False, extendGenome="", replaceModels=False):
+    if extendGenome:
+        if replaceModels:
+            print "will replace gene models with %s" % extendGenome
+        else:
+            print "will extend gene models with %s" % extendGenome
+
+    if doCache:
+        cacheGeneDB(genome)
+        hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
+        print "%s cached" % genome
+    else:
+        hg = Genome(genome, inRAM=True)
+
+    if extendGenome != "":
+        hg.extendFeatures(extendGenome, replace=replaceModels)
+
+    featuresDict = hg.getallGeneFeatures()
+
      if doCache:
          uncacheGeneDB(genome)
  
+    return featuresDict
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/regionCounts.py b/regionCounts.py

index ae005cb3146b686fd3934cd9a152c331900ebde4..e11508ea65e7a61614afb971f88fdb8f00824da7 100755 (executable)
--- a/regionCounts.py
+++ b/regionCounts.py
@@ -12,8 +12,8 @@ except:
  import sys
  import string
  import optparse
-from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption, getHeaderComment, isSpliceEntry, getReadSense
  
  versionString = "regionCounts: version 3.10"
  print versionString
@@ -32,10 +32,12 @@ def main(argv=None):
          sys.exit(1)
  
      regionfilename = args[0]
-    hitfile =  args[1]
+    bamfilename =  args[1]
      outfilename = args[2]
  
-    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    regionCounts(regionfilename, bamfile, outfilename, options.flagRDS, options.cField,
                   options.useFullchrom, options.normalize, options.padregion,
                   options.mergeregion, options.merging, options.doUniqs, options.doMulti,
                   options.doSplices, options.usePeak, options.cachePages, options.logfilename,
@@ -89,7 +91,7 @@ def getParser(usage):
      return parser
  
  
-def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
+def regionCounts(regionfilename, bamfile, outfilename, flagRDS=False, cField=1,
                   useFullchrom=False, normalize=True, padregion=0, mergeregion=0,
                   merging=True, doUniqs=True, doMulti=True, doSplices=False, usePeak=False,
                   cachePages=-1, logfilename="regionCounts.log", doRPKM=False, doLength=False,
@@ -99,11 +101,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      print "merging regions closer than %d bp" % mergeregion
      print "will use peak values"
  
-    if cachePages != -1:
-        doCache = True
-    else:
-        doCache = False
-
      normalize = True
      doRPKM = False
      if doRPKM == True:
@@ -119,24 +116,13 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
      labeltoRegionDict = {}
      regionCount = {}
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
-    totalCount = len(hitRDS)
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    totalCount = getHeaderComment(bamfile.header, "Total")
      if normalize:
          normalizationFactor = totalCount / 1000000.
  
-    chromList = hitRDS.getChromosomes(fullChrom=useFullchrom)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=useFullchrom)
-
+    chromList = [chrom for chrom in bamfile.references if chrom != "chrM"]
      chromList.sort()
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("OFF")        
-
      for rchrom in regionDict:
          if forceRegion and rchrom not in chromList:
              print rchrom
@@ -146,7 +132,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
                  labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
  
      for rchrom in chromList:
-        regionList = []
+        #regionList = []
          if rchrom not in regionDict:
              continue
  
@@ -156,11 +142,6 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
          else:
              fullchrom = "chr%s" % rchrom
  
-        if usePeak:
-            readDict = hitRDS.getReadsDict(chrom=fullchrom, withWeight=True, doMulti=True, findallOptimize=True)
-            rindex = 0
-            dictLen = len(readDict[fullchrom])
-
          for region in regionDict[rchrom]:
              label = region.label
              start = region.start
@@ -168,17 +149,12 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
              regionCount[label] = 0
              labelList.append(label)
              labeltoRegionDict[label] = (rchrom, start, stop)
-            regionList.append((label, fullchrom, start, stop))
+            #regionList.append((label, fullchrom, start, stop))
              if usePeak:
                  readList = []
-                for localIndex in xrange(rindex, dictLen):
-                    read = readDict[fullchrom][localIndex]
-                    if read["start"] < start:
-                        rindex += 1
-                    elif start <= read["start"] <= stop:
-                        readList.append(read)
-                    else:
-                        break
+                for alignedread in bamfile.fetch(fullchrom, start, stop):
+                    weight = 1.0/alignedread.opt('NH')
+                    readList.append({"start": alignedread.pos, "sense": getReadSense(alignedread), "weight": weight})
  
                  if len(readList) < 1:
                      continue
@@ -193,13 +169,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
  
                  regionCount[label] += topValue
              else:
-                regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-        if flagRDS:
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    if flagRDS:
-        hitRDS.setSynchronousPragma("ON")    
+                regionCount[label] += getRegionReadCounts(bamfile, fullchrom, start, stop, doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices)
  
      if normalize:
          for label in regionCount:
@@ -235,10 +205,29 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
          outfile.write("\n")
  
      outfile.close()
-    if doCache and flagRDS:
-        hitRDS.saveCacheDB(hitfile)
-
-    writeLog(logfilename, versionString, "returned %d region counts for %s (%.2f M reads)" % (len(labelList), hitfile, totalCount / 1000000.))
+    writeLog(logfilename, versionString, "returned %d region counts (%.2f M reads)" % (len(labelList), totalCount / 1000000.))
+
+
+def getRegionReadCounts(bamfile, chr, start, end, doUniqs=True, doMulti=False, doSplices=False):
+    uniques = 0
+    multis = 0.0
+    uniqueSplice = 0
+    multiSplice = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        readMultiplicity = alignedread.opt('NH')
+        if doSplices and isSpliceEntry(alignedread.cigar):
+            if readMultiplicity == 1 and doUniqs:
+                uniqueSplice += 1
+            elif doMulti:
+                multiSplice += 1.0/readMultiplicity
+        elif readMultiplicity == 1 and doUniqs:
+            uniques += 1
+        elif doMulti:
+            multis += 1.0/readMultiplicity
+
+    totalReads = uniques + multis + uniqueSplice + multiSplice
+
+    return totalReads
  
  
  if __name__ == "__main__":
diff --git a/rnafarPairs.py b/rnafarPairs.py

index 4d70c4965ef77c2eea4f646e58526db83682ffa6..2c5aff43ced96fe650d5cc52562cb998033b4d04 100755 (executable)
--- a/rnafarPairs.py
+++ b/rnafarPairs.py
@@ -1,9 +1,3 @@
-#
-#  RNAFARpairs.py
-#  ENRAGE
-#
-#  Created by Ali Mortazavi on 11/2/08.
-#
  """ usage: python rnafarpairs.py genome goodfile rdsfile outfile [options]
             looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM
  """
@@ -16,8 +10,8 @@ except:
  import sys
  import time
  import optparse
-import ReadDataset
-from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption
+import pysam
+from commoncode import getGeneInfoDict, getGeneAnnotDict, getConfigParser, getConfigIntOption, getConfigBoolOption, isSpliceEntry
  
  
  def main(argv=None):
@@ -36,10 +30,12 @@ def main(argv=None):
  
      genome = args[0]
      goodfilename = args[1]
-    rdsfile = args[2]
+    bamfilename = args[2]
      outfilename = args[3]
  
-    rnaFarPairs(genome, goodfilename, rdsfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
+    bamfile = pysam.Samfile(bamfilename, "rb")
+
+    rnaFarPairs(genome, goodfilename, bamfile, outfilename, options.doVerbose, options.doCache, options.maxDist)
  
  
  def makeParser(usage=""):
@@ -62,16 +58,26 @@ def makeParser(usage=""):
      return parser
  
  
-def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+def rnaFarPairs(genome, goodfilename, bamfile, outfilename, doVerbose=False, doCache=False, maxDist=500000):
+    """ map all candidate regions that have paired ends overlapping with known genes
+    """
+
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    regions = {}
+    for chromosome in chromosomeList:
+        regions[chromosome] = []
+
      goodDict = {}
      goodfile = open(goodfilename)
      for line in goodfile:
          fields = line.split()
-        goodDict[fields[0]] = line
+        label = fields[0]
+        start = int(fields[2])
+        stop = int(fields[3])
+        goodDict[label] = line
+        regions[chromosome].append((start, stop, label))
  
      goodfile.close()
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
-    chromosomeList = RDS.getChromosomes()
      if doVerbose:
          print time.ctime()
  
@@ -83,23 +89,18 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
      assigned = {}
      farConnected = {}
      for chromosome in chromosomeList:
-        if doNotProcessChromosome(chromosome):
-            continue
-
          print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+        regionList = regions[chromosome].sort()
+        uniqDict, pairCount = getUniqueReadIDFlags(bamfile, regionList, chromosome, maxDist)
          if doVerbose:
              print len(uniqDict), time.ctime()    
  
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                if processReads(readList[:2], maxDist):
-                    flags = (readList[0]["flag"], readList[1]["flag"])
-                    processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
-                    total += processed
-                    distinct += distinctPairs
+        total += pairCount
+        for readID, readList in uniqDict.items():
+            flags = (readList[0]["flag"], readList[1]["flag"])
+            processed, distinctPairs = writeFarPairsToFile(flags, goodDict, genome, geneinfoDict, geneannotDict, outfile, assigned, farConnected)
+            total += processed
+            distinct += distinctPairs
  
      entriesWritten = writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile)
      distinct += entriesWritten
@@ -109,15 +110,55 @@ def rnaFarPairs(genome, goodfilename, rdsfile, outfilename, doVerbose=False, doC
      print time.ctime()
  
  
-def doNotProcessChromosome(chromosome):
-    return chromosome == "chrM"
+def getUniqueReadIDFlags(bamfile, regions, chromosome, maxDist):
+    """ Returns dictionary of readsIDs with each entry consisting of a list of dictionaries of read start and read flag.
+        Only returns unique non-spliced read pairs matching the criteria given in processReads().
+    """
+    start = 1
+    readDict = {}
+    for regionstart, regionstop, regionname in regions:
+        for alignedread in bamfile.fetch(chromosome, start, regionstop):
+            if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+                if alignedread.pos >= regionstart:
+                    flag = regionname
+                else:
+                    flag = alignedread.opt("ZG")
  
+                try:
+                    readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+                except KeyError:
+                    readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
  
-def processReads(reads, maxDist):
+        start = regionstop + 1
+
+    for alignedread in bamfile.fetch(chromosome, start):
+        if alignedread.opt("NH") == 1 and not isSpliceEntry(alignedread.cigar):
+            flag = alignedread.opt("ZG")
+
+            try:
+                readDict[alignedread.qname].append({"start": alignedread.pos, "flag": flag})
+            except KeyError:
+                readDict[alignedread.qname] = [{"start": alignedread.pos, "flag": flag}]
+
+    pairCount = len(readDict.keys())
+    for readID, readList in readDict.items():
+        if len(readList) != 2:
+            del readDict[readID]
+            pairCount -= 1
+        elif not processReads(readList, maxDist):
+            del readDict[readID]
+
+    return readDict, pairCount
+
+
+def processReads(reads, maxDist=500000):
+    """ For a pair of readID's to be acceptable:
+            - flags must be different
+            - neither flag can be 'NM'
+            - the read starts have to be within maxDist
+    """
      process = False
-    start1 = reads[0]["start"]
-    start2 = reads[1]["start"]
-    dist = abs(start1 - start2)
+    dist = abs(reads[0]["start"] - reads[1]["start"])
      flag1 = reads[0]["flag"]
      flag2 = reads[1]["flag"]
  
@@ -128,6 +169,11 @@ def processReads(reads, maxDist):
  
  
  def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, outfile, assigned, farConnected):
+    """ Writes out the region information along with symbol and geneID for paired reads where one read
+        of the pair is in a known gene and the other is in a new region.  If both reads in the pair are
+        in new regions the region is added to farConnected.  No action is taken if both reads in the
+        pair are in known genes.
+    """
      flag1, flag2 = flags
      total = 0
      distinct = 0
@@ -147,7 +193,7 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
          except KeyError:
              farConnected[geneID] = [farFlag]
      elif read1IsGood or read2IsGood:
-        total += 1
+        total = 1
          if read2IsGood:
              farFlag = flag2
              geneID = flag1
@@ -155,30 +201,35 @@ def writeFarPairsToFile(flags, goodDict, genome, geneInfoDict, geneAnnotDict, ou
              farFlag = flag1
              geneID = flag2
  
-        try:
-            if genome == "dmelanogaster":
-                symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
-            else:
-                symbol = geneInfoDict[geneID][0][0]
-        except (KeyError, IndexError):
-            try:
-                symbol = geneAnnotDict[(genome, geneID)][0]
-            except (KeyError, IndexError):
-                symbol = "LOC%s" % geneID
-
-        symbol = symbol.strip()
-        symbol = symbol.replace(" ","|")
-        symbol = symbol.replace("\t","|")
-
+        symbol = getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict)
          if farFlag not in assigned:
              assigned[farFlag] = (symbol, geneID)
              print "%s %s %s" % (symbol, geneID, goodDict[farFlag].strip())
              outfile.write("%s %s %s" % (symbol, geneID, goodDict[farFlag]))
-            distinct += 1
+            distinct = 1
  
      return total, distinct
  
  
+def getGeneSymbol(genome, geneID, geneInfoDict, geneAnnotDict):
+    try:
+        if genome == "dmelanogaster":
+            symbol = geneInfoDict["Dmel_%s" % geneID][0][0]
+        else:
+            symbol = geneInfoDict[geneID][0][0]
+    except (KeyError, IndexError):
+        try:
+            symbol = geneAnnotDict[(genome, geneID)][0]
+        except (KeyError, IndexError):
+            symbol = "LOC%s" % geneID
+
+    symbol = symbol.strip()
+    symbol = symbol.replace(" ","|")
+    symbol = symbol.replace("\t","|")
+
+    return symbol
+
+
  def writeUnassignedEntriesToFile(farConnected, assigned, goodDict, outfile):
      total, written = writeUnassignedPairsToFile(farConnected, assigned, goodDict, outfile)
      writeUnassignedGoodReadsToFile(total, goodDict, assigned, outfile)
diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py

index ebc25a530af4166ea69b17a8228a6943b19960c6..0c01de47d6a4b0aa7a1f4476ad82c024da669809 100644 (file)
--- a/runRNAPairedAnalysis.py
+++ b/runRNAPairedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -50,84 +50,78 @@ def getParser(usage):
      return parser
  
  
-def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False):
      """ based on original script runRNAPairedAnalysis.sh
-        usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
-            where rdsprefix is the name of the rds file without the .rds extension
+        usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels]
+            where fileprefix is the name of the bam file without the .bam extension
              use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+    message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb)
      writeLog(logfile, "runRNAPairedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
-    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    regionfilename = "%s.newregions.checked" % fileprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile)
  
      # calculate the read densities
-    regionfilename = "%s.newregions.checked" % rdsprefix
-    regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+    goodFileName = "%s.newregions.good" % fileprefix
+    regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that have paired ends overlapping with known genes
-    candidatefilename = "%s.candidates.txt" % rdsprefix
-    rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+    candidatefilename = "%s.candidates.txt" % fileprefix
+    rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
                              acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    acceptfile = "%s.accepted.rpkm" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+    multicountfilename = "%s.multi.count" % fileprefix
+    acceptfile = "%s.accepted.rpkm" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
                             extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py

index adfa37522a5f2c9af6d70b2a117102a6ab4163c8..d20ef640899392ec1f08f474328298d1274e69c9 100644 (file)
--- a/runStandardAnalysis.py
+++ b/runStandardAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -17,7 +17,7 @@ def main(argv=None):
          argv = sys.argv
  
      print "runStandardAnalysis: version %s" % VERSION
-    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+    usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
  
      parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -27,15 +27,15 @@ def main(argv=None):
          sys.exit(1)
  
      genome = args[0]
-    rdsprefix = args[1]
+    fileprefix = args[1]
      repeatmaskdb = args[2]
-    bpradius = args[3]
+    bpradius = int(args[3])
      try:
          modelfile = args[4]
      except IndexError:
          modelfile = ""
  
-    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+    runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
  
  
  def getParser(usage):
@@ -50,82 +50,78 @@ def getParser(usage):
      return parser
  
  
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
      """ based on original script runStandardAnalysis.sh
-        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
-               where rdsprefix is the name of the rds file without the .rds extension
+        usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where fileprefix is the name of the bam file without the .bam extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStandardAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are
      # mappable reads that fell outside of the Cistematic gene models and not the
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    splicecountfilename = "none"
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # Alternative 1: find new regions outside of gene models with reads piled up
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
                             replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
+    #TODO: remove when not tracking
+    writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file
diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py

index 55c87ca76e48a1e272bf1184ec6fd3bd5ff9b1e9..35a67008d196ea0c942e2fb94614a7e76f612f7e 100644 (file)
--- a/runStrandedAnalysis.py
+++ b/runStrandedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -40,58 +40,57 @@ def getParser(usage):
      return parser
  
  
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
      """ based on original script runStrandedAnalysis.sh
          usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                 where rdsprefix is the name of the rds file without the .rds extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStrandedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  
      # recount the unique reads with weights calculated during the first pass
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+    findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
  
      # filter out new regions that overlap repeats more than a certain fraction
-    newregionfilename = "%s.newregions.txt" % rdsprefix
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      #TODO: these calls look wrong
@@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
      #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
diff --git a/siteintersects.py b/siteintersects.py

index 58d9e3059ad8a587c10badd3fb8c7cc820462047..9c0c38fa197cbd18669f65269c6f94672cc6e604 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -127,7 +127,7 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite
  
          siteNotCommon = True
          #TODO: as handed to me this is O(n^m) - probably could be better...
-        #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file
+        #TODO: if there are multiple matches will count them but only write the first match to the file
          for (rstart, rstop, rline) in siteDict[chrom]:
              if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
diff --git a/weighMultireads.py b/weighMultireads.py

index 3fd6bd8e151b3f909034e856aeb5eb329db7c17f..f4b1691a07a08b15988539f75f1fbb2a62e148b2 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -168,7 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
-            #TODO: QForAli - Is this right? If match index is 0 are we doing nothing?
+            #TODO: Is this right? If match index is 0 are we doing nothing?
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
author	Sean Upchurch <sau@caltech.edu>
	Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Wed, 19 Jun 2013 18:56:48 +0000 (11:56 -0700)
ReadDataset.py		patch \| blob \| history
Region.py		patch \| blob \| history
bamPreprocessing.py	[new file with mode: 0644]	patch \| blob
checkrmask.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
combinerds.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
docs/README.chip-seq		patch \| blob \| history
docs/RNA-seq.analysisSteps.txt		patch \| blob \| history
docs/runRNAPairedAnalysis.sh		patch \| blob \| history
docs/runSNPAnalysis.sh		patch \| blob \| history
docs/runStandardAnalysis.sh		patch \| blob \| history
docs/runStrandedAnalysis.sh		patch \| blob \| history
findall.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCounts.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
geneStartBins.py		patch \| blob \| history
getsplicefa.py		patch \| blob \| history
normalizeExpandedExonic.py		patch \| blob \| history
regionCounts.py		patch \| blob \| history
rnafarPairs.py		patch \| blob \| history
runRNAPairedAnalysis.py		patch \| blob \| history
runStandardAnalysis.py		patch \| blob \| history
runStrandedAnalysis.py		patch \| blob \| history
siteintersects.py		patch \| blob \| history
weighMultireads.py		patch \| blob \| history