convert standard analysis pipelines to use bam format natively

[erange.git] / runRNAPairedAnalysis.py
diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py

index ebc25a530af4166ea69b17a8228a6943b19960c6..0c01de47d6a4b0aa7a1f4476ad82c024da669809 100644 (file)
--- a/runRNAPairedAnalysis.py
+++ b/runRNAPairedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -50,84 +50,78 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+def runRNAPairedAnalysis(genome, fileprefix, repeatmaskdb, modelfile="", replacemodels=False):
      """ based on original script runRNAPairedAnalysis.sh
      """ based on original script runRNAPairedAnalysis.sh
-        usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
-            where rdsprefix is the name of the rds file without the .rds extension
+        usage:runRNAPairedAnalysis.sh genome fileprefix repeatmaskdb [modelfile] [--replacemodels]
+            where fileprefix is the name of the bam file without the .bam extension
              use "none" for the repeatmaskdb if you do not have one
      """
  
              use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+    message = "with parameters: %s %s %s" % (genome, fileprefix, repeatmaskdb)
      writeLog(logfile, "runRNAPairedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
      writeLog(logfile, "runRNAPairedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # find new regions outside of gene models with reads piled up 
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # filter out new regions that overlap repeats more than a certain fraction
  
      # filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
-    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    regionfilename = "%s.newregions.checked" % fileprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, regionfilename, startField=1, cachePages=1, logfilename=logfile)
  
      # calculate the read densities
  
      # calculate the read densities
-    regionfilename = "%s.newregions.checked" % rdsprefix
-    regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+    goodFileName = "%s.newregions.good" % fileprefix
+    regionCounts(regionfilename, bamfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that have paired ends overlapping with known genes
  
      # map all candidate regions that have paired ends overlapping with known genes
-    candidatefilename = "%s.candidates.txt" % rdsprefix
-    rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+    candidatefilename = "%s.candidates.txt" % fileprefix
+    rnaFarPairs(genome, goodFileName, bamfile, candidatefilename, doCache=True)
  
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
                              acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
                              acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    acceptfile = "%s.accepted.rpkm" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+    multicountfilename = "%s.multi.count" % fileprefix
+    acceptfile = "%s.accepted.rpkm" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
                             extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate final exonic read density
                             extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)