convert standard analysis pipelines to use bam format natively

[erange.git] / runStrandedAnalysis.py
diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py

index 55c87ca76e48a1e272bf1184ec6fd3bd5ff9b1e9..35a67008d196ea0c942e2fb94614a7e76f612f7e 100644 (file)
--- a/runStrandedAnalysis.py
+++ b/runStrandedAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-import ReadDataset
-from commoncode import writeLog
+import pysam
+from commoncode import writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -40,58 +40,57 @@ def getParser(usage):
      return parser
  
  
      return parser
  
  
-def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+def runStrandedAnalysis(genome, fileprefix, repeatmaskdb, bpradius):
      """ based on original script runStrandedAnalysis.sh
          usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                 where rdsprefix is the name of the rds file without the .rds extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
      """ based on original script runStrandedAnalysis.sh
          usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
                 where rdsprefix is the name of the rds file without the .rds extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStrandedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
      writeLog(logfile, "runStrandedAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are 
      # mappable reads that fell outside of the Cistematic gene models and not the 
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, trackStrand=True, markGID=True)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  
      # recount the unique reads with weights calculated during the first pass
      normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
  
      # recount the unique reads with weights calculated during the first pass
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
  
      # count splice reads
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False)
  
      # find new regions outside of gene models with reads piled up 
  
      # find new regions outside of gene models with reads piled up 
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
      regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
  
      regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+    findall(regionFinder, bamfile, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False, outputMode="a")
  
      # filter out new regions that overlap repeats more than a certain fraction
  
      # filter out new regions that overlap repeats more than a certain fraction
-    newregionfilename = "%s.newregions.txt" % rdsprefix
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      #TODO: these calls look wrong
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      #TODO: these calls look wrong
@@ -100,28 +99,21 @@ def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
      #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  
      # map all candidate regions that are within a given radius of a gene in bp
      #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
  
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True)
  
      # weigh multi-reads
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
  
      # calculate final exonic read density
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
  
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)