convert standard analysis pipelines to use bam format natively

[erange.git] / runStandardAnalysis.py
diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py

index adfa37522a5f2c9af6d70b2a117102a6ab4163c8..d20ef640899392ec1f08f474328298d1274e69c9 100644 (file)
--- a/runStandardAnalysis.py
+++ b/runStandardAnalysis.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
-import ReadDataset
-from commoncode import getConfigParser, getConfigBoolOption, writeLog
+import pysam
+from commoncode import getConfigParser, getConfigBoolOption, writeLog, getHeaderComment
  from checkrmask import checkrmask
  from geneMrnaCounts import geneMrnaCounts
  from geneMrnaCountsWeighted import geneMrnaCountsWeighted
@@ -17,7 +17,7 @@ def main(argv=None):
          argv = sys.argv
  
      print "runStandardAnalysis: version %s" % VERSION
-    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+    usage = "usage: python runStandardAnalysis.py genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
  
      parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -27,15 +27,15 @@ def main(argv=None):
          sys.exit(1)
  
      genome = args[0]
-    rdsprefix = args[1]
+    fileprefix = args[1]
      repeatmaskdb = args[2]
-    bpradius = args[3]
+    bpradius = int(args[3])
      try:
          modelfile = args[4]
      except IndexError:
          modelfile = ""
  
-    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+    runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
  
  
  def getParser(usage):
@@ -50,82 +50,78 @@ def getParser(usage):
      return parser
  
  
-def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+def runStandardAnalysis(genome, fileprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
      """ based on original script runStandardAnalysis.sh
-        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
-               where rdsprefix is the name of the rds file without the .rds extension
+        usage: runStandardAnalysis.sh genome fileprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where fileprefix is the name of the bam file without the .bam extension
                 use "none" for the repeatmaskdb if you do not have one
      """
  
-    rdsfile = "%s.rds" % rdsprefix
+    bamfilename = "%s.bam" % fileprefix
+    bamfile = pysam.Samfile(bamfilename, "rb")
      logfile = "rna.log"
  
      # log the parameters
-    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    message = "with parameters: %s %s %s %s" % (genome, fileprefix, repeatmaskdb, bpradius)
      writeLog(logfile, "runStandardAnalysis.py", message)
  
      # count the unique reads falling on the gene models ; the nomatch files are
      # mappable reads that fell outside of the Cistematic gene models and not the
      # unmappable of Eland (i.e, the "NM" reads)
-    uniquecountfilename = "%s.uniqs.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+    #
+    # These will need to be marked by running the BAM preprocessor with the markGID option
+    uniquecountfilename = "%s.uniqs.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels)
  
      # calculate a first-pass RPKM to re-weigh the unique reads,
      # using 'none' for the splice count
-    splicecountfilename = "none"
-    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
-    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
-    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    initialrpkmfilename = "%s.firstpass.rpkm" % fileprefix
      readCounts = {}
-    readCounts["uniq"] = ucount
-    readCounts["splice"] = mcount
-    readCounts["multi"] = scount
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    readCounts["uniq"] = int(getHeaderComment(bamfile.header, "Unique"))
+    readCounts["splice"] = int(getHeaderComment(bamfile.header, "UniqueSplices"))
+    readCounts["multi"] = int(getHeaderComment(bamfile.header, "Multis"))
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # recount the unique reads with weights calculated during the first pass
-    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+    uniquerecountfilename = "%s.uniqs.recount" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
  
      # count splice reads
-    splicecountfilename = "%s.splices.count" % rdsprefix
-    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+    splicecountfilename = "%s.splices.count" % fileprefix
+    geneMrnaCounts(genome, bamfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels)
  
      # Alternative 1: find new regions outside of gene models with reads piled up
-    newregionfilename = "%s.newregions.txt" % rdsprefix
+    newregionfilename = "%s.newregions.txt" % fileprefix
      regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
-    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+    findall(regionFinder, bamfilename, newregionfilename, logfilename=logfile, rnaSettings=True, useMulti=False)
  
      # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
-    outFileName = "%s.newregions.repstatus" % rdsprefix
-    goodFileName = "%s.newregions.good" % rdsprefix
+    outFileName = "%s.newregions.repstatus" % fileprefix
+    goodFileName = "%s.newregions.good" % fileprefix
      checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
  
      # map all candidate regions that are within a given radius of a gene in bp
-    candidatefilename = "%s.candidates.txt" % rdsprefix
+    candidatefilename = "%s.candidates.txt" % fileprefix
      getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
-    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    expandedRPKMfilename = "%s.expanded.rpkm" % fileprefix
      # calculate expanded exonic read density
-    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
-    try:
-        candidatefile = open(candidatefilename)
-        candidateLines = candidatefile.readlines()
-        candidatefile.close()
-    except IOError:
-        candidateLines = []
-
-    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
-                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+    acceptedfilename = "%s.accepted.rpkm" % fileprefix
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidatefilename=candidatefilename,
+                            acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
  
      # weigh multi-reads
-    multicountfilename = "%s.multi.count" % rdsprefix
-    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+    multicountfilename = "%s.multi.count" % fileprefix
+    geneMrnaCountsWeighted(genome, bamfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
                             replaceModels=replacemodels)
  
      # calculate final exonic read density
-    outfilename = "%s.final.rpkm" % rdsprefix
+    outfilename = "%s.final.rpkm" % fileprefix
      normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
  
+    #TODO: remove when not tracking
+    writeLog(logfile, "runStandardAnalysis.py", "analysis complete")
+
  
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file