add --extract-results to scripts/runfolder
authorDiane Trout <diane@caltech.edu>
Thu, 5 Jun 2008 22:24:22 +0000 (22:24 +0000)
committerDiane Trout <diane@caltech.edu>
Thu, 5 Jun 2008 22:24:22 +0000 (22:24 +0000)
this will build a directory tree with <flowcellID>/<cycle count>/
with the various eland result files, run_*.xml files, etc.

gaworkflow/pipeline/runfolder.py
scripts/runfolder

index 19962b1d0e6e42247c57e80bdca7117736aca890..25d933110877ad3bfd853bfdf1e3456f020f90ee 100644 (file)
@@ -6,6 +6,7 @@ import logging
 import os
 import re
 import stat
+import subprocess
 import sys
 import time
 
@@ -114,11 +115,14 @@ class PipelineRun(object):
         return self._name
     name = property(_get_run_name)
 
-    def save(self):
+    def save(self, destdir=None):
+        if destdir is None:
+            destdir = ''
         logging.info("Saving run report "+ self.name)
         xml = self.get_elements()
         indent(xml)
-        ElementTree.ElementTree(xml).write(self.name)
+        dest_pathname = os.path.join(destdir, self.name)
+        ElementTree.ElementTree(xml).write(dest_pathname)
 
     def load(self, filename):
         logging.info("Loading run report from " + filename)
@@ -216,3 +220,63 @@ def summary_report(runs):
             report.append('---')
             report.append('')
         return os.linesep.join(report)
+
+def extract_results(runs, output_base_dir=None):
+    if output_base_dir is None:
+        output_base_dir = os.getcwd()
+
+    for r in runs:
+      result_dir = os.path.join(output_base_dir, r.flowcell_id)
+      logging.info("Using %s as result directory" % (result_dir,))
+      if not os.path.exists(result_dir):
+        os.mkdir(result_dir)
+      
+      # create cycle_dir
+      cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop)
+      logging.info("Filling in %s" % (cycle,))
+      cycle_dir = os.path.join(result_dir, cycle)
+      if os.path.exists(cycle_dir):
+        logging.error("%s already exists, not overwriting" % (cycle_dir,))
+        continue
+      else:
+        os.mkdir(cycle_dir)
+
+      # copy stuff out of the main run
+      g = r.gerald
+
+      # save run file
+      r.save(cycle_dir)
+
+      # tar score files
+      score_files = []
+      for f in os.listdir(g.pathname):
+          if re.match('.*_score.txt', f):
+              score_files.append(f)
+
+      tar_cmd = ['/bin/tar', 'c'] + score_files
+      bzip_cmd = [ 'bzip2', '-9', '-c' ]
+      tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2')
+      tar_dest = open(tar_dest_name, 'w')
+      logging.info("Compressing score files in %s" % (g.pathname,))
+      logging.info("Running tar: " + " ".join(tar_cmd[:10]))
+      logging.info("Running bzip2: " + " ".join(bzip_cmd))
+      logging.info("Writing to %s" %(tar_dest_name))
+      
+      tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname)
+      bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
+      tar.wait()
+
+      # copy & bzip eland files
+      for eland_lane in g.eland_results.values():
+          source_name = eland_lane.pathname
+          path, name = os.path.split(eland_lane.pathname)
+          dest_name = os.path.join(cycle_dir, name+'.bz2')
+
+          args = ['bzip2', '-9', '-c', source_name]
+          logging.info('Running: %s' % ( " ".join(args) ))
+          bzip_dest = open(dest_name, 'w')
+          bzip = subprocess.Popen(args, stdout=bzip_dest)
+          logging.info('Saving to %s' % (dest_name, ))
+          bzip.wait()
+
+      
index af89015a86e4854d2ab89c982cc03b3fbb0cba9f..d54bf60f3ce9734fcfe5644b6b9e1201ca18e344 100644 (file)
@@ -46,6 +46,9 @@ def make_parser():
     parser.add_option('-a', '--archive', dest='archive', action='store_true',
                       default=False,
                       help='generate run configuration archive')
+    parser.add_option('--extract-results', action='store_true',
+           default=False,
+           help='extract result files out of runfolder into a simpler archive')
     parser.add_option('--run-xml', dest='run_xml',
            default=None,
            help='specify a run_<FlowCell>.xml file for summary reports')
@@ -73,6 +76,8 @@ def main(cmdlist=None):
             print runfolder.summary_report(runs)
         if opt.archive:
             runfolder.extract_run_parameters(runs)
+        if opt.extract_results:
+            runfolder.extract_results(runs)
 
     return 0