Add support for scanning for results in the IPAR directory.

[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index dee3231068e95150e9e699d96dc3cc80b90614f5..fc2beeb425e3f1338874549e0f9b41502f392f87 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -25,7 +25,6 @@ LANES_PER_FLOWCELL = 8
  from htsworkflow.util.alphanum import alphanum
  from htsworkflow.util.ethelp import indent, flatten
  
-
  class PipelineRun(object):
      """
      Capture "interesting" information about a pipeline run
@@ -34,20 +33,20 @@ class PipelineRun(object):
      PIPELINE_RUN = 'PipelineRun'
      FLOWCELL_ID = 'FlowcellID'
  
-    def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+    def __init__(self, pathname=None, xml=None):
          if pathname is not None:
            self.pathname = os.path.normpath(pathname)
          else:
            self.pathname = None
          self._name = None
          self._flowcell_id = None
-        self.firecrest = firecrest
-        self.bustard = bustard
-        self.gerald = gerald
+        self.image_analysis = None
+        self.bustard = None
+        self.gerald = None
  
          if xml is not None:
            self.set_elements(xml)
-    
+
      def _get_flowcell_id(self):
          # extract flowcell ID
          if self._flowcell_id is None:
@@ -63,7 +62,7 @@ class PipelineRun(object):
                flowcell_id = path_fields[-1]
              else:
                flowcell_id = 'unknown'
-              
+
             logging.warning(
               "Flowcell id was not found, guessing %s" % (
                  flowcell_id))
@@ -78,7 +77,7 @@ class PipelineRun(object):
          root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
          flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
          flowcell.text = self.flowcell_id
-        root.append(self.firecrest.get_elements())
+        root.append(self.image_analysis.get_elements())
          root.append(self.bustard.get_elements())
          root.append(self.gerald.get_elements())
          return root
@@ -87,6 +86,7 @@ class PipelineRun(object):
          # this file gets imported by all the others,
          # so we need to hide the imports to avoid a cyclic imports
          from htsworkflow.pipelines import firecrest
+        from htsworkflow.pipelines import ipar
          from htsworkflow.pipelines import bustard
          from htsworkflow.pipelines import gerald
  
@@ -99,8 +99,11 @@ class PipelineRun(object):
            if tag == PipelineRun.FLOWCELL_ID.lower():
              self._flowcell_id = element.text
            #ok the xword.Xword.XWORD pattern for module.class.constant is lame
+          # you should only have Firecrest or IPAR, never both of them.
            elif tag == firecrest.Firecrest.FIRECREST.lower():
-            self.firecrest = firecrest.Firecrest(xml=element)
+            self.image_analysis = firecrest.Firecrest(xml=element)
+          elif tag == ipar.IPAR.IPAR.lower():
+            self.image_analysis = ipar.IPAR(xml=element)
            elif tag == bustard.Bustard.BUSTARD.lower():
              self.bustard = bustard.Bustard(xml=element)
            elif tag == gerald.Gerald.GERALD.lower():
@@ -113,7 +116,7 @@ class PipelineRun(object):
          Given a run tuple, find the latest date and use that as our name
          """
          if self._name is None:
-          tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+          tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
            timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
            self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
          return self._name
@@ -143,28 +146,48 @@ def get_runs(runfolder):
      in there gerald component.
      """
      from htsworkflow.pipelines import firecrest
+    from htsworkflow.pipelines import ipar
      from htsworkflow.pipelines import bustard
      from htsworkflow.pipelines import gerald
  
-    datadir = os.path.join(runfolder, 'Data')
-
-    logging.info('Searching for runs in ' + datadir)
-    runs = []
-    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
-        f = firecrest.firecrest(firecrest_pathname)
-        bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+    def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
+        logging.info("Looking for bustard directories in %s" % (pathname,))
+        bustard_glob = os.path.join(pathname, "Bustard*")
          for bustard_pathname in glob(bustard_glob):
+            logging.info("Found bustard directory %s" % (bustard_pathname,))
              b = bustard.bustard(bustard_pathname)
              gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+            logging.info("Looking for gerald directories in %s" % (pathname,))
              for gerald_pathname in glob(gerald_glob):
+                logging.info("Found gerald directory %s" % (gerald_pathname,))
                  try:
                      g = gerald.gerald(gerald_pathname)
-                    runs.append(PipelineRun(runfolder, f, b, g))
+                    p = PipelineRun(runfolder)
+                    p.image_analysis = image_analysis
+                    p.bustard = b
+                    p.gerald = g
+                    runs.append(p)
                  except IOError, e:
                      print "Ignoring", str(e)
+
+    datadir = os.path.join(runfolder, 'Data')
+
+    logging.info('Searching for runs in ' + datadir)
+    runs = []
+    # scan for firecrest directories
+    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+        logging.info('Found firecrest in ' + datadir)
+        image_analysis = firecrest.firecrest(firecrest_pathname)
+        scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname)
+    # scan for IPAR directories
+    for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
+        logging.info('Found ipar directories in ' + datadir)
+        image_analysis = ipar.ipar(ipar_pathname)
+        scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname)
+
      return runs
-                
-    
+
+
  def extract_run_parameters(runs):
      """
      Search through runfolder_path for various runs and grab their parameters
@@ -190,6 +213,33 @@ def summarize_mapped_reads(mapped_reads):
      summarized_reads[genome] = genome_reads
      return summarized_reads
  
+def summarize_lane(gerald, lane_id):
+    report = []
+    summary_results = gerald.summary.lane_results
+    eland_result = gerald.eland_results.results[lane_id]
+    report.append("Sample name %s" % (eland_result.sample_name))
+    report.append("Lane id %s" % (eland_result.lane_id,))
+    cluster = summary_results[eland_result.lane_id].cluster
+    report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+    report.append("Total Reads: %d" % (eland_result.reads))
+    mc = eland_result._match_codes
+    nm = mc['NM']
+    nm_percent = float(nm)/eland_result.reads  * 100
+    qc = mc['QC']
+    qc_percent = float(qc)/eland_result.reads * 100
+
+    report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
+    report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
+    report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+                  (mc['U0'], mc['U1'], mc['U2']))
+    report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+                  (mc['R0'], mc['R1'], mc['R2']))
+    report.append("Mapped Reads")
+    mapped_reads = summarize_mapped_reads(eland_result.mapped_reads)
+    for name, counts in mapped_reads.items():
+      report.append("  %s: %d" % (name, counts))
+    return report
+
  def summary_report(runs):
      """
      Summarize cluster numbers and mapped read counts for a runfolder
@@ -202,30 +252,8 @@ def summary_report(runs):
         eland_keys = run.gerald.eland_results.results.keys()
         eland_keys.sort(alphanum)
  
-        lane_results = run.gerald.summary.lane_results
         for lane_id in eland_keys:
-           result = run.gerald.eland_results.results[lane_id]
-            report.append("Sample name %s" % (result.sample_name))
-            report.append("Lane id %s" % (result.lane_id,))
-            cluster = lane_results[result.lane_id].cluster
-            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
-            report.append("Total Reads: %d" % (result.reads))
-            mc = result._match_codes
-            nm = mc['NM']
-            nm_percent = float(nm)/result.reads  * 100
-            qc = mc['QC']
-            qc_percent = float(qc)/result.reads * 100
-
-           report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
-           report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
-            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
-                          (mc['U0'], mc['U1'], mc['U2']))
-            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
-                          (mc['R0'], mc['R1'], mc['R2']))
-            report.append("Mapped Reads")
-            mapped_reads = summarize_mapped_reads(result.mapped_reads)
-            for name, counts in mapped_reads.items():
-              report.append("  %s: %d" % (name, counts))
+            report.extend(summarize_lane(run.gerald, lane_id))
              report.append('---')
              report.append('')
          return os.linesep.join(report)
@@ -239,9 +267,9 @@ def extract_results(runs, output_base_dir=None):
        logging.info("Using %s as result directory" % (result_dir,))
        if not os.path.exists(result_dir):
          os.mkdir(result_dir)
-      
+
        # create cycle_dir
-      cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop)
+      cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
        logging.info("Filling in %s" % (cycle,))
        cycle_dir = os.path.join(result_dir, cycle)
        if os.path.exists(cycle_dir):
@@ -278,7 +306,7 @@ def extract_results(runs, output_base_dir=None):
        logging.info("Running tar: " + " ".join(tar_cmd[:10]))
        logging.info("Running bzip2: " + " ".join(bzip_cmd))
        logging.info("Writing to %s" %(tar_dest_name))
-      
+
        tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname)
        bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
        tar.wait()