From 87f2f6a72beeeaf42f8657878d80817a6cef3b8f Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Fri, 13 Jul 2012 17:31:41 -0700
Subject: [PATCH] Attempt to guess raw sequence type for a run. Also will check
 for the Unaligned directory first before looking for the BaseCalls directory,
 as there's still a BaseCalls dir. (It does this by checking for the aligned
 directory first, and if it exists using the name in the aligned config file
 to find the corresponding base call directory)

---
 htsworkflow/pipelines/bustard.py              | 16 +++++
 htsworkflow/pipelines/runfolder.py            | 71 ++++++++++++-------
 .../pipelines/test/test_runfolder_rta160.py   |  7 ++
 .../pipelines/test/test_runfolder_rta1_12.py  |  6 +-
 .../testdata/1_12/aligned_config_1_12.xml     |  2 +-
 5 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/htsworkflow/pipelines/bustard.py b/htsworkflow/pipelines/bustard.py
index cb32515..eefaef4 100644
--- a/htsworkflow/pipelines/bustard.py
+++ b/htsworkflow/pipelines/bustard.py
@@ -239,6 +239,22 @@ class Bustard(object):
         self.date = date(*t[0:3])
         self.user = groups[2]
 
+    def _get_sequence_format(self):
+        """Guess sequence format"""
+        projects = glob(os.path.join(self.pathname, 'Project_*'))
+        if len(projects) > 0:
+            # Hey we look like a demultiplexed run
+            return 'fastq'
+        return 'qseq'
+        #qseqs = glob(os.path.join(self.pathname, '*_qseq.txt'))
+        #if len(qseqs) > 0:
+        #    return 'qseq'
+        #seqs = glob(os.path.join(self.pathname, '*_seq.txt'))
+        #if len(seqs) > 0:
+        #    return 'srf'
+        return None
+    sequence_format = property(_get_sequence_format)
+
     def _get_software_version(self):
         """return software name, version tuple"""
         if self.bustard_config is None:
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py
index 3eab735..71d5b6d 100644
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -201,7 +201,13 @@ def get_runs(runfolder, flowcell_id=None):
     from htsworkflow.pipelines import bustard
     from htsworkflow.pipelines import gerald
 
-    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
+    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis,
+                                 pathname):
+        added = build_aligned_runs(image_analysis, runs, datadir, runfolder)
+        # If we're a multiplexed run, don't look for older run type.
+        if added > 0:
+            return
+
         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
         # RTA BaseCalls looks enough like Bustard.
@@ -211,9 +217,9 @@ def get_runs(runfolder, flowcell_id=None):
             b = bustard.bustard(bustard_pathname)
             build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
 
-            build_aligned_runs(image_analysis, runs, b, datadir, runfolder)
 
     def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
+        start = len(runs)
         gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
         LOGGER.info("Looking for gerald directories in %s" % (pathname,))
         for gerald_pathname in glob(gerald_glob):
@@ -228,23 +234,27 @@ def get_runs(runfolder, flowcell_id=None):
                 runs.append(p)
             except IOError, e:
                 LOGGER.error("Ignoring " + str(e))
+        return len(runs) - start
 
 
-    def build_aligned_runs(image_analysis, runs, b, datadir, runfolder):
+    def build_aligned_runs(image_analysis, runs, datadir, runfolder):
+        start = len(runs)
         aligned_glob = os.path.join(runfolder, 'Aligned*')
         for aligned in glob(aligned_glob):
             LOGGER.info("Found aligned directory %s" % (aligned,))
             try:
                 g = gerald.gerald(aligned)
                 p = PipelineRun(runfolder, flowcell_id)
+                bustard_pathname = os.path.join(runfolder, g.runfolder_name)
+
                 p.datadir = datadir
                 p.image_analysis = image_analysis
-                p.bustard = b
+                p.bustard = bustard.bustard(bustard_pathname)
                 p.gerald = g
                 runs.append(p)
             except IOError, e:
                 LOGGER.error("Ignoring " + str(e))
-
+        return len(runs) - start
     datadir = os.path.join(runfolder, 'Data')
 
     LOGGER.info('Searching for runs in ' + datadir)
@@ -560,7 +570,7 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
       q.run()
 
 
-def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
+def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
     """
     Iterate over runfolders in runs extracting the most useful information.
       * run parameters (in run-*.xml)
@@ -593,7 +603,8 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       r.save(cycle_dir)
 
       # save illumina flowcell status report
-      save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
+      save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
+                            cycle_dir)
 
       # save stuff from bustard
       # grab IVC plot
@@ -601,26 +612,7 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
 
       # build base call saving commands
       if site is not None:
-        lanes = []
-        for lane in range(1, 9):
-          lane_parameters = r.gerald.lanes.get(lane, None)
-          if lane_parameters is not None and lane_parameters.analysis != 'none':
-            lanes.append(lane)
-
-        run_name = srf.pathname_to_run_name(r.pathname)
-        seq_cmds = []
-        LOGGER.info("Raw Format is: %s" % (raw_format, ))
-        if raw_format == 'fastq':
-            rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
-            LOGGER.info("raw data = %s" % (rawpath,))
-            srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
-        elif raw_format == 'qseq':
-            seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
-        elif raw_format == 'srf':
-            seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
-        else:
-            raise ValueError('Unknown --raw-format=%s' % (raw_format))
-        srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
+          save_raw_data(num_jobs, r, site, raw_format, cycle_dir)
 
       # save stuff from GERALD
       # copy stuff out of the main run
@@ -636,6 +628,31 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       md5_commands = srf.make_md5_commands(cycle_dir)
       srf.run_commands(cycle_dir, md5_commands, num_jobs)
 
+def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
+    lanes = []
+    for lane in range(1, 9):
+        lane_parameters = r.gerald.lanes.get(lane, None)
+        if lane_parameters is not None and lane_parameters.analysis != 'none':
+            lanes.append(lane)
+
+    run_name = srf.pathname_to_run_name(r.pathname)
+    seq_cmds = []
+    if raw_format is None:
+        raw_format = r.bustard.sequence_format
+
+    LOGGER.info("Raw Format is: %s" % (raw_format, ))
+    if raw_format == 'fastq':
+        rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
+        LOGGER.info("raw data = %s" % (rawpath,))
+        srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
+    elif raw_format == 'qseq':
+        seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
+    elif raw_format == 'srf':
+        seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
+    else:
+        raise ValueError('Unknown --raw-format=%s' % (raw_format))
+    srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
+
 def rm_list(files, dry_run=True):
     for f in files:
         if os.path.exists(f):
diff --git a/htsworkflow/pipelines/test/test_runfolder_rta160.py b/htsworkflow/pipelines/test/test_runfolder_rta160.py
index 49f8ffa..caf82a0 100644
--- a/htsworkflow/pipelines/test/test_runfolder_rta160.py
+++ b/htsworkflow/pipelines/test/test_runfolder_rta160.py
@@ -258,9 +258,15 @@ class RunfolderTests(unittest.TestCase):
         name = 'run_207BTAAXY_%s.xml' % ( date.today().strftime('%Y-%m-%d'),)
         self.failUnlessEqual(runs[0].name, name)
 
+        bustard_dir = os.path.join(self.runfolder_dir, 'Data',
+                                   'Intensities', 'BaseCalls')
         r1 = runs[0]
         xml = r1.get_elements()
         xml_str = ElementTree.tostring(xml)
+        self.failUnlessEqual(r1.bustard.sequence_format, 'qseq')
+        self.failUnlessEqual(r1.bustard.pathname, bustard_dir)
+        self.failUnlessEqual(r1.gerald.runfolder_name,
+                             '090220_HWI-EAS229_0093_30VR0AAXX')
 
         r2 = runfolder.PipelineRun(xml=xml)
         self.failUnlessEqual(r1.name, r2.name)
@@ -269,6 +275,7 @@ class RunfolderTests(unittest.TestCase):
         self.failIfEqual(r2.gerald, None)
 
 
+
 def suite():
     return unittest.makeSuite(RunfolderTests,'test')
 
diff --git a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py
index 81f7263..fdb6001 100644
--- a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py
+++ b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py
@@ -258,7 +258,12 @@ class RunfolderTests(unittest.TestCase):
                                    date.today().strftime('%Y-%m-%d'),)
         self.failUnlessEqual(runs[0].name, name)
 
+        bustard_dir = os.path.join(self.runfolder_dir, 'Unaligned')
         r1 = runs[0]
+        self.failUnlessEqual(r1.bustard.sequence_format, 'fastq')
+        self.failUnlessEqual(r1.bustard.pathname, bustard_dir)
+        self.failUnlessEqual(r1.gerald.runfolder_name, 'Unaligned')
+
         xml = r1.get_elements()
         xml_str = ElementTree.tostring(xml)
 
@@ -268,7 +273,6 @@ class RunfolderTests(unittest.TestCase):
         self.failIfEqual(r2.bustard, None)
         self.failIfEqual(r2.gerald, None)
 
-
 def suite():
     return unittest.makeSuite(RunfolderTests,'test')
 
diff --git a/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml b/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml
index 1a5471f..14ee3e4 100644
--- a/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml
+++ b/htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml
@@ -25,7 +25,7 @@
     <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
     <EMAIL_LIST>${logName}</EMAIL_LIST>
     <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
-    <EXPT_DIR>/mmjggl/nicodemus/data01/sequencer/110815_SN787_0101_AD07K6ACXX/Unaligned_1MM</EXPT_DIR>
+    <EXPT_DIR>/mmjggl/nicodemus/data01/sequencer/110815_SN787_0101_AD07K6ACXX/Unaligned</EXPT_DIR>
     <FLAT_TXT_GZ_SUFFIX>Flat.txt.gz</FLAT_TXT_GZ_SUFFIX>
     <FLOWCELL>D07K6ACXX</FLOWCELL>
     <GENE_MD_GZ_SUFFIX>_gene.md.gz</GENE_MD_GZ_SUFFIX>
-- 
2.30.2