Attempt to guess raw sequence type for a run.
authorDiane Trout <diane@caltech.edu>
Sat, 14 Jul 2012 00:31:41 +0000 (17:31 -0700)
committerDiane Trout <diane@caltech.edu>
Sat, 14 Jul 2012 00:31:41 +0000 (17:31 -0700)
Also will check for the Unaligned directory first
before looking for the BaseCalls directory, as there's still
a BaseCalls dir.
(It does this by checking for the aligned directory first, and
if it exists using the name in the aligned config file to find
the corresponding base call directory)

htsworkflow/pipelines/bustard.py
htsworkflow/pipelines/runfolder.py
htsworkflow/pipelines/test/test_runfolder_rta160.py
htsworkflow/pipelines/test/test_runfolder_rta1_12.py
htsworkflow/pipelines/test/testdata/1_12/aligned_config_1_12.xml

index cb32515dca06e2f22fed1e836a1c30288d33940b..eefaef4d82acc122b8fda2e35ffcbb9cfca100b5 100644 (file)
@@ -239,6 +239,22 @@ class Bustard(object):
         self.date = date(*t[0:3])
         self.user = groups[2]
 
+    def _get_sequence_format(self):
+        """Guess sequence format"""
+        projects = glob(os.path.join(self.pathname, 'Project_*'))
+        if len(projects) > 0:
+            # Hey we look like a demultiplexed run
+            return 'fastq'
+        return 'qseq'
+        #qseqs = glob(os.path.join(self.pathname, '*_qseq.txt'))
+        #if len(qseqs) > 0:
+        #    return 'qseq'
+        #seqs = glob(os.path.join(self.pathname, '*_seq.txt'))
+        #if len(seqs) > 0:
+        #    return 'srf'
+        return None
+    sequence_format = property(_get_sequence_format)
+
     def _get_software_version(self):
         """return software name, version tuple"""
         if self.bustard_config is None:
index 3eab735873cc9e0087b4e3bf9b0fa77ac60e13c3..71d5b6d0a5ac3d1db396ecd061c340f2d1ef8441 100644 (file)
@@ -201,7 +201,13 @@ def get_runs(runfolder, flowcell_id=None):
     from htsworkflow.pipelines import bustard
     from htsworkflow.pipelines import gerald
 
-    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
+    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis,
+                                 pathname):
+        added = build_aligned_runs(image_analysis, runs, datadir, runfolder)
+        # If we're a multiplexed run, don't look for older run type.
+        if added > 0:
+            return
+
         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
         # RTA BaseCalls looks enough like Bustard.
@@ -211,9 +217,9 @@ def get_runs(runfolder, flowcell_id=None):
             b = bustard.bustard(bustard_pathname)
             build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
 
-            build_aligned_runs(image_analysis, runs, b, datadir, runfolder)
 
     def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
+        start = len(runs)
         gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
         LOGGER.info("Looking for gerald directories in %s" % (pathname,))
         for gerald_pathname in glob(gerald_glob):
@@ -228,23 +234,27 @@ def get_runs(runfolder, flowcell_id=None):
                 runs.append(p)
             except IOError, e:
                 LOGGER.error("Ignoring " + str(e))
+        return len(runs) - start
 
 
-    def build_aligned_runs(image_analysis, runs, b, datadir, runfolder):
+    def build_aligned_runs(image_analysis, runs, datadir, runfolder):
+        start = len(runs)
         aligned_glob = os.path.join(runfolder, 'Aligned*')
         for aligned in glob(aligned_glob):
             LOGGER.info("Found aligned directory %s" % (aligned,))
             try:
                 g = gerald.gerald(aligned)
                 p = PipelineRun(runfolder, flowcell_id)
+                bustard_pathname = os.path.join(runfolder, g.runfolder_name)
+
                 p.datadir = datadir
                 p.image_analysis = image_analysis
-                p.bustard = b
+                p.bustard = bustard.bustard(bustard_pathname)
                 p.gerald = g
                 runs.append(p)
             except IOError, e:
                 LOGGER.error("Ignoring " + str(e))
-
+        return len(runs) - start
     datadir = os.path.join(runfolder, 'Data')
 
     LOGGER.info('Searching for runs in ' + datadir)
@@ -560,7 +570,7 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
       q.run()
 
 
-def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
+def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
     """
     Iterate over runfolders in runs extracting the most useful information.
       * run parameters (in run-*.xml)
@@ -593,7 +603,8 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       r.save(cycle_dir)
 
       # save illumina flowcell status report
-      save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
+      save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
+                            cycle_dir)
 
       # save stuff from bustard
       # grab IVC plot
@@ -601,26 +612,7 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
 
       # build base call saving commands
       if site is not None:
-        lanes = []
-        for lane in range(1, 9):
-          lane_parameters = r.gerald.lanes.get(lane, None)
-          if lane_parameters is not None and lane_parameters.analysis != 'none':
-            lanes.append(lane)
-
-        run_name = srf.pathname_to_run_name(r.pathname)
-        seq_cmds = []
-        LOGGER.info("Raw Format is: %s" % (raw_format, ))
-        if raw_format == 'fastq':
-            rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
-            LOGGER.info("raw data = %s" % (rawpath,))
-            srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
-        elif raw_format == 'qseq':
-            seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
-        elif raw_format == 'srf':
-            seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
-        else:
-            raise ValueError('Unknown --raw-format=%s' % (raw_format))
-        srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
+          save_raw_data(num_jobs, r, site, raw_format, cycle_dir)
 
       # save stuff from GERALD
       # copy stuff out of the main run
@@ -636,6 +628,31 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       md5_commands = srf.make_md5_commands(cycle_dir)
       srf.run_commands(cycle_dir, md5_commands, num_jobs)
 
+def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
+    lanes = []
+    for lane in range(1, 9):
+        lane_parameters = r.gerald.lanes.get(lane, None)
+        if lane_parameters is not None and lane_parameters.analysis != 'none':
+            lanes.append(lane)
+
+    run_name = srf.pathname_to_run_name(r.pathname)
+    seq_cmds = []
+    if raw_format is None:
+        raw_format = r.bustard.sequence_format
+
+    LOGGER.info("Raw Format is: %s" % (raw_format, ))
+    if raw_format == 'fastq':
+        rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
+        LOGGER.info("raw data = %s" % (rawpath,))
+        srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
+    elif raw_format == 'qseq':
+        seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
+    elif raw_format == 'srf':
+        seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
+    else:
+        raise ValueError('Unknown --raw-format=%s' % (raw_format))
+    srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
+
 def rm_list(files, dry_run=True):
     for f in files:
         if os.path.exists(f):
index 49f8ffaba7fdfde5ca6a88934cf26d170c357221..caf82a0d7cc811facd6ef155d236abcaf16fc303 100644 (file)
@@ -258,9 +258,15 @@ class RunfolderTests(unittest.TestCase):
         name = 'run_207BTAAXY_%s.xml' % ( date.today().strftime('%Y-%m-%d'),)
         self.failUnlessEqual(runs[0].name, name)
 
+        bustard_dir = os.path.join(self.runfolder_dir, 'Data',
+                                   'Intensities', 'BaseCalls')
         r1 = runs[0]
         xml = r1.get_elements()
         xml_str = ElementTree.tostring(xml)
+        self.failUnlessEqual(r1.bustard.sequence_format, 'qseq')
+        self.failUnlessEqual(r1.bustard.pathname, bustard_dir)
+        self.failUnlessEqual(r1.gerald.runfolder_name,
+                             '090220_HWI-EAS229_0093_30VR0AAXX')
 
         r2 = runfolder.PipelineRun(xml=xml)
         self.failUnlessEqual(r1.name, r2.name)
@@ -269,6 +275,7 @@ class RunfolderTests(unittest.TestCase):
         self.failIfEqual(r2.gerald, None)
 
 
+
 def suite():
     return unittest.makeSuite(RunfolderTests,'test')
 
index 81f72632f8b6ca2c27880bc705983a26199d7fc9..fdb60011df9f12785ba224ccc29a62617c0a30d1 100644 (file)
@@ -258,7 +258,12 @@ class RunfolderTests(unittest.TestCase):
                                    date.today().strftime('%Y-%m-%d'),)
         self.failUnlessEqual(runs[0].name, name)
 
+        bustard_dir = os.path.join(self.runfolder_dir, 'Unaligned')
         r1 = runs[0]
+        self.failUnlessEqual(r1.bustard.sequence_format, 'fastq')
+        self.failUnlessEqual(r1.bustard.pathname, bustard_dir)
+        self.failUnlessEqual(r1.gerald.runfolder_name, 'Unaligned')
+
         xml = r1.get_elements()
         xml_str = ElementTree.tostring(xml)
 
@@ -268,7 +273,6 @@ class RunfolderTests(unittest.TestCase):
         self.failIfEqual(r2.bustard, None)
         self.failIfEqual(r2.gerald, None)
 
-
 def suite():
     return unittest.makeSuite(RunfolderTests,'test')
 
index 1a5471fa8662b7fc5b897f2493fe7bc4de2316df..14ee3e4c04fbe0731fbb52ae293c29a5a5a5664c 100644 (file)
@@ -25,7 +25,7 @@
     <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
     <EMAIL_LIST>${logName}</EMAIL_LIST>
     <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
-    <EXPT_DIR>/mmjggl/nicodemus/data01/sequencer/110815_SN787_0101_AD07K6ACXX/Unaligned_1MM</EXPT_DIR>
+    <EXPT_DIR>/mmjggl/nicodemus/data01/sequencer/110815_SN787_0101_AD07K6ACXX/Unaligned</EXPT_DIR>
     <FLAT_TXT_GZ_SUFFIX>Flat.txt.gz</FLAT_TXT_GZ_SUFFIX>
     <FLOWCELL>D07K6ACXX</FLOWCELL>
     <GENE_MD_GZ_SUFFIX>_gene.md.gz</GENE_MD_GZ_SUFFIX>