Initial port to python3

[htsworkflow.git] / scripts / rerun_eland.py
diff --git a/scripts/rerun_eland.py b/scripts/rerun_eland.py

index 25aecc445ddc143b74622155444c2da7af72dceb..6741bca9053a37b72ccbf48291c344e2a700e26a 100644 (file)
--- a/scripts/rerun_eland.py
+++ b/scripts/rerun_eland.py
@@ -6,40 +6,44 @@ import os
  import subprocess
  import sys
  
-from gaworkflow.pipeline import gerald
+from htsworkflow.pipelines import gerald
+from htsworkflow.pipelines.eland import extract_eland_sequence
+from htsworkflow.pipelines import runfolder
+
+LOGGER = logging.getLogger(__name__)
  
  def make_query_filename(eland_obj, output_dir):
-    query_name = '%s_%s_eland_query.txt' 
+    query_name = '%s_%s_eland_query.txt'
      query_name %= (eland_obj.sample_name, eland_obj.lane_id)
  
      query_pathname = os.path.join(output_dir, query_name)
-    
+
      if os.path.exists(query_pathname):
-        logging.warn("overwriting %s" % (query_pathname,))
+        LOGGER.warn("overwriting %s" % (query_pathname,))
  
      return query_pathname
  
  def make_result_filename(eland_obj, output_dir):
-    result_name = '%s_%s_eland_result.txt' 
+    result_name = '%s_%s_eland_result.txt'
      result_name %= (eland_obj.sample_name, eland_obj.lane_id)
  
      result_pathname = os.path.join(output_dir, result_name)
-    
+
      if os.path.exists(result_pathname):
-        logging.warn("overwriting %s" % (result_pathname,))
+        LOGGER.warn("overwriting %s" % (result_pathname,))
  
      return result_pathname
  
  def extract_sequence(inpathname, query_pathname, length, dry_run=False):
-    logging.info('extracting %d bases' %(length,))
-    logging.info('extracting from %s' %(inpathname,))
-    logging.info('extracting to %s' %(query_pathname,))
-    
-    if not dry_run: 
+    LOGGER.info('extracting %d bases' %(length,))
+    LOGGER.info('extracting from %s' %(inpathname,))
+    LOGGER.info('extracting to %s' %(query_pathname,))
+
+    if not dry_run:
          try:
              instream = open(inpathname, 'r')
              outstream = open(query_pathname, 'w')
-            gerald.extract_eland_sequence(instream, outstream, 0, length)
+            extract_eland_sequence(instream, outstream, 0, length)
          finally:
              outstream.close()
              instream.close()
@@ -49,7 +53,7 @@ def run_eland(length, query_name, genome, result_name, multi=False, dry_run=Fals
      if multi:
          cmdline += ['--multi']
  
-    logging.info('running eland: ' + " ".join(cmdline))
+    LOGGER.info('running eland: ' + " ".join(cmdline))
      if not dry_run:
          return subprocess.Popen(cmdline)
      else:
@@ -60,11 +64,16 @@ def rerun(gerald_dir, output_dir, length=25, dry_run=False):
      """
      look for eland files in gerald_dir and write a subset to output_dir
      """
-    logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
+    LOGGER.info("Extracting %d bp from files in %s" % (length, gerald_dir))
      g = gerald.gerald(gerald_dir)
  
+    # this will only work if we're only missing the last dir in output_dir
+    if not os.path.exists(output_dir):
+        LOGGER.info("Making %s" %(output_dir,))
+        if not dry_run: os.mkdir(output_dir)
+
      processes = []
-    for lane_id, lane_param in g.lanes.items():
+    for lane_id, lane_param in list(g.lanes.items()):
          eland = g.eland_results[lane_id]
  
          inpathname = eland.pathname
@@ -73,23 +82,23 @@ def rerun(gerald_dir, output_dir, length=25, dry_run=False):
  
          extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
  
-        p = run_eland(length, 
-                      query_pathname, 
-                      lane_param.eland_genome, 
-                      result_pathname, 
+        p = run_eland(length,
+                      query_pathname,
+                      lane_param.eland_genome,
+                      result_pathname,
                        dry_run=dry_run)
          if p is not None:
              processes.append(p)
  
      for p in processes:
          p.wait()
-        
+
  def make_parser():
-    usage = '%prog: --gerald <gerald dir> -o <new dir>'
+    usage = '%prog: [options] runfolder'
  
      parser = OptionParser(usage)
-    
-    parser.add_option('--gerald', 
+
+    parser.add_option('--gerald',
                        help='specify location of GERALD directory',
                        default=None)
      parser.add_option('-o', '--output',
@@ -115,17 +124,32 @@ def main(cmdline=None):
      parser = make_parser()
      opts, args = parser.parse_args(cmdline)
  
-    if opts.gerald is None:
+    if opts.length < 16 or opts.length > 32:
+        parser.error("eland can only process reads in the range 16-32")
+
+    if len(args) > 1:
+        parser.error("Can only process one runfolder directory")
+    elif len(args) == 1:
+        runs = runfolder.get_runs(args[0])
+        if len(runs) != 1:
+            parser.error("Not a runfolder")
+        opts.gerald = runs[0].gerald.pathname
+        if opts.output is None:
+            opts.output = os.path.join(
+                runs[0].pathname,
+                'Data',
+                # pythons 0..n ==> elands 1..n+1
+                'C1-%d' % (opts.length+1,)
+            )
+
+    elif opts.gerald is None:
          parser.error("need gerald directory")
-    
+
      if opts.output is None:
          parser.error("specify location for the new eland files")
  
-    if opts.length < 16 or opts.length > 32:
-        parser.error("eland can only process reads in the range 16-32")
-
      if opts.verbose:
-        root_logger = logging.getLogger()
+        root_logger = logging.getLogger('rerun_eland')
          root_logger.setLevel(logging.INFO)
  
      rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)