Utility to create srf files from a bustard directory

author Diane Trout <diane@caltech.edu>

Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)

committer Diane Trout <diane@caltech.edu>

Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)
author Diane Trout <diane@caltech.edu>
Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)
committer Diane Trout <diane@caltech.edu>
Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)
diff --git a/gaworkflow/util/queuecommands.py b/gaworkflow/util/queuecommands.py

index 4873a51e5bb63f7e843b34449cfe7e2674c76f65..0c342920590079a561130465af81b3eefa8f1a98 100644 (file)
--- a/gaworkflow/util/queuecommands.py
+++ b/gaworkflow/util/queuecommands.py
@@ -14,7 +14,7 @@ class QueueCommands(object):
      finish.
      """
  
-    def __init__(self, cmd_list, N=0):
+    def __init__(self, cmd_list, N=0, cwd=None):
          """
          cmd_list is a list of elements suitable for subprocess
          N is the  number of simultanious processes to run. 
@@ -27,6 +27,7 @@ class QueueCommands(object):
          self.to_run = cmd_list[:]
          self.running = {}
          self.N = N
+        self.cwd = cwd
  
      def under_process_limit(self):
          """
@@ -46,10 +47,11 @@ class QueueCommands(object):
          (or have run out of jobs)
          """
          queue_log = logging.getLogger('queue')
+        queue_log.info('using %s as cwd' % (self.cwd,))
  
          while (len(self.to_run) > 0) and self.under_process_limit():
              cmd = self.to_run.pop(0)
-            p = subprocess.Popen(cmd, stdout=PIPE)
+            p = subprocess.Popen(cmd, stdout=PIPE, cwd=self.cwd, shell=True)
              self.running[p.stdout] = p
              queue_log.info("Created process %d from %s" % (p.pid, str(cmd)))
  
diff --git a/scripts/srf b/scripts/srf

new file mode 100644 (file)

index 0000000..fcca2ea
--- /dev/null
+++ b/scripts/srf
@@ -0,0 +1,157 @@
+#!/usr/bin/python
+
+from glob import glob
+import logging
+import optparse
+import os
+import subprocess
+import sys
+
+from gaworkflow.util import queuecommands
+
+def make_commands(run_name, lanes, site_name, destdir):
+  """
+  make a subprocess-friendly list of command line arguments to run solexa2srf
+  generates files like: 
+  woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
+   site        run name                    lane
+             
+  run_name - most of the file name (run folder name is a good choice)
+  lanes - list of integers corresponding to which lanes to process
+  site_name - name of your "sequencing site" or "Individual"
+  destdir - where to write all the srf files
+  """
+  cmd_list = []
+  for lane in lanes:
+    name_prefix = '%s_%%l_%%t_' % (run_name,)
+    destname = '%s:%s-%d.srf' % (site_name, run_name, lane)
+    destdir = os.path.normpath(destdir)
+    dest_path = os.path.join(destdir, destname)
+    seq_pattern = 's_%d_*_seq.txt' % (lane,)
+
+    cmd = ['solexa2srf', 
+           '-N', name_prefix,
+           '-n', '%3x:%3y', 
+           '-o', dest_path, 
+          seq_pattern]
+
+    cmd_list.append(" ".join(cmd))
+  return cmd_list
+
+def pathname_to_run_name(pathname):
+  """
+  Convert a pathname to a base runfolder name
+  handle the case with a trailing /
+  """
+  name = None
+  while name is None:
+    base, name = os.path.split(pathname)
+    if len(base) == 0:
+      return None
+  return name
+
+def find_bustard_dir(pathname):
+  # fixme: for don't repeat yourself this should some how be related 
+  # fixme: to pipeline.runfolder
+
+  datadir = os.path.join(pathname, 'Data')
+  logging.info("searching in %s" % (datadir,))
+  
+  bustard_dirs = []
+  for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+    bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+    for bustard_pathname in glob(bustard_glob):
+      bustard_dirs.append(bustard_pathname)
+  return bustard_dirs
+    
+
+
+def make_parser():
+  usage = '%prog: [options] runfolder -l 1,2,3 [runfolder -l 5,6 ...]'
+
+  parser = optparse.OptionParser(usage)
+  parser.add_option('--dry-run', action='store_true',
+                    help='print what would be done',
+                    default=False)
+
+  parser.add_option('-d', '--dest-dir', dest='dest_dir',
+                    help='location to write srf files to',
+                    default='.')
+  parser.add_option('-s', '--site',
+                    help='site name',
+                    default='Individual')
+  parser.add_option('-l', '--lanes', dest='lanes', action="append",
+         default=[],
+         help='comma seperated list of lanes to add to srf'
+  )
+  parser.add_option('-j', '--jobs', default=1, type='int',
+                    help='how many jobs to run simultaneously')
+                     
+
+  return parser
+
+def parse_lane_arg(lane_arg):
+    """
+    Convert comma sperated list of lane ids to a list of integers
+    """
+    lanes = []
+    for lane in lane_arg.split(','):
+        try:
+            lane = int(lane)
+            if lane < 1 or lane > 8:
+                parser.error('Lanes must be in range [1..8]')
+            lanes.append(lane)
+        except ValueError:
+            parser.error('Lane selections must be integers')
+    return lanes
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+  
+    if len(args) == 0:
+        parser.error('need runfolder arguments')
+
+    # parse lane arguemnts
+    lanes_list = []
+    if len(opts.lanes) == 0:
+        lanes_list = [[1,2,3,4,5,6,7,8]] * len(args)
+    elif len(opts.lanes) == len(args):
+        for lane_arg in opts.lanes:
+            lanes_list.append(parse_lane_arg(lane_arg))
+    else:
+        parser.error(
+          "Number of lane arguments must match number of runfolders"
+        )
+    
+    # build list of commands
+    cmds = {}
+    for runfolder_path, lanes in zip(args, lanes_list):
+        name = pathname_to_run_name(runfolder_path)
+        bustard_dir = find_bustard_dir(runfolder_path)
+        if len(bustard_dir) == 1:
+          bustard_dir = bustard_dir[0]
+          cmds[bustard_dir] = make_commands(name, lanes, opts.site, opts.dest_dir)
+        else:
+          print 'ERROR: Too many bustard directories'
+          print "\n ".join(bustard_dir)
+          return 1
+
+    if not opts.dry_run:
+      for cwd, cmd_list in cmds.items():
+        curdir = os.getcwd()
+        os.chdir(cwd)
+        q = queuecommands.QueueCommands(cmd_list, opts.jobs)
+        q.start_jobs()
+        os.chdir(curdir)
+    else:
+      for cwd, cmd_list in cmds.items():
+        print cwd
+        print cmd_list
+        print 'jobs: ', opts.jobs
+
+    return 0
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    sys.exit(main(sys.argv[1:]))
author	Diane Trout <diane@caltech.edu>
	Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)
committer	Diane Trout <diane@caltech.edu>
	Thu, 14 Aug 2008 00:09:09 +0000 (00:09 +0000)
gaworkflow/util/queuecommands.py		patch \| blob \| history
scripts/srf	[new file with mode: 0644]	patch \| blob