scripts/rerun_eland.py

   1 #!/usr/bin/env python
   2
   3 import logging
   4 from optparse import OptionParser
   5 import os
   6 import subprocess
   7 import sys
   8
   9 from htsworkflow.pipelines import gerald
  10 from htsworkflow.pipelines.eland import extract_eland_sequence
  11 from htsworkflow.pipelines import runfolder
  12
  13 LOGGER = logging.getLogger(__name__)
  14
  15 def make_query_filename(eland_obj, output_dir):
  16     query_name = '%s_%s_eland_query.txt'
  17     query_name %= (eland_obj.sample_name, eland_obj.lane_id)
  18
  19     query_pathname = os.path.join(output_dir, query_name)
  20
  21     if os.path.exists(query_pathname):
  22         LOGGER.warn("overwriting %s" % (query_pathname,))
  23
  24     return query_pathname
  25
  26 def make_result_filename(eland_obj, output_dir):
  27     result_name = '%s_%s_eland_result.txt'
  28     result_name %= (eland_obj.sample_name, eland_obj.lane_id)
  29
  30     result_pathname = os.path.join(output_dir, result_name)
  31
  32     if os.path.exists(result_pathname):
  33         LOGGER.warn("overwriting %s" % (result_pathname,))
  34
  35     return result_pathname
  36
  37 def extract_sequence(inpathname, query_pathname, length, dry_run=False):
  38     LOGGER.info('extracting %d bases' %(length,))
  39     LOGGER.info('extracting from %s' %(inpathname,))
  40     LOGGER.info('extracting to %s' %(query_pathname,))
  41
  42     if not dry_run:
  43         try:
  44             instream = open(inpathname, 'r')
  45             outstream = open(query_pathname, 'w')
  46             extract_eland_sequence(instream, outstream, 0, length)
  47         finally:
  48             outstream.close()
  49             instream.close()
  50
  51 def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
  52     cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
  53     if multi:
  54         cmdline += ['--multi']
  55
  56     LOGGER.info('running eland: ' + " ".join(cmdline))
  57     if not dry_run:
  58         return subprocess.Popen(cmdline)
  59     else:
  60         return None
  61
  62
  63 def rerun(gerald_dir, output_dir, length=25, dry_run=False):
  64     """
  65     look for eland files in gerald_dir and write a subset to output_dir
  66     """
  67     LOGGER.info("Extracting %d bp from files in %s" % (length, gerald_dir))
  68     g = gerald.gerald(gerald_dir)
  69
  70     # this will only work if we're only missing the last dir in output_dir
  71     if not os.path.exists(output_dir):
  72         LOGGER.info("Making %s" %(output_dir,))
  73         if not dry_run: os.mkdir(output_dir)
  74
  75     processes = []
  76     for lane_id, lane_param in g.lanes.items():
  77         eland = g.eland_results[lane_id]
  78
  79         inpathname = eland.pathname
  80         query_pathname = make_query_filename(eland, output_dir)
  81         result_pathname = make_result_filename(eland, output_dir)
  82
  83         extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
  84
  85         p = run_eland(length,
  86                       query_pathname,
  87                       lane_param.eland_genome,
  88                       result_pathname,
  89                       dry_run=dry_run)
  90         if p is not None:
  91             processes.append(p)
  92
  93     for p in processes:
  94         p.wait()
  95
  96 def make_parser():
  97     usage = '%prog: [options] runfolder'
  98
  99     parser = OptionParser(usage)
 100
 101     parser.add_option('--gerald',
 102                       help='specify location of GERALD directory',
 103                       default=None)
 104     parser.add_option('-o', '--output',
 105                       help='specify output location of files',
 106                       default=None)
 107     parser.add_option('-l', '--read-length', type='int',
 108                       help='specify new eland length',
 109                       dest='length',
 110                       default=25)
 111     parser.add_option('--dry-run', action='store_true',
 112                       help='only pretend to run',
 113                       default=False)
 114     parser.add_option('-v', '--verbose', action='store_true',
 115                       help='increase verbosity',
 116                       default=False)
 117
 118     return parser
 119
 120
 121 def main(cmdline=None):
 122     logging.basicConfig(level=logging.WARNING)
 123
 124     parser = make_parser()
 125     opts, args = parser.parse_args(cmdline)
 126
 127     if opts.length < 16 or opts.length > 32:
 128         parser.error("eland can only process reads in the range 16-32")
 129
 130     if len(args) > 1:
 131         parser.error("Can only process one runfolder directory")
 132     elif len(args) == 1:
 133         runs = runfolder.get_runs(args[0])
 134         if len(runs) != 1:
 135             parser.error("Not a runfolder")
 136         opts.gerald = runs[0].gerald.pathname
 137         if opts.output is None:
 138             opts.output = os.path.join(
 139                 runs[0].pathname,
 140                 'Data',
 141                 # pythons 0..n ==> elands 1..n+1
 142                 'C1-%d' % (opts.length+1,)
 143             )
 144
 145     elif opts.gerald is None:
 146         parser.error("need gerald directory")
 147
 148     if opts.output is None:
 149         parser.error("specify location for the new eland files")
 150
 151     if opts.verbose:
 152         root_logger = logging.getLogger('rerun_eland')
 153         root_logger.setLevel(logging.INFO)
 154
 155     rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
 156
 157     return 0
 158
 159 if __name__ == "__main__":
 160     sys.exit(main(sys.argv[1:]))