From c514bfc62b3534b5f5cb90d49832d4cc6a5d22e2 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Thu, 15 May 2008 00:32:47 +0000 Subject: [PATCH] add rerun_eland.py which extracts sub-sequences from eland files and runs eland on them with a new sequence length. The script also helpfully uses the gerald config file to figure out the correct genome path. --- gaworkflow/pipeline/gerald.py | 12 +++ scripts/elandseq | 13 +--- scripts/rerun_eland.py | 136 ++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 11 deletions(-) create mode 100644 scripts/rerun_eland.py diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py index e7e09de..8364607 100644 --- a/gaworkflow/pipeline/gerald.py +++ b/gaworkflow/pipeline/gerald.py @@ -537,6 +537,18 @@ class ElandLane(object): else: logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) +def extract_eland_sequence(instream, outstream, start, end): + """ + Extract a chunk of sequence out of an eland file + """ + for line in instream: + record = line.split() + if len(record) > 1: + result = [record[0], record[1][start:end]] + else: + result = [record[0][start:end]] + outstream.write("\t".join(result)) + outstream.write(os.linesep) class ELAND(object): """ diff --git a/scripts/elandseq b/scripts/elandseq index 1dedc38..ece4bf3 100755 --- a/scripts/elandseq +++ b/scripts/elandseq @@ -3,16 +3,7 @@ import optparse import os import sys -def extract_sequence(instream, outstream, start, end): - for line in instream: - record = line.split() - if len(record) > 1: - result = [record[0], record[1][start:end]] - else: - result = [record[0][start:end]] - outstream.write("\t".join(result)) - outstream.write(os.linesep) - +from gaworkflow.pipeline.gerald import extract_eland_sequence def make_parser(): usage = "usage: %prog [options] infile [outfile]" @@ -53,7 +44,7 @@ def main(argv): else: outstream = sys.stdout - extract_sequence(instream, outstream, start, end) + extract_eland_sequence(instream, outstream, start, end) if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/scripts/rerun_eland.py b/scripts/rerun_eland.py new file mode 100644 index 0000000..25aecc4 --- /dev/null +++ b/scripts/rerun_eland.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +import logging +from optparse import OptionParser +import os +import subprocess +import sys + +from gaworkflow.pipeline import gerald + +def make_query_filename(eland_obj, output_dir): + query_name = '%s_%s_eland_query.txt' + query_name %= (eland_obj.sample_name, eland_obj.lane_id) + + query_pathname = os.path.join(output_dir, query_name) + + if os.path.exists(query_pathname): + logging.warn("overwriting %s" % (query_pathname,)) + + return query_pathname + +def make_result_filename(eland_obj, output_dir): + result_name = '%s_%s_eland_result.txt' + result_name %= (eland_obj.sample_name, eland_obj.lane_id) + + result_pathname = os.path.join(output_dir, result_name) + + if os.path.exists(result_pathname): + logging.warn("overwriting %s" % (result_pathname,)) + + return result_pathname + +def extract_sequence(inpathname, query_pathname, length, dry_run=False): + logging.info('extracting %d bases' %(length,)) + logging.info('extracting from %s' %(inpathname,)) + logging.info('extracting to %s' %(query_pathname,)) + + if not dry_run: + try: + instream = open(inpathname, 'r') + outstream = open(query_pathname, 'w') + gerald.extract_eland_sequence(instream, outstream, 0, length) + finally: + outstream.close() + instream.close() + +def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False): + cmdline = ['eland_%d' % (length,), query_name, genome, result_name] + if multi: + cmdline += ['--multi'] + + logging.info('running eland: ' + " ".join(cmdline)) + if not dry_run: + return subprocess.Popen(cmdline) + else: + return None + + +def rerun(gerald_dir, output_dir, length=25, dry_run=False): + """ + look for eland files in gerald_dir and write a subset to output_dir + """ + logging.info("Extracting %d bp from files in %s" % (length, gerald_dir)) + g = gerald.gerald(gerald_dir) + + processes = [] + for lane_id, lane_param in g.lanes.items(): + eland = g.eland_results[lane_id] + + inpathname = eland.pathname + query_pathname = make_query_filename(eland, output_dir) + result_pathname = make_result_filename(eland, output_dir) + + extract_sequence(inpathname, query_pathname, length, dry_run=dry_run) + + p = run_eland(length, + query_pathname, + lane_param.eland_genome, + result_pathname, + dry_run=dry_run) + if p is not None: + processes.append(p) + + for p in processes: + p.wait() + +def make_parser(): + usage = '%prog: --gerald -o ' + + parser = OptionParser(usage) + + parser.add_option('--gerald', + help='specify location of GERALD directory', + default=None) + parser.add_option('-o', '--output', + help='specify output location of files', + default=None) + parser.add_option('-l', '--read-length', type='int', + help='specify new eland length', + dest='length', + default=25) + parser.add_option('--dry-run', action='store_true', + help='only pretend to run', + default=False) + parser.add_option('-v', '--verbose', action='store_true', + help='increase verbosity', + default=False) + + return parser + + +def main(cmdline=None): + logging.basicConfig(level=logging.WARNING) + + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + if opts.gerald is None: + parser.error("need gerald directory") + + if opts.output is None: + parser.error("specify location for the new eland files") + + if opts.length < 16 or opts.length > 32: + parser.error("eland can only process reads in the range 16-32") + + if opts.verbose: + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + + rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run) + + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) -- 2.30.2