4 from optparse import OptionParser
9 from htsworkflow.pipelines import gerald
10 from htsworkflow.pipelines.eland import extract_eland_sequence
11 from htsworkflow.pipelines import runfolder
13 LOGGER = logging.getLogger(__name__)
15 def make_query_filename(eland_obj, output_dir):
16 query_name = '%s_%s_eland_query.txt'
17 query_name %= (eland_obj.sample_name, eland_obj.lane_id)
19 query_pathname = os.path.join(output_dir, query_name)
21 if os.path.exists(query_pathname):
22 LOGGER.warn("overwriting %s" % (query_pathname,))
26 def make_result_filename(eland_obj, output_dir):
27 result_name = '%s_%s_eland_result.txt'
28 result_name %= (eland_obj.sample_name, eland_obj.lane_id)
30 result_pathname = os.path.join(output_dir, result_name)
32 if os.path.exists(result_pathname):
33 LOGGER.warn("overwriting %s" % (result_pathname,))
35 return result_pathname
37 def extract_sequence(inpathname, query_pathname, length, dry_run=False):
38 LOGGER.info('extracting %d bases' %(length,))
39 LOGGER.info('extracting from %s' %(inpathname,))
40 LOGGER.info('extracting to %s' %(query_pathname,))
44 instream = open(inpathname, 'r')
45 outstream = open(query_pathname, 'w')
46 extract_eland_sequence(instream, outstream, 0, length)
51 def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
52 cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
54 cmdline += ['--multi']
56 LOGGER.info('running eland: ' + " ".join(cmdline))
58 return subprocess.Popen(cmdline)
63 def rerun(gerald_dir, output_dir, length=25, dry_run=False):
65 look for eland files in gerald_dir and write a subset to output_dir
67 LOGGER.info("Extracting %d bp from files in %s" % (length, gerald_dir))
68 g = gerald.gerald(gerald_dir)
70 # this will only work if we're only missing the last dir in output_dir
71 if not os.path.exists(output_dir):
72 LOGGER.info("Making %s" %(output_dir,))
73 if not dry_run: os.mkdir(output_dir)
76 for lane_id, lane_param in g.lanes.items():
77 eland = g.eland_results[lane_id]
79 inpathname = eland.pathname
80 query_pathname = make_query_filename(eland, output_dir)
81 result_pathname = make_result_filename(eland, output_dir)
83 extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
87 lane_param.eland_genome,
97 usage = '%prog: [options] runfolder'
99 parser = OptionParser(usage)
101 parser.add_option('--gerald',
102 help='specify location of GERALD directory',
104 parser.add_option('-o', '--output',
105 help='specify output location of files',
107 parser.add_option('-l', '--read-length', type='int',
108 help='specify new eland length',
111 parser.add_option('--dry-run', action='store_true',
112 help='only pretend to run',
114 parser.add_option('-v', '--verbose', action='store_true',
115 help='increase verbosity',
121 def main(cmdline=None):
122 logging.basicConfig(level=logging.WARNING)
124 parser = make_parser()
125 opts, args = parser.parse_args(cmdline)
127 if opts.length < 16 or opts.length > 32:
128 parser.error("eland can only process reads in the range 16-32")
131 parser.error("Can only process one runfolder directory")
133 runs = runfolder.get_runs(args[0])
135 parser.error("Not a runfolder")
136 opts.gerald = runs[0].gerald.pathname
137 if opts.output is None:
138 opts.output = os.path.join(
141 # pythons 0..n ==> elands 1..n+1
142 'C1-%d' % (opts.length+1,)
145 elif opts.gerald is None:
146 parser.error("need gerald directory")
148 if opts.output is None:
149 parser.error("specify location for the new eland files")
152 root_logger = logging.getLogger('rerun_eland')
153 root_logger.setLevel(logging.INFO)
155 rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
159 if __name__ == "__main__":
160 sys.exit(main(sys.argv[1:]))