4 from optparse import OptionParser
9 from htsworkflow.pipelines import gerald
10 from htsworkflow.pipelines import runfolder
12 def make_query_filename(eland_obj, output_dir):
13 query_name = '%s_%s_eland_query.txt'
14 query_name %= (eland_obj.sample_name, eland_obj.lane_id)
16 query_pathname = os.path.join(output_dir, query_name)
18 if os.path.exists(query_pathname):
19 logging.warn("overwriting %s" % (query_pathname,))
23 def make_result_filename(eland_obj, output_dir):
24 result_name = '%s_%s_eland_result.txt'
25 result_name %= (eland_obj.sample_name, eland_obj.lane_id)
27 result_pathname = os.path.join(output_dir, result_name)
29 if os.path.exists(result_pathname):
30 logging.warn("overwriting %s" % (result_pathname,))
32 return result_pathname
34 def extract_sequence(inpathname, query_pathname, length, dry_run=False):
35 logging.info('extracting %d bases' %(length,))
36 logging.info('extracting from %s' %(inpathname,))
37 logging.info('extracting to %s' %(query_pathname,))
41 instream = open(inpathname, 'r')
42 outstream = open(query_pathname, 'w')
43 gerald.extract_eland_sequence(instream, outstream, 0, length)
48 def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
49 cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
51 cmdline += ['--multi']
53 logging.info('running eland: ' + " ".join(cmdline))
55 return subprocess.Popen(cmdline)
60 def rerun(gerald_dir, output_dir, length=25, dry_run=False):
62 look for eland files in gerald_dir and write a subset to output_dir
64 logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
65 g = gerald.gerald(gerald_dir)
67 # this will only work if we're only missing the last dir in output_dir
68 if not os.path.exists(output_dir):
69 logging.info("Making %s" %(output_dir,))
70 if not dry_run: os.mkdir(output_dir)
73 for lane_id, lane_param in g.lanes.items():
74 eland = g.eland_results[lane_id]
76 inpathname = eland.pathname
77 query_pathname = make_query_filename(eland, output_dir)
78 result_pathname = make_result_filename(eland, output_dir)
80 extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
84 lane_param.eland_genome,
94 usage = '%prog: [options] runfolder'
96 parser = OptionParser(usage)
98 parser.add_option('--gerald',
99 help='specify location of GERALD directory',
101 parser.add_option('-o', '--output',
102 help='specify output location of files',
104 parser.add_option('-l', '--read-length', type='int',
105 help='specify new eland length',
108 parser.add_option('--dry-run', action='store_true',
109 help='only pretend to run',
111 parser.add_option('-v', '--verbose', action='store_true',
112 help='increase verbosity',
118 def main(cmdline=None):
119 logging.basicConfig(level=logging.WARNING)
121 parser = make_parser()
122 opts, args = parser.parse_args(cmdline)
124 if opts.length < 16 or opts.length > 32:
125 parser.error("eland can only process reads in the range 16-32")
128 parser.error("Can only process one runfolder directory")
130 runs = runfolder.get_runs(args[0])
132 parser.error("Not a runfolder")
133 opts.gerald = runs[0].gerald.pathname
134 if opts.output is None:
135 opts.output = os.path.join(
138 # pythons 0..n ==> elands 1..n+1
139 'C1-%d' % (opts.length+1,)
142 elif opts.gerald is None:
143 parser.error("need gerald directory")
145 if opts.output is None:
146 parser.error("specify location for the new eland files")
149 root_logger = logging.getLogger()
150 root_logger.setLevel(logging.INFO)
152 rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
156 if __name__ == "__main__":
157 sys.exit(main(sys.argv[1:]))