Merge branch 'master' into debianized
[htsworkflow.git] / scripts / rerun_eland.py
1 #!/usr/bin/env python
2
3 import logging
4 from optparse import OptionParser
5 import os
6 import subprocess
7 import sys
8
9 from htsworkflow.pipelines import gerald
10 from htsworkflow.pipelines.eland import extract_eland_sequence
11 from htsworkflow.pipelines import runfolder
12
13 def make_query_filename(eland_obj, output_dir):
14     query_name = '%s_%s_eland_query.txt' 
15     query_name %= (eland_obj.sample_name, eland_obj.lane_id)
16
17     query_pathname = os.path.join(output_dir, query_name)
18     
19     if os.path.exists(query_pathname):
20         logging.warn("overwriting %s" % (query_pathname,))
21
22     return query_pathname
23
24 def make_result_filename(eland_obj, output_dir):
25     result_name = '%s_%s_eland_result.txt' 
26     result_name %= (eland_obj.sample_name, eland_obj.lane_id)
27
28     result_pathname = os.path.join(output_dir, result_name)
29     
30     if os.path.exists(result_pathname):
31         logging.warn("overwriting %s" % (result_pathname,))
32
33     return result_pathname
34
35 def extract_sequence(inpathname, query_pathname, length, dry_run=False):
36     logging.info('extracting %d bases' %(length,))
37     logging.info('extracting from %s' %(inpathname,))
38     logging.info('extracting to %s' %(query_pathname,))
39     
40     if not dry_run: 
41         try:
42             instream = open(inpathname, 'r')
43             outstream = open(query_pathname, 'w')
44             extract_eland_sequence(instream, outstream, 0, length)
45         finally:
46             outstream.close()
47             instream.close()
48     
49 def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
50     cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
51     if multi:
52         cmdline += ['--multi']
53
54     logging.info('running eland: ' + " ".join(cmdline))
55     if not dry_run:
56         return subprocess.Popen(cmdline)
57     else:
58         return None
59
60
61 def rerun(gerald_dir, output_dir, length=25, dry_run=False):
62     """
63     look for eland files in gerald_dir and write a subset to output_dir
64     """
65     logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
66     g = gerald.gerald(gerald_dir)
67
68     # this will only work if we're only missing the last dir in output_dir
69     if not os.path.exists(output_dir):
70         logging.info("Making %s" %(output_dir,))
71         if not dry_run: os.mkdir(output_dir)
72
73     processes = []
74     for lane_id, lane_param in g.lanes.items():
75         eland = g.eland_results[lane_id]
76
77         inpathname = eland.pathname
78         query_pathname = make_query_filename(eland, output_dir)
79         result_pathname = make_result_filename(eland, output_dir)
80
81         extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
82
83         p = run_eland(length, 
84                       query_pathname, 
85                       lane_param.eland_genome, 
86                       result_pathname, 
87                       dry_run=dry_run)
88         if p is not None:
89             processes.append(p)
90
91     for p in processes:
92         p.wait()
93         
94 def make_parser():
95     usage = '%prog: [options] runfolder'
96
97     parser = OptionParser(usage)
98     
99     parser.add_option('--gerald', 
100                       help='specify location of GERALD directory',
101                       default=None)
102     parser.add_option('-o', '--output',
103                       help='specify output location of files',
104                       default=None)
105     parser.add_option('-l', '--read-length', type='int',
106                       help='specify new eland length',
107                       dest='length',
108                       default=25)
109     parser.add_option('--dry-run', action='store_true',
110                       help='only pretend to run',
111                       default=False)
112     parser.add_option('-v', '--verbose', action='store_true',
113                       help='increase verbosity',
114                       default=False)
115
116     return parser
117
118
119 def main(cmdline=None):
120     logging.basicConfig(level=logging.WARNING)
121
122     parser = make_parser()
123     opts, args = parser.parse_args(cmdline)
124
125     if opts.length < 16 or opts.length > 32:
126         parser.error("eland can only process reads in the range 16-32")
127
128     if len(args) > 1:
129         parser.error("Can only process one runfolder directory")
130     elif len(args) == 1:
131         runs = runfolder.get_runs(args[0])
132         if len(runs) != 1:
133             parser.error("Not a runfolder")
134         opts.gerald = runs[0].gerald.pathname
135         if opts.output is None:
136             opts.output = os.path.join(
137                 runs[0].pathname, 
138                 'Data', 
139                 # pythons 0..n ==> elands 1..n+1
140                 'C1-%d' % (opts.length+1,) 
141             )
142
143     elif opts.gerald is None:
144         parser.error("need gerald directory")
145     
146     if opts.output is None:
147         parser.error("specify location for the new eland files")
148
149     if opts.verbose:
150         root_logger = logging.getLogger()
151         root_logger.setLevel(logging.INFO)
152
153     rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
154
155     return 0
156
157 if __name__ == "__main__":
158     sys.exit(main(sys.argv[1:]))