25aecc445ddc143b74622155444c2da7af72dceb
[htsworkflow.git] / scripts / rerun_eland.py
1 #!/usr/bin/env python
2
3 import logging
4 from optparse import OptionParser
5 import os
6 import subprocess
7 import sys
8
9 from gaworkflow.pipeline import gerald
10
11 def make_query_filename(eland_obj, output_dir):
12     query_name = '%s_%s_eland_query.txt' 
13     query_name %= (eland_obj.sample_name, eland_obj.lane_id)
14
15     query_pathname = os.path.join(output_dir, query_name)
16     
17     if os.path.exists(query_pathname):
18         logging.warn("overwriting %s" % (query_pathname,))
19
20     return query_pathname
21
22 def make_result_filename(eland_obj, output_dir):
23     result_name = '%s_%s_eland_result.txt' 
24     result_name %= (eland_obj.sample_name, eland_obj.lane_id)
25
26     result_pathname = os.path.join(output_dir, result_name)
27     
28     if os.path.exists(result_pathname):
29         logging.warn("overwriting %s" % (result_pathname,))
30
31     return result_pathname
32
33 def extract_sequence(inpathname, query_pathname, length, dry_run=False):
34     logging.info('extracting %d bases' %(length,))
35     logging.info('extracting from %s' %(inpathname,))
36     logging.info('extracting to %s' %(query_pathname,))
37     
38     if not dry_run: 
39         try:
40             instream = open(inpathname, 'r')
41             outstream = open(query_pathname, 'w')
42             gerald.extract_eland_sequence(instream, outstream, 0, length)
43         finally:
44             outstream.close()
45             instream.close()
46
47 def run_eland(length, query_name, genome, result_name, multi=False, dry_run=False):
48     cmdline = ['eland_%d' % (length,), query_name, genome, result_name]
49     if multi:
50         cmdline += ['--multi']
51
52     logging.info('running eland: ' + " ".join(cmdline))
53     if not dry_run:
54         return subprocess.Popen(cmdline)
55     else:
56         return None
57
58
59 def rerun(gerald_dir, output_dir, length=25, dry_run=False):
60     """
61     look for eland files in gerald_dir and write a subset to output_dir
62     """
63     logging.info("Extracting %d bp from files in %s" % (length, gerald_dir))
64     g = gerald.gerald(gerald_dir)
65
66     processes = []
67     for lane_id, lane_param in g.lanes.items():
68         eland = g.eland_results[lane_id]
69
70         inpathname = eland.pathname
71         query_pathname = make_query_filename(eland, output_dir)
72         result_pathname = make_result_filename(eland, output_dir)
73
74         extract_sequence(inpathname, query_pathname, length, dry_run=dry_run)
75
76         p = run_eland(length, 
77                       query_pathname, 
78                       lane_param.eland_genome, 
79                       result_pathname, 
80                       dry_run=dry_run)
81         if p is not None:
82             processes.append(p)
83
84     for p in processes:
85         p.wait()
86         
87 def make_parser():
88     usage = '%prog: --gerald <gerald dir> -o <new dir>'
89
90     parser = OptionParser(usage)
91     
92     parser.add_option('--gerald', 
93                       help='specify location of GERALD directory',
94                       default=None)
95     parser.add_option('-o', '--output',
96                       help='specify output location of files',
97                       default=None)
98     parser.add_option('-l', '--read-length', type='int',
99                       help='specify new eland length',
100                       dest='length',
101                       default=25)
102     parser.add_option('--dry-run', action='store_true',
103                       help='only pretend to run',
104                       default=False)
105     parser.add_option('-v', '--verbose', action='store_true',
106                       help='increase verbosity',
107                       default=False)
108
109     return parser
110
111
112 def main(cmdline=None):
113     logging.basicConfig(level=logging.WARNING)
114
115     parser = make_parser()
116     opts, args = parser.parse_args(cmdline)
117
118     if opts.gerald is None:
119         parser.error("need gerald directory")
120     
121     if opts.output is None:
122         parser.error("specify location for the new eland files")
123
124     if opts.length < 16 or opts.length > 32:
125         parser.error("eland can only process reads in the range 16-32")
126
127     if opts.verbose:
128         root_logger = logging.getLogger()
129         root_logger.setLevel(logging.INFO)
130
131     rerun(opts.gerald, opts.output, opts.length, dry_run=opts.dry_run)
132
133     return 0
134
135 if __name__ == "__main__":
136     sys.exit(main(sys.argv[1:]))