Remove ReadPrep and Thumbnail_Images being generated now.
[htsworkflow.git] / scripts / htsw-runfolder
1 #!/usr/bin/env python
2 """htsw-runfolder archives summary information from a runfolder.
3 The information currently being captured includes:
4
5   * Flowcell ID
6   * run dates
7   * start/stop cycle numbers
8   * Firecrest, bustard, gerald version numbers
9   * Eland analysis types, and everything in the eland configuration file.
10   * cluster numbers and other values from the Summary.htm
11     LaneSpecificParameters table.
12   * How many reads mapped to a genome from an eland file
13
14
15 The ELAND "mapped reads" counter will also check for eland squashed file
16 that were symlinked from another directory. This is so I can track how
17 many reads landed on the genome of interest and on the spike ins.
18
19 Basically my subdirectories something like:
20
21 genomes/hg18
22 genomes/hg18/chr*.2bpb <- files for hg18 genome
23 genomes/hg18/chr*.vld
24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
25 genomes/spikein
26
27 htsw-runfolder can also spit out a simple summary report (-s option)
28 that contains the per lane post filter cluster numbers and the mapped
29 read counts. (The report isn't currently very pretty)
30
31 In addition if you provide a --site name it will also archive the raw
32 reads.
33 """
34 from glob import glob
35 import logging
36 import optparse
37 import os
38 import sys
39
40 from htsworkflow.pipelines import runfolder
41 from htsworkflow.pipelines.runfolder import ElementTree
42
43
44 def main(cmdlist=None):
45     parser = make_parser()
46     opts, args = parser.parse_args(cmdlist)
47
48     logging.basicConfig()
49     root_log = logging.getLogger()
50     if opts.debug:
51         root_log.setLevel(logging.DEBUG)
52     elif opts.verbose:
53         root_log.setLevel(logging.INFO)
54
55     logging.info('Starting htsworkflow illumina runfolder processing tool.')
56     runs = []
57     runs.extend(load_run_xml_file(parser, args, opts))
58     runs.extend(load_specific_runfolder_analysis(parser, args, opts))
59     runs.extend(load_runfolders(parser, args, opts))
60
61     if len(runs) == 0:
62         parser.error("Please specify some run folders to process")
63
64     command_run = False
65     if opts.summary:
66         print runfolder.summary_report(runs)
67         command_run = True
68     if opts.archive:
69         runfolder.extract_run_parameters(runs)
70         command_run = True
71     if opts.extract_results:
72         command_run = True
73         extract_results(parser, args, opts, runs)
74     if opts.clean:
75         runfolder.clean_runs(runs, opts.dry_run)
76         command_run = True
77
78     if command_run == False:
79         parser.perror("No commands provided")
80
81     return 0
82
83
84 def load_run_xml_file(parser, args, opts):
85     runs = []
86     if opts.run_xml:
87         # handle ~ shortcut
88         opt.run_xml = os.path.expanduser(opt.run_xml)
89         tree = ElementTree.parse(opt.run_xml).getroot()
90         runs.append(runfolder.PipelineRun(xml=tree))
91     return runs
92
93
94 def load_specific_runfolder_analysis(parser, args, opts):
95     # look for manually specified run
96     runs = []
97     if opts.use_run is not None:
98         specific_run = runfolder.get_specific_run(opts.use_run)
99         if specific_run is not None:
100             runs.append(specific_run)
101         else:
102             logging.warn("Couldn't find a run in %s" % (opts.use_run,))
103     return runs
104
105
106 def load_runfolders(parser, args, opts):
107     if opts.flowcell_id is not None:
108         if len(args) != 1:
109             parser.error(
110                 'Can only force flowcell ID when operating on one run')
111     # scan runfolders for runs
112     runs = []
113     for run_pattern in args:
114         # expand args on our own if needed
115         for run_dir in glob(run_pattern):
116             runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
117     return runs
118
119
120 def extract_results(parser, args, opts, runs):
121     if opts.dry_run:
122         parser.error("Dry-run is not supported for extract-results")
123     runfolder.extract_results(runs,
124                               opts.output_dir,
125                               opts.site,
126                               opts.max_jobs,
127                               opts.raw_format)
128
129
130 def make_parser():
131     usage = 'usage: %prog [options] runfolder_root_dir'
132     parser = optparse.OptionParser(usage)
133
134     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
135                       default=False,
136                       help='turn on verbose mode')
137     parser.add_option('--debug', action='store_true',
138                       default=False,
139                       help='turn on debug logging (implies verbose)')
140     parser.add_option('--dry-run', action='store_true', default=False,
141                       help="Don't delete anything (in clean mode)")
142
143     commands = optparse.OptionGroup(parser, 'Commands')
144
145     commands.add_option('-s', '--summary', dest='summary', action='store_true',
146                         default=False,
147                         help='produce summary report')
148     commands.add_option('-a', '--archive', dest='archive', action='store_true',
149                         default=False,
150                         help='generate run configuration archive')
151     commands.add_option('--extract-results', action='store_true',
152                         default=False,
153                         help='create run-xml summary, compress the eland '\
154                         'result files, build srf files and copy all that '\
155                         'and the Summary.htm file into an archival '\
156                         'directory.')
157     commands.add_option('-c', '--clean', action='store_true', default=False,
158                         help='Clean runfolder, preparing it for '\
159                              'long-term storage')
160     parser.add_option_group(commands)
161
162     parser.add_option('-f', '--flowcell-id', default=None,
163                       help='force a particular flowcell id')
164     parser.add_option('-j', '--max-jobs', default=1,
165                       help='specify the maximum number of processes to run '
166                            '(used in extract-results)')
167     parser.add_option('-o', '--output-dir', default=None,
168            help="specify the default output directory for extract results")
169     parser.add_option('--run-xml', dest='run_xml',
170            default=None,
171            help='specify a run_<FlowCell>.xml file for summary reports')
172     parser.add_option('--site', default=None,
173                       help='create srf files tagged with the provided '\
174                       'site name')
175     parser.add_option('--raw-format', dest="raw_format", default='qseq',
176                       choices=['qseq', 'srf'],
177                       help='Specify which type of raw format to use. '
178                            'Currently supported options: qseq, srf')
179     parser.add_option('-u', '--use-run', dest='use_run', default=None,
180                       help='Specify which run to use instead of autoscanning '
181                            'the runfolder. You do this by providing the final '
182                            ' GERALD directory, and it assumes the parent '
183                            'directories are the bustard and image processing '
184                            'directories.')
185
186     return parser
187
188 if __name__ == "__main__":
189     sys.exit(main(sys.argv[1:]))