mark the example submission rule files as being raw, so the escapes dont get confused
[htsworkflow.git] / scripts / htsw-runfolder
1 #!/usr/bin/env python
2 """htsw-runfolder archives summary information from a runfolder.
3 The information currently being captured includes:
4
5   * Flowcell ID
6   * run dates
7   * start/stop cycle numbers
8   * Firecrest, bustard, gerald version numbers
9   * Eland analysis types, and everything in the eland configuration file.
10   * cluster numbers and other values from the Summary.htm
11     LaneSpecificParameters table.
12   * How many reads mapped to a genome from an eland file
13
14
15 The ELAND "mapped reads" counter will also check for eland squashed file
16 that were symlinked from another directory. This is so I can track how
17 many reads landed on the genome of interest and on the spike ins.
18
19 Basically my subdirectories something like:
20
21 genomes/hg18
22 genomes/hg18/chr*.2bpb <- files for hg18 genome
23 genomes/hg18/chr*.vld
24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
25 genomes/spikein
26
27 htsw-runfolder can also spit out a simple summary report (-s option)
28 that contains the per lane post filter cluster numbers and the mapped
29 read counts. (The report isn't currently very pretty)
30
31 In addition if you provide a --site name it will also archive the raw
32 reads.
33 """
34 from glob import glob
35 import logging
36 import optparse
37 import os
38 import sys
39
40 from htsworkflow.pipelines import runfolder
41 from htsworkflow.pipelines.runfolder import ElementTree
42
43 LOGGER = logging.getLogger(__name__)
44
45 def main(cmdlist=None):
46     parser = make_parser()
47     opts, args = parser.parse_args(cmdlist)
48
49     logging.basicConfig()
50     root_log = logging.getLogger()
51     if opts.debug:
52         root_log.setLevel(logging.DEBUG)
53     elif opts.verbose:
54         root_log.setLevel(logging.INFO)
55
56     logging.info('Starting htsworkflow illumina runfolder processing tool.')
57     runs = []
58     runs.extend(load_run_xml_file(parser, args, opts))
59     runs.extend(load_specific_runfolder_analysis(parser, args, opts))
60     runs.extend(load_runfolders(parser, args, opts))
61
62     if len(runs) == 0:
63         parser.error("Please specify some run folders to process")
64
65     command_run = False
66     if opts.summary:
67         print runfolder.summary_report(runs)
68         command_run = True
69     if opts.archive:
70         runfolder.extract_run_parameters(runs)
71         command_run = True
72     if opts.extract_results:
73         command_run = True
74         extract_results(parser, args, opts, runs)
75     if opts.clean:
76         runfolder.clean_runs(runs, opts.dry_run)
77         command_run = True
78
79     if command_run == False:
80         parser.perror("No commands provided")
81
82     return 0
83
84
85 def load_run_xml_file(parser, args, opts):
86     runs = []
87     if opts.run_xml:
88         # handle ~ shortcut
89         opts.run_xml = os.path.expanduser(opts.run_xml)
90         tree = ElementTree.parse(opts.run_xml).getroot()
91         runs.append(runfolder.PipelineRun(xml=tree))
92     return runs
93
94
95 def load_specific_runfolder_analysis(parser, args, opts):
96     # look for manually specified run
97     runs = []
98     if opts.use_run is not None:
99         specific_run = runfolder.get_specific_run(opts.use_run)
100         if specific_run is not None:
101             runs.append(specific_run)
102         else:
103             logging.warn("Couldn't find a run in %s" % (opts.use_run,))
104     return runs
105
106
107 def load_runfolders(parser, args, opts):
108     if opts.flowcell_id is not None:
109         if len(args) != 1:
110             parser.error(
111                 'Can only force flowcell ID when operating on one run')
112     # scan runfolders for runs
113     runs = []
114     for run_pattern in args:
115         # expand args on our own if needed
116         for run_dir in glob(run_pattern):
117             runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
118     return runs
119
120
121 def extract_results(parser, args, opts, runs):
122     if opts.dry_run:
123         parser.error("Dry-run is not supported for extract-results")
124     runfolder.extract_results(runs,
125                               opts.output_dir,
126                               opts.site,
127                               opts.max_jobs,
128                               opts.raw_format)
129
130
131 def make_parser():
132     usage = 'usage: %prog [options] runfolder_root_dir'
133     parser = optparse.OptionParser(usage)
134
135     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
136                       default=False,
137                       help='turn on verbose mode')
138     parser.add_option('--debug', action='store_true',
139                       default=False,
140                       help='turn on debug logging (implies verbose)')
141     parser.add_option('--dry-run', action='store_true', default=False,
142                       help="Don't delete anything (in clean mode)")
143
144     commands = optparse.OptionGroup(parser, 'Commands')
145
146     commands.add_option('-s', '--summary', dest='summary', action='store_true',
147                         default=False,
148                         help='produce summary report')
149     commands.add_option('-a', '--archive', dest='archive', action='store_true',
150                         default=False,
151                         help='generate run configuration archive')
152     commands.add_option('--extract-results', action='store_true',
153                         default=False,
154                         help='create run-xml summary, compress the eland '\
155                         'result files, build srf files and copy all that '\
156                         'and the Summary.htm file into an archival '\
157                         'directory.')
158     commands.add_option('-c', '--clean', action='store_true', default=False,
159                         help='Clean runfolder, preparing it for '\
160                              'long-term storage')
161     parser.add_option_group(commands)
162
163     parser.add_option('-f', '--flowcell-id', default=None,
164                       help='force a particular flowcell id')
165     parser.add_option('-j', '--max-jobs', default=1,
166                       help='specify the maximum number of processes to run '
167                            '(used in extract-results)')
168     parser.add_option('-o', '--output-dir', default=None,
169            help="specify the default output directory for extract results")
170     parser.add_option('--run-xml', dest='run_xml',
171            default=None,
172            help='specify a run_<FlowCell>.xml file for summary reports')
173     parser.add_option('--site', default=None,
174                       help='create srf files tagged with the provided '\
175                       'site name')
176     parser.add_option('--raw-format', dest="raw_format", default=None,
177                       choices=['qseq', 'srf', 'fastq', None],
178                       help='Specify which type of raw format to use. '
179                            'Currently supported options: qseq, srf, fastq')
180     parser.add_option('-u', '--use-run', dest='use_run', default=None,
181                       help='Specify which run to use instead of autoscanning '
182                            'the runfolder. You do this by providing the final '
183                            ' GERALD directory, and it assumes the parent '
184                            'directories are the bustard and image processing '
185                            'directories.')
186
187     return parser
188
189 if __name__ == "__main__":
190     sys.exit(main(sys.argv[1:]))