2 """htsw-runfolder archives summary information from a runfolder.
3 The information currently being captured includes:
7 * start/stop cycle numbers
8 * Firecrest, bustard, gerald version numbers
9 * Eland analysis types, and everything in the eland configuration file.
10 * cluster numbers and other values from the Summary.htm
11 LaneSpecificParameters table.
12 * How many reads mapped to a genome from an eland file
15 The ELAND "mapped reads" counter will also check for eland squashed file
16 that were symlinked from another directory. This is so I can track how
17 many reads landed on the genome of interest and on the spike ins.
19 Basically my subdirectories something like:
22 genomes/hg18/chr*.2bpb <- files for hg18 genome
24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
27 htsw-runfolder can also spit out a simple summary report (-s option)
28 that contains the per lane post filter cluster numbers and the mapped
29 read counts. (The report isn't currently very pretty)
31 In addition if you provide a --site name it will also archive the raw
40 from htsworkflow.pipelines import runfolder
41 from htsworkflow.pipelines.runfolder import ElementTree
43 LOGGER = logging.getLogger(__name__)
45 def main(cmdlist=None):
46 parser = make_parser()
47 opts, args = parser.parse_args(cmdlist)
50 root_log = logging.getLogger()
52 root_log.setLevel(logging.DEBUG)
54 root_log.setLevel(logging.INFO)
56 logging.info('Starting htsworkflow illumina runfolder processing tool.')
58 runs.extend(load_run_xml_file(parser, args, opts))
59 runs.extend(load_specific_runfolder_analysis(parser, args, opts))
60 runs.extend(load_runfolders(parser, args, opts))
63 parser.error("Please specify some run folders to process")
67 print runfolder.summary_report(runs)
70 runfolder.extract_run_parameters(runs)
72 if opts.extract_results:
74 extract_results(parser, args, opts, runs)
76 runfolder.clean_runs(runs, opts.dry_run)
79 if command_run == False:
80 parser.perror("No commands provided")
85 def load_run_xml_file(parser, args, opts):
89 opt.run_xml = os.path.expanduser(opt.run_xml)
90 tree = ElementTree.parse(opt.run_xml).getroot()
91 runs.append(runfolder.PipelineRun(xml=tree))
95 def load_specific_runfolder_analysis(parser, args, opts):
96 # look for manually specified run
98 if opts.use_run is not None:
99 specific_run = runfolder.get_specific_run(opts.use_run)
100 if specific_run is not None:
101 runs.append(specific_run)
103 logging.warn("Couldn't find a run in %s" % (opts.use_run,))
107 def load_runfolders(parser, args, opts):
108 if opts.flowcell_id is not None:
111 'Can only force flowcell ID when operating on one run')
112 # scan runfolders for runs
114 for run_pattern in args:
115 # expand args on our own if needed
116 for run_dir in glob(run_pattern):
117 runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
121 def extract_results(parser, args, opts, runs):
123 parser.error("Dry-run is not supported for extract-results")
124 runfolder.extract_results(runs,
132 usage = 'usage: %prog [options] runfolder_root_dir'
133 parser = optparse.OptionParser(usage)
135 parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
137 help='turn on verbose mode')
138 parser.add_option('--debug', action='store_true',
140 help='turn on debug logging (implies verbose)')
141 parser.add_option('--dry-run', action='store_true', default=False,
142 help="Don't delete anything (in clean mode)")
144 commands = optparse.OptionGroup(parser, 'Commands')
146 commands.add_option('-s', '--summary', dest='summary', action='store_true',
148 help='produce summary report')
149 commands.add_option('-a', '--archive', dest='archive', action='store_true',
151 help='generate run configuration archive')
152 commands.add_option('--extract-results', action='store_true',
154 help='create run-xml summary, compress the eland '\
155 'result files, build srf files and copy all that '\
156 'and the Summary.htm file into an archival '\
158 commands.add_option('-c', '--clean', action='store_true', default=False,
159 help='Clean runfolder, preparing it for '\
161 parser.add_option_group(commands)
163 parser.add_option('-f', '--flowcell-id', default=None,
164 help='force a particular flowcell id')
165 parser.add_option('-j', '--max-jobs', default=1,
166 help='specify the maximum number of processes to run '
167 '(used in extract-results)')
168 parser.add_option('-o', '--output-dir', default=None,
169 help="specify the default output directory for extract results")
170 parser.add_option('--run-xml', dest='run_xml',
172 help='specify a run_<FlowCell>.xml file for summary reports')
173 parser.add_option('--site', default=None,
174 help='create srf files tagged with the provided '\
176 parser.add_option('--raw-format', dest="raw_format", default='qseq',
177 choices=['qseq', 'srf', 'fastq'],
178 help='Specify which type of raw format to use. '
179 'Currently supported options: qseq, srf, fastq')
180 parser.add_option('-u', '--use-run', dest='use_run', default=None,
181 help='Specify which run to use instead of autoscanning '
182 'the runfolder. You do this by providing the final '
183 ' GERALD directory, and it assumes the parent '
184 'directories are the bustard and image processing '
189 if __name__ == "__main__":
190 sys.exit(main(sys.argv[1:]))