2 """htsw-runfolder archives summary information from a runfolder.
3 The information currently being captured includes:
7 * start/stop cycle numbers
8 * Firecrest, bustard, gerald version numbers
9 * Eland analysis types, and everything in the eland configuration file.
10 * cluster numbers and other values from the Summary.htm
11 LaneSpecificParameters table.
12 * How many reads mapped to a genome from an eland file
15 The ELAND "mapped reads" counter will also check for eland squashed file
16 that were symlinked from another directory. This is so I can track how
17 many reads landed on the genome of interest and on the spike ins.
19 Basically my subdirectories something like:
22 genomes/hg18/chr*.2bpb <- files for hg18 genome
24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
27 htsw-runfolder can also spit out a simple summary report (-s option)
28 that contains the per lane post filter cluster numbers and the mapped
29 read counts. (The report isn't currently very pretty)
31 In addition if you provide a --site name it will also archive the raw
40 from htsworkflow.pipelines import runfolder
41 from htsworkflow.pipelines.runfolder import ElementTree
44 def main(cmdlist=None):
45 parser = make_parser()
46 opts, args = parser.parse_args(cmdlist)
49 root_log = logging.getLogger()
51 root_log.setLevel(logging.DEBUG)
53 root_log.setLevel(logging.INFO)
55 logging.info('Starting htsworkflow illumina runfolder processing tool.')
57 runs.extend(load_run_xml_file(parser, args, opts))
58 runs.extend(load_specific_runfolder_analysis(parser, args, opts))
59 runs.extend(load_runfolders(parser, args, opts))
62 parser.error("Please specify some run folders to process")
66 print runfolder.summary_report(runs)
69 runfolder.extract_run_parameters(runs)
71 if opts.extract_results:
73 extract_results(parser, args, opts, runs)
75 runfolder.clean_runs(runs, opts.dry_run)
78 if command_run == False:
79 parser.perror("No commands provided")
84 def load_run_xml_file(parser, args, opts):
88 opt.run_xml = os.path.expanduser(opt.run_xml)
89 tree = ElementTree.parse(opt.run_xml).getroot()
90 runs.append(runfolder.PipelineRun(xml=tree))
94 def load_specific_runfolder_analysis(parser, args, opts):
95 # look for manually specified run
97 if opts.use_run is not None:
98 specific_run = runfolder.get_specific_run(opts.use_run)
99 if specific_run is not None:
100 runs.append(specific_run)
102 logging.warn("Couldn't find a run in %s" % (opts.use_run,))
106 def load_runfolders(parser, args, opts):
107 if opts.flowcell_id is not None:
110 'Can only force flowcell ID when operating on one run')
111 # scan runfolders for runs
113 for run_pattern in args:
114 # expand args on our own if needed
115 for run_dir in glob(run_pattern):
116 runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
120 def extract_results(parser, args, opts, runs):
122 parser.error("Dry-run is not supported for extract-results")
123 runfolder.extract_results(runs,
131 usage = 'usage: %prog [options] runfolder_root_dir'
132 parser = optparse.OptionParser(usage)
134 parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
136 help='turn on verbose mode')
137 parser.add_option('--debug', action='store_true',
139 help='turn on debug logging (implies verbose)')
140 parser.add_option('--dry-run', action='store_true', default=False,
141 help="Don't delete anything (in clean mode)")
143 commands = optparse.OptionGroup(parser, 'Commands')
145 commands.add_option('-s', '--summary', dest='summary', action='store_true',
147 help='produce summary report')
148 commands.add_option('-a', '--archive', dest='archive', action='store_true',
150 help='generate run configuration archive')
151 commands.add_option('--extract-results', action='store_true',
153 help='create run-xml summary, compress the eland '\
154 'result files, build srf files and copy all that '\
155 'and the Summary.htm file into an archival '\
157 commands.add_option('-c', '--clean', action='store_true', default=False,
158 help='Clean runfolder, preparing it for '\
160 parser.add_option_group(commands)
162 parser.add_option('-f', '--flowcell-id', default=None,
163 help='force a particular flowcell id')
164 parser.add_option('-j', '--max-jobs', default=1,
165 help='specify the maximum number of processes to run '
166 '(used in extract-results)')
167 parser.add_option('-o', '--output-dir', default=None,
168 help="specify the default output directory for extract results")
169 parser.add_option('--run-xml', dest='run_xml',
171 help='specify a run_<FlowCell>.xml file for summary reports')
172 parser.add_option('--site', default=None,
173 help='create srf files tagged with the provided '\
175 parser.add_option('--raw-format', dest="raw_format", default='qseq',
176 choices=['qseq', 'srf'],
177 help='Specify which type of raw format to use. '
178 'Currently supported options: qseq, srf')
179 parser.add_option('-u', '--use-run', dest='use_run', default=None,
180 help='Specify which run to use instead of autoscanning '
181 'the runfolder. You do this by providing the final '
182 ' GERALD directory, and it assumes the parent '
183 'directories are the bustard and image processing '
188 if __name__ == "__main__":
189 sys.exit(main(sys.argv[1:]))