2 """htsw-runfolder archives summary information from a runfolder.
3 The information currently being captured includes:
7 * start/stop cycle numbers
8 * Firecrest, bustard, gerald version numbers
9 * Eland analysis types, and everything in the eland configuration file.
10 * cluster numbers and other values from the Summary.htm
11 LaneSpecificParameters table.
12 * How many reads mapped to a genome from an eland file
15 The ELAND "mapped reads" counter will also check for eland squashed file
16 that were symlinked from another directory. This is so I can track how
17 many reads landed on the genome of interest and on the spike ins.
19 Basically my subdirectories something like:
22 genomes/hg18/chr*.2bpb <- files for hg18 genome
24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
27 htsw-runfolder can also spit out a simple summary report (-s option)
28 that contains the per lane post filter cluster numbers and the mapped
29 read counts. (The report isn't currently very pretty)
31 In addition if you provide a --site name it will also archive the raw
40 from htsworkflow.pipelines import runfolder
41 from htsworkflow.pipelines.runfolder import ElementTree
44 def main(cmdlist=None):
45 parser = make_parser()
46 opts, args = parser.parse_args(cmdlist)
50 root_log = logging.getLogger()
51 root_log.setLevel(logging.INFO)
53 logging.info('Starting htsworkflow illumina runfolder processing tool.')
55 runs.extend(load_run_xml_file(parser, args, opts))
56 runs.extend(load_specific_runfolder_analysis(parser, args, opts))
57 runs.extend(load_runfolders(parser, args, opts))
60 parser.error("Please specify some run folders to process")
64 print runfolder.summary_report(runs)
67 runfolder.extract_run_parameters(runs)
69 if opts.extract_results:
71 extract_results(parser, args, opts, runs)
73 runfolder.clean_runs(runs, opts.dry_run)
76 if command_run == False:
77 parser.perror("No commands provided")
82 def load_run_xml_file(parser, args, opts):
86 opt.run_xml = os.path.expanduser(opt.run_xml)
87 tree = ElementTree.parse(opt.run_xml).getroot()
88 runs.append(runfolder.PipelineRun(xml=tree))
92 def load_specific_runfolder_analysis(parser, args, opts):
93 # look for manually specified run
95 if opts.use_run is not None:
96 specific_run = runfolder.get_specific_run(opts.use_run)
97 if specific_run is not None:
98 runs.append(specific_run)
100 logging.warn("Couldn't find a run in %s" % (opts.use_run,))
104 def load_runfolders(parser, args, opts):
105 if opts.flowcell_id is not None:
108 'Can only force flowcell ID when operating on one run')
109 # scan runfolders for runs
111 for run_pattern in args:
112 # expand args on our own if needed
113 for run_dir in glob(run_pattern):
114 runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
118 def extract_results(parser, args, opts, runs):
120 parser.error("Dry-run is not supported for extract-results")
121 runfolder.extract_results(runs,
129 usage = 'usage: %prog [options] runfolder_root_dir'
130 parser = optparse.OptionParser(usage)
132 parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
134 help='turn on verbose mode')
135 parser.add_option('--dry-run', action='store_true', default=False,
136 help="Don't delete anything (in clean mode)")
138 commands = optparse.OptionGroup(parser, 'Commands')
140 commands.add_option('-s', '--summary', dest='summary', action='store_true',
142 help='produce summary report')
143 commands.add_option('-a', '--archive', dest='archive', action='store_true',
145 help='generate run configuration archive')
146 commands.add_option('--extract-results', action='store_true',
148 help='create run-xml summary, compress the eland '\
149 'result files, build srf files and copy all that '\
150 'and the Summary.htm file into an archival '\
152 commands.add_option('-c', '--clean', action='store_true', default=False,
153 help='Clean runfolder, preparing it for '\
155 parser.add_option_group(commands)
157 parser.add_option('-f', '--flowcell-id', default=None,
158 help='force a particular flowcell id')
159 parser.add_option('-j', '--max-jobs', default=1,
160 help='specify the maximum number of processes to run '
161 '(used in extract-results)')
162 parser.add_option('-o', '--output-dir', default=None,
163 help="specify the default output directory for extract results")
164 parser.add_option('--run-xml', dest='run_xml',
166 help='specify a run_<FlowCell>.xml file for summary reports')
167 parser.add_option('--site', default=None,
168 help='create srf files tagged with the provided '\
170 parser.add_option('--raw-format', dest="raw_format", default='qseq',
171 choices=['qseq', 'srf'],
172 help='Specify which type of raw format to use. '
173 'Currently supported options: qseq, srf')
174 parser.add_option('-u', '--use-run', dest='use_run', default=None,
175 help='Specify which run to use instead of autoscanning '
176 'the runfolder. You do this by providing the final '
177 ' GERALD directory, and it assumes the parent '
178 'directories are the bustard and image processing '
183 if __name__ == "__main__":
184 sys.exit(main(sys.argv[1:]))