scripts/htsw-runfolder

   1 #!/usr/bin/env python
   2 """htsw-runfolder archives summary information from a runfolder.
   3 The information currently being captured includes:
   4
   5   * Flowcell ID
   6   * run dates
   7   * start/stop cycle numbers
   8   * Firecrest, bustard, gerald version numbers
   9   * Eland analysis types, and everything in the eland configuration file.
  10   * cluster numbers and other values from the Summary.htm
  11     LaneSpecificParameters table.
  12   * How many reads mapped to a genome from an eland file
  13
  14
  15 The ELAND "mapped reads" counter will also check for eland squashed file
  16 that were symlinked from another directory. This is so I can track how
  17 many reads landed on the genome of interest and on the spike ins.
  18
  19 Basically my subdirectories something like:
  20
  21 genomes/hg18
  22 genomes/hg18/chr*.2bpb <- files for hg18 genome
  23 genomes/hg18/chr*.vld
  24 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
  25 genomes/spikein
  26
  27 htsw-runfolder can also spit out a simple summary report (-s option)
  28 that contains the per lane post filter cluster numbers and the mapped
  29 read counts. (The report isn't currently very pretty)
  30
  31 In addition if you provide a --site name it will also archive the raw
  32 reads.
  33 """
  34 from glob import glob
  35 import logging
  36 import optparse
  37 import os
  38 import sys
  39
  40 from htsworkflow.pipelines import runfolder
  41 from htsworkflow.pipelines.runfolder import ElementTree
  42
  43
  44 def main(cmdlist=None):
  45     parser = make_parser()
  46     opts, args = parser.parse_args(cmdlist)
  47
  48     logging.basicConfig()
  49     if opts.verbose:
  50         root_log = logging.getLogger()
  51         root_log.setLevel(logging.INFO)
  52
  53     logging.info('Starting htsworkflow illumina runfolder processing tool.')
  54     runs = []
  55     runs.extend(load_run_xml_file(parser, args, opts))
  56     runs.extend(load_specific_runfolder_analysis(parser, args, opts))
  57     runs.extend(load_runfolders(parser, args, opts))
  58
  59     if len(runs) == 0:
  60         parser.error("Please specify some run folders to process")
  61
  62     command_run = False
  63     if opts.summary:
  64         print runfolder.summary_report(runs)
  65         command_run = True
  66     if opts.archive:
  67         runfolder.extract_run_parameters(runs)
  68         command_run = True
  69     if opts.extract_results:
  70         command_run = True
  71         extract_results(parser, args, opts, runs)
  72     if opts.clean:
  73         runfolder.clean_runs(runs, opts.dry_run)
  74         command_run = True
  75
  76     if command_run == False:
  77         parser.perror("No commands provided")
  78
  79     return 0
  80
  81
  82 def load_run_xml_file(parser, args, opts):
  83     runs = []
  84     if opts.run_xml:
  85         # handle ~ shortcut
  86         opt.run_xml = os.path.expanduser(opt.run_xml)
  87         tree = ElementTree.parse(opt.run_xml).getroot()
  88         runs.append(runfolder.PipelineRun(xml=tree))
  89     return runs
  90
  91
  92 def load_specific_runfolder_analysis(parser, args, opts):
  93     # look for manually specified run
  94     runs = []
  95     if opts.use_run is not None:
  96         specific_run = runfolder.get_specific_run(opts.use_run)
  97         if specific_run is not None:
  98             runs.append(specific_run)
  99         else:
 100             logging.warn("Couldn't find a run in %s" % (opts.use_run,))
 101     return runs
 102
 103
 104 def load_runfolders(parser, args, opts):
 105     if opts.flowcell_id is not None:
 106         if len(args) != 1:
 107             parser.error(
 108                 'Can only force flowcell ID when operating on one run')
 109     # scan runfolders for runs
 110     runs = []
 111     for run_pattern in args:
 112         # expand args on our own if needed
 113         for run_dir in glob(run_pattern):
 114             runs.extend(runfolder.get_runs(run_dir, opts.flowcell_id))
 115     return runs
 116
 117
 118 def extract_results(parser, args, opts, runs):
 119     if opts.dry_run:
 120         parser.error("Dry-run is not supported for extract-results")
 121     runfolder.extract_results(runs,
 122                               opts.output_dir,
 123                               opts.site,
 124                               opts.max_jobs,
 125                               opts.raw_format)
 126
 127
 128 def make_parser():
 129     usage = 'usage: %prog [options] runfolder_root_dir'
 130     parser = optparse.OptionParser(usage)
 131
 132     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
 133                       default=False,
 134                       help='turn on verbose mode')
 135     parser.add_option('--dry-run', action='store_true', default=False,
 136                       help="Don't delete anything (in clean mode)")
 137
 138     commands = optparse.OptionGroup(parser, 'Commands')
 139
 140     commands.add_option('-s', '--summary', dest='summary', action='store_true',
 141                         default=False,
 142                         help='produce summary report')
 143     commands.add_option('-a', '--archive', dest='archive', action='store_true',
 144                         default=False,
 145                         help='generate run configuration archive')
 146     commands.add_option('--extract-results', action='store_true',
 147                         default=False,
 148                         help='create run-xml summary, compress the eland '\
 149                         'result files, build srf files and copy all that '\
 150                         'and the Summary.htm file into an archival '\
 151                         'directory.')
 152     commands.add_option('-c', '--clean', action='store_true', default=False,
 153                         help='Clean runfolder, preparing it for '\
 154                              'long-term storage')
 155     parser.add_option_group(commands)
 156
 157     parser.add_option('-f', '--flowcell-id', default=None,
 158                       help='force a particular flowcell id')
 159     parser.add_option('-j', '--max-jobs', default=1,
 160                       help='specify the maximum number of processes to run '
 161                            '(used in extract-results)')
 162     parser.add_option('-o', '--output-dir', default=None,
 163            help="specify the default output directory for extract results")
 164     parser.add_option('--run-xml', dest='run_xml',
 165            default=None,
 166            help='specify a run_<FlowCell>.xml file for summary reports')
 167     parser.add_option('--site', default=None,
 168                       help='create srf files tagged with the provided '\
 169                       'site name')
 170     parser.add_option('--raw-format', dest="raw_format", default='qseq',
 171                       choices=['qseq', 'srf'],
 172                       help='Specify which type of raw format to use. '
 173                            'Currently supported options: qseq, srf')
 174     parser.add_option('-u', '--use-run', dest='use_run', default=None,
 175                       help='Specify which run to use instead of autoscanning '
 176                            'the runfolder. You do this by providing the final '
 177                            ' GERALD directory, and it assumes the parent '
 178                            'directories are the bustard and image processing '
 179                            'directories.')
 180
 181     return parser
 182
 183 if __name__ == "__main__":
 184     sys.exit(main(sys.argv[1:]))