scripts/htsw-runfolder

   1 #!/usr/bin/env python
   2 """
   3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
   4
   5   * Flowcell ID
   6   * run dates
   7   * start/stop cycle numbers
   8   * Firecrest, bustard, gerald version numbers
   9   * Eland analysis types, and everything in the eland configuration file.
  10   * cluster numbers and other values from the Summary.htm
  11     LaneSpecificParameters table.
  12   * How many reads mapped to a genome from an eland file
  13
  14 The ELAND "mapped reads" counter will also check for eland squashed file
  15 that were symlinked from another directory. This is so I can track how
  16 many reads landed on the genome of interest and on the spike ins.
  17
  18 Basically my subdirectories something like:
  19
  20 genomes/hg18
  21 genomes/hg18/chr*.2bpb <- files for hg18 genome
  22 genomes/hg18/chr*.vld
  23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
  24 genomes/spikein
  25
  26 runfolder.py can also spit out a simple summary report (-s option)
  27 that contains the per lane post filter cluster numbers and the mapped
  28 read counts. (The report isn't currently very pretty)
  29 """
  30 from glob import glob
  31 import logging
  32 import optparse
  33 import os
  34 import sys
  35
  36 from htsworkflow.pipelines import runfolder
  37 from htsworkflow.pipelines.runfolder import ElementTree
  38
  39 def make_parser():
  40     usage = 'usage: %prog [options] runfolder_root_dir'
  41     parser = optparse.OptionParser(usage)
  42
  43     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
  44                       default=False,
  45                       help='turn on verbose mode')
  46     parser.add_option('--dry-run', action='store_true', default=False,
  47                       help="Don't delete anything (in clean mode)")
  48
  49     commands = optparse.OptionGroup(parser, 'Commands')
  50
  51     commands.add_option('-s', '--summary', dest='summary', action='store_true',
  52                         default=False,
  53                         help='produce summary report')
  54     commands.add_option('-a', '--archive', dest='archive', action='store_true',
  55                         default=False,
  56                         help='generate run configuration archive')
  57     commands.add_option('--extract-results', action='store_true',
  58            default=False,
  59            help='create run-xml summary, compress the eland result files, build srf files and '
  60                 'copy all that and the Summary.htm file into an archival directory.')
  61     commands.add_option('-c', '--clean', action='store_true', default=False,
  62                         help='Clean runfolder, preparing it for long-term storage')
  63     parser.add_option_group(commands)
  64
  65     parser.add_option('-j', '--max-jobs', default=1,
  66                       help='sepcify the maximum number of processes to run '
  67                            '(used in extract-results)')
  68     parser.add_option('-o', '--output-dir', default=None,
  69            help="specify the default output directory for extract results")
  70     parser.add_option('--run-xml', dest='run_xml',
  71            default=None,
  72            help='specify a run_<FlowCell>.xml file for summary reports')
  73     parser.add_option('--site', default=None,
  74                       help='create srf files tagged with the provided site name')
  75     parser.add_option('-u', '--use-run', dest='use_run', default=None,
  76                       help='Specify which run to use instead of autoscanning '
  77                            'the runfolder. You do this by providing the final '
  78                            ' GERALD directory, and it assumes the parent '
  79                            'directories are the bustard and image processing '
  80                            'directories.')
  81     parser.add_option('--raw-format', dest="raw_format", default='qseq',
  82                       choices=['qseq', 'srf'],
  83                       help='Specify which type of raw format to use. '
  84                            'Currently supported options: qseq, srf')
  85
  86     return parser
  87
  88 def main(cmdlist=None):
  89     parser = make_parser()
  90     opt, args = parser.parse_args(cmdlist)
  91
  92     logging.basicConfig()
  93     if opt.verbose:
  94         root_log = logging.getLogger()
  95         root_log.setLevel(logging.INFO)
  96
  97     logging.info('Starting htsworkflow illumina runfolder processing tool.')
  98     runs = []
  99     if opt.run_xml:
 100         # handle ~ shortcut
 101         opt.run_xml = os.path.expanduser(opt.run_xml)
 102         tree = ElementTree.parse(opt.run_xml).getroot()
 103         runs.append(runfolder.PipelineRun(xml=tree))
 104
 105     # look for manually specified run
 106     if opt.use_run is not None:
 107         specific_run = runfolder.get_specific_run(opt.use_run)
 108         if specific_run is not None:
 109             runs.append(specific_run)
 110         else:
 111             logging.warn("Couldn't find a run in %s" % (opt.use_run,))
 112
 113     # scan runfolders for runs
 114     for run_pattern in args:
 115         # expand args on our own if needed
 116         for run_dir in glob(run_pattern):
 117             runs.extend(runfolder.get_runs(run_dir))
 118
 119     if len(runs) > 0:
 120         command_run = False
 121         if opt.summary:
 122             print runfolder.summary_report(runs)
 123             command_run = True
 124         if opt.archive:
 125             runfolder.extract_run_parameters(runs)
 126             command_run = True
 127         if opt.extract_results:
 128             if opt.dry_run:
 129                 parser.error("Dry-run is not supported for extract-results")
 130             runfolder.extract_results(runs,
 131                                       opt.output_dir,
 132                                       opt.site,
 133                                       opt.max_jobs,
 134                                       opt.raw_format)
 135             command_run = True
 136         if opt.clean:
 137             runfolder.clean_runs(runs, opt.dry_run)
 138             command_run = True
 139
 140         if command_run == False:
 141             print "You need to specify a command." + os.linesep
 142             parser.print_help()
 143     else:
 144         print "You need to specify some run folders to process..." + os.linesep
 145         parser.print_help()
 146
 147     return 0
 148
 149 if __name__ == "__main__":
 150   sys.exit(main(sys.argv[1:]))