htswdataprod/scripts/runfolder

   1 #!/usr/bin/env python
   2 """
   3 Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
   4
   5   * Flowcell ID
   6   * run dates
   7   * start/stop cycle numbers
   8   * Firecrest, bustard, gerald version numbers
   9   * Eland analysis types, and everything in the eland configuration file.
  10   * cluster numbers and other values from the Summary.htm
  11     LaneSpecificParameters table.
  12   * How many reads mapped to a genome from an eland file
  13
  14 The ELAND "mapped reads" counter will also check for eland squashed file
  15 that were symlinked from another directory. This is so I can track how
  16 many reads landed on the genome of interest and on the spike ins.
  17
  18 Basically my subdirectories something like:
  19
  20 genomes/hg18
  21 genomes/hg18/chr*.2bpb <- files for hg18 genome
  22 genomes/hg18/chr*.vld
  23 genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
  24 genomes/spikein
  25
  26 runfolder.py can also spit out a simple summary report (-s option)
  27 that contains the per lane post filter cluster numbers and the mapped
  28 read counts. (The report isn't currently very pretty)
  29 """
  30 import logging
  31 import optparse
  32 import sys
  33 import os
  34
  35 from htswdataprod import runfolder
  36 from htswdataprod.runfolder import ElementTree
  37
  38 def make_parser():
  39     usage = 'usage: %prog [options] runfolder_root_dir'
  40     parser = optparse.OptionParser(usage)
  41     parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
  42                       default=False,
  43                       help='turn on verbose mode')
  44     parser.add_option('-s', '--summary', dest='summary', action='store_true',
  45                       default=False,
  46                       help='produce summary report')
  47     parser.add_option('-a', '--archive', dest='archive', action='store_true',
  48                       default=False,
  49                       help='generate run configuration archive')
  50     parser.add_option('--extract-results', action='store_true',
  51            default=False,
  52            help='extract result files out of runfolder into a simpler archive')
  53     parser.add_option('--run-xml', dest='run_xml',
  54            default=None,
  55            help='specify a run_<FlowCell>.xml file for summary reports')
  56
  57     parser.add_option('--compat_mode', dest='compat', action='store_true',
  58                       default=False,
  59                       help="Stanford CollectReads compatibility mode")
  60
  61     return parser
  62
  63 def main(cmdlist=None):
  64     parser = make_parser()
  65     opt, args = parser.parse_args(cmdlist)
  66
  67     logging.basicConfig()
  68     if opt.verbose:
  69         root_log = logging.getLogger()
  70         root_log.setLevel(logging.INFO)
  71
  72     runs = []
  73     if opt.run_xml:
  74         tree = ElementTree.parse(opt.run_xml).getroot()
  75         runs.append(runfolder.PipelineRun(xml=tree))
  76     for run_dir in args:
  77         runs.extend(runfolder.get_runs(run_dir))
  78
  79     if len(runs) > 0:
  80         if opt.summary:
  81             print runfolder.summary_report(runs)
  82         if opt.archive:
  83             runfolder.extract_run_parameters(runs)
  84         if opt.extract_results:
  85
  86             # Caltech specific mode
  87             if not opt.compat:
  88                 runfolder.extract_results(runs)
  89
  90             # Stanford CollectReads compatibility mode
  91             else:
  92
  93                 # Define directories to be used
  94                 run_dir = args[0]
  95                 cr_dir = os.path.join(run_dir, 'CollectReads')
  96                 base_dir = os.path.join(cr_dir, 'caltech')
  97
  98                 # Create directories if they do not exist
  99                 if not os.path.exists(cr_dir):
 100                     os.mkdir(cr_dir)
 101                 if not os.path.exists(base_dir):
 102                     os.mkdir(base_dir)
 103
 104                 # Run extract results
 105                 runfolder.extract_results(runs, base_dir)
 106
 107     return 0
 108
 109 if __name__ == "__main__":
 110   sys.exit(main(sys.argv[1:]))