Pulling in Diane's runfolder script from trunk.

author Brandon King <kingb@caltech.edu>

Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)

committer Brandon King <kingb@caltech.edu>

Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)
author Brandon King <kingb@caltech.edu>
Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)
committer Brandon King <kingb@caltech.edu>
Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)
diff --git a/htswdataprod/scripts/runfolder b/htswdataprod/scripts/runfolder

new file mode 100644 (file)

index 0000000..d54bf60
--- /dev/null
+++ b/htswdataprod/scripts/runfolder
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+"""
+Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
+
+  * Flowcell ID
+  * run dates
+  * start/stop cycle numbers
+  * Firecrest, bustard, gerald version numbers
+  * Eland analysis types, and everything in the eland configuration file.
+  * cluster numbers and other values from the Summary.htm 
+    LaneSpecificParameters table. 
+  * How many reads mapped to a genome from an eland file
+
+The ELAND "mapped reads" counter will also check for eland squashed file
+that were symlinked from another directory. This is so I can track how 
+many reads landed on the genome of interest and on the spike ins. 
+
+Basically my subdirectories something like:
+
+genomes/hg18
+genomes/hg18/chr*.2bpb <- files for hg18 genome
+genomes/hg18/chr*.vld  
+genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
+genomes/spikein 
+
+runfolder.py can also spit out a simple summary report (-s option) 
+that contains the per lane post filter cluster numbers and the mapped 
+read counts. (The report isn't currently very pretty)
+"""
+import logging
+import optparse
+import sys
+
+from gaworkflow.pipeline import runfolder
+from gaworkflow.pipeline.runfolder import ElementTree
+        
+def make_parser():
+    usage = 'usage: %prog [options] runfolder_root_dir'
+    parser = optparse.OptionParser(usage)
+    parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
+                      default=False,
+                      help='turn on verbose mode')
+    parser.add_option('-s', '--summary', dest='summary', action='store_true',
+                      default=False,
+                      help='produce summary report')
+    parser.add_option('-a', '--archive', dest='archive', action='store_true',
+                      default=False,
+                      help='generate run configuration archive')
+    parser.add_option('--extract-results', action='store_true',
+           default=False,
+           help='extract result files out of runfolder into a simpler archive')
+    parser.add_option('--run-xml', dest='run_xml',
+           default=None,
+           help='specify a run_<FlowCell>.xml file for summary reports')
+
+    return parser
+
+def main(cmdlist=None):
+    parser = make_parser()
+    opt, args = parser.parse_args(cmdlist)
+
+    logging.basicConfig()
+    if opt.verbose:
+        root_log = logging.getLogger()
+        root_log.setLevel(logging.INFO)
+
+    runs = []
+    if opt.run_xml:
+        tree = ElementTree.parse(opt.run_xml).getroot()
+        runs.append(runfolder.PipelineRun(xml=tree))
+    for run_dir in args:
+        runs.extend(runfolder.get_runs(run_dir))
+
+    if len(runs) > 0:
+        if opt.summary:
+            print runfolder.summary_report(runs)
+        if opt.archive:
+            runfolder.extract_run_parameters(runs)
+        if opt.extract_results:
+            runfolder.extract_results(runs)
+
+    return 0
+
+if __name__ == "__main__":
+  sys.exit(main(sys.argv[1:]))
author	Brandon King <kingb@caltech.edu>
	Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)
committer	Brandon King <kingb@caltech.edu>
	Thu, 7 Aug 2008 19:55:53 +0000 (19:55 +0000)