+ args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
+ bz_commands.append(" ".join(args))
+ #LOGGER.info('Running: %s' % ( " ".join(args) ))
+ #bzip_dest = open(dest_name, 'w')
+ #bzip = subprocess.Popen(args, stdout=bzip_dest)
+ #LOGGER.info('Saving to %s' % (dest_name, ))
+ #bzip.wait()
+
+ if len(bz_commands) > 0:
+ q = QueueCommands(bz_commands, num_jobs)
+ q.run()
+
+
+def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
+ """
+ Iterate over runfolders in runs extracting the most useful information.
+ * run parameters (in run-*.xml)
+ * eland_result files
+ * score files
+ * Summary.htm
+ * srf files (raw sequence & qualities)
+ """
+ if output_base_dir is None:
+ output_base_dir = os.getcwd()
+
+ for r in runs:
+ result_dir = os.path.join(output_base_dir, r.flowcell_id)
+ LOGGER.info("Using %s as result directory" % (result_dir,))
+ if not os.path.exists(result_dir):
+ os.mkdir(result_dir)
+
+ # create directory to add this runs results to
+ LOGGER.info("Filling in %s" % (r.run_dirname,))
+ run_dirname = os.path.join(result_dir, r.run_dirname)
+ run_dirname = os.path.abspath(run_dirname)
+ if os.path.exists(run_dirname):
+ LOGGER.error("%s already exists, not overwriting" % (run_dirname,))
+ continue
+ else:
+ os.mkdir(run_dirname)
+
+ # save run file
+ r.save(run_dirname)
+
+ # save illumina flowcell status report
+ save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
+ run_dirname)
+
+ # save stuff from bustard
+ # grab IVC plot
+ save_ivc_plot(r.bustard, run_dirname)
+
+ # build base call saving commands
+ if site is not None:
+ save_raw_data(num_jobs, r, site, raw_format, run_dirname)
+
+ # save stuff from GERALD
+ # copy stuff out of the main run
+ if r.gerald:
+ g = r.gerald
+
+ # save summary file
+ save_summary_file(r, run_dirname)
+
+ # compress eland result files
+ compress_eland_results(g, run_dirname, num_jobs)
+
+ # md5 all the compressed files once we're done
+ md5_commands = srf.make_md5_commands(run_dirname)
+ srf.run_commands(run_dirname, md5_commands, num_jobs)
+
+def save_raw_data(num_jobs, r, site, raw_format, run_dirname):
+ lanes = []
+ if r.gerald:
+ for lane in r.gerald.lanes:
+ lane_parameters = r.gerald.lanes.get(lane, None)
+ if lane_parameters is not None:
+ lanes.append(lane)
+ else:
+ # assume default list of lanes
+ lanes = LANE_SAMPLE_KEYS
+
+ run_name = srf.pathname_to_run_name(r.pathname)
+ seq_cmds = []
+ if raw_format is None:
+ raw_format = r.bustard.sequence_format
+
+ LOGGER.info("Raw Format is: %s" % (raw_format, ))
+ if raw_format == 'fastq':
+ LOGGER.info("Reading fastq files from %s", r.bustard.pathname)
+ rawpath = os.path.join(r.pathname, r.bustard.pathname)
+ LOGGER.info("raw data = %s" % (rawpath,))
+ srf.copy_hiseq_project_fastqs(run_name, rawpath, site, run_dirname)
+ elif raw_format == 'qseq':
+ seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, run_dirname)
+ elif raw_format == 'srf':
+ seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, run_dirname, 0)
+ else:
+ raise ValueError('Unknown --raw-format=%s' % (raw_format))
+ srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)