htsworkflow/pipelines/runfolder.py

   1 """
   2 Core information needed to inspect a runfolder.
   3 """
   4 from glob import glob
   5 import logging
   6 import os
   7 import re
   8 import shutil
   9 import stat
  10 import subprocess
  11 import sys
  12 import tarfile
  13 import time
  14
  15 import lxml.etree as ElementTree
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 EUROPEAN_STRPTIME = "%d-%m-%Y"
  20 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
  21 VERSION_RE = "([0-9\.]+)"
  22 USER_RE = "([a-zA-Z0-9]+)"
  23 LANES_PER_FLOWCELL = 8
  24 LANE_LIST = range(1, LANES_PER_FLOWCELL + 1)
  25
  26 from htsworkflow.util.alphanum import alphanum
  27 from htsworkflow.util.ethelp import indent, flatten
  28 from htsworkflow.util.queuecommands import QueueCommands
  29
  30 from htsworkflow.pipelines import srf
  31
  32 class PipelineRun(object):
  33     """
  34     Capture "interesting" information about a pipeline run
  35     """
  36     XML_VERSION = 1
  37     PIPELINE_RUN = 'PipelineRun'
  38     FLOWCELL_ID = 'FlowcellID'
  39
  40     def __init__(self, pathname=None, flowcell_id=None, xml=None):
  41         if pathname is not None:
  42           self.pathname = os.path.normpath(pathname)
  43         else:
  44           self.pathname = None
  45         self._name = None
  46         self._flowcell_id = flowcell_id
  47         self.datadir = None
  48         self.image_analysis = None
  49         self.bustard = None
  50         self.gerald = None
  51
  52         if xml is not None:
  53           self.set_elements(xml)
  54
  55     def _get_flowcell_id(self):
  56         # extract flowcell ID
  57         if self._flowcell_id is None:
  58             self._flowcell_id = self._get_flowcell_id_from_runinfo()
  59         if self._flowcell_id is None:
  60             self._flowcell_id = self._get_flowcell_id_from_flowcellid()
  61         if self._flowcell_id is None:
  62             self._flowcell_id = self._get_flowcell_id_from_path()
  63         if self._flowcell_id is None:
  64             self._flowcell_id = 'unknown'
  65
  66             LOGGER.warning(
  67                 "Flowcell id was not found, guessing %s" % (
  68                     self._flowcell_id))
  69
  70         return self._flowcell_id
  71     flowcell_id = property(_get_flowcell_id)
  72
  73     def _get_flowcell_id_from_flowcellid(self):
  74         """Extract flowcell id from a Config/FlowcellId.xml file
  75         """
  76         config_dir = os.path.join(self.pathname, 'Config')
  77         flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
  78         if os.path.exists(flowcell_id_path):
  79             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
  80             return flowcell_id_tree.findtext('Text')
  81
  82     def _get_flowcell_id_from_runinfo(self):
  83         """Read RunInfo file for flowcell id
  84         """
  85         runinfo = os.path.join(self.pathname, 'RunInfo.xml')
  86         if os.path.exists(runinfo):
  87             tree = ElementTree.parse(runinfo)
  88             root = tree.getroot()
  89             fc_nodes = root.xpath('/RunInfo/Run/Flowcell')
  90             if len(fc_nodes) == 1:
  91                 return fc_nodes[0].text
  92
  93
  94     def _get_flowcell_id_from_path(self):
  95         """Guess a flowcell name from the path
  96         """
  97         path_fields = self.pathname.split('_')
  98         if len(path_fields) > 0:
  99             # guessing last element of filename
 100             return path_fields[-1]
 101
 102     def _get_runfolder_name(self):
 103         if self.gerald is None:
 104             return None
 105         else:
 106             return self.gerald.runfolder_name
 107     runfolder_name = property(_get_runfolder_name)
 108
 109     def get_elements(self):
 110         """
 111         make one master xml file from all of our sub-components.
 112         """
 113         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
 114         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
 115         flowcell.text = self.flowcell_id
 116         root.append(self.image_analysis.get_elements())
 117         root.append(self.bustard.get_elements())
 118         root.append(self.gerald.get_elements())
 119         return root
 120
 121     def set_elements(self, tree):
 122         # this file gets imported by all the others,
 123         # so we need to hide the imports to avoid a cyclic imports
 124         from htsworkflow.pipelines import firecrest
 125         from htsworkflow.pipelines import ipar
 126         from htsworkflow.pipelines import bustard
 127         from htsworkflow.pipelines import gerald
 128
 129         tag = tree.tag.lower()
 130         if tag != PipelineRun.PIPELINE_RUN.lower():
 131           raise ValueError('Pipeline Run Expecting %s got %s' % (
 132               PipelineRun.PIPELINE_RUN, tag))
 133         for element in tree:
 134           tag = element.tag.lower()
 135           if tag == PipelineRun.FLOWCELL_ID.lower():
 136             self._flowcell_id = element.text
 137           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
 138           # you should only have Firecrest or IPAR, never both of them.
 139           elif tag == firecrest.Firecrest.FIRECREST.lower():
 140             self.image_analysis = firecrest.Firecrest(xml=element)
 141           elif tag == ipar.IPAR.IPAR.lower():
 142             self.image_analysis = ipar.IPAR(xml=element)
 143           elif tag == bustard.Bustard.BUSTARD.lower():
 144             self.bustard = bustard.Bustard(xml=element)
 145           elif tag == gerald.Gerald.GERALD.lower():
 146             self.gerald = gerald.Gerald(xml=element)
 147           elif tag == gerald.CASAVA.GERALD.lower():
 148             self.gerald = gerald.CASAVA(xml=element)
 149           else:
 150             LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
 151
 152     def _get_run_name(self):
 153         """
 154         Given a run tuple, find the latest date and use that as our name
 155         """
 156         if self._name is None:
 157           tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
 158           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
 159           self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
 160         return self._name
 161     name = property(_get_run_name)
 162
 163     def save(self, destdir=None):
 164         if destdir is None:
 165             destdir = ''
 166         LOGGER.info("Saving run report " + self.name)
 167         xml = self.get_elements()
 168         indent(xml)
 169         dest_pathname = os.path.join(destdir, self.name)
 170         ElementTree.ElementTree(xml).write(dest_pathname)
 171
 172     def load(self, filename):
 173         LOGGER.info("Loading run report from " + filename)
 174         tree = ElementTree.parse(filename).getroot()
 175         self.set_elements(tree)
 176
 177 def load_pipeline_run_xml(pathname):
 178     """
 179     Load and instantiate a Pipeline run from a run xml file
 180
 181     :Parameters:
 182       - `pathname` : location of an run xml file
 183
 184     :Returns: initialized PipelineRun object
 185     """
 186     tree = ElementTree.parse(pathname).getroot()
 187     run = PipelineRun(xml=tree)
 188     return run
 189
 190 def get_runs(runfolder, flowcell_id=None):
 191     """
 192     Search through a run folder for all the various sub component runs
 193     and then return a PipelineRun for each different combination.
 194
 195     For example if there are two different GERALD runs, this will
 196     generate two different PipelineRun objects, that differ
 197     in there gerald component.
 198     """
 199     from htsworkflow.pipelines import firecrest
 200     from htsworkflow.pipelines import ipar
 201     from htsworkflow.pipelines import bustard
 202     from htsworkflow.pipelines import gerald
 203
 204     def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
 205         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
 206         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
 207         # RTA BaseCalls looks enough like Bustard.
 208         bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
 209         for bustard_pathname in bustard_dirs:
 210             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
 211             b = bustard.bustard(bustard_pathname)
 212             build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
 213
 214             build_aligned_runs(image_analysis, runs, b, datadir, runfolder)
 215
 216     def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
 217         gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
 218         LOGGER.info("Looking for gerald directories in %s" % (pathname,))
 219         for gerald_pathname in glob(gerald_glob):
 220             LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
 221             try:
 222                 g = gerald.gerald(gerald_pathname)
 223                 p = PipelineRun(runfolder, flowcell_id)
 224                 p.datadir = datadir
 225                 p.image_analysis = image_analysis
 226                 p.bustard = b
 227                 p.gerald = g
 228                 runs.append(p)
 229             except IOError, e:
 230                 LOGGER.error("Ignoring " + str(e))
 231
 232
 233     def build_aligned_runs(image_analysis, runs, b, datadir, runfolder):
 234         aligned_glob = os.path.join(runfolder, 'Aligned*')
 235         for aligned in glob(aligned_glob):
 236             LOGGER.info("Found aligned directory %s" % (aligned,))
 237             try:
 238                 g = gerald.gerald(aligned)
 239                 p = PipelineRun(runfolder, flowcell_id)
 240                 p.datadir = datadir
 241                 p.image_analysis = image_analysis
 242                 p.bustard = b
 243                 p.gerald = g
 244                 runs.append(p)
 245             except IOError, e:
 246                 LOGGER.error("Ignoring " + str(e))
 247
 248     datadir = os.path.join(runfolder, 'Data')
 249
 250     LOGGER.info('Searching for runs in ' + datadir)
 251     runs = []
 252     # scan for firecrest directories
 253     for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
 254         LOGGER.info('Found firecrest in ' + datadir)
 255         image_analysis = firecrest.firecrest(firecrest_pathname)
 256         if image_analysis is None:
 257             LOGGER.warn(
 258                 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
 259             )
 260         else:
 261             scan_post_image_analysis(
 262                 runs, runfolder, datadir, image_analysis, firecrest_pathname
 263             )
 264     # scan for IPAR directories
 265     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
 266     # The Intensities directory from the RTA software looks a lot like IPAR
 267     ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
 268     for ipar_pathname in ipar_dirs:
 269         LOGGER.info('Found ipar directories in ' + datadir)
 270         image_analysis = ipar.ipar(ipar_pathname)
 271         if image_analysis is None:
 272             LOGGER.warn(
 273                 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
 274             )
 275         else:
 276             scan_post_image_analysis(
 277                 runs, runfolder, datadir, image_analysis, ipar_pathname
 278             )
 279
 280     return runs
 281
 282 def get_specific_run(gerald_dir):
 283     """
 284     Given a gerald directory, construct a PipelineRun out of its parents
 285
 286     Basically this allows specifying a particular run instead of the previous
 287     get_runs which scans a runfolder for various combinations of
 288     firecrest/ipar/bustard/gerald runs.
 289     """
 290     from htsworkflow.pipelines import firecrest
 291     from htsworkflow.pipelines import ipar
 292     from htsworkflow.pipelines import bustard
 293     from htsworkflow.pipelines import gerald
 294
 295     gerald_dir = os.path.expanduser(gerald_dir)
 296     bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
 297     image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
 298
 299     runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
 300
 301     LOGGER.info('--- use-run detected options ---')
 302     LOGGER.info('runfolder: %s' % (runfolder_dir,))
 303     LOGGER.info('image_dir: %s' % (image_dir,))
 304     LOGGER.info('bustard_dir: %s' % (bustard_dir,))
 305     LOGGER.info('gerald_dir: %s' % (gerald_dir,))
 306
 307     # find our processed image dir
 308     image_run = None
 309     # split into parent, and leaf directory
 310     # leaf directory should be an IPAR or firecrest directory
 311     data_dir, short_image_dir = os.path.split(image_dir)
 312     LOGGER.info('data_dir: %s' % (data_dir,))
 313     LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
 314
 315     # guess which type of image processing directory we have by looking
 316     # in the leaf directory name
 317     if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
 318         image_run = firecrest.firecrest(image_dir)
 319     elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
 320         image_run = ipar.ipar(image_dir)
 321     elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
 322         image_run = ipar.ipar(image_dir)
 323
 324     # if we din't find a run, report the error and return
 325     if image_run is None:
 326         msg = '%s does not contain an image processing step' % (image_dir,)
 327         LOGGER.error(msg)
 328         return None
 329
 330     # find our base calling
 331     base_calling_run = bustard.bustard(bustard_dir)
 332     if base_calling_run is None:
 333         LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
 334         return None
 335
 336     # find alignments
 337     gerald_run = gerald.gerald(gerald_dir)
 338     if gerald_run is None:
 339         LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
 340         return None
 341
 342     p = PipelineRun(runfolder_dir)
 343     p.image_analysis = image_run
 344     p.bustard = base_calling_run
 345     p.gerald = gerald_run
 346
 347     LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
 348     return p
 349
 350 def extract_run_parameters(runs):
 351     """
 352     Search through runfolder_path for various runs and grab their parameters
 353     """
 354     for run in runs:
 355       run.save()
 356
 357 def summarize_mapped_reads(genome_map, mapped_reads):
 358     """
 359     Summarize per chromosome reads into a genome count
 360     But handle spike-in/contamination symlinks seperately.
 361     """
 362     summarized_reads = {}
 363     genome_reads = 0
 364     genome = 'unknown'
 365     for k, v in mapped_reads.items():
 366         path, k = os.path.split(k)
 367         if len(path) > 0 and path not in genome_map:
 368             genome = path
 369             genome_reads += v
 370         else:
 371             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
 372     summarized_reads[genome] = genome_reads
 373     return summarized_reads
 374
 375 def summarize_lane(gerald, lane_id):
 376     report = []
 377     lane_results = gerald.summary.lane_results
 378     eland_result = gerald.eland_results[lane_id]
 379     report.append("Sample name %s" % (eland_result.sample_name))
 380     report.append("Lane id %s end %s" % (lane_id.lane, lane_id.read))
 381
 382     if lane_id.read < len(lane_results) and \
 383            lane_id.lane in lane_results[lane_id.read]:
 384         summary_results = lane_results[lane_id.read][lane_id.lane]
 385         cluster = summary_results.cluster
 386         report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
 387     report.append("Total Reads: %d" % (eland_result.reads))
 388
 389     if hasattr(eland_result, 'match_codes'):
 390         mc = eland_result.match_codes
 391         nm = mc['NM']
 392         nm_percent = float(nm) / eland_result.reads * 100
 393         qc = mc['QC']
 394         qc_percent = float(qc) / eland_result.reads * 100
 395
 396         report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
 397         report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
 398         report.append('Unique (0,1,2 mismatches) %d %d %d' % \
 399                       (mc['U0'], mc['U1'], mc['U2']))
 400         report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
 401                       (mc['R0'], mc['R1'], mc['R2']))
 402
 403     if hasattr(eland_result, 'genome_map'):
 404         report.append("Mapped Reads")
 405         mapped_reads = summarize_mapped_reads(eland_result.genome_map,
 406                                               eland_result.mapped_reads)
 407         for name, counts in mapped_reads.items():
 408             report.append("  %s: %d" % (name, counts))
 409
 410         report.append('')
 411     return report
 412
 413 def summary_report(runs):
 414     """
 415     Summarize cluster numbers and mapped read counts for a runfolder
 416     """
 417     report = []
 418     for run in runs:
 419         # print a run name?
 420         report.append('Summary for %s' % (run.name,))
 421         # sort the report
 422         eland_keys = sorted(run.gerald.eland_results.keys())
 423     for lane_id in eland_keys:
 424         report.extend(summarize_lane(run.gerald, lane_id))
 425         report.append('---')
 426         report.append('')
 427     return os.linesep.join(report)
 428
 429 def is_compressed(filename):
 430     if os.path.splitext(filename)[1] == ".gz":
 431         return True
 432     elif os.path.splitext(filename)[1] == '.bz2':
 433         return True
 434     else:
 435         return False
 436
 437 def save_flowcell_reports(data_dir, cycle_dir):
 438     """
 439     Save the flowcell quality reports
 440     """
 441     data_dir = os.path.abspath(data_dir)
 442     status_file = os.path.join(data_dir, 'Status.xml')
 443     reports_dir = os.path.join(data_dir, 'reports')
 444     reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
 445     if os.path.exists(reports_dir):
 446         cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
 447         if os.path.exists(status_file):
 448             cmd_list.extend(['Status.xml', 'Status.xsl'])
 449         LOGGER.info("Saving reports from " + reports_dir)
 450         cwd = os.getcwd()
 451         os.chdir(data_dir)
 452         q = QueueCommands([" ".join(cmd_list)])
 453         q.run()
 454         os.chdir(cwd)
 455
 456
 457 def save_summary_file(pipeline, cycle_dir):
 458     # Copy Summary.htm
 459     gerald_object = pipeline.gerald
 460     gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
 461     status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
 462     if os.path.exists(gerald_summary):
 463         LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
 464         shutil.copy(gerald_summary, cycle_dir)
 465     elif os.path.exists(status_files_summary):
 466         LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
 467         shutil.copy(status_files_summary, cycle_dir)
 468     else:
 469         LOGGER.info('Summary file %s was not found' % (summary_path,))
 470
 471 def save_ivc_plot(bustard_object, cycle_dir):
 472     """
 473     Save the IVC page and its supporting images
 474     """
 475     plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
 476     plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
 477     plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
 478
 479     plot_target_path = os.path.join(cycle_dir, 'Plots')
 480
 481     if os.path.exists(plot_html):
 482         LOGGER.debug("Saving %s" % (plot_html,))
 483         LOGGER.debug("Saving %s" % (plot_images,))
 484         shutil.copy(plot_html, cycle_dir)
 485         if not os.path.exists(plot_target_path):
 486             os.mkdir(plot_target_path)
 487         for plot_file in glob(plot_images):
 488             shutil.copy(plot_file, plot_target_path)
 489     else:
 490         LOGGER.warning('Missing IVC.html file, not archiving')
 491
 492
 493 def compress_score_files(bustard_object, cycle_dir):
 494     """
 495     Compress score files into our result directory
 496     """
 497     # check for g.pathname/Temp a new feature of 1.1rc1
 498     scores_path = bustard_object.pathname
 499     scores_path_temp = os.path.join(scores_path, 'Temp')
 500     if os.path.isdir(scores_path_temp):
 501         scores_path = scores_path_temp
 502
 503     # hopefully we have a directory that contains s_*_score files
 504     score_files = []
 505     for f in os.listdir(scores_path):
 506         if re.match('.*_score.txt', f):
 507             score_files.append(f)
 508
 509     tar_cmd = ['tar', 'c'] + score_files
 510     bzip_cmd = [ 'bzip2', '-9', '-c' ]
 511     tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
 512     tar_dest = open(tar_dest_name, 'w')
 513     LOGGER.info("Compressing score files from %s" % (scores_path,))
 514     LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
 515     LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
 516     LOGGER.info("Writing to %s" % (tar_dest_name,))
 517
 518     env = {'BZIP': '-9'}
 519     tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
 520                            cwd=scores_path)
 521     bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
 522     tar.wait()
 523
 524
 525 def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
 526     """
 527     Compress eland result files into the archive directory
 528     """
 529     # copy & bzip eland files
 530     bz_commands = []
 531
 532     for key in gerald_object.eland_results:
 533         eland_lane = gerald_object.eland_results[key]
 534         for source_name in eland_lane.pathnames:
 535             if source_name is None:
 536               LOGGER.info(
 537                 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
 538             else:
 539               path, name = os.path.split(source_name)
 540               dest_name = os.path.join(cycle_dir, name)
 541               LOGGER.info("Saving eland file %s to %s" % \
 542                          (source_name, dest_name))
 543
 544               if is_compressed(name):
 545                 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
 546                 shutil.copy(source_name, dest_name)
 547               else:
 548                 # not compressed
 549                 dest_name += '.bz2'
 550                 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
 551                 bz_commands.append(" ".join(args))
 552                 #LOGGER.info('Running: %s' % ( " ".join(args) ))
 553                 #bzip_dest = open(dest_name, 'w')
 554                 #bzip = subprocess.Popen(args, stdout=bzip_dest)
 555                 #LOGGER.info('Saving to %s' % (dest_name, ))
 556                 #bzip.wait()
 557
 558     if len(bz_commands) > 0:
 559       q = QueueCommands(bz_commands, num_jobs)
 560       q.run()
 561
 562
 563 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
 564     """
 565     Iterate over runfolders in runs extracting the most useful information.
 566       * run parameters (in run-*.xml)
 567       * eland_result files
 568       * score files
 569       * Summary.htm
 570       * srf files (raw sequence & qualities)
 571     """
 572     if output_base_dir is None:
 573         output_base_dir = os.getcwd()
 574
 575     for r in runs:
 576       result_dir = os.path.join(output_base_dir, r.flowcell_id)
 577       LOGGER.info("Using %s as result directory" % (result_dir,))
 578       if not os.path.exists(result_dir):
 579         os.mkdir(result_dir)
 580
 581       # create cycle_dir
 582       cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
 583       LOGGER.info("Filling in %s" % (cycle,))
 584       cycle_dir = os.path.join(result_dir, cycle)
 585       cycle_dir = os.path.abspath(cycle_dir)
 586       if os.path.exists(cycle_dir):
 587         LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
 588         continue
 589       else:
 590         os.mkdir(cycle_dir)
 591
 592       # save run file
 593       r.save(cycle_dir)
 594
 595       # save illumina flowcell status report
 596       save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
 597
 598       # save stuff from bustard
 599       # grab IVC plot
 600       save_ivc_plot(r.bustard, cycle_dir)
 601
 602       # build base call saving commands
 603       if site is not None:
 604         lanes = []
 605         for lane in range(1, 9):
 606           lane_parameters = r.gerald.lanes.get(lane, None)
 607           if lane_parameters is not None and lane_parameters.analysis != 'none':
 608             lanes.append(lane)
 609
 610         run_name = srf.pathname_to_run_name(r.pathname)
 611         seq_cmds = []
 612         LOGGER.info("Raw Format is: %s" % (raw_format, ))
 613         if raw_format == 'fastq':
 614             rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
 615             LOGGER.info("raw data = %s" % (rawpath,))
 616             srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
 617         elif raw_format == 'qseq':
 618             seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
 619         elif raw_format == 'srf':
 620             seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
 621         else:
 622             raise ValueError('Unknown --raw-format=%s' % (raw_format))
 623         srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
 624
 625       # save stuff from GERALD
 626       # copy stuff out of the main run
 627       g = r.gerald
 628
 629       # save summary file
 630       save_summary_file(r, cycle_dir)
 631
 632       # compress eland result files
 633       compress_eland_results(g, cycle_dir, num_jobs)
 634
 635       # md5 all the compressed files once we're done
 636       md5_commands = srf.make_md5_commands(cycle_dir)
 637       srf.run_commands(cycle_dir, md5_commands, num_jobs)
 638
 639 def rm_list(files, dry_run=True):
 640     for f in files:
 641         if os.path.exists(f):
 642             LOGGER.info('deleting %s' % (f,))
 643             if not dry_run:
 644                 if os.path.isdir(f):
 645                     shutil.rmtree(f)
 646                 else:
 647                     os.unlink(f)
 648         else:
 649             LOGGER.warn("%s doesn't exist." % (f,))
 650
 651 def clean_runs(runs, dry_run=True):
 652     """
 653     Clean up run folders to optimize for compression.
 654     """
 655     if dry_run:
 656         LOGGER.info('In dry-run mode')
 657
 658     for run in runs:
 659         LOGGER.info('Cleaninging %s' % (run.pathname,))
 660         # rm RunLog*.xml
 661         runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
 662         rm_list(runlogs, dry_run)
 663         # rm pipeline_*.txt
 664         pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
 665         rm_list(pipeline_logs, dry_run)
 666         # rm gclog.txt?
 667         # rm NetCopy.log? Isn't this robocopy?
 668         logs = glob(os.path.join(run.pathname, '*.log'))
 669         rm_list(logs, dry_run)
 670         # rm nfn.log?
 671         # Calibration
 672         calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
 673         rm_list(calibration_dir, dry_run)
 674         # rm Images/L*
 675         LOGGER.info("Cleaning images")
 676         image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
 677         rm_list(image_dirs, dry_run)
 678         # rm ReadPrep
 679         LOGGER.info("Cleaning ReadPrep*")
 680         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
 681         rm_list(read_prep_dirs, dry_run)
 682         # rm ReadPrep
 683         LOGGER.info("Cleaning Thubmnail_images")
 684         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
 685         rm_list(thumbnail_dirs, dry_run)
 686
 687         # make clean_intermediate
 688         logging.info("Cleaning intermediate files")
 689         if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
 690             clean_process = subprocess.Popen(['make', 'clean_intermediate'],
 691                                              cwd=run.image_analysis.pathname,)
 692             clean_process.wait()
 693
 694
 695