htsworkflow/pipelines/runfolder.py

   1 """
   2 Core information needed to inspect a runfolder.
   3 """
   4 from glob import glob
   5 import logging
   6 import os
   7 import re
   8 import shutil
   9 import stat
  10 import subprocess
  11 import sys
  12 import tarfile
  13 import time
  14
  15 try:
  16     from xml.etree import ElementTree
  17 except ImportError, e:
  18     from elementtree import ElementTree
  19
  20 LOGGER = logging.getLogger(__name__)
  21
  22 EUROPEAN_STRPTIME = "%d-%m-%Y"
  23 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
  24 VERSION_RE = "([0-9\.]+)"
  25 USER_RE = "([a-zA-Z0-9]+)"
  26 LANES_PER_FLOWCELL = 8
  27 LANE_LIST = range(1, LANES_PER_FLOWCELL + 1)
  28
  29 from htsworkflow.util.alphanum import alphanum
  30 from htsworkflow.util.ethelp import indent, flatten
  31 from htsworkflow.util.queuecommands import QueueCommands
  32
  33 from htsworkflow.pipelines import srf
  34
  35 class PipelineRun(object):
  36     """
  37     Capture "interesting" information about a pipeline run
  38     """
  39     XML_VERSION = 1
  40     PIPELINE_RUN = 'PipelineRun'
  41     FLOWCELL_ID = 'FlowcellID'
  42
  43     def __init__(self, pathname=None, flowcell_id=None, xml=None):
  44         if pathname is not None:
  45           self.pathname = os.path.normpath(pathname)
  46         else:
  47           self.pathname = None
  48         self._name = None
  49         self._flowcell_id = flowcell_id
  50         self.image_analysis = None
  51         self.bustard = None
  52         self.gerald = None
  53
  54         if xml is not None:
  55           self.set_elements(xml)
  56
  57     def _get_flowcell_id(self):
  58         # extract flowcell ID
  59         if self._flowcell_id is None:
  60             config_dir = os.path.join(self.pathname, 'Config')
  61             flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
  62             if os.path.exists(flowcell_id_path):
  63                 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
  64                 self._flowcell_id = flowcell_id_tree.findtext('Text')
  65             else:
  66                 path_fields = self.pathname.split('_')
  67                 if len(path_fields) > 0:
  68                     # guessing last element of filename
  69                    self._flowcell_id = path_fields[-1]
  70                 else:
  71                    self._flowcell_id = 'unknown'
  72
  73                    LOGGER.warning(
  74                        "Flowcell id was not found, guessing %s" % (
  75                        self._flowcell_id))
  76
  77         return self._flowcell_id
  78     flowcell_id = property(_get_flowcell_id)
  79
  80     def _get_runfolder_name(self):
  81         if self.gerald is None:
  82             return None
  83         else:
  84             return self.gerald.runfolder_name
  85     runfolder_name = property(_get_runfolder_name)
  86
  87     def get_elements(self):
  88         """
  89         make one master xml file from all of our sub-components.
  90         """
  91         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
  92         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
  93         flowcell.text = self.flowcell_id
  94         root.append(self.image_analysis.get_elements())
  95         root.append(self.bustard.get_elements())
  96         root.append(self.gerald.get_elements())
  97         return root
  98
  99     def set_elements(self, tree):
 100         # this file gets imported by all the others,
 101         # so we need to hide the imports to avoid a cyclic imports
 102         from htsworkflow.pipelines import firecrest
 103         from htsworkflow.pipelines import ipar
 104         from htsworkflow.pipelines import bustard
 105         from htsworkflow.pipelines import gerald
 106
 107         tag = tree.tag.lower()
 108         if tag != PipelineRun.PIPELINE_RUN.lower():
 109           raise ValueError('Pipeline Run Expecting %s got %s' % (
 110               PipelineRun.PIPELINE_RUN, tag))
 111         for element in tree:
 112           tag = element.tag.lower()
 113           if tag == PipelineRun.FLOWCELL_ID.lower():
 114             self._flowcell_id = element.text
 115           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
 116           # you should only have Firecrest or IPAR, never both of them.
 117           elif tag == firecrest.Firecrest.FIRECREST.lower():
 118             self.image_analysis = firecrest.Firecrest(xml=element)
 119           elif tag == ipar.IPAR.IPAR.lower():
 120             self.image_analysis = ipar.IPAR(xml=element)
 121           elif tag == bustard.Bustard.BUSTARD.lower():
 122             self.bustard = bustard.Bustard(xml=element)
 123           elif tag == gerald.Gerald.GERALD.lower():
 124             self.gerald = gerald.Gerald(xml=element)
 125           else:
 126             LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
 127
 128     def _get_run_name(self):
 129         """
 130         Given a run tuple, find the latest date and use that as our name
 131         """
 132         if self._name is None:
 133           tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
 134           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
 135           self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
 136         return self._name
 137     name = property(_get_run_name)
 138
 139     def save(self, destdir=None):
 140         if destdir is None:
 141             destdir = ''
 142         LOGGER.info("Saving run report " + self.name)
 143         xml = self.get_elements()
 144         indent(xml)
 145         dest_pathname = os.path.join(destdir, self.name)
 146         ElementTree.ElementTree(xml).write(dest_pathname)
 147
 148     def load(self, filename):
 149         LOGGER.info("Loading run report from " + filename)
 150         tree = ElementTree.parse(filename).getroot()
 151         self.set_elements(tree)
 152
 153 def load_pipeline_run_xml(pathname):
 154     """
 155     Load and instantiate a Pipeline run from a run xml file
 156
 157     :Parameters:
 158       - `pathname` : location of an run xml file
 159
 160     :Returns: initialized PipelineRun object
 161     """
 162     tree = ElementTree.parse(pathname).getroot()
 163     run = PipelineRun(xml=tree)
 164     return run
 165
 166 def get_runs(runfolder, flowcell_id=None):
 167     """
 168     Search through a run folder for all the various sub component runs
 169     and then return a PipelineRun for each different combination.
 170
 171     For example if there are two different GERALD runs, this will
 172     generate two different PipelineRun objects, that differ
 173     in there gerald component.
 174     """
 175     from htsworkflow.pipelines import firecrest
 176     from htsworkflow.pipelines import ipar
 177     from htsworkflow.pipelines import bustard
 178     from htsworkflow.pipelines import gerald
 179
 180     def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
 181         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
 182         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
 183         # RTA BaseCalls looks enough like Bustard.
 184         bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
 185         for bustard_pathname in bustard_dirs:
 186             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
 187             b = bustard.bustard(bustard_pathname)
 188             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
 189             LOGGER.info("Looking for gerald directories in %s" % (pathname,))
 190             for gerald_pathname in glob(gerald_glob):
 191                 LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
 192                 try:
 193                     g = gerald.gerald(gerald_pathname)
 194                     p = PipelineRun(runfolder, flowcell_id)
 195                     p.image_analysis = image_analysis
 196                     p.bustard = b
 197                     p.gerald = g
 198                     runs.append(p)
 199                 except IOError, e:
 200                     LOGGER.error("Ignoring " + str(e))
 201
 202     datadir = os.path.join(runfolder, 'Data')
 203
 204     LOGGER.info('Searching for runs in ' + datadir)
 205     runs = []
 206     # scan for firecrest directories
 207     for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
 208         LOGGER.info('Found firecrest in ' + datadir)
 209         image_analysis = firecrest.firecrest(firecrest_pathname)
 210         if image_analysis is None:
 211             LOGGER.warn(
 212                 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
 213             )
 214         else:
 215             scan_post_image_analysis(
 216                 runs, runfolder, image_analysis, firecrest_pathname
 217             )
 218     # scan for IPAR directories
 219     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
 220     # The Intensities directory from the RTA software looks a lot like IPAR
 221     ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
 222     for ipar_pathname in ipar_dirs:
 223         LOGGER.info('Found ipar directories in ' + datadir)
 224         image_analysis = ipar.ipar(ipar_pathname)
 225         if image_analysis is None:
 226             LOGGER.warn(
 227                 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
 228             )
 229         else:
 230             scan_post_image_analysis(
 231                 runs, runfolder, image_analysis, ipar_pathname
 232             )
 233
 234     return runs
 235
 236 def get_specific_run(gerald_dir):
 237     """
 238     Given a gerald directory, construct a PipelineRun out of its parents
 239
 240     Basically this allows specifying a particular run instead of the previous
 241     get_runs which scans a runfolder for various combinations of
 242     firecrest/ipar/bustard/gerald runs.
 243     """
 244     from htsworkflow.pipelines import firecrest
 245     from htsworkflow.pipelines import ipar
 246     from htsworkflow.pipelines import bustard
 247     from htsworkflow.pipelines import gerald
 248
 249     gerald_dir = os.path.expanduser(gerald_dir)
 250     bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
 251     image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
 252
 253     runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
 254
 255     LOGGER.info('--- use-run detected options ---')
 256     LOGGER.info('runfolder: %s' % (runfolder_dir,))
 257     LOGGER.info('image_dir: %s' % (image_dir,))
 258     LOGGER.info('bustard_dir: %s' % (bustard_dir,))
 259     LOGGER.info('gerald_dir: %s' % (gerald_dir,))
 260
 261     # find our processed image dir
 262     image_run = None
 263     # split into parent, and leaf directory
 264     # leaf directory should be an IPAR or firecrest directory
 265     data_dir, short_image_dir = os.path.split(image_dir)
 266     LOGGER.info('data_dir: %s' % (data_dir,))
 267     LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
 268
 269     # guess which type of image processing directory we have by looking
 270     # in the leaf directory name
 271     if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
 272         image_run = firecrest.firecrest(image_dir)
 273     elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
 274         image_run = ipar.ipar(image_dir)
 275     elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
 276         image_run = ipar.ipar(image_dir)
 277
 278     # if we din't find a run, report the error and return
 279     if image_run is None:
 280         msg = '%s does not contain an image processing step' % (image_dir,)
 281         LOGGER.error(msg)
 282         return None
 283
 284     # find our base calling
 285     base_calling_run = bustard.bustard(bustard_dir)
 286     if base_calling_run is None:
 287         LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
 288         return None
 289
 290     # find alignments
 291     gerald_run = gerald.gerald(gerald_dir)
 292     if gerald_run is None:
 293         LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
 294         return None
 295
 296     p = PipelineRun(runfolder_dir)
 297     p.image_analysis = image_run
 298     p.bustard = base_calling_run
 299     p.gerald = gerald_run
 300
 301     LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
 302     return p
 303
 304 def extract_run_parameters(runs):
 305     """
 306     Search through runfolder_path for various runs and grab their parameters
 307     """
 308     for run in runs:
 309       run.save()
 310
 311 def summarize_mapped_reads(genome_map, mapped_reads):
 312     """
 313     Summarize per chromosome reads into a genome count
 314     But handle spike-in/contamination symlinks seperately.
 315     """
 316     summarized_reads = {}
 317     genome_reads = 0
 318     genome = 'unknown'
 319     for k, v in mapped_reads.items():
 320         path, k = os.path.split(k)
 321         if len(path) > 0 and not genome_map.has_key(path):
 322             genome = path
 323             genome_reads += v
 324         else:
 325             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
 326     summarized_reads[genome] = genome_reads
 327     return summarized_reads
 328
 329 def summarize_lane(gerald, lane_id):
 330     report = []
 331     summary_results = gerald.summary.lane_results
 332     for end in range(len(summary_results)):
 333       eland_result = gerald.eland_results.results[end][lane_id]
 334       report.append("Sample name %s" % (eland_result.sample_name))
 335       report.append("Lane id %s end %s" % (eland_result.lane_id, end))
 336       if end < len(summary_results) and summary_results[end].has_key(eland_result.lane_id):
 337           cluster = summary_results[end][eland_result.lane_id].cluster
 338           report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
 339       report.append("Total Reads: %d" % (eland_result.reads))
 340
 341       if hasattr(eland_result, 'match_codes'):
 342           mc = eland_result.match_codes
 343           nm = mc['NM']
 344           nm_percent = float(nm) / eland_result.reads * 100
 345           qc = mc['QC']
 346           qc_percent = float(qc) / eland_result.reads * 100
 347
 348           report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
 349           report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
 350           report.append('Unique (0,1,2 mismatches) %d %d %d' % \
 351                         (mc['U0'], mc['U1'], mc['U2']))
 352           report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
 353                         (mc['R0'], mc['R1'], mc['R2']))
 354
 355       if hasattr(eland_result, 'genome_map'):
 356           report.append("Mapped Reads")
 357           mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads)
 358           for name, counts in mapped_reads.items():
 359             report.append("  %s: %d" % (name, counts))
 360
 361       report.append('')
 362     return report
 363
 364 def summary_report(runs):
 365     """
 366     Summarize cluster numbers and mapped read counts for a runfolder
 367     """
 368     report = []
 369     for run in runs:
 370         # print a run name?
 371         report.append('Summary for %s' % (run.name,))
 372         # sort the report
 373         eland_keys = run.gerald.eland_results.results[0].keys()
 374         eland_keys.sort(alphanum)
 375
 376         for lane_id in eland_keys:
 377             report.extend(summarize_lane(run.gerald, lane_id))
 378             report.append('---')
 379             report.append('')
 380         return os.linesep.join(report)
 381
 382 def is_compressed(filename):
 383     if os.path.splitext(filename)[1] == ".gz":
 384         return True
 385     elif os.path.splitext(filename)[1] == '.bz2':
 386         return True
 387     else:
 388         return False
 389
 390 def save_flowcell_reports(data_dir, cycle_dir):
 391     """
 392     Save the flowcell quality reports
 393     """
 394     data_dir = os.path.abspath(data_dir)
 395     status_file = os.path.join(data_dir, 'Status.xml')
 396     reports_dir = os.path.join(data_dir, 'reports')
 397     reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
 398     if os.path.exists(reports_dir):
 399         cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
 400         if os.path.exists(status_file):
 401             cmd_list.extend(['Status.xml', 'Status.xsl'])
 402         LOGGER.info("Saving reports from " + reports_dir)
 403         cwd = os.getcwd()
 404         os.chdir(data_dir)
 405         q = QueueCommands([" ".join(cmd_list)])
 406         q.run()
 407         os.chdir(cwd)
 408
 409
 410 def save_summary_file(gerald_object, cycle_dir):
 411     # Copy Summary.htm
 412     summary_path = os.path.join(gerald_object.pathname, 'Summary.htm')
 413     if os.path.exists(summary_path):
 414         LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir))
 415         shutil.copy(summary_path, cycle_dir)
 416     else:
 417         LOGGER.info('Summary file %s was not found' % (summary_path,))
 418
 419 def save_ivc_plot(bustard_object, cycle_dir):
 420     """
 421     Save the IVC page and its supporting images
 422     """
 423     plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
 424     plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
 425     plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
 426
 427     plot_target_path = os.path.join(cycle_dir, 'Plots')
 428
 429     if os.path.exists(plot_html):
 430         LOGGER.debug("Saving %s" % (plot_html,))
 431         LOGGER.debug("Saving %s" % (plot_images,))
 432         shutil.copy(plot_html, cycle_dir)
 433         if not os.path.exists(plot_target_path):
 434             os.mkdir(plot_target_path)
 435         for plot_file in glob(plot_images):
 436             shutil.copy(plot_file, plot_target_path)
 437     else:
 438         LOGGER.warning('Missing IVC.html file, not archiving')
 439
 440
 441 def compress_score_files(bustard_object, cycle_dir):
 442     """
 443     Compress score files into our result directory
 444     """
 445     # check for g.pathname/Temp a new feature of 1.1rc1
 446     scores_path = bustard_object.pathname
 447     scores_path_temp = os.path.join(scores_path, 'Temp')
 448     if os.path.isdir(scores_path_temp):
 449         scores_path = scores_path_temp
 450
 451     # hopefully we have a directory that contains s_*_score files
 452     score_files = []
 453     for f in os.listdir(scores_path):
 454         if re.match('.*_score.txt', f):
 455             score_files.append(f)
 456
 457     tar_cmd = ['tar', 'c'] + score_files
 458     bzip_cmd = [ 'bzip2', '-9', '-c' ]
 459     tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
 460     tar_dest = open(tar_dest_name, 'w')
 461     LOGGER.info("Compressing score files from %s" % (scores_path,))
 462     LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
 463     LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
 464     LOGGER.info("Writing to %s" % (tar_dest_name,))
 465
 466     env = {'BZIP': '-9'}
 467     tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
 468                            cwd=scores_path)
 469     bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
 470     tar.wait()
 471
 472
 473 def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
 474     """
 475     Compress eland result files into the archive directory
 476     """
 477     # copy & bzip eland files
 478     bz_commands = []
 479
 480     for lanes_dictionary in gerald_object.eland_results.results:
 481         for eland_lane in lanes_dictionary.values():
 482             source_name = eland_lane.pathname
 483             if source_name is None:
 484               LOGGER.info(
 485                 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
 486             else:
 487               path, name = os.path.split(source_name)
 488               dest_name = os.path.join(cycle_dir, name)
 489               LOGGER.info("Saving eland file %s to %s" % \
 490                          (source_name, dest_name))
 491
 492               if is_compressed(name):
 493                 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
 494                 shutil.copy(source_name, dest_name)
 495               else:
 496                 # not compressed
 497                 dest_name += '.bz2'
 498                 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
 499                 bz_commands.append(" ".join(args))
 500                 #LOGGER.info('Running: %s' % ( " ".join(args) ))
 501                 #bzip_dest = open(dest_name, 'w')
 502                 #bzip = subprocess.Popen(args, stdout=bzip_dest)
 503                 #LOGGER.info('Saving to %s' % (dest_name, ))
 504                 #bzip.wait()
 505
 506     if len(bz_commands) > 0:
 507       q = QueueCommands(bz_commands, num_jobs)
 508       q.run()
 509
 510
 511 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
 512     """
 513     Iterate over runfolders in runs extracting the most useful information.
 514       * run parameters (in run-*.xml)
 515       * eland_result files
 516       * score files
 517       * Summary.htm
 518       * srf files (raw sequence & qualities)
 519     """
 520     if output_base_dir is None:
 521         output_base_dir = os.getcwd()
 522
 523     for r in runs:
 524       result_dir = os.path.join(output_base_dir, r.flowcell_id)
 525       LOGGER.info("Using %s as result directory" % (result_dir,))
 526       if not os.path.exists(result_dir):
 527         os.mkdir(result_dir)
 528
 529       # create cycle_dir
 530       cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
 531       LOGGER.info("Filling in %s" % (cycle,))
 532       cycle_dir = os.path.join(result_dir, cycle)
 533       cycle_dir = os.path.abspath(cycle_dir)
 534       if os.path.exists(cycle_dir):
 535         LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
 536         continue
 537       else:
 538         os.mkdir(cycle_dir)
 539
 540       # save run file
 541       r.save(cycle_dir)
 542
 543       # save illumina flowcell status report
 544       save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
 545
 546       # save stuff from bustard
 547       # grab IVC plot
 548       save_ivc_plot(r.bustard, cycle_dir)
 549
 550       # build base call saving commands
 551       if site is not None:
 552         lanes = []
 553         for lane in range(1, 9):
 554           if r.gerald.lanes[lane].analysis != 'none':
 555             lanes.append(lane)
 556
 557         run_name = srf.pathname_to_run_name(r.pathname)
 558         seq_cmds = []
 559         if raw_format == 'fastq':
 560             srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, site, cycle_dir)
 561         elif raw_format == 'qseq':
 562             seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
 563         elif raw_format == 'srf':
 564             seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
 565         else:
 566             raise ValueError('Unknown --raw-format=%s' % (raw_format))
 567         srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
 568
 569       # save stuff from GERALD
 570       # copy stuff out of the main run
 571       g = r.gerald
 572
 573       # save summary file
 574       save_summary_file(g, cycle_dir)
 575
 576       # compress eland result files
 577       compress_eland_results(g, cycle_dir, num_jobs)
 578
 579       # md5 all the compressed files once we're done
 580       md5_commands = srf.make_md5_commands(cycle_dir)
 581       srf.run_commands(cycle_dir, md5_commands, num_jobs)
 582
 583 def rm_list(files, dry_run=True):
 584     for f in files:
 585         if os.path.exists(f):
 586             LOGGER.info('deleting %s' % (f,))
 587             if not dry_run:
 588                 if os.path.isdir(f):
 589                     shutil.rmtree(f)
 590                 else:
 591                     os.unlink(f)
 592         else:
 593             LOGGER.warn("%s doesn't exist." % (f,))
 594
 595 def clean_runs(runs, dry_run=True):
 596     """
 597     Clean up run folders to optimize for compression.
 598     """
 599     if dry_run:
 600         LOGGER.info('In dry-run mode')
 601
 602     for run in runs:
 603         LOGGER.info('Cleaninging %s' % (run.pathname,))
 604         # rm RunLog*.xml
 605         runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
 606         rm_list(runlogs, dry_run)
 607         # rm pipeline_*.txt
 608         pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
 609         rm_list(pipeline_logs, dry_run)
 610         # rm gclog.txt?
 611         # rm NetCopy.log? Isn't this robocopy?
 612         logs = glob(os.path.join(run.pathname, '*.log'))
 613         rm_list(logs, dry_run)
 614         # rm nfn.log?
 615         # Calibration
 616         calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
 617         rm_list(calibration_dir, dry_run)
 618         # rm Images/L*
 619         LOGGER.info("Cleaning images")
 620         image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
 621         rm_list(image_dirs, dry_run)
 622         # rm ReadPrep
 623         LOGGER.info("Cleaning ReadPrep*")
 624         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
 625         rm_list(read_prep_dirs, dry_run)
 626         # rm ReadPrep
 627         LOGGER.info("Cleaning Thubmnail_images")
 628         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
 629         rm_list(thumbnail_dirs, dry_run)
 630
 631         # make clean_intermediate
 632         logging.info("Cleaning intermediate files")
 633         if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
 634             clean_process = subprocess.Popen(['make', 'clean_intermediate'],
 635                                              cwd=run.image_analysis.pathname,)
 636             clean_process.wait()
 637
 638
 639