Minimal changes needed to get raw data archived for loxcyc.
[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
1 """
2 Core information needed to inspect a runfolder.
3 """
4 from glob import glob
5 import logging
6 import os
7 import re
8 import shutil
9 import stat
10 import subprocess
11 import sys
12 import tarfile
13 import time
14
15 try:
16     from xml.etree import ElementTree
17 except ImportError, e:
18     from elementtree import ElementTree
19
20 LOGGER = logging.getLogger(__name__)
21
22 EUROPEAN_STRPTIME = "%d-%m-%Y"
23 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
24 VERSION_RE = "([0-9\.]+)"
25 USER_RE = "([a-zA-Z0-9]+)"
26 LANES_PER_FLOWCELL = 8
27 LANE_LIST = range(1, LANES_PER_FLOWCELL + 1)
28
29 from htsworkflow.util.alphanum import alphanum
30 from htsworkflow.util.ethelp import indent, flatten
31 from htsworkflow.util.queuecommands import QueueCommands
32
33 from htsworkflow.pipelines import srf
34
35 class PipelineRun(object):
36     """
37     Capture "interesting" information about a pipeline run
38     """
39     XML_VERSION = 1
40     PIPELINE_RUN = 'PipelineRun'
41     FLOWCELL_ID = 'FlowcellID'
42
43     def __init__(self, pathname=None, flowcell_id=None, xml=None):
44         if pathname is not None:
45           self.pathname = os.path.normpath(pathname)
46         else:
47           self.pathname = None
48         self._name = None
49         self._flowcell_id = flowcell_id
50         self.datadir = None
51         self.image_analysis = None
52         self.bustard = None
53         self.gerald = None
54
55         if xml is not None:
56           self.set_elements(xml)
57
58     def _get_flowcell_id(self):
59         # extract flowcell ID
60         if self._flowcell_id is None:
61             config_dir = os.path.join(self.pathname, 'Config')
62             flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
63             if os.path.exists(flowcell_id_path):
64                 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
65                 self._flowcell_id = flowcell_id_tree.findtext('Text')
66             else:
67                 path_fields = self.pathname.split('_')
68                 if len(path_fields) > 0:
69                     # guessing last element of filename
70                    self._flowcell_id = path_fields[-1]
71                 else:
72                    self._flowcell_id = 'unknown'
73
74                    LOGGER.warning(
75                        "Flowcell id was not found, guessing %s" % (
76                        self._flowcell_id))
77
78         return self._flowcell_id
79     flowcell_id = property(_get_flowcell_id)
80
81     def _get_runfolder_name(self):
82         if self.gerald is None:
83             return None
84         else:
85             return self.gerald.runfolder_name
86     runfolder_name = property(_get_runfolder_name)
87
88     def get_elements(self):
89         """
90         make one master xml file from all of our sub-components.
91         """
92         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
93         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
94         flowcell.text = self.flowcell_id
95         root.append(self.image_analysis.get_elements())
96         root.append(self.bustard.get_elements())
97         root.append(self.gerald.get_elements())
98         return root
99
100     def set_elements(self, tree):
101         # this file gets imported by all the others,
102         # so we need to hide the imports to avoid a cyclic imports
103         from htsworkflow.pipelines import firecrest
104         from htsworkflow.pipelines import ipar
105         from htsworkflow.pipelines import bustard
106         from htsworkflow.pipelines import gerald
107
108         tag = tree.tag.lower()
109         if tag != PipelineRun.PIPELINE_RUN.lower():
110           raise ValueError('Pipeline Run Expecting %s got %s' % (
111               PipelineRun.PIPELINE_RUN, tag))
112         for element in tree:
113           tag = element.tag.lower()
114           if tag == PipelineRun.FLOWCELL_ID.lower():
115             self._flowcell_id = element.text
116           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
117           # you should only have Firecrest or IPAR, never both of them.
118           elif tag == firecrest.Firecrest.FIRECREST.lower():
119             self.image_analysis = firecrest.Firecrest(xml=element)
120           elif tag == ipar.IPAR.IPAR.lower():
121             self.image_analysis = ipar.IPAR(xml=element)
122           elif tag == bustard.Bustard.BUSTARD.lower():
123             self.bustard = bustard.Bustard(xml=element)
124           elif tag == gerald.Gerald.GERALD.lower():
125             self.gerald = gerald.Gerald(xml=element)
126           else:
127             LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
128
129     def _get_run_name(self):
130         """
131         Given a run tuple, find the latest date and use that as our name
132         """
133         if self._name is None:
134           tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
135           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
136           self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
137         return self._name
138     name = property(_get_run_name)
139
140     def save(self, destdir=None):
141         if destdir is None:
142             destdir = ''
143         LOGGER.info("Saving run report " + self.name)
144         xml = self.get_elements()
145         indent(xml)
146         dest_pathname = os.path.join(destdir, self.name)
147         ElementTree.ElementTree(xml).write(dest_pathname)
148
149     def load(self, filename):
150         LOGGER.info("Loading run report from " + filename)
151         tree = ElementTree.parse(filename).getroot()
152         self.set_elements(tree)
153
154 def load_pipeline_run_xml(pathname):
155     """
156     Load and instantiate a Pipeline run from a run xml file
157
158     :Parameters:
159       - `pathname` : location of an run xml file
160
161     :Returns: initialized PipelineRun object
162     """
163     tree = ElementTree.parse(pathname).getroot()
164     run = PipelineRun(xml=tree)
165     return run
166
167 def get_runs(runfolder, flowcell_id=None):
168     """
169     Search through a run folder for all the various sub component runs
170     and then return a PipelineRun for each different combination.
171
172     For example if there are two different GERALD runs, this will
173     generate two different PipelineRun objects, that differ
174     in there gerald component.
175     """
176     from htsworkflow.pipelines import firecrest
177     from htsworkflow.pipelines import ipar
178     from htsworkflow.pipelines import bustard
179     from htsworkflow.pipelines import gerald
180
181     def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
182         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
183         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
184         # RTA BaseCalls looks enough like Bustard.
185         bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
186         for bustard_pathname in bustard_dirs:
187             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
188             b = bustard.bustard(bustard_pathname)
189             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
190             LOGGER.info("Looking for gerald directories in %s" % (pathname,))
191             for gerald_pathname in glob(gerald_glob):
192                 LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
193                 try:
194                     g = gerald.gerald(gerald_pathname)
195                     p = PipelineRun(runfolder, flowcell_id)
196                     p.datadir = datadir
197                     p.image_analysis = image_analysis
198                     p.bustard = b
199                     p.gerald = g
200                     runs.append(p)
201                 except IOError, e:
202                     LOGGER.error("Ignoring " + str(e))
203
204             aligned_glob = os.path.join(runfolder, 'Aligned*')
205             for aligned in glob(aligned_glob):
206                 LOGGER.info("Found aligned directory %s" % (aligned,))
207                 try:
208                     g = gerald.gerald(aligned)
209                     p = PipelineRun(runfolder, flowcell_id)
210                     p.datadir = datadir
211                     p.image_analysis = image_analysis
212                     p.bustard = b
213                     p.gerald = g
214                     runs.append(p)
215                 except IOError, e:
216                     LOGGER.error("Ignoring " + str(e))
217
218     datadir = os.path.join(runfolder, 'Data')
219
220     LOGGER.info('Searching for runs in ' + datadir)
221     runs = []
222     # scan for firecrest directories
223     for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
224         LOGGER.info('Found firecrest in ' + datadir)
225         image_analysis = firecrest.firecrest(firecrest_pathname)
226         if image_analysis is None:
227             LOGGER.warn(
228                 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
229             )
230         else:
231             scan_post_image_analysis(
232                 runs, runfolder, image_analysis, firecrest_pathname
233             )
234     # scan for IPAR directories
235     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
236     # The Intensities directory from the RTA software looks a lot like IPAR
237     ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
238     for ipar_pathname in ipar_dirs:
239         LOGGER.info('Found ipar directories in ' + datadir)
240         image_analysis = ipar.ipar(ipar_pathname)
241         if image_analysis is None:
242             LOGGER.warn(
243                 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
244             )
245         else:
246             scan_post_image_analysis(
247                 runs, runfolder, datadir, image_analysis, ipar_pathname
248             )
249
250     return runs
251
252 def get_specific_run(gerald_dir):
253     """
254     Given a gerald directory, construct a PipelineRun out of its parents
255
256     Basically this allows specifying a particular run instead of the previous
257     get_runs which scans a runfolder for various combinations of
258     firecrest/ipar/bustard/gerald runs.
259     """
260     from htsworkflow.pipelines import firecrest
261     from htsworkflow.pipelines import ipar
262     from htsworkflow.pipelines import bustard
263     from htsworkflow.pipelines import gerald
264
265     gerald_dir = os.path.expanduser(gerald_dir)
266     bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
267     image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
268
269     runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
270
271     LOGGER.info('--- use-run detected options ---')
272     LOGGER.info('runfolder: %s' % (runfolder_dir,))
273     LOGGER.info('image_dir: %s' % (image_dir,))
274     LOGGER.info('bustard_dir: %s' % (bustard_dir,))
275     LOGGER.info('gerald_dir: %s' % (gerald_dir,))
276
277     # find our processed image dir
278     image_run = None
279     # split into parent, and leaf directory
280     # leaf directory should be an IPAR or firecrest directory
281     data_dir, short_image_dir = os.path.split(image_dir)
282     LOGGER.info('data_dir: %s' % (data_dir,))
283     LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
284
285     # guess which type of image processing directory we have by looking
286     # in the leaf directory name
287     if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
288         image_run = firecrest.firecrest(image_dir)
289     elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
290         image_run = ipar.ipar(image_dir)
291     elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
292         image_run = ipar.ipar(image_dir)
293
294     # if we din't find a run, report the error and return
295     if image_run is None:
296         msg = '%s does not contain an image processing step' % (image_dir,)
297         LOGGER.error(msg)
298         return None
299
300     # find our base calling
301     base_calling_run = bustard.bustard(bustard_dir)
302     if base_calling_run is None:
303         LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
304         return None
305
306     # find alignments
307     gerald_run = gerald.gerald(gerald_dir)
308     if gerald_run is None:
309         LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
310         return None
311
312     p = PipelineRun(runfolder_dir)
313     p.image_analysis = image_run
314     p.bustard = base_calling_run
315     p.gerald = gerald_run
316
317     LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
318     return p
319
320 def extract_run_parameters(runs):
321     """
322     Search through runfolder_path for various runs and grab their parameters
323     """
324     for run in runs:
325       run.save()
326
327 def summarize_mapped_reads(genome_map, mapped_reads):
328     """
329     Summarize per chromosome reads into a genome count
330     But handle spike-in/contamination symlinks seperately.
331     """
332     summarized_reads = {}
333     genome_reads = 0
334     genome = 'unknown'
335     for k, v in mapped_reads.items():
336         path, k = os.path.split(k)
337         if len(path) > 0 and not genome_map.has_key(path):
338             genome = path
339             genome_reads += v
340         else:
341             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
342     summarized_reads[genome] = genome_reads
343     return summarized_reads
344
345 def summarize_lane(gerald, lane_id):
346     report = []
347     summary_results = gerald.summary.lane_results
348     for end in range(len(summary_results)):
349       eland_result = gerald.eland_results.results[end][lane_id]
350       report.append("Sample name %s" % (eland_result.sample_name))
351       report.append("Lane id %s end %s" % (eland_result.lane_id, end))
352       if end < len(summary_results) and summary_results[end].has_key(eland_result.lane_id):
353           cluster = summary_results[end][eland_result.lane_id].cluster
354           report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
355       report.append("Total Reads: %d" % (eland_result.reads))
356
357       if hasattr(eland_result, 'match_codes'):
358           mc = eland_result.match_codes
359           nm = mc['NM']
360           nm_percent = float(nm) / eland_result.reads * 100
361           qc = mc['QC']
362           qc_percent = float(qc) / eland_result.reads * 100
363
364           report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
365           report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
366           report.append('Unique (0,1,2 mismatches) %d %d %d' % \
367                         (mc['U0'], mc['U1'], mc['U2']))
368           report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
369                         (mc['R0'], mc['R1'], mc['R2']))
370
371       if hasattr(eland_result, 'genome_map'):
372           report.append("Mapped Reads")
373           mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads)
374           for name, counts in mapped_reads.items():
375             report.append("  %s: %d" % (name, counts))
376
377       report.append('')
378     return report
379
380 def summary_report(runs):
381     """
382     Summarize cluster numbers and mapped read counts for a runfolder
383     """
384     report = []
385     for run in runs:
386         # print a run name?
387         report.append('Summary for %s' % (run.name,))
388         # sort the report
389         eland_keys = run.gerald.eland_results.results[0].keys()
390         eland_keys.sort(alphanum)
391
392         for lane_id in eland_keys:
393             report.extend(summarize_lane(run.gerald, lane_id))
394             report.append('---')
395             report.append('')
396         return os.linesep.join(report)
397
398 def is_compressed(filename):
399     if os.path.splitext(filename)[1] == ".gz":
400         return True
401     elif os.path.splitext(filename)[1] == '.bz2':
402         return True
403     else:
404         return False
405
406 def save_flowcell_reports(data_dir, cycle_dir):
407     """
408     Save the flowcell quality reports
409     """
410     data_dir = os.path.abspath(data_dir)
411     status_file = os.path.join(data_dir, 'Status.xml')
412     reports_dir = os.path.join(data_dir, 'reports')
413     reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
414     if os.path.exists(reports_dir):
415         cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
416         if os.path.exists(status_file):
417             cmd_list.extend(['Status.xml', 'Status.xsl'])
418         LOGGER.info("Saving reports from " + reports_dir)
419         cwd = os.getcwd()
420         os.chdir(data_dir)
421         q = QueueCommands([" ".join(cmd_list)])
422         q.run()
423         os.chdir(cwd)
424
425
426 def save_summary_file(pipeline, cycle_dir):
427     # Copy Summary.htm
428     gerald_object = pipeline.gerald
429     gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
430     status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
431     if os.path.exists(gerald_summary):
432         LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
433         shutil.copy(gerald_summary, cycle_dir)
434     elif os.path.exists(status_files_summary):
435         LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
436         shutil.copy(status_files_summary, cycle_dir)
437     else:
438         LOGGER.info('Summary file %s was not found' % (summary_path,))
439
440 def save_ivc_plot(bustard_object, cycle_dir):
441     """
442     Save the IVC page and its supporting images
443     """
444     plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
445     plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
446     plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
447
448     plot_target_path = os.path.join(cycle_dir, 'Plots')
449
450     if os.path.exists(plot_html):
451         LOGGER.debug("Saving %s" % (plot_html,))
452         LOGGER.debug("Saving %s" % (plot_images,))
453         shutil.copy(plot_html, cycle_dir)
454         if not os.path.exists(plot_target_path):
455             os.mkdir(plot_target_path)
456         for plot_file in glob(plot_images):
457             shutil.copy(plot_file, plot_target_path)
458     else:
459         LOGGER.warning('Missing IVC.html file, not archiving')
460
461
462 def compress_score_files(bustard_object, cycle_dir):
463     """
464     Compress score files into our result directory
465     """
466     # check for g.pathname/Temp a new feature of 1.1rc1
467     scores_path = bustard_object.pathname
468     scores_path_temp = os.path.join(scores_path, 'Temp')
469     if os.path.isdir(scores_path_temp):
470         scores_path = scores_path_temp
471
472     # hopefully we have a directory that contains s_*_score files
473     score_files = []
474     for f in os.listdir(scores_path):
475         if re.match('.*_score.txt', f):
476             score_files.append(f)
477
478     tar_cmd = ['tar', 'c'] + score_files
479     bzip_cmd = [ 'bzip2', '-9', '-c' ]
480     tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
481     tar_dest = open(tar_dest_name, 'w')
482     LOGGER.info("Compressing score files from %s" % (scores_path,))
483     LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
484     LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
485     LOGGER.info("Writing to %s" % (tar_dest_name,))
486
487     env = {'BZIP': '-9'}
488     tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
489                            cwd=scores_path)
490     bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
491     tar.wait()
492
493
494 def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
495     """
496     Compress eland result files into the archive directory
497     """
498     # copy & bzip eland files
499     bz_commands = []
500
501     for lanes_dictionary in gerald_object.eland_results.results:
502         for eland_lane in lanes_dictionary.values():
503             source_name = eland_lane.pathname
504             if source_name is None:
505               LOGGER.info(
506                 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
507             else:
508               path, name = os.path.split(source_name)
509               dest_name = os.path.join(cycle_dir, name)
510               LOGGER.info("Saving eland file %s to %s" % \
511                          (source_name, dest_name))
512
513               if is_compressed(name):
514                 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
515                 shutil.copy(source_name, dest_name)
516               else:
517                 # not compressed
518                 dest_name += '.bz2'
519                 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
520                 bz_commands.append(" ".join(args))
521                 #LOGGER.info('Running: %s' % ( " ".join(args) ))
522                 #bzip_dest = open(dest_name, 'w')
523                 #bzip = subprocess.Popen(args, stdout=bzip_dest)
524                 #LOGGER.info('Saving to %s' % (dest_name, ))
525                 #bzip.wait()
526
527     if len(bz_commands) > 0:
528       q = QueueCommands(bz_commands, num_jobs)
529       q.run()
530
531
532 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
533     """
534     Iterate over runfolders in runs extracting the most useful information.
535       * run parameters (in run-*.xml)
536       * eland_result files
537       * score files
538       * Summary.htm
539       * srf files (raw sequence & qualities)
540     """
541     if output_base_dir is None:
542         output_base_dir = os.getcwd()
543
544     for r in runs:
545       result_dir = os.path.join(output_base_dir, r.flowcell_id)
546       LOGGER.info("Using %s as result directory" % (result_dir,))
547       if not os.path.exists(result_dir):
548         os.mkdir(result_dir)
549
550       # create cycle_dir
551       cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
552       LOGGER.info("Filling in %s" % (cycle,))
553       cycle_dir = os.path.join(result_dir, cycle)
554       cycle_dir = os.path.abspath(cycle_dir)
555       if os.path.exists(cycle_dir):
556         LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
557         continue
558       else:
559         os.mkdir(cycle_dir)
560
561       # save run file
562       r.save(cycle_dir)
563
564       # save illumina flowcell status report
565       save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
566
567       # save stuff from bustard
568       # grab IVC plot
569       save_ivc_plot(r.bustard, cycle_dir)
570
571       # build base call saving commands
572       if site is not None:
573         lanes = []
574         for lane in range(1, 9):
575           lane_parameters = r.gerald.lanes.get(lane, None)
576           if lane_parameters is not None and lane_parameters.analysis != 'none':
577             lanes.append(lane)
578
579         run_name = srf.pathname_to_run_name(r.pathname)
580         seq_cmds = []
581         LOGGER.info("Raw Format is: %s" % (raw_format, ))
582         if raw_format == 'fastq':
583             rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
584             LOGGER.info("raw data = %s" % (rawpath,))
585             srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
586         elif raw_format == 'qseq':
587             seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
588         elif raw_format == 'srf':
589             seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
590         else:
591             raise ValueError('Unknown --raw-format=%s' % (raw_format))
592         srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
593
594       # save stuff from GERALD
595       # copy stuff out of the main run
596       g = r.gerald
597
598       # save summary file
599       save_summary_file(r, cycle_dir)
600
601       # compress eland result files
602       compress_eland_results(g, cycle_dir, num_jobs)
603
604       # md5 all the compressed files once we're done
605       md5_commands = srf.make_md5_commands(cycle_dir)
606       srf.run_commands(cycle_dir, md5_commands, num_jobs)
607
608 def rm_list(files, dry_run=True):
609     for f in files:
610         if os.path.exists(f):
611             LOGGER.info('deleting %s' % (f,))
612             if not dry_run:
613                 if os.path.isdir(f):
614                     shutil.rmtree(f)
615                 else:
616                     os.unlink(f)
617         else:
618             LOGGER.warn("%s doesn't exist." % (f,))
619
620 def clean_runs(runs, dry_run=True):
621     """
622     Clean up run folders to optimize for compression.
623     """
624     if dry_run:
625         LOGGER.info('In dry-run mode')
626
627     for run in runs:
628         LOGGER.info('Cleaninging %s' % (run.pathname,))
629         # rm RunLog*.xml
630         runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
631         rm_list(runlogs, dry_run)
632         # rm pipeline_*.txt
633         pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
634         rm_list(pipeline_logs, dry_run)
635         # rm gclog.txt?
636         # rm NetCopy.log? Isn't this robocopy?
637         logs = glob(os.path.join(run.pathname, '*.log'))
638         rm_list(logs, dry_run)
639         # rm nfn.log?
640         # Calibration
641         calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
642         rm_list(calibration_dir, dry_run)
643         # rm Images/L*
644         LOGGER.info("Cleaning images")
645         image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
646         rm_list(image_dirs, dry_run)
647         # rm ReadPrep
648         LOGGER.info("Cleaning ReadPrep*")
649         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
650         rm_list(read_prep_dirs, dry_run)
651         # rm ReadPrep
652         LOGGER.info("Cleaning Thubmnail_images")
653         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
654         rm_list(thumbnail_dirs, dry_run)
655
656         # make clean_intermediate
657         logging.info("Cleaning intermediate files")
658         if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
659             clean_process = subprocess.Popen(['make', 'clean_intermediate'],
660                                              cwd=run.image_analysis.pathname,)
661             clean_process.wait()
662
663
664