Clean up flowcell ID detection and add support for reading HiSeq IDs
[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
1 """
2 Core information needed to inspect a runfolder.
3 """
4 from glob import glob
5 import logging
6 import os
7 import re
8 import shutil
9 import stat
10 import subprocess
11 import sys
12 import tarfile
13 import time
14
15 import lxml.etree as ElementTree
16
17 LOGGER = logging.getLogger(__name__)
18
19 EUROPEAN_STRPTIME = "%d-%m-%Y"
20 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
21 VERSION_RE = "([0-9\.]+)"
22 USER_RE = "([a-zA-Z0-9]+)"
23 LANES_PER_FLOWCELL = 8
24 LANE_LIST = range(1, LANES_PER_FLOWCELL + 1)
25
26 from htsworkflow.util.alphanum import alphanum
27 from htsworkflow.util.ethelp import indent, flatten
28 from htsworkflow.util.queuecommands import QueueCommands
29
30 from htsworkflow.pipelines import srf
31
32 class PipelineRun(object):
33     """
34     Capture "interesting" information about a pipeline run
35     """
36     XML_VERSION = 1
37     PIPELINE_RUN = 'PipelineRun'
38     FLOWCELL_ID = 'FlowcellID'
39
40     def __init__(self, pathname=None, flowcell_id=None, xml=None):
41         if pathname is not None:
42           self.pathname = os.path.normpath(pathname)
43         else:
44           self.pathname = None
45         self._name = None
46         self._flowcell_id = flowcell_id
47         self.datadir = None
48         self.image_analysis = None
49         self.bustard = None
50         self.gerald = None
51
52         if xml is not None:
53           self.set_elements(xml)
54
55     def _get_flowcell_id(self):
56         # extract flowcell ID
57         if self._flowcell_id is None:
58             self._flowcell_id = self._get_flowcell_id_from_runinfo()
59         if self._flowcell_id is None:
60             self._flowcell_id = self._get_flowcell_id_from_flowcellid()
61         if self._flowcell_id is None:
62             self._flowcell_id = self._get_flowcell_id_from_path()
63         if self._flowcell_id is None:
64             self._flowcell_id = 'unknown'
65
66             LOGGER.warning(
67                 "Flowcell id was not found, guessing %s" % (
68                     self._flowcell_id))
69
70         return self._flowcell_id
71     flowcell_id = property(_get_flowcell_id)
72
73     def _get_flowcell_id_from_flowcellid(self):
74         """Extract flowcell id from a Config/FlowcellId.xml file
75         """
76         config_dir = os.path.join(self.pathname, 'Config')
77         flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
78         if os.path.exists(flowcell_id_path):
79             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
80             return flowcell_id_tree.findtext('Text')
81
82     def _get_flowcell_id_from_runinfo(self):
83         """Read RunInfo file for flowcell id
84         """
85         runinfo = os.path.join(self.pathname, 'RunInfo.xml')
86         if os.path.exists(runinfo):
87             tree = ElementTree.parse(runinfo)
88             root = tree.getroot()
89             fc_nodes = root.xpath('/RunInfo/Run/Flowcell')
90             if len(fc_nodes) == 1:
91                 return fc_nodes[0].text
92
93
94     def _get_flowcell_id_from_path(self):
95         """Guess a flowcell name from the path
96         """
97         path_fields = self.pathname.split('_')
98         if len(path_fields) > 0:
99             # guessing last element of filename
100             return path_fields[-1]
101
102     def _get_runfolder_name(self):
103         if self.gerald is None:
104             return None
105         else:
106             return self.gerald.runfolder_name
107     runfolder_name = property(_get_runfolder_name)
108
109     def get_elements(self):
110         """
111         make one master xml file from all of our sub-components.
112         """
113         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
114         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
115         flowcell.text = self.flowcell_id
116         root.append(self.image_analysis.get_elements())
117         root.append(self.bustard.get_elements())
118         root.append(self.gerald.get_elements())
119         return root
120
121     def set_elements(self, tree):
122         # this file gets imported by all the others,
123         # so we need to hide the imports to avoid a cyclic imports
124         from htsworkflow.pipelines import firecrest
125         from htsworkflow.pipelines import ipar
126         from htsworkflow.pipelines import bustard
127         from htsworkflow.pipelines import gerald
128
129         tag = tree.tag.lower()
130         if tag != PipelineRun.PIPELINE_RUN.lower():
131           raise ValueError('Pipeline Run Expecting %s got %s' % (
132               PipelineRun.PIPELINE_RUN, tag))
133         for element in tree:
134           tag = element.tag.lower()
135           if tag == PipelineRun.FLOWCELL_ID.lower():
136             self._flowcell_id = element.text
137           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
138           # you should only have Firecrest or IPAR, never both of them.
139           elif tag == firecrest.Firecrest.FIRECREST.lower():
140             self.image_analysis = firecrest.Firecrest(xml=element)
141           elif tag == ipar.IPAR.IPAR.lower():
142             self.image_analysis = ipar.IPAR(xml=element)
143           elif tag == bustard.Bustard.BUSTARD.lower():
144             self.bustard = bustard.Bustard(xml=element)
145           elif tag == gerald.Gerald.GERALD.lower():
146             self.gerald = gerald.Gerald(xml=element)
147           elif tag == gerald.CASAVA.GERALD.lower():
148             self.gerald = gerald.CASAVA(xml=element)
149           else:
150             LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
151
152     def _get_run_name(self):
153         """
154         Given a run tuple, find the latest date and use that as our name
155         """
156         if self._name is None:
157           tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
158           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
159           self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
160         return self._name
161     name = property(_get_run_name)
162
163     def save(self, destdir=None):
164         if destdir is None:
165             destdir = ''
166         LOGGER.info("Saving run report " + self.name)
167         xml = self.get_elements()
168         indent(xml)
169         dest_pathname = os.path.join(destdir, self.name)
170         ElementTree.ElementTree(xml).write(dest_pathname)
171
172     def load(self, filename):
173         LOGGER.info("Loading run report from " + filename)
174         tree = ElementTree.parse(filename).getroot()
175         self.set_elements(tree)
176
177 def load_pipeline_run_xml(pathname):
178     """
179     Load and instantiate a Pipeline run from a run xml file
180
181     :Parameters:
182       - `pathname` : location of an run xml file
183
184     :Returns: initialized PipelineRun object
185     """
186     tree = ElementTree.parse(pathname).getroot()
187     run = PipelineRun(xml=tree)
188     return run
189
190 def get_runs(runfolder, flowcell_id=None):
191     """
192     Search through a run folder for all the various sub component runs
193     and then return a PipelineRun for each different combination.
194
195     For example if there are two different GERALD runs, this will
196     generate two different PipelineRun objects, that differ
197     in there gerald component.
198     """
199     from htsworkflow.pipelines import firecrest
200     from htsworkflow.pipelines import ipar
201     from htsworkflow.pipelines import bustard
202     from htsworkflow.pipelines import gerald
203
204     def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
205         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
206         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
207         # RTA BaseCalls looks enough like Bustard.
208         bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
209         for bustard_pathname in bustard_dirs:
210             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
211             b = bustard.bustard(bustard_pathname)
212             build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
213
214             build_aligned_runs(image_analysis, runs, b, datadir, runfolder)
215
216     def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
217         gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
218         LOGGER.info("Looking for gerald directories in %s" % (pathname,))
219         for gerald_pathname in glob(gerald_glob):
220             LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
221             try:
222                 g = gerald.gerald(gerald_pathname)
223                 p = PipelineRun(runfolder, flowcell_id)
224                 p.datadir = datadir
225                 p.image_analysis = image_analysis
226                 p.bustard = b
227                 p.gerald = g
228                 runs.append(p)
229             except IOError, e:
230                 LOGGER.error("Ignoring " + str(e))
231
232
233     def build_aligned_runs(image_analysis, runs, b, datadir, runfolder):
234         aligned_glob = os.path.join(runfolder, 'Aligned*')
235         for aligned in glob(aligned_glob):
236             LOGGER.info("Found aligned directory %s" % (aligned,))
237             try:
238                 g = gerald.gerald(aligned)
239                 p = PipelineRun(runfolder, flowcell_id)
240                 p.datadir = datadir
241                 p.image_analysis = image_analysis
242                 p.bustard = b
243                 p.gerald = g
244                 runs.append(p)
245             except IOError, e:
246                 LOGGER.error("Ignoring " + str(e))
247
248     datadir = os.path.join(runfolder, 'Data')
249
250     LOGGER.info('Searching for runs in ' + datadir)
251     runs = []
252     # scan for firecrest directories
253     for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
254         LOGGER.info('Found firecrest in ' + datadir)
255         image_analysis = firecrest.firecrest(firecrest_pathname)
256         if image_analysis is None:
257             LOGGER.warn(
258                 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
259             )
260         else:
261             scan_post_image_analysis(
262                 runs, runfolder, datadir, image_analysis, firecrest_pathname
263             )
264     # scan for IPAR directories
265     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
266     # The Intensities directory from the RTA software looks a lot like IPAR
267     ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
268     for ipar_pathname in ipar_dirs:
269         LOGGER.info('Found ipar directories in ' + datadir)
270         image_analysis = ipar.ipar(ipar_pathname)
271         if image_analysis is None:
272             LOGGER.warn(
273                 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
274             )
275         else:
276             scan_post_image_analysis(
277                 runs, runfolder, datadir, image_analysis, ipar_pathname
278             )
279
280     return runs
281
282 def get_specific_run(gerald_dir):
283     """
284     Given a gerald directory, construct a PipelineRun out of its parents
285
286     Basically this allows specifying a particular run instead of the previous
287     get_runs which scans a runfolder for various combinations of
288     firecrest/ipar/bustard/gerald runs.
289     """
290     from htsworkflow.pipelines import firecrest
291     from htsworkflow.pipelines import ipar
292     from htsworkflow.pipelines import bustard
293     from htsworkflow.pipelines import gerald
294
295     gerald_dir = os.path.expanduser(gerald_dir)
296     bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
297     image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
298
299     runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
300
301     LOGGER.info('--- use-run detected options ---')
302     LOGGER.info('runfolder: %s' % (runfolder_dir,))
303     LOGGER.info('image_dir: %s' % (image_dir,))
304     LOGGER.info('bustard_dir: %s' % (bustard_dir,))
305     LOGGER.info('gerald_dir: %s' % (gerald_dir,))
306
307     # find our processed image dir
308     image_run = None
309     # split into parent, and leaf directory
310     # leaf directory should be an IPAR or firecrest directory
311     data_dir, short_image_dir = os.path.split(image_dir)
312     LOGGER.info('data_dir: %s' % (data_dir,))
313     LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
314
315     # guess which type of image processing directory we have by looking
316     # in the leaf directory name
317     if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
318         image_run = firecrest.firecrest(image_dir)
319     elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
320         image_run = ipar.ipar(image_dir)
321     elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
322         image_run = ipar.ipar(image_dir)
323
324     # if we din't find a run, report the error and return
325     if image_run is None:
326         msg = '%s does not contain an image processing step' % (image_dir,)
327         LOGGER.error(msg)
328         return None
329
330     # find our base calling
331     base_calling_run = bustard.bustard(bustard_dir)
332     if base_calling_run is None:
333         LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
334         return None
335
336     # find alignments
337     gerald_run = gerald.gerald(gerald_dir)
338     if gerald_run is None:
339         LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
340         return None
341
342     p = PipelineRun(runfolder_dir)
343     p.image_analysis = image_run
344     p.bustard = base_calling_run
345     p.gerald = gerald_run
346
347     LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
348     return p
349
350 def extract_run_parameters(runs):
351     """
352     Search through runfolder_path for various runs and grab their parameters
353     """
354     for run in runs:
355       run.save()
356
357 def summarize_mapped_reads(genome_map, mapped_reads):
358     """
359     Summarize per chromosome reads into a genome count
360     But handle spike-in/contamination symlinks seperately.
361     """
362     summarized_reads = {}
363     genome_reads = 0
364     genome = 'unknown'
365     for k, v in mapped_reads.items():
366         path, k = os.path.split(k)
367         if len(path) > 0 and path not in genome_map:
368             genome = path
369             genome_reads += v
370         else:
371             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
372     summarized_reads[genome] = genome_reads
373     return summarized_reads
374
375 def summarize_lane(gerald, lane_id):
376     report = []
377     lane_results = gerald.summary.lane_results
378     eland_result = gerald.eland_results[lane_id]
379     report.append("Sample name %s" % (eland_result.sample_name))
380     report.append("Lane id %s end %s" % (lane_id.lane, lane_id.read))
381
382     if lane_id.read < len(lane_results) and \
383            lane_id.lane in lane_results[lane_id.read]:
384         summary_results = lane_results[lane_id.read][lane_id.lane]
385         cluster = summary_results.cluster
386         report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
387     report.append("Total Reads: %d" % (eland_result.reads))
388
389     if hasattr(eland_result, 'match_codes'):
390         mc = eland_result.match_codes
391         nm = mc['NM']
392         nm_percent = float(nm) / eland_result.reads * 100
393         qc = mc['QC']
394         qc_percent = float(qc) / eland_result.reads * 100
395
396         report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
397         report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
398         report.append('Unique (0,1,2 mismatches) %d %d %d' % \
399                       (mc['U0'], mc['U1'], mc['U2']))
400         report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
401                       (mc['R0'], mc['R1'], mc['R2']))
402
403     if hasattr(eland_result, 'genome_map'):
404         report.append("Mapped Reads")
405         mapped_reads = summarize_mapped_reads(eland_result.genome_map,
406                                               eland_result.mapped_reads)
407         for name, counts in mapped_reads.items():
408             report.append("  %s: %d" % (name, counts))
409
410         report.append('')
411     return report
412
413 def summary_report(runs):
414     """
415     Summarize cluster numbers and mapped read counts for a runfolder
416     """
417     report = []
418     for run in runs:
419         # print a run name?
420         report.append('Summary for %s' % (run.name,))
421         # sort the report
422         eland_keys = sorted(run.gerald.eland_results.keys())
423     for lane_id in eland_keys:
424         report.extend(summarize_lane(run.gerald, lane_id))
425         report.append('---')
426         report.append('')
427     return os.linesep.join(report)
428
429 def is_compressed(filename):
430     if os.path.splitext(filename)[1] == ".gz":
431         return True
432     elif os.path.splitext(filename)[1] == '.bz2':
433         return True
434     else:
435         return False
436
437 def save_flowcell_reports(data_dir, cycle_dir):
438     """
439     Save the flowcell quality reports
440     """
441     data_dir = os.path.abspath(data_dir)
442     status_file = os.path.join(data_dir, 'Status.xml')
443     reports_dir = os.path.join(data_dir, 'reports')
444     reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
445     if os.path.exists(reports_dir):
446         cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
447         if os.path.exists(status_file):
448             cmd_list.extend(['Status.xml', 'Status.xsl'])
449         LOGGER.info("Saving reports from " + reports_dir)
450         cwd = os.getcwd()
451         os.chdir(data_dir)
452         q = QueueCommands([" ".join(cmd_list)])
453         q.run()
454         os.chdir(cwd)
455
456
457 def save_summary_file(pipeline, cycle_dir):
458     # Copy Summary.htm
459     gerald_object = pipeline.gerald
460     gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
461     status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
462     if os.path.exists(gerald_summary):
463         LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
464         shutil.copy(gerald_summary, cycle_dir)
465     elif os.path.exists(status_files_summary):
466         LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
467         shutil.copy(status_files_summary, cycle_dir)
468     else:
469         LOGGER.info('Summary file %s was not found' % (summary_path,))
470
471 def save_ivc_plot(bustard_object, cycle_dir):
472     """
473     Save the IVC page and its supporting images
474     """
475     plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
476     plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
477     plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
478
479     plot_target_path = os.path.join(cycle_dir, 'Plots')
480
481     if os.path.exists(plot_html):
482         LOGGER.debug("Saving %s" % (plot_html,))
483         LOGGER.debug("Saving %s" % (plot_images,))
484         shutil.copy(plot_html, cycle_dir)
485         if not os.path.exists(plot_target_path):
486             os.mkdir(plot_target_path)
487         for plot_file in glob(plot_images):
488             shutil.copy(plot_file, plot_target_path)
489     else:
490         LOGGER.warning('Missing IVC.html file, not archiving')
491
492
493 def compress_score_files(bustard_object, cycle_dir):
494     """
495     Compress score files into our result directory
496     """
497     # check for g.pathname/Temp a new feature of 1.1rc1
498     scores_path = bustard_object.pathname
499     scores_path_temp = os.path.join(scores_path, 'Temp')
500     if os.path.isdir(scores_path_temp):
501         scores_path = scores_path_temp
502
503     # hopefully we have a directory that contains s_*_score files
504     score_files = []
505     for f in os.listdir(scores_path):
506         if re.match('.*_score.txt', f):
507             score_files.append(f)
508
509     tar_cmd = ['tar', 'c'] + score_files
510     bzip_cmd = [ 'bzip2', '-9', '-c' ]
511     tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
512     tar_dest = open(tar_dest_name, 'w')
513     LOGGER.info("Compressing score files from %s" % (scores_path,))
514     LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
515     LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
516     LOGGER.info("Writing to %s" % (tar_dest_name,))
517
518     env = {'BZIP': '-9'}
519     tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
520                            cwd=scores_path)
521     bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
522     tar.wait()
523
524
525 def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
526     """
527     Compress eland result files into the archive directory
528     """
529     # copy & bzip eland files
530     bz_commands = []
531
532     for key in gerald_object.eland_results:
533         eland_lane = gerald_object.eland_results[key]
534         for source_name in eland_lane.pathnames:
535             if source_name is None:
536               LOGGER.info(
537                 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
538             else:
539               path, name = os.path.split(source_name)
540               dest_name = os.path.join(cycle_dir, name)
541               LOGGER.info("Saving eland file %s to %s" % \
542                          (source_name, dest_name))
543
544               if is_compressed(name):
545                 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
546                 shutil.copy(source_name, dest_name)
547               else:
548                 # not compressed
549                 dest_name += '.bz2'
550                 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
551                 bz_commands.append(" ".join(args))
552                 #LOGGER.info('Running: %s' % ( " ".join(args) ))
553                 #bzip_dest = open(dest_name, 'w')
554                 #bzip = subprocess.Popen(args, stdout=bzip_dest)
555                 #LOGGER.info('Saving to %s' % (dest_name, ))
556                 #bzip.wait()
557
558     if len(bz_commands) > 0:
559       q = QueueCommands(bz_commands, num_jobs)
560       q.run()
561
562
563 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'):
564     """
565     Iterate over runfolders in runs extracting the most useful information.
566       * run parameters (in run-*.xml)
567       * eland_result files
568       * score files
569       * Summary.htm
570       * srf files (raw sequence & qualities)
571     """
572     if output_base_dir is None:
573         output_base_dir = os.getcwd()
574
575     for r in runs:
576       result_dir = os.path.join(output_base_dir, r.flowcell_id)
577       LOGGER.info("Using %s as result directory" % (result_dir,))
578       if not os.path.exists(result_dir):
579         os.mkdir(result_dir)
580
581       # create cycle_dir
582       cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
583       LOGGER.info("Filling in %s" % (cycle,))
584       cycle_dir = os.path.join(result_dir, cycle)
585       cycle_dir = os.path.abspath(cycle_dir)
586       if os.path.exists(cycle_dir):
587         LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
588         continue
589       else:
590         os.mkdir(cycle_dir)
591
592       # save run file
593       r.save(cycle_dir)
594
595       # save illumina flowcell status report
596       save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir)
597
598       # save stuff from bustard
599       # grab IVC plot
600       save_ivc_plot(r.bustard, cycle_dir)
601
602       # build base call saving commands
603       if site is not None:
604         lanes = []
605         for lane in range(1, 9):
606           lane_parameters = r.gerald.lanes.get(lane, None)
607           if lane_parameters is not None and lane_parameters.analysis != 'none':
608             lanes.append(lane)
609
610         run_name = srf.pathname_to_run_name(r.pathname)
611         seq_cmds = []
612         LOGGER.info("Raw Format is: %s" % (raw_format, ))
613         if raw_format == 'fastq':
614             rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
615             LOGGER.info("raw data = %s" % (rawpath,))
616             srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
617         elif raw_format == 'qseq':
618             seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
619         elif raw_format == 'srf':
620             seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
621         else:
622             raise ValueError('Unknown --raw-format=%s' % (raw_format))
623         srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
624
625       # save stuff from GERALD
626       # copy stuff out of the main run
627       g = r.gerald
628
629       # save summary file
630       save_summary_file(r, cycle_dir)
631
632       # compress eland result files
633       compress_eland_results(g, cycle_dir, num_jobs)
634
635       # md5 all the compressed files once we're done
636       md5_commands = srf.make_md5_commands(cycle_dir)
637       srf.run_commands(cycle_dir, md5_commands, num_jobs)
638
639 def rm_list(files, dry_run=True):
640     for f in files:
641         if os.path.exists(f):
642             LOGGER.info('deleting %s' % (f,))
643             if not dry_run:
644                 if os.path.isdir(f):
645                     shutil.rmtree(f)
646                 else:
647                     os.unlink(f)
648         else:
649             LOGGER.warn("%s doesn't exist." % (f,))
650
651 def clean_runs(runs, dry_run=True):
652     """
653     Clean up run folders to optimize for compression.
654     """
655     if dry_run:
656         LOGGER.info('In dry-run mode')
657
658     for run in runs:
659         LOGGER.info('Cleaninging %s' % (run.pathname,))
660         # rm RunLog*.xml
661         runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
662         rm_list(runlogs, dry_run)
663         # rm pipeline_*.txt
664         pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
665         rm_list(pipeline_logs, dry_run)
666         # rm gclog.txt?
667         # rm NetCopy.log? Isn't this robocopy?
668         logs = glob(os.path.join(run.pathname, '*.log'))
669         rm_list(logs, dry_run)
670         # rm nfn.log?
671         # Calibration
672         calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
673         rm_list(calibration_dir, dry_run)
674         # rm Images/L*
675         LOGGER.info("Cleaning images")
676         image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
677         rm_list(image_dirs, dry_run)
678         # rm ReadPrep
679         LOGGER.info("Cleaning ReadPrep*")
680         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
681         rm_list(read_prep_dirs, dry_run)
682         # rm ReadPrep
683         LOGGER.info("Cleaning Thubmnail_images")
684         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
685         rm_list(thumbnail_dirs, dry_run)
686
687         # make clean_intermediate
688         logging.info("Cleaning intermediate files")
689         if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
690             clean_process = subprocess.Popen(['make', 'clean_intermediate'],
691                                              cwd=run.image_analysis.pathname,)
692             clean_process.wait()
693
694
695