Recent IPAR xml config blocks include the runfolder name
[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
1 """
2 Core information needed to inspect a runfolder.
3 """
4 from glob import glob
5 import logging
6 import os
7 import re
8 import shutil
9 import stat
10 import subprocess
11 import sys
12 import tarfile
13 import time
14
15 import lxml.etree as ElementTree
16
17 LOGGER = logging.getLogger(__name__)
18
19 EUROPEAN_STRPTIME = "%d-%m-%Y"
20 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
21 VERSION_RE = "([0-9\.]+)"
22 USER_RE = "([a-zA-Z0-9]+)"
23 LANES_PER_FLOWCELL = 8
24 LANE_LIST = range(1, LANES_PER_FLOWCELL + 1)
25
26 from htsworkflow.util.alphanum import alphanum
27 from htsworkflow.util.ethelp import indent, flatten
28 from htsworkflow.util.queuecommands import QueueCommands
29
30 from htsworkflow.pipelines import srf
31
32 class PipelineRun(object):
33     """
34     Capture "interesting" information about a pipeline run
35     """
36     XML_VERSION = 1
37     PIPELINE_RUN = 'PipelineRun'
38     FLOWCELL_ID = 'FlowcellID'
39
40     def __init__(self, pathname=None, flowcell_id=None, xml=None):
41         if pathname is not None:
42           self.pathname = os.path.normpath(pathname)
43         else:
44           self.pathname = None
45         self._name = None
46         self._flowcell_id = flowcell_id
47         self.datadir = None
48         self.image_analysis = None
49         self.bustard = None
50         self.gerald = None
51
52         if xml is not None:
53           self.set_elements(xml)
54
55     def _get_flowcell_id(self):
56         # extract flowcell ID
57         if self._flowcell_id is None:
58             self._flowcell_id = self._get_flowcell_id_from_runinfo()
59         if self._flowcell_id is None:
60             self._flowcell_id = self._get_flowcell_id_from_flowcellid()
61         if self._flowcell_id is None:
62             self._flowcell_id = self._get_flowcell_id_from_path()
63         if self._flowcell_id is None:
64             self._flowcell_id = 'unknown'
65
66             LOGGER.warning(
67                 "Flowcell id was not found, guessing %s" % (
68                     self._flowcell_id))
69
70         return self._flowcell_id
71     flowcell_id = property(_get_flowcell_id)
72
73     def _get_flowcell_id_from_flowcellid(self):
74         """Extract flowcell id from a Config/FlowcellId.xml file
75         """
76         config_dir = os.path.join(self.pathname, 'Config')
77         flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
78         if os.path.exists(flowcell_id_path):
79             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
80             return flowcell_id_tree.findtext('Text')
81
82     def _get_flowcell_id_from_runinfo(self):
83         """Read RunInfo file for flowcell id
84         """
85         runinfo = os.path.join(self.pathname, 'RunInfo.xml')
86         if os.path.exists(runinfo):
87             tree = ElementTree.parse(runinfo)
88             root = tree.getroot()
89             fc_nodes = root.xpath('/RunInfo/Run/Flowcell')
90             if len(fc_nodes) == 1:
91                 return fc_nodes[0].text
92
93
94     def _get_flowcell_id_from_path(self):
95         """Guess a flowcell name from the path
96         """
97         path_fields = self.pathname.split('_')
98         if len(path_fields) > 0:
99             # guessing last element of filename
100             return path_fields[-1]
101
102     def _get_runfolder_name(self):
103         if self.gerald:
104             return self.gerald.runfolder_name
105         elif hasattr(self.image_analysis, 'runfolder_name'):
106             return self.image_analysis.runfolder_name
107         else:
108             return None
109     runfolder_name = property(_get_runfolder_name)
110
111     def get_elements(self):
112         """
113         make one master xml file from all of our sub-components.
114         """
115         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
116         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
117         flowcell.text = self.flowcell_id
118         root.append(self.image_analysis.get_elements())
119         root.append(self.bustard.get_elements())
120         root.append(self.gerald.get_elements())
121         return root
122
123     def set_elements(self, tree):
124         # this file gets imported by all the others,
125         # so we need to hide the imports to avoid a cyclic imports
126         from htsworkflow.pipelines import firecrest
127         from htsworkflow.pipelines import ipar
128         from htsworkflow.pipelines import bustard
129         from htsworkflow.pipelines import gerald
130
131         tag = tree.tag.lower()
132         if tag != PipelineRun.PIPELINE_RUN.lower():
133           raise ValueError('Pipeline Run Expecting %s got %s' % (
134               PipelineRun.PIPELINE_RUN, tag))
135         for element in tree:
136           tag = element.tag.lower()
137           if tag == PipelineRun.FLOWCELL_ID.lower():
138             self._flowcell_id = element.text
139           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
140           # you should only have Firecrest or IPAR, never both of them.
141           elif tag == firecrest.Firecrest.FIRECREST.lower():
142             self.image_analysis = firecrest.Firecrest(xml=element)
143           elif tag == ipar.IPAR.IPAR.lower():
144             self.image_analysis = ipar.IPAR(xml=element)
145           elif tag == bustard.Bustard.BUSTARD.lower():
146             self.bustard = bustard.Bustard(xml=element)
147           elif tag == gerald.Gerald.GERALD.lower():
148             self.gerald = gerald.Gerald(xml=element)
149           elif tag == gerald.CASAVA.GERALD.lower():
150             self.gerald = gerald.CASAVA(xml=element)
151           else:
152             LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
153
154     def _get_run_name(self):
155         """
156         Given a run tuple, find the latest date and use that as our name
157         """
158         if self._name is None:
159           tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
160           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
161           self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
162         return self._name
163     name = property(_get_run_name)
164
165     def save(self, destdir=None):
166         if destdir is None:
167             destdir = ''
168         LOGGER.info("Saving run report " + self.name)
169         xml = self.get_elements()
170         indent(xml)
171         dest_pathname = os.path.join(destdir, self.name)
172         ElementTree.ElementTree(xml).write(dest_pathname)
173
174     def load(self, filename):
175         LOGGER.info("Loading run report from " + filename)
176         tree = ElementTree.parse(filename).getroot()
177         self.set_elements(tree)
178
179 def load_pipeline_run_xml(pathname):
180     """
181     Load and instantiate a Pipeline run from a run xml file
182
183     :Parameters:
184       - `pathname` : location of an run xml file
185
186     :Returns: initialized PipelineRun object
187     """
188     tree = ElementTree.parse(pathname).getroot()
189     run = PipelineRun(xml=tree)
190     return run
191
192 def get_runs(runfolder, flowcell_id=None):
193     """
194     Search through a run folder for all the various sub component runs
195     and then return a PipelineRun for each different combination.
196
197     For example if there are two different GERALD runs, this will
198     generate two different PipelineRun objects, that differ
199     in there gerald component.
200     """
201     from htsworkflow.pipelines import firecrest
202     from htsworkflow.pipelines import ipar
203     from htsworkflow.pipelines import bustard
204     from htsworkflow.pipelines import gerald
205
206     def scan_post_image_analysis(runs, runfolder, datadir, image_analysis,
207                                  pathname):
208         added = build_aligned_runs(image_analysis, runs, datadir, runfolder)
209         # If we're a multiplexed run, don't look for older run type.
210         if added > 0:
211             return
212
213         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
214         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
215         # RTA BaseCalls looks enough like Bustard.
216         bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
217         for bustard_pathname in bustard_dirs:
218             LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
219             b = bustard.bustard(bustard_pathname)
220             build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder)
221
222
223     def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder):
224         start = len(runs)
225         gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
226         LOGGER.info("Looking for gerald directories in %s" % (pathname,))
227         for gerald_pathname in glob(gerald_glob):
228             LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
229             try:
230                 g = gerald.gerald(gerald_pathname)
231                 p = PipelineRun(runfolder, flowcell_id)
232                 p.datadir = datadir
233                 p.image_analysis = image_analysis
234                 p.bustard = b
235                 p.gerald = g
236                 runs.append(p)
237             except IOError, e:
238                 LOGGER.error("Ignoring " + str(e))
239         return len(runs) - start
240
241
242     def build_aligned_runs(image_analysis, runs, datadir, runfolder):
243         start = len(runs)
244         aligned_glob = os.path.join(runfolder, 'Aligned*')
245         for aligned in glob(aligned_glob):
246             LOGGER.info("Found aligned directory %s" % (aligned,))
247             try:
248                 g = gerald.gerald(aligned)
249                 p = PipelineRun(runfolder, flowcell_id)
250                 bustard_pathname = os.path.join(runfolder, g.runfolder_name)
251
252                 p.datadir = datadir
253                 p.image_analysis = image_analysis
254                 p.bustard = bustard.bustard(bustard_pathname)
255                 p.gerald = g
256                 runs.append(p)
257             except IOError, e:
258                 LOGGER.error("Ignoring " + str(e))
259         return len(runs) - start
260     datadir = os.path.join(runfolder, 'Data')
261
262     LOGGER.info('Searching for runs in ' + datadir)
263     runs = []
264     # scan for firecrest directories
265     for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
266         LOGGER.info('Found firecrest in ' + datadir)
267         image_analysis = firecrest.firecrest(firecrest_pathname)
268         if image_analysis is None:
269             LOGGER.warn(
270                 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
271             )
272         else:
273             scan_post_image_analysis(
274                 runs, runfolder, datadir, image_analysis, firecrest_pathname
275             )
276     # scan for IPAR directories
277     ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
278     # The Intensities directory from the RTA software looks a lot like IPAR
279     ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
280     for ipar_pathname in ipar_dirs:
281         LOGGER.info('Found ipar directories in ' + datadir)
282         image_analysis = ipar.ipar(ipar_pathname)
283         if image_analysis is None:
284             LOGGER.warn(
285                 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
286             )
287         else:
288             scan_post_image_analysis(
289                 runs, runfolder, datadir, image_analysis, ipar_pathname
290             )
291
292     return runs
293
294 def get_specific_run(gerald_dir):
295     """
296     Given a gerald directory, construct a PipelineRun out of its parents
297
298     Basically this allows specifying a particular run instead of the previous
299     get_runs which scans a runfolder for various combinations of
300     firecrest/ipar/bustard/gerald runs.
301     """
302     from htsworkflow.pipelines import firecrest
303     from htsworkflow.pipelines import ipar
304     from htsworkflow.pipelines import bustard
305     from htsworkflow.pipelines import gerald
306
307     gerald_dir = os.path.expanduser(gerald_dir)
308     bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
309     image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
310
311     runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
312
313     LOGGER.info('--- use-run detected options ---')
314     LOGGER.info('runfolder: %s' % (runfolder_dir,))
315     LOGGER.info('image_dir: %s' % (image_dir,))
316     LOGGER.info('bustard_dir: %s' % (bustard_dir,))
317     LOGGER.info('gerald_dir: %s' % (gerald_dir,))
318
319     # find our processed image dir
320     image_run = None
321     # split into parent, and leaf directory
322     # leaf directory should be an IPAR or firecrest directory
323     data_dir, short_image_dir = os.path.split(image_dir)
324     LOGGER.info('data_dir: %s' % (data_dir,))
325     LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
326
327     # guess which type of image processing directory we have by looking
328     # in the leaf directory name
329     if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
330         image_run = firecrest.firecrest(image_dir)
331     elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
332         image_run = ipar.ipar(image_dir)
333     elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
334         image_run = ipar.ipar(image_dir)
335
336     # if we din't find a run, report the error and return
337     if image_run is None:
338         msg = '%s does not contain an image processing step' % (image_dir,)
339         LOGGER.error(msg)
340         return None
341
342     # find our base calling
343     base_calling_run = bustard.bustard(bustard_dir)
344     if base_calling_run is None:
345         LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
346         return None
347
348     # find alignments
349     gerald_run = gerald.gerald(gerald_dir)
350     if gerald_run is None:
351         LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
352         return None
353
354     p = PipelineRun(runfolder_dir)
355     p.image_analysis = image_run
356     p.bustard = base_calling_run
357     p.gerald = gerald_run
358
359     LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
360     return p
361
362 def extract_run_parameters(runs):
363     """
364     Search through runfolder_path for various runs and grab their parameters
365     """
366     for run in runs:
367       run.save()
368
369 def summarize_mapped_reads(genome_map, mapped_reads):
370     """
371     Summarize per chromosome reads into a genome count
372     But handle spike-in/contamination symlinks seperately.
373     """
374     summarized_reads = {}
375     genome_reads = 0
376     genome = 'unknown'
377     for k, v in mapped_reads.items():
378         path, k = os.path.split(k)
379         if len(path) > 0 and path not in genome_map:
380             genome = path
381             genome_reads += v
382         else:
383             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
384     summarized_reads[genome] = genome_reads
385     return summarized_reads
386
387 def summarize_lane(gerald, lane_id):
388     report = []
389     lane_results = gerald.summary.lane_results
390     eland_result = gerald.eland_results[lane_id]
391     report.append("Sample name %s" % (eland_result.sample_name))
392     report.append("Lane id %s end %s" % (lane_id.lane, lane_id.read))
393
394     if lane_id.read < len(lane_results) and \
395            lane_id.lane in lane_results[lane_id.read]:
396         summary_results = lane_results[lane_id.read][lane_id.lane]
397         cluster = summary_results.cluster
398         report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
399     report.append("Total Reads: %d" % (eland_result.reads))
400
401     if hasattr(eland_result, 'match_codes'):
402         mc = eland_result.match_codes
403         nm = mc['NM']
404         nm_percent = float(nm) / eland_result.reads * 100
405         qc = mc['QC']
406         qc_percent = float(qc) / eland_result.reads * 100
407
408         report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
409         report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
410         report.append('Unique (0,1,2 mismatches) %d %d %d' % \
411                       (mc['U0'], mc['U1'], mc['U2']))
412         report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
413                       (mc['R0'], mc['R1'], mc['R2']))
414
415     if hasattr(eland_result, 'genome_map'):
416         report.append("Mapped Reads")
417         mapped_reads = summarize_mapped_reads(eland_result.genome_map,
418                                               eland_result.mapped_reads)
419         for name, counts in mapped_reads.items():
420             report.append("  %s: %d" % (name, counts))
421
422         report.append('')
423     return report
424
425 def summary_report(runs):
426     """
427     Summarize cluster numbers and mapped read counts for a runfolder
428     """
429     report = []
430     for run in runs:
431         # print a run name?
432         report.append('Summary for %s' % (run.name,))
433         # sort the report
434         eland_keys = sorted(run.gerald.eland_results.keys())
435     for lane_id in eland_keys:
436         report.extend(summarize_lane(run.gerald, lane_id))
437         report.append('---')
438         report.append('')
439     return os.linesep.join(report)
440
441 def is_compressed(filename):
442     if os.path.splitext(filename)[1] == ".gz":
443         return True
444     elif os.path.splitext(filename)[1] == '.bz2':
445         return True
446     else:
447         return False
448
449 def save_flowcell_reports(data_dir, cycle_dir):
450     """
451     Save the flowcell quality reports
452     """
453     data_dir = os.path.abspath(data_dir)
454     status_file = os.path.join(data_dir, 'Status.xml')
455     reports_dir = os.path.join(data_dir, 'reports')
456     reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
457     if os.path.exists(reports_dir):
458         cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
459         if os.path.exists(status_file):
460             cmd_list.extend(['Status.xml', 'Status.xsl'])
461         LOGGER.info("Saving reports from " + reports_dir)
462         cwd = os.getcwd()
463         os.chdir(data_dir)
464         q = QueueCommands([" ".join(cmd_list)])
465         q.run()
466         os.chdir(cwd)
467
468
469 def save_summary_file(pipeline, cycle_dir):
470     # Copy Summary.htm
471     gerald_object = pipeline.gerald
472     gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
473     status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
474     if os.path.exists(gerald_summary):
475         LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
476         shutil.copy(gerald_summary, cycle_dir)
477     elif os.path.exists(status_files_summary):
478         LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
479         shutil.copy(status_files_summary, cycle_dir)
480     else:
481         LOGGER.info('Summary file %s was not found' % (summary_path,))
482
483 def save_ivc_plot(bustard_object, cycle_dir):
484     """
485     Save the IVC page and its supporting images
486     """
487     plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
488     plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
489     plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
490
491     plot_target_path = os.path.join(cycle_dir, 'Plots')
492
493     if os.path.exists(plot_html):
494         LOGGER.debug("Saving %s" % (plot_html,))
495         LOGGER.debug("Saving %s" % (plot_images,))
496         shutil.copy(plot_html, cycle_dir)
497         if not os.path.exists(plot_target_path):
498             os.mkdir(plot_target_path)
499         for plot_file in glob(plot_images):
500             shutil.copy(plot_file, plot_target_path)
501     else:
502         LOGGER.warning('Missing IVC.html file, not archiving')
503
504
505 def compress_score_files(bustard_object, cycle_dir):
506     """
507     Compress score files into our result directory
508     """
509     # check for g.pathname/Temp a new feature of 1.1rc1
510     scores_path = bustard_object.pathname
511     scores_path_temp = os.path.join(scores_path, 'Temp')
512     if os.path.isdir(scores_path_temp):
513         scores_path = scores_path_temp
514
515     # hopefully we have a directory that contains s_*_score files
516     score_files = []
517     for f in os.listdir(scores_path):
518         if re.match('.*_score.txt', f):
519             score_files.append(f)
520
521     tar_cmd = ['tar', 'c'] + score_files
522     bzip_cmd = [ 'bzip2', '-9', '-c' ]
523     tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
524     tar_dest = open(tar_dest_name, 'w')
525     LOGGER.info("Compressing score files from %s" % (scores_path,))
526     LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
527     LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
528     LOGGER.info("Writing to %s" % (tar_dest_name,))
529
530     env = {'BZIP': '-9'}
531     tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
532                            cwd=scores_path)
533     bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
534     tar.wait()
535
536
537 def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
538     """
539     Compress eland result files into the archive directory
540     """
541     # copy & bzip eland files
542     bz_commands = []
543
544     for key in gerald_object.eland_results:
545         eland_lane = gerald_object.eland_results[key]
546         for source_name in eland_lane.pathnames:
547             if source_name is None:
548               LOGGER.info(
549                 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
550             else:
551               path, name = os.path.split(source_name)
552               dest_name = os.path.join(cycle_dir, name)
553               LOGGER.info("Saving eland file %s to %s" % \
554                          (source_name, dest_name))
555
556               if is_compressed(name):
557                 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
558                 shutil.copy(source_name, dest_name)
559               else:
560                 # not compressed
561                 dest_name += '.bz2'
562                 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
563                 bz_commands.append(" ".join(args))
564                 #LOGGER.info('Running: %s' % ( " ".join(args) ))
565                 #bzip_dest = open(dest_name, 'w')
566                 #bzip = subprocess.Popen(args, stdout=bzip_dest)
567                 #LOGGER.info('Saving to %s' % (dest_name, ))
568                 #bzip.wait()
569
570     if len(bz_commands) > 0:
571       q = QueueCommands(bz_commands, num_jobs)
572       q.run()
573
574
575 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
576     """
577     Iterate over runfolders in runs extracting the most useful information.
578       * run parameters (in run-*.xml)
579       * eland_result files
580       * score files
581       * Summary.htm
582       * srf files (raw sequence & qualities)
583     """
584     if output_base_dir is None:
585         output_base_dir = os.getcwd()
586
587     for r in runs:
588         result_dir = os.path.join(output_base_dir, r.flowcell_id)
589         LOGGER.info("Using %s as result directory" % (result_dir,))
590         if not os.path.exists(result_dir):
591             os.mkdir(result_dir)
592
593         # create cycle_dir
594         cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
595         LOGGER.info("Filling in %s" % (cycle,))
596         cycle_dir = os.path.join(result_dir, cycle)
597         cycle_dir = os.path.abspath(cycle_dir)
598         if os.path.exists(cycle_dir):
599             LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
600             continue
601         else:
602             os.mkdir(cycle_dir)
603
604         # save run file
605         r.save(cycle_dir)
606
607         # save illumina flowcell status report
608         save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
609                               cycle_dir)
610
611         # save stuff from bustard
612         # grab IVC plot
613         save_ivc_plot(r.bustard, cycle_dir)
614
615         # build base call saving commands
616         if site is not None:
617             save_raw_data(num_jobs, r, site, raw_format, cycle_dir)
618
619         # save stuff from GERALD
620         # copy stuff out of the main run
621         g = r.gerald
622
623         # save summary file
624         save_summary_file(r, cycle_dir)
625
626         # compress eland result files
627         compress_eland_results(g, cycle_dir, num_jobs)
628
629         # md5 all the compressed files once we're done
630         md5_commands = srf.make_md5_commands(cycle_dir)
631         srf.run_commands(cycle_dir, md5_commands, num_jobs)
632
633 def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
634     lanes = []
635     for lane in r.gerald.lanes:
636         lane_parameters = r.gerald.lanes.get(lane, None)
637         if lane_parameters is not None:
638             lanes.append(lane)
639
640     run_name = srf.pathname_to_run_name(r.pathname)
641     seq_cmds = []
642     if raw_format is None:
643         raw_format = r.bustard.sequence_format
644
645     LOGGER.info("Raw Format is: %s" % (raw_format, ))
646     if raw_format == 'fastq':
647         rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
648         LOGGER.info("raw data = %s" % (rawpath,))
649         srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
650     elif raw_format == 'qseq':
651         seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
652     elif raw_format == 'srf':
653         seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
654     else:
655         raise ValueError('Unknown --raw-format=%s' % (raw_format))
656     srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
657
658 def rm_list(files, dry_run=True):
659     for f in files:
660         if os.path.exists(f):
661             LOGGER.info('deleting %s' % (f,))
662             if not dry_run:
663                 if os.path.isdir(f):
664                     shutil.rmtree(f)
665                 else:
666                     os.unlink(f)
667         else:
668             LOGGER.warn("%s doesn't exist." % (f,))
669
670 def clean_runs(runs, dry_run=True):
671     """
672     Clean up run folders to optimize for compression.
673     """
674     if dry_run:
675         LOGGER.info('In dry-run mode')
676
677     for run in runs:
678         LOGGER.info('Cleaninging %s' % (run.pathname,))
679         # rm RunLog*.xml
680         runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
681         rm_list(runlogs, dry_run)
682         # rm pipeline_*.txt
683         pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
684         rm_list(pipeline_logs, dry_run)
685         # rm gclog.txt?
686         # rm NetCopy.log? Isn't this robocopy?
687         logs = glob(os.path.join(run.pathname, '*.log'))
688         rm_list(logs, dry_run)
689         # rm nfn.log?
690         # Calibration
691         calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
692         rm_list(calibration_dir, dry_run)
693         # rm Images/L*
694         LOGGER.info("Cleaning images")
695         image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
696         rm_list(image_dirs, dry_run)
697         # rm ReadPrep
698         LOGGER.info("Cleaning ReadPrep*")
699         read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
700         rm_list(read_prep_dirs, dry_run)
701         # rm ReadPrep
702         LOGGER.info("Cleaning Thubmnail_images")
703         thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
704         rm_list(thumbnail_dirs, dry_run)
705
706         # make clean_intermediate
707         logging.info("Cleaning intermediate files")
708         if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
709             clean_process = subprocess.Popen(['make', 'clean_intermediate'],
710                                              cwd=run.image_analysis.pathname,)
711             clean_process.wait()
712
713
714