1 """Core information needed to inspect a runfolder.
14 LOGGER = logging.getLogger(__name__)
16 from htsworkflow.pipelines import firecrest
17 from htsworkflow.pipelines import ipar
18 from htsworkflow.pipelines import bustard
19 from htsworkflow.pipelines import gerald
20 from htsworkflow.pipelines import ElementTree, \
21 EUROPEAN_STRPTIME, EUROPEAN_DATE_RE, \
22 VERSION_RE, USER_RE, \
23 LANES_PER_FLOWCELL, LANE_LIST
24 from htsworkflow.pipelines.samplekey import LANE_SAMPLE_KEYS
25 from htsworkflow.util.alphanum import alphanum
26 from htsworkflow.util.ethelp import indent, flatten
27 from htsworkflow.util.queuecommands import QueueCommands
29 from htsworkflow.pipelines import srf
31 class PipelineRun(object):
32 """Capture "interesting" information about a pipeline run
35 - `pathname` location of the root of this runfolder
36 - `serialization_filename` read only property containing name of run xml file
37 - `flowcell_id` read-only property containing flowcell id (bar code)
38 - `datadir` location of the runfolder data dir.
39 - `image_analysis` generic name for Firecrest or IPAR image analysis
40 - `bustard` summary base caller
41 - `gerald` summary of sequence alignment and quality control metrics
44 PIPELINE_RUN = 'PipelineRun'
45 FLOWCELL_ID = 'FlowcellID'
47 def __init__(self, pathname=None, flowcell_id=None, xml=None):
48 """Initialize a PipelineRun object
51 - `pathname` the root directory of this run folder.
52 - `flowcell_id` the flowcell ID in case it can't be determined
53 - `xml` Allows initializing an object from a serialized xml file.
60 if pathname is not None:
61 self.pathname = os.path.normpath(pathname)
65 self._flowcell_id = flowcell_id
68 self.image_analysis = None
73 self.set_elements(xml)
75 def _get_flowcell_id(self):
76 """Return the flowcell ID
78 Attempts to find the flowcell ID through several mechanisms.
81 if self._flowcell_id is None:
82 self._flowcell_id = self._get_flowcell_id_from_runinfo()
83 if self._flowcell_id is None:
84 self._flowcell_id = self._get_flowcell_id_from_flowcellid()
85 if self._flowcell_id is None:
86 self._flowcell_id = self._get_flowcell_id_from_path()
87 if self._flowcell_id is None:
88 self._flowcell_id = 'unknown'
91 "Flowcell id was not found, guessing %s" % (
94 return self._flowcell_id
95 flowcell_id = property(_get_flowcell_id)
97 def _get_flowcell_id_from_flowcellid(self):
98 """Extract flowcell id from a Config/FlowcellId.xml file
100 :return: flowcell_id or None if not found
102 config_dir = os.path.join(self.pathname, 'Config')
103 flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
104 if os.path.exists(flowcell_id_path):
105 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
106 return flowcell_id_tree.findtext('Text')
108 def _get_flowcell_id_from_runinfo(self):
109 """Read RunInfo file for flowcell id
111 :return: flowcell_id or None if not found
113 runinfo = os.path.join(self.pathname, 'RunInfo.xml')
114 if os.path.exists(runinfo):
115 tree = ElementTree.parse(runinfo)
116 root = tree.getroot()
117 fc_nodes = root.xpath('/RunInfo/Run/Flowcell')
118 if len(fc_nodes) == 1:
119 return fc_nodes[0].text
121 def _get_flowcell_id_from_path(self):
122 """Guess a flowcell name from the path
124 :return: flowcell_id or None if not found
126 path_fields = self.pathname.split('_')
127 if len(path_fields) > 0:
128 # guessing last element of filename
129 return path_fields[-1]
131 def _get_runfolder_name(self):
133 return self.gerald.runfolder_name
134 elif hasattr(self.image_analysis, 'runfolder_name'):
135 return self.image_analysis.runfolder_name
138 runfolder_name = property(_get_runfolder_name)
140 def _get_run_dirname(self):
141 """Return name of directory to hold result files from one analysis
143 For pre-multiplexing runs this is just the cycle range C1-123
144 For post-multiplexing runs the "suffix" that we add to
145 differentiate runs will be added to the range.
146 E.g. Unaligned_6mm may produce C1-200_6mm
148 if self.image_analysis is None:
149 raise ValueError("Not initialized yet")
150 start = self.image_analysis.start
151 stop = self.image_analysis.stop
152 cycle_fragment = "C%d-%d" % (start, stop)
154 cycle_fragment += self.suffix
156 return cycle_fragment
157 run_dirname = property(_get_run_dirname)
159 def get_elements(self):
160 """make one master xml file from all of our sub-components.
162 :return: an ElementTree containing all available pipeline
165 root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
166 flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
167 flowcell.text = self.flowcell_id
168 root.append(self.image_analysis.get_elements())
169 root.append(self.bustard.get_elements())
171 root.append(self.gerald.get_elements())
174 def set_elements(self, tree):
175 """Initialize a PipelineRun object from an run.xml ElementTree.
177 :param tree: parsed ElementTree
178 :type tree: ElementTree
180 tag = tree.tag.lower()
181 if tag != PipelineRun.PIPELINE_RUN.lower():
182 raise ValueError('Pipeline Run Expecting %s got %s' % (
183 PipelineRun.PIPELINE_RUN, tag))
185 tag = element.tag.lower()
186 if tag == PipelineRun.FLOWCELL_ID.lower():
187 self._flowcell_id = element.text
188 #ok the xword.Xword.XWORD pattern for module.class.constant is lame
189 # you should only have Firecrest or IPAR, never both of them.
190 elif tag == firecrest.Firecrest.FIRECREST.lower():
191 self.image_analysis = firecrest.Firecrest(xml=element)
192 elif tag == ipar.IPAR.IPAR.lower():
193 self.image_analysis = ipar.IPAR(xml=element)
194 elif tag == bustard.Bustard.BUSTARD.lower():
195 self.bustard = bustard.Bustard(xml=element)
196 elif tag == gerald.Gerald.GERALD.lower():
197 self.gerald = gerald.Gerald(xml=element)
198 elif tag == gerald.CASAVA.GERALD.lower():
199 self.gerald = gerald.CASAVA(xml=element)
201 LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
203 def _get_serialization_filename(self):
204 """Compute the filename for the run xml file
206 Attempts to find the latest date from all of the run
209 :return: filename run_{flowcell id}_{timestamp}.xml
212 if self._name is None:
213 components = [self.image_analysis, self.bustard, self.gerald]
214 tmax = max([ c.time for c in components if c ])
215 timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
216 self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
218 serialization_filename = property(_get_serialization_filename)
220 def save(self, destdir=None):
221 """Save a run xml file.
223 :param destdir: Directory name to save too, uses current directory
229 LOGGER.info("Saving run report " + self.serialization_filename)
230 xml = self.get_elements()
232 dest_pathname = os.path.join(destdir, self.serialization_filename)
233 ElementTree.ElementTree(xml).write(dest_pathname)
235 def load(self, filename):
236 """Load a run xml into this object.
239 - `filename` location of a run xml file
244 LOGGER.info("Loading run report from " + filename)
245 tree = ElementTree.parse(filename).getroot()
246 self.set_elements(tree)
248 def load_pipeline_run_xml(pathname):
250 Load and instantiate a Pipeline run from a run xml file
253 - `pathname` location of an run xml file
255 :Returns: initialized PipelineRun object
257 tree = ElementTree.parse(pathname).getroot()
258 run = PipelineRun(xml=tree)
261 def get_runs(runfolder, flowcell_id=None):
262 """Find all runs associated with a runfolder.
264 We end up with multiple analysis runs as we sometimes
265 need to try with different parameters. This attempts
266 to return a list of all the various runs.
268 For example if there are two different GERALD runs, this will
269 generate two different PipelineRun objects, that differ
270 in there gerald component.
272 datadir = os.path.join(runfolder, 'Data')
274 LOGGER.info('Searching for runs in ' + datadir)
276 # scan for firecrest directories
277 for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
278 LOGGER.info('Found firecrest in ' + datadir)
279 image_analysis = firecrest.firecrest(firecrest_pathname)
280 if image_analysis is None:
282 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
285 scan_post_image_analysis(
286 runs, runfolder, datadir, image_analysis, firecrest_pathname, flowcell_id
288 # scan for IPAR directories
289 ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
290 # The Intensities directory from the RTA software looks a lot like IPAR
291 ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
292 for ipar_pathname in ipar_dirs:
293 LOGGER.info('Found ipar directories in ' + datadir)
294 image_analysis = ipar.ipar(ipar_pathname)
295 if image_analysis is None:
297 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
300 scan_post_image_analysis(
301 runs, runfolder, datadir, image_analysis, ipar_pathname, flowcell_id
306 def scan_post_image_analysis(runs, runfolder, datadir, image_analysis,
307 pathname, flowcell_id):
308 added = build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id)
309 # If we're a multiplexed run, don't look for older run type.
313 LOGGER.info("Looking for bustard directories in %s" % (pathname,))
314 bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
315 # RTA BaseCalls looks enough like Bustard.
316 bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
317 for bustard_pathname in bustard_dirs:
318 LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
319 b = bustard.bustard(bustard_pathname)
320 build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname,
321 runfolder, flowcell_id)
324 def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder,
327 gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
328 LOGGER.info("Looking for gerald directories in %s" % (pathname,))
329 for gerald_pathname in glob(gerald_glob):
330 LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
332 g = gerald.gerald(gerald_pathname)
333 p = PipelineRun(runfolder, flowcell_id)
335 p.image_analysis = image_analysis
340 LOGGER.error("Ignoring " + str(e))
341 return len(runs) - start
344 def build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id):
346 aligned_glob = os.path.join(runfolder, 'Aligned*')
347 unaligned_glob = os.path.join(runfolder, 'Unaligned*')
349 aligned_paths = glob(aligned_glob)
350 unaligned_paths = glob(unaligned_glob)
352 matched_paths = hiseq_match_aligned_unaligned(aligned_paths, unaligned_paths)
353 LOGGER.debug("Matched HiSeq analysis: %s", str(matched_paths))
355 for aligned, unaligned, suffix in matched_paths:
356 if unaligned is None:
357 LOGGER.warn("Aligned directory %s without matching unalinged, skipping", aligned)
361 p = PipelineRun(runfolder, flowcell_id)
364 p.image_analysis = image_analysis
365 p.bustard = bustard.bustard(unaligned)
367 p.gerald = gerald.gerald(aligned)
369 except (IOError, RuntimeError) as e:
370 LOGGER.error("Exception %s", str(e))
371 LOGGER.error("Skipping run in %s", flowcell_id)
372 return len(runs) - start
374 def hiseq_match_aligned_unaligned(aligned, unaligned):
375 """Match aligned and unaligned folders from seperate lists
377 unaligned_suffix_re = re.compile('Unaligned(?P<suffix>[\w]*)')
379 aligned_by_suffix = build_dir_dict_by_suffix('Aligned', aligned)
380 unaligned_by_suffix = build_dir_dict_by_suffix('Unaligned', unaligned)
382 keys = set(aligned_by_suffix.keys()).union(set(unaligned_by_suffix.keys()))
386 a = aligned_by_suffix.get(key)
387 u = unaligned_by_suffix.get(key)
388 matches.append((a, u, key))
391 def build_dir_dict_by_suffix(prefix, dirnames):
392 """Build a dictionary indexed by suffix of last directory name.
394 It assumes a constant prefix
396 regex = re.compile('%s(?P<suffix>[\w]*)' % (prefix,))
399 for absname in dirnames:
400 basename = os.path.basename(absname)
401 match = regex.match(basename)
403 by_suffix[match.group('suffix')] = absname
406 def get_specific_run(gerald_dir):
408 Given a gerald directory, construct a PipelineRun out of its parents
410 Basically this allows specifying a particular run instead of the previous
411 get_runs which scans a runfolder for various combinations of
412 firecrest/ipar/bustard/gerald runs.
414 from htsworkflow.pipelines import firecrest
415 from htsworkflow.pipelines import ipar
416 from htsworkflow.pipelines import bustard
417 from htsworkflow.pipelines import gerald
419 gerald_dir = os.path.expanduser(gerald_dir)
420 bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
421 image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
423 runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
425 LOGGER.info('--- use-run detected options ---')
426 LOGGER.info('runfolder: %s' % (runfolder_dir,))
427 LOGGER.info('image_dir: %s' % (image_dir,))
428 LOGGER.info('bustard_dir: %s' % (bustard_dir,))
429 LOGGER.info('gerald_dir: %s' % (gerald_dir,))
431 # find our processed image dir
433 # split into parent, and leaf directory
434 # leaf directory should be an IPAR or firecrest directory
435 data_dir, short_image_dir = os.path.split(image_dir)
436 LOGGER.info('data_dir: %s' % (data_dir,))
437 LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
439 # guess which type of image processing directory we have by looking
440 # in the leaf directory name
441 if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
442 image_run = firecrest.firecrest(image_dir)
443 elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
444 image_run = ipar.ipar(image_dir)
445 elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
446 image_run = ipar.ipar(image_dir)
448 # if we din't find a run, report the error and return
449 if image_run is None:
450 msg = '%s does not contain an image processing step' % (image_dir,)
454 # find our base calling
455 base_calling_run = bustard.bustard(bustard_dir)
456 if base_calling_run is None:
457 LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
461 gerald_run = gerald.gerald(gerald_dir)
462 if gerald_run is None:
463 LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
466 p = PipelineRun(runfolder_dir)
467 p.image_analysis = image_run
468 p.bustard = base_calling_run
469 p.gerald = gerald_run
471 LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
474 def extract_run_parameters(runs):
476 Search through runfolder_path for various runs and grab their parameters
481 def summarize_mapped_reads(genome_map, mapped_reads):
483 Summarize per chromosome reads into a genome count
484 But handle spike-in/contamination symlinks seperately.
486 summarized_reads = {}
489 for k, v in mapped_reads.items():
490 path, k = os.path.split(k)
491 if len(path) > 0 and path not in genome_map:
495 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
496 summarized_reads[genome] = genome_reads
497 return summarized_reads
499 def summarize_lane(gerald, lane_id):
501 lane_results = gerald.summary.lane_results
502 eland_result = gerald.eland_results[lane_id]
503 report.append("Sample name %s" % (eland_result.sample_name))
504 report.append("Lane id %s end %s" % (lane_id.lane, lane_id.read))
506 if lane_id.read < len(lane_results) and \
507 lane_id.lane in lane_results[lane_id.read]:
508 summary_results = lane_results[lane_id.read][lane_id.lane]
509 cluster = summary_results.cluster
510 report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
511 report.append("Total Reads: %d" % (eland_result.reads))
513 if hasattr(eland_result, 'match_codes'):
514 mc = eland_result.match_codes
516 nm_percent = float(nm) / eland_result.reads * 100
518 qc_percent = float(qc) / eland_result.reads * 100
520 report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
521 report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
522 report.append('Unique (0,1,2 mismatches) %d %d %d' % \
523 (mc['U0'], mc['U1'], mc['U2']))
524 report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
525 (mc['R0'], mc['R1'], mc['R2']))
527 if hasattr(eland_result, 'genome_map'):
528 report.append("Mapped Reads")
529 mapped_reads = summarize_mapped_reads(eland_result.genome_map,
530 eland_result.mapped_reads)
531 for name, counts in mapped_reads.items():
532 report.append(" %s: %d" % (name, counts))
537 def summary_report(runs):
539 Summarize cluster numbers and mapped read counts for a runfolder
545 report.append('Summary for %s' % (run.serialization_filename,))
548 eland_keys = sorted(run.gerald.eland_results.keys())
550 report.append("Alignment not done, no report possible")
552 for lane_id in eland_keys:
553 report.extend(summarize_lane(run.gerald, lane_id))
556 return os.linesep.join(report)
558 def is_compressed(filename):
559 if os.path.splitext(filename)[1] == ".gz":
561 elif os.path.splitext(filename)[1] == '.bz2':
566 def save_flowcell_reports(data_dir, run_dirname):
568 Save the flowcell quality reports
570 data_dir = os.path.abspath(data_dir)
571 status_file = os.path.join(data_dir, 'Status.xml')
572 reports_dir = os.path.join(data_dir, 'reports')
573 reports_dest = os.path.join(run_dirname, 'flowcell-reports.tar.bz2')
574 if os.path.exists(reports_dir):
575 cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
576 if os.path.exists(status_file):
577 cmd_list.extend(['Status.xml', 'Status.xsl'])
578 LOGGER.info("Saving reports from " + reports_dir)
581 q = QueueCommands([" ".join(cmd_list)])
586 def save_summary_file(pipeline, run_dirname):
588 gerald_object = pipeline.gerald
589 gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
590 status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
591 if os.path.exists(gerald_summary):
592 LOGGER.info('Copying %s to %s' % (gerald_summary, run_dirname))
593 shutil.copy(gerald_summary, run_dirname)
594 elif os.path.exists(status_files_summary):
595 LOGGER.info('Copying %s to %s' % (status_files_summary, run_dirname))
596 shutil.copy(status_files_summary, run_dirname)
598 LOGGER.info('Summary file %s was not found' % (summary_path,))
600 def save_ivc_plot(bustard_object, run_dirname):
602 Save the IVC page and its supporting images
604 plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
605 plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
606 plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
608 plot_target_path = os.path.join(run_dirname, 'Plots')
610 if os.path.exists(plot_html):
611 LOGGER.debug("Saving %s" % (plot_html,))
612 LOGGER.debug("Saving %s" % (plot_images,))
613 shutil.copy(plot_html, run_dirname)
614 if not os.path.exists(plot_target_path):
615 os.mkdir(plot_target_path)
616 for plot_file in glob(plot_images):
617 shutil.copy(plot_file, plot_target_path)
619 LOGGER.warning('Missing IVC.html file, not archiving')
622 def compress_score_files(bustard_object, run_dirname):
624 Compress score files into our result directory
626 # check for g.pathname/Temp a new feature of 1.1rc1
627 scores_path = bustard_object.pathname
628 scores_path_temp = os.path.join(scores_path, 'Temp')
629 if os.path.isdir(scores_path_temp):
630 scores_path = scores_path_temp
632 # hopefully we have a directory that contains s_*_score files
634 for f in os.listdir(scores_path):
635 if re.match('.*_score.txt', f):
636 score_files.append(f)
638 tar_cmd = ['tar', 'c'] + score_files
639 bzip_cmd = [ 'bzip2', '-9', '-c' ]
640 tar_dest_name = os.path.join(run_dirname, 'scores.tar.bz2')
641 tar_dest = open(tar_dest_name, 'w')
642 LOGGER.info("Compressing score files from %s" % (scores_path,))
643 LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
644 LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
645 LOGGER.info("Writing to %s" % (tar_dest_name,))
648 tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
650 bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
654 def compress_eland_results(gerald_object, run_dirname, num_jobs=1):
656 Compress eland result files into the archive directory
658 # copy & bzip eland files
661 for key in gerald_object.eland_results:
662 eland_lane = gerald_object.eland_results[key]
663 for source_name in eland_lane.pathnames:
664 if source_name is None:
666 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
668 path, name = os.path.split(source_name)
669 dest_name = os.path.join(run_dirname, name)
670 LOGGER.info("Saving eland file %s to %s" % \
671 (source_name, dest_name))
673 if is_compressed(name):
674 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
675 shutil.copy(source_name, dest_name)
679 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
680 bz_commands.append(" ".join(args))
681 #LOGGER.info('Running: %s' % ( " ".join(args) ))
682 #bzip_dest = open(dest_name, 'w')
683 #bzip = subprocess.Popen(args, stdout=bzip_dest)
684 #LOGGER.info('Saving to %s' % (dest_name, ))
687 if len(bz_commands) > 0:
688 q = QueueCommands(bz_commands, num_jobs)
692 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
694 Iterate over runfolders in runs extracting the most useful information.
695 * run parameters (in run-*.xml)
699 * srf files (raw sequence & qualities)
701 if output_base_dir is None:
702 output_base_dir = os.getcwd()
705 result_dir = os.path.join(output_base_dir, r.flowcell_id)
706 LOGGER.info("Using %s as result directory" % (result_dir,))
707 if not os.path.exists(result_dir):
710 # create directory to add this runs results to
711 LOGGER.info("Filling in %s" % (r.run_dirname,))
712 run_dirname = os.path.join(result_dir, r.run_dirname)
713 run_dirname = os.path.abspath(run_dirname)
714 if os.path.exists(run_dirname):
715 LOGGER.error("%s already exists, not overwriting" % (run_dirname,))
718 os.mkdir(run_dirname)
723 # save illumina flowcell status report
724 save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
727 # save stuff from bustard
729 save_ivc_plot(r.bustard, run_dirname)
731 # build base call saving commands
733 save_raw_data(num_jobs, r, site, raw_format, run_dirname)
735 # save stuff from GERALD
736 # copy stuff out of the main run
741 save_summary_file(r, run_dirname)
743 # compress eland result files
744 compress_eland_results(g, run_dirname, num_jobs)
746 # md5 all the compressed files once we're done
747 md5_commands = srf.make_md5_commands(run_dirname)
748 srf.run_commands(run_dirname, md5_commands, num_jobs)
750 def save_raw_data(num_jobs, r, site, raw_format, run_dirname):
753 for lane in r.gerald.lanes:
754 lane_parameters = r.gerald.lanes.get(lane, None)
755 if lane_parameters is not None:
758 # assume default list of lanes
759 lanes = LANE_SAMPLE_KEYS
761 run_name = srf.pathname_to_run_name(r.pathname)
763 if raw_format is None:
764 raw_format = r.bustard.sequence_format
766 LOGGER.info("Raw Format is: %s" % (raw_format, ))
767 if raw_format == 'fastq':
768 LOGGER.info("Reading fastq files from %s", r.bustard.pathname)
769 rawpath = os.path.join(r.pathname, r.bustard.pathname)
770 LOGGER.info("raw data = %s" % (rawpath,))
771 srf.copy_hiseq_project_fastqs(run_name, rawpath, site, run_dirname)
772 elif raw_format == 'qseq':
773 seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, run_dirname)
774 elif raw_format == 'srf':
775 seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, run_dirname, 0)
777 raise ValueError('Unknown --raw-format=%s' % (raw_format))
778 srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
780 def rm_list(files, dry_run=True):
782 if os.path.exists(f):
783 LOGGER.info('deleting %s' % (f,))
790 LOGGER.warn("%s doesn't exist." % (f,))
792 def clean_runs(runs, dry_run=True):
794 Clean up run folders to optimize for compression.
797 LOGGER.info('In dry-run mode')
800 LOGGER.info('Cleaninging %s' % (run.pathname,))
802 runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
803 rm_list(runlogs, dry_run)
805 pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
806 rm_list(pipeline_logs, dry_run)
808 # rm NetCopy.log? Isn't this robocopy?
809 logs = glob(os.path.join(run.pathname, '*.log'))
810 rm_list(logs, dry_run)
813 calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
814 rm_list(calibration_dir, dry_run)
816 LOGGER.info("Cleaning images")
817 image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
818 rm_list(image_dirs, dry_run)
820 LOGGER.info("Cleaning ReadPrep*")
821 read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
822 rm_list(read_prep_dirs, dry_run)
824 LOGGER.info("Cleaning Thubmnail_images")
825 thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
826 rm_list(thumbnail_dirs, dry_run)
828 # make clean_intermediate
829 logging.info("Cleaning intermediate files")
830 if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
831 clean_process = subprocess.Popen(['make', 'clean_intermediate'],
832 cwd=run.image_analysis.pathname,)