1 """Core information needed to inspect a runfolder.
14 LOGGER = logging.getLogger(__name__)
16 from htsworkflow.pipelines import firecrest
17 from htsworkflow.pipelines import ipar
18 from htsworkflow.pipelines import bustard
19 from htsworkflow.pipelines import gerald
20 from htsworkflow.pipelines import ElementTree, \
21 EUROPEAN_STRPTIME, EUROPEAN_DATE_RE, \
22 VERSION_RE, USER_RE, \
23 LANES_PER_FLOWCELL, LANE_LIST
24 from htsworkflow.pipelines.samplekey import LANE_SAMPLE_KEYS
25 from htsworkflow.util.alphanum import alphanum
26 from htsworkflow.util.ethelp import indent, flatten
27 from htsworkflow.util.queuecommands import QueueCommands
29 from htsworkflow.pipelines import srf
31 class PipelineRun(object):
32 """Capture "interesting" information about a pipeline run
35 - `pathname` location of the root of this runfolder
36 - `serialization_filename` read only property containing name of run xml file
37 - `flowcell_id` read-only property containing flowcell id (bar code)
38 - `datadir` location of the runfolder data dir.
39 - `image_analysis` generic name for Firecrest or IPAR image analysis
40 - `bustard` summary base caller
41 - `gerald` summary of sequence alignment and quality control metrics
44 PIPELINE_RUN = 'PipelineRun'
45 FLOWCELL_ID = 'FlowcellID'
47 def __init__(self, pathname=None, flowcell_id=None, xml=None):
48 """Initialize a PipelineRun object
51 - `pathname` the root directory of this run folder.
52 - `flowcell_id` the flowcell ID in case it can't be determined
53 - `xml` Allows initializing an object from a serialized xml file.
60 if pathname is not None:
61 self.pathname = os.path.normpath(pathname)
65 self._flowcell_id = flowcell_id
68 self.image_analysis = None
73 self.set_elements(xml)
75 def _get_flowcell_id(self):
76 """Return the flowcell ID
78 Attempts to find the flowcell ID through several mechanisms.
81 if self._flowcell_id is None:
82 self._flowcell_id = self._get_flowcell_id_from_runinfo()
83 if self._flowcell_id is None:
84 self._flowcell_id = self._get_flowcell_id_from_flowcellid()
85 if self._flowcell_id is None:
86 self._flowcell_id = self._get_flowcell_id_from_path()
87 if self._flowcell_id is None:
88 self._flowcell_id = 'unknown'
91 "Flowcell id was not found, guessing %s" % (
94 return self._flowcell_id
95 flowcell_id = property(_get_flowcell_id)
97 def _get_flowcell_id_from_flowcellid(self):
98 """Extract flowcell id from a Config/FlowcellId.xml file
100 :return: flowcell_id or None if not found
102 config_dir = os.path.join(self.pathname, 'Config')
103 flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
104 if os.path.exists(flowcell_id_path):
105 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
106 return flowcell_id_tree.findtext('Text')
108 def _get_flowcell_id_from_runinfo(self):
109 """Read RunInfo file for flowcell id
111 :return: flowcell_id or None if not found
113 runinfo = os.path.join(self.pathname, 'RunInfo.xml')
114 if os.path.exists(runinfo):
115 tree = ElementTree.parse(runinfo)
116 root = tree.getroot()
117 fc_nodes = root.xpath('/RunInfo/Run/Flowcell')
118 if len(fc_nodes) == 1:
119 return fc_nodes[0].text
121 def _get_flowcell_id_from_path(self):
122 """Guess a flowcell name from the path
124 :return: flowcell_id or None if not found
126 path_fields = self.pathname.split('_')
127 if len(path_fields) > 0:
128 # guessing last element of filename
129 return path_fields[-1]
131 def _get_runfolder_name(self):
132 if self.gerald is None:
135 return self.gerald.runfolder_name
136 runfolder_name = property(_get_runfolder_name)
138 def _get_run_dirname(self):
139 """Return name of directory to hold result files from one analysis
141 For pre-multiplexing runs this is just the cycle range C1-123
142 For post-multiplexing runs the "suffix" that we add to
143 differentiate runs will be added to the range.
144 E.g. Unaligned_6mm may produce C1-200_6mm
146 if self.image_analysis is None:
147 raise ValueError("Not initialized yet")
148 start = self.image_analysis.start
149 stop = self.image_analysis.stop
150 cycle_fragment = "C%d-%d" % (start, stop)
152 cycle_fragment += self.suffix
154 return cycle_fragment
155 run_dirname = property(_get_run_dirname)
157 def get_elements(self):
158 """make one master xml file from all of our sub-components.
160 :return: an ElementTree containing all available pipeline
163 root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
164 flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
165 flowcell.text = self.flowcell_id
166 root.append(self.image_analysis.get_elements())
167 root.append(self.bustard.get_elements())
169 root.append(self.gerald.get_elements())
172 def set_elements(self, tree):
173 """Initialize a PipelineRun object from an run.xml ElementTree.
175 :param tree: parsed ElementTree
176 :type tree: ElementTree
178 tag = tree.tag.lower()
179 if tag != PipelineRun.PIPELINE_RUN.lower():
180 raise ValueError('Pipeline Run Expecting %s got %s' % (
181 PipelineRun.PIPELINE_RUN, tag))
183 tag = element.tag.lower()
184 if tag == PipelineRun.FLOWCELL_ID.lower():
185 self._flowcell_id = element.text
186 #ok the xword.Xword.XWORD pattern for module.class.constant is lame
187 # you should only have Firecrest or IPAR, never both of them.
188 elif tag == firecrest.Firecrest.FIRECREST.lower():
189 self.image_analysis = firecrest.Firecrest(xml=element)
190 elif tag == ipar.IPAR.IPAR.lower():
191 self.image_analysis = ipar.IPAR(xml=element)
192 elif tag == bustard.Bustard.BUSTARD.lower():
193 self.bustard = bustard.Bustard(xml=element)
194 elif tag == gerald.Gerald.GERALD.lower():
195 self.gerald = gerald.Gerald(xml=element)
196 elif tag == gerald.CASAVA.GERALD.lower():
197 self.gerald = gerald.CASAVA(xml=element)
199 LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
201 def _get_serialization_filename(self):
202 """Compute the filename for the run xml file
204 Attempts to find the latest date from all of the run
207 :return: filename run_{flowcell id}_{timestamp}.xml
210 if self._name is None:
211 components = [self.image_analysis, self.bustard, self.gerald]
212 tmax = max([ c.time for c in components if c ])
213 timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
214 self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
216 serialization_filename = property(_get_serialization_filename)
218 def save(self, destdir=None):
219 """Save a run xml file.
221 :param destdir: Directory name to save too, uses current directory
227 LOGGER.info("Saving run report " + self.serialization_filename)
228 xml = self.get_elements()
230 dest_pathname = os.path.join(destdir, self.serialization_filename)
231 ElementTree.ElementTree(xml).write(dest_pathname)
233 def load(self, filename):
234 """Load a run xml into this object.
237 - `filename` location of a run xml file
242 LOGGER.info("Loading run report from " + filename)
243 tree = ElementTree.parse(filename).getroot()
244 self.set_elements(tree)
246 def load_pipeline_run_xml(pathname):
248 Load and instantiate a Pipeline run from a run xml file
251 - `pathname` location of an run xml file
253 :Returns: initialized PipelineRun object
255 tree = ElementTree.parse(pathname).getroot()
256 run = PipelineRun(xml=tree)
259 def get_runs(runfolder, flowcell_id=None):
260 """Find all runs associated with a runfolder.
262 We end up with multiple analysis runs as we sometimes
263 need to try with different parameters. This attempts
264 to return a list of all the various runs.
266 For example if there are two different GERALD runs, this will
267 generate two different PipelineRun objects, that differ
268 in there gerald component.
270 datadir = os.path.join(runfolder, 'Data')
272 LOGGER.info('Searching for runs in ' + datadir)
274 # scan for firecrest directories
275 for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
276 LOGGER.info('Found firecrest in ' + datadir)
277 image_analysis = firecrest.firecrest(firecrest_pathname)
278 if image_analysis is None:
280 "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
283 scan_post_image_analysis(
284 runs, runfolder, datadir, image_analysis, firecrest_pathname, flowcell_id
286 # scan for IPAR directories
287 ipar_dirs = glob(os.path.join(datadir, "IPAR_*"))
288 # The Intensities directory from the RTA software looks a lot like IPAR
289 ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
290 for ipar_pathname in ipar_dirs:
291 LOGGER.info('Found ipar directories in ' + datadir)
292 image_analysis = ipar.ipar(ipar_pathname)
293 if image_analysis is None:
295 "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
298 scan_post_image_analysis(
299 runs, runfolder, datadir, image_analysis, ipar_pathname, flowcell_id
304 def scan_post_image_analysis(runs, runfolder, datadir, image_analysis,
305 pathname, flowcell_id):
306 added = build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id)
307 # If we're a multiplexed run, don't look for older run type.
311 LOGGER.info("Looking for bustard directories in %s" % (pathname,))
312 bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
313 # RTA BaseCalls looks enough like Bustard.
314 bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
315 for bustard_pathname in bustard_dirs:
316 LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
317 b = bustard.bustard(bustard_pathname)
318 build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname,
319 runfolder, flowcell_id)
322 def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder,
325 gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
326 LOGGER.info("Looking for gerald directories in %s" % (pathname,))
327 for gerald_pathname in glob(gerald_glob):
328 LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
330 g = gerald.gerald(gerald_pathname)
331 p = PipelineRun(runfolder, flowcell_id)
333 p.image_analysis = image_analysis
338 LOGGER.error("Ignoring " + str(e))
339 return len(runs) - start
342 def build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id):
344 aligned_glob = os.path.join(runfolder, 'Aligned*')
345 unaligned_glob = os.path.join(runfolder, 'Unaligned*')
347 aligned_paths = glob(aligned_glob)
348 unaligned_paths = glob(unaligned_glob)
350 matched_paths = hiseq_match_aligned_unaligned(aligned_paths, unaligned_paths)
351 LOGGER.debug("Matched HiSeq analysis: %s", str(matched_paths))
353 for aligned, unaligned, suffix in matched_paths:
354 if unaligned is None:
355 LOGGER.warn("Aligned directory %s without matching unalinged, skipping", aligned)
359 p = PipelineRun(runfolder, flowcell_id)
362 p.image_analysis = image_analysis
363 p.bustard = bustard.bustard(unaligned)
366 p.gerald = gerald.gerald(aligned)
369 LOGGER.error("Ignoring " + str(e))
370 return len(runs) - start
372 def hiseq_match_aligned_unaligned(aligned, unaligned):
373 """Match aligned and unaligned folders from seperate lists
375 unaligned_suffix_re = re.compile('Unaligned(?P<suffix>[\w]*)')
377 aligned_by_suffix = build_dir_dict_by_suffix('Aligned', aligned)
378 unaligned_by_suffix = build_dir_dict_by_suffix('Unaligned', unaligned)
380 keys = set(aligned_by_suffix.keys()).union(set(unaligned_by_suffix.keys()))
384 a = aligned_by_suffix.get(key)
385 u = unaligned_by_suffix.get(key)
386 matches.append((a, u, key))
389 def build_dir_dict_by_suffix(prefix, dirnames):
390 """Build a dictionary indexed by suffix of last directory name.
392 It assumes a constant prefix
394 regex = re.compile('%s(?P<suffix>[\w]*)' % (prefix,))
397 for absname in dirnames:
398 basename = os.path.basename(absname)
399 match = regex.match(basename)
401 by_suffix[match.group('suffix')] = absname
404 def get_specific_run(gerald_dir):
406 Given a gerald directory, construct a PipelineRun out of its parents
408 Basically this allows specifying a particular run instead of the previous
409 get_runs which scans a runfolder for various combinations of
410 firecrest/ipar/bustard/gerald runs.
412 from htsworkflow.pipelines import firecrest
413 from htsworkflow.pipelines import ipar
414 from htsworkflow.pipelines import bustard
415 from htsworkflow.pipelines import gerald
417 gerald_dir = os.path.expanduser(gerald_dir)
418 bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..'))
419 image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..'))
421 runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
423 LOGGER.info('--- use-run detected options ---')
424 LOGGER.info('runfolder: %s' % (runfolder_dir,))
425 LOGGER.info('image_dir: %s' % (image_dir,))
426 LOGGER.info('bustard_dir: %s' % (bustard_dir,))
427 LOGGER.info('gerald_dir: %s' % (gerald_dir,))
429 # find our processed image dir
431 # split into parent, and leaf directory
432 # leaf directory should be an IPAR or firecrest directory
433 data_dir, short_image_dir = os.path.split(image_dir)
434 LOGGER.info('data_dir: %s' % (data_dir,))
435 LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
437 # guess which type of image processing directory we have by looking
438 # in the leaf directory name
439 if re.search('Firecrest', short_image_dir, re.IGNORECASE) is not None:
440 image_run = firecrest.firecrest(image_dir)
441 elif re.search('IPAR', short_image_dir, re.IGNORECASE) is not None:
442 image_run = ipar.ipar(image_dir)
443 elif re.search('Intensities', short_image_dir, re.IGNORECASE) is not None:
444 image_run = ipar.ipar(image_dir)
446 # if we din't find a run, report the error and return
447 if image_run is None:
448 msg = '%s does not contain an image processing step' % (image_dir,)
452 # find our base calling
453 base_calling_run = bustard.bustard(bustard_dir)
454 if base_calling_run is None:
455 LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
459 gerald_run = gerald.gerald(gerald_dir)
460 if gerald_run is None:
461 LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
464 p = PipelineRun(runfolder_dir)
465 p.image_analysis = image_run
466 p.bustard = base_calling_run
467 p.gerald = gerald_run
469 LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
472 def extract_run_parameters(runs):
474 Search through runfolder_path for various runs and grab their parameters
479 def summarize_mapped_reads(genome_map, mapped_reads):
481 Summarize per chromosome reads into a genome count
482 But handle spike-in/contamination symlinks seperately.
484 summarized_reads = {}
487 for k, v in mapped_reads.items():
488 path, k = os.path.split(k)
489 if len(path) > 0 and path not in genome_map:
493 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
494 summarized_reads[genome] = genome_reads
495 return summarized_reads
497 def summarize_lane(gerald, lane_id):
499 lane_results = gerald.summary.lane_results
500 eland_result = gerald.eland_results[lane_id]
501 report.append("Sample name %s" % (eland_result.sample_name))
502 report.append("Lane id %s end %s" % (lane_id.lane, lane_id.read))
504 if lane_id.read < len(lane_results) and \
505 lane_id.lane in lane_results[lane_id.read]:
506 summary_results = lane_results[lane_id.read][lane_id.lane]
507 cluster = summary_results.cluster
508 report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
509 report.append("Total Reads: %d" % (eland_result.reads))
511 if hasattr(eland_result, 'match_codes'):
512 mc = eland_result.match_codes
514 nm_percent = float(nm) / eland_result.reads * 100
516 qc_percent = float(qc) / eland_result.reads * 100
518 report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
519 report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
520 report.append('Unique (0,1,2 mismatches) %d %d %d' % \
521 (mc['U0'], mc['U1'], mc['U2']))
522 report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
523 (mc['R0'], mc['R1'], mc['R2']))
525 if hasattr(eland_result, 'genome_map'):
526 report.append("Mapped Reads")
527 mapped_reads = summarize_mapped_reads(eland_result.genome_map,
528 eland_result.mapped_reads)
529 for name, counts in mapped_reads.items():
530 report.append(" %s: %d" % (name, counts))
535 def summary_report(runs):
537 Summarize cluster numbers and mapped read counts for a runfolder
543 report.append('Summary for %s' % (run.serialization_filename,))
546 eland_keys = sorted(run.gerald.eland_results.keys())
548 report.append("Alignment not done, no report possible")
550 for lane_id in eland_keys:
551 report.extend(summarize_lane(run.gerald, lane_id))
554 return os.linesep.join(report)
556 def is_compressed(filename):
557 if os.path.splitext(filename)[1] == ".gz":
559 elif os.path.splitext(filename)[1] == '.bz2':
564 def save_flowcell_reports(data_dir, run_dirname):
566 Save the flowcell quality reports
568 data_dir = os.path.abspath(data_dir)
569 status_file = os.path.join(data_dir, 'Status.xml')
570 reports_dir = os.path.join(data_dir, 'reports')
571 reports_dest = os.path.join(run_dirname, 'flowcell-reports.tar.bz2')
572 if os.path.exists(reports_dir):
573 cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
574 if os.path.exists(status_file):
575 cmd_list.extend(['Status.xml', 'Status.xsl'])
576 LOGGER.info("Saving reports from " + reports_dir)
579 q = QueueCommands([" ".join(cmd_list)])
584 def save_summary_file(pipeline, run_dirname):
586 gerald_object = pipeline.gerald
587 gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
588 status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
589 if os.path.exists(gerald_summary):
590 LOGGER.info('Copying %s to %s' % (gerald_summary, run_dirname))
591 shutil.copy(gerald_summary, run_dirname)
592 elif os.path.exists(status_files_summary):
593 LOGGER.info('Copying %s to %s' % (status_files_summary, run_dirname))
594 shutil.copy(status_files_summary, run_dirname)
596 LOGGER.info('Summary file %s was not found' % (summary_path,))
598 def save_ivc_plot(bustard_object, run_dirname):
600 Save the IVC page and its supporting images
602 plot_html = os.path.join(bustard_object.pathname, 'IVC.htm')
603 plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
604 plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
606 plot_target_path = os.path.join(run_dirname, 'Plots')
608 if os.path.exists(plot_html):
609 LOGGER.debug("Saving %s" % (plot_html,))
610 LOGGER.debug("Saving %s" % (plot_images,))
611 shutil.copy(plot_html, run_dirname)
612 if not os.path.exists(plot_target_path):
613 os.mkdir(plot_target_path)
614 for plot_file in glob(plot_images):
615 shutil.copy(plot_file, plot_target_path)
617 LOGGER.warning('Missing IVC.html file, not archiving')
620 def compress_score_files(bustard_object, run_dirname):
622 Compress score files into our result directory
624 # check for g.pathname/Temp a new feature of 1.1rc1
625 scores_path = bustard_object.pathname
626 scores_path_temp = os.path.join(scores_path, 'Temp')
627 if os.path.isdir(scores_path_temp):
628 scores_path = scores_path_temp
630 # hopefully we have a directory that contains s_*_score files
632 for f in os.listdir(scores_path):
633 if re.match('.*_score.txt', f):
634 score_files.append(f)
636 tar_cmd = ['tar', 'c'] + score_files
637 bzip_cmd = [ 'bzip2', '-9', '-c' ]
638 tar_dest_name = os.path.join(run_dirname, 'scores.tar.bz2')
639 tar_dest = open(tar_dest_name, 'w')
640 LOGGER.info("Compressing score files from %s" % (scores_path,))
641 LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
642 LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
643 LOGGER.info("Writing to %s" % (tar_dest_name,))
646 tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
648 bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
652 def compress_eland_results(gerald_object, run_dirname, num_jobs=1):
654 Compress eland result files into the archive directory
656 # copy & bzip eland files
659 for key in gerald_object.eland_results:
660 eland_lane = gerald_object.eland_results[key]
661 for source_name in eland_lane.pathnames:
662 if source_name is None:
664 "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
666 path, name = os.path.split(source_name)
667 dest_name = os.path.join(run_dirname, name)
668 LOGGER.info("Saving eland file %s to %s" % \
669 (source_name, dest_name))
671 if is_compressed(name):
672 LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
673 shutil.copy(source_name, dest_name)
677 args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
678 bz_commands.append(" ".join(args))
679 #LOGGER.info('Running: %s' % ( " ".join(args) ))
680 #bzip_dest = open(dest_name, 'w')
681 #bzip = subprocess.Popen(args, stdout=bzip_dest)
682 #LOGGER.info('Saving to %s' % (dest_name, ))
685 if len(bz_commands) > 0:
686 q = QueueCommands(bz_commands, num_jobs)
690 def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format=None):
692 Iterate over runfolders in runs extracting the most useful information.
693 * run parameters (in run-*.xml)
697 * srf files (raw sequence & qualities)
699 if output_base_dir is None:
700 output_base_dir = os.getcwd()
703 result_dir = os.path.join(output_base_dir, r.flowcell_id)
704 LOGGER.info("Using %s as result directory" % (result_dir,))
705 if not os.path.exists(result_dir):
708 # create directory to add this runs results to
709 LOGGER.info("Filling in %s" % (r.run_dirname,))
710 run_dirname = os.path.join(result_dir, r.run_dirname)
711 run_dirname = os.path.abspath(run_dirname)
712 if os.path.exists(run_dirname):
713 LOGGER.error("%s already exists, not overwriting" % (run_dirname,))
716 os.mkdir(run_dirname)
721 # save illumina flowcell status report
722 save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
725 # save stuff from bustard
727 save_ivc_plot(r.bustard, run_dirname)
729 # build base call saving commands
731 save_raw_data(num_jobs, r, site, raw_format, run_dirname)
733 # save stuff from GERALD
734 # copy stuff out of the main run
739 save_summary_file(r, run_dirname)
741 # compress eland result files
742 compress_eland_results(g, run_dirname, num_jobs)
744 # md5 all the compressed files once we're done
745 md5_commands = srf.make_md5_commands(run_dirname)
746 srf.run_commands(run_dirname, md5_commands, num_jobs)
748 def save_raw_data(num_jobs, r, site, raw_format, run_dirname):
751 for lane in r.gerald.lanes:
752 lane_parameters = r.gerald.lanes.get(lane, None)
753 if lane_parameters is not None:
756 # assume default list of lanes
757 lanes = LANE_SAMPLE_KEYS
759 run_name = srf.pathname_to_run_name(r.pathname)
761 if raw_format is None:
762 raw_format = r.bustard.sequence_format
764 LOGGER.info("Raw Format is: %s" % (raw_format, ))
765 if raw_format == 'fastq':
766 LOGGER.info("Reading fastq files from %s", r.bustard.pathname)
767 rawpath = os.path.join(r.pathname, r.bustard.pathname)
768 LOGGER.info("raw data = %s" % (rawpath,))
769 srf.copy_hiseq_project_fastqs(run_name, rawpath, site, run_dirname)
770 elif raw_format == 'qseq':
771 seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, run_dirname)
772 elif raw_format == 'srf':
773 seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, run_dirname, 0)
775 raise ValueError('Unknown --raw-format=%s' % (raw_format))
776 srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
778 def rm_list(files, dry_run=True):
780 if os.path.exists(f):
781 LOGGER.info('deleting %s' % (f,))
788 LOGGER.warn("%s doesn't exist." % (f,))
790 def clean_runs(runs, dry_run=True):
792 Clean up run folders to optimize for compression.
795 LOGGER.info('In dry-run mode')
798 LOGGER.info('Cleaninging %s' % (run.pathname,))
800 runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
801 rm_list(runlogs, dry_run)
803 pipeline_logs = glob(os.path.join(run.pathname, 'pipeline*.txt'))
804 rm_list(pipeline_logs, dry_run)
806 # rm NetCopy.log? Isn't this robocopy?
807 logs = glob(os.path.join(run.pathname, '*.log'))
808 rm_list(logs, dry_run)
811 calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
812 rm_list(calibration_dir, dry_run)
814 LOGGER.info("Cleaning images")
815 image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
816 rm_list(image_dirs, dry_run)
818 LOGGER.info("Cleaning ReadPrep*")
819 read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
820 rm_list(read_prep_dirs, dry_run)
822 LOGGER.info("Cleaning Thubmnail_images")
823 thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
824 rm_list(thumbnail_dirs, dry_run)
826 # make clean_intermediate
827 logging.info("Cleaning intermediate files")
828 if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')):
829 clean_process = subprocess.Popen(['make', 'clean_intermediate'],
830 cwd=run.image_analysis.pathname,)