From: Brandon W. King Date: Fri, 10 Jun 2011 22:11:03 +0000 (-0700) Subject: The srf/qseq --raw-format patch. X-Git-Tag: 0.5.2~28^2 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=26373d4810da249edd3b7a37fec514d00dc421e5 The srf/qseq --raw-format patch. Signed-off-by: Diane Trout --- diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 78705e5..e1b474b 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -22,7 +22,7 @@ EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})" VERSION_RE = "([0-9\.]+)" USER_RE = "([a-zA-Z0-9]+)" LANES_PER_FLOWCELL = 8 -LANE_LIST = range(1, LANES_PER_FLOWCELL+1) +LANE_LIST = range(1, LANES_PER_FLOWCELL + 1) from htsworkflow.util.alphanum import alphanum from htsworkflow.util.ethelp import indent, flatten @@ -81,7 +81,7 @@ class PipelineRun(object): else: return self.gerald.runfolder_name runfolder_name = property(_get_runfolder_name) - + def get_elements(self): """ make one master xml file from all of our sub-components. @@ -130,14 +130,14 @@ class PipelineRun(object): if self._name is None: tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time) timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) - self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' + self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml' return self._name name = property(_get_run_name) def save(self, destdir=None): if destdir is None: destdir = '' - logging.info("Saving run report "+ self.name) + logging.info("Saving run report " + self.name) xml = self.get_elements() indent(xml) dest_pathname = os.path.join(destdir, self.name) @@ -202,7 +202,7 @@ def get_runs(runfolder): logging.info('Searching for runs in ' + datadir) runs = [] # scan for firecrest directories - for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): + for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")): logging.info('Found firecrest in ' + datadir) image_analysis = firecrest.firecrest(firecrest_pathname) if image_analysis is None: @@ -222,7 +222,7 @@ def get_runs(runfolder): image_analysis = ipar.ipar(ipar_pathname) if image_analysis is None: logging.warn( - "%s is an empty or invalid IPAR directory" %(ipar_pathname,) + "%s is an empty or invalid IPAR directory" % (ipar_pathname,) ) else: scan_post_image_analysis( @@ -248,8 +248,8 @@ def get_specific_run(gerald_dir): bustard_dir = os.path.abspath(os.path.join(gerald_dir, '..')) image_dir = os.path.abspath(os.path.join(gerald_dir, '..', '..')) - runfolder_dir = os.path.abspath(os.path.join(image_dir, '..','..')) - + runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..')) + logging.info('--- use-run detected options ---') logging.info('runfolder: %s' % (runfolder_dir,)) logging.info('image_dir: %s' % (image_dir,)) @@ -262,7 +262,7 @@ def get_specific_run(gerald_dir): # leaf directory should be an IPAR or firecrest directory data_dir, short_image_dir = os.path.split(image_dir) logging.info('data_dir: %s' % (data_dir,)) - logging.info('short_iamge_dir: %s' %(short_image_dir,)) + logging.info('short_iamge_dir: %s' % (short_image_dir,)) # guess which type of image processing directory we have by looking # in the leaf directory name @@ -295,7 +295,7 @@ def get_specific_run(gerald_dir): p.image_analysis = image_run p.bustard = base_calling_run p.gerald = gerald_run - + logging.info('Constructed PipelineRun from %s' % (gerald_dir,)) return p @@ -327,7 +327,7 @@ def summarize_mapped_reads(genome_map, mapped_reads): def summarize_lane(gerald, lane_id): report = [] summary_results = gerald.summary.lane_results - for end in range(len(summary_results)): + for end in range(len(summary_results)): eland_result = gerald.eland_results.results[end][lane_id] report.append("Sample name %s" % (eland_result.sample_name)) report.append("Lane id %s end %s" % (eland_result.lane_id, end)) @@ -339,9 +339,9 @@ def summarize_lane(gerald, lane_id): if hasattr(eland_result, 'match_codes'): mc = eland_result.match_codes nm = mc['NM'] - nm_percent = float(nm)/eland_result.reads * 100 + nm_percent = float(nm) / eland_result.reads * 100 qc = mc['QC'] - qc_percent = float(qc)/eland_result.reads * 100 + qc_percent = float(qc) / eland_result.reads * 100 report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) @@ -425,11 +425,11 @@ def save_ivc_plot(bustard_object, cycle_dir): plot_target_path = os.path.join(cycle_dir, 'Plots') if os.path.exists(plot_html): - logging.debug("Saving %s" % ( plot_html, )) - logging.debug("Saving %s" % ( plot_images, )) + logging.debug("Saving %s" % (plot_html,)) + logging.debug("Saving %s" % (plot_images,)) shutil.copy(plot_html, cycle_dir) if not os.path.exists(plot_target_path): - os.mkdir(plot_target_path) + os.mkdir(plot_target_path) for plot_file in glob(plot_images): shutil.copy(plot_file, plot_target_path) else: @@ -454,12 +454,12 @@ def compress_score_files(bustard_object, cycle_dir): tar_cmd = ['tar', 'c'] + score_files bzip_cmd = [ 'bzip2', '-9', '-c' ] - tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2') + tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2') tar_dest = open(tar_dest_name, 'w') logging.info("Compressing score files from %s" % (scores_path,)) logging.info("Running tar: " + " ".join(tar_cmd[:10])) logging.info("Running bzip2: " + " ".join(bzip_cmd)) - logging.info("Writing to %s" %(tar_dest_name,)) + logging.info("Writing to %s" % (tar_dest_name,)) env = {'BZIP': '-9'} tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env, @@ -474,7 +474,7 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1): """ # copy & bzip eland files bz_commands = [] - + for lanes_dictionary in gerald_object.eland_results.results: for eland_lane in lanes_dictionary.values(): source_name = eland_lane.pathname @@ -486,9 +486,9 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1): dest_name = os.path.join(cycle_dir, name) logging.info("Saving eland file %s to %s" % \ (source_name, dest_name)) - + if is_compressed(name): - logging.info('Already compressed, Saving to %s' % (dest_name, )) + logging.info('Already compressed, Saving to %s' % (dest_name,)) shutil.copy(source_name, dest_name) else: # not compressed @@ -500,13 +500,13 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1): #bzip = subprocess.Popen(args, stdout=bzip_dest) #logging.info('Saving to %s' % (dest_name, )) #bzip.wait() - + if len(bz_commands) > 0: q = QueueCommands(bz_commands, num_jobs) q.run() - - -def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1): + + +def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, raw_format='qseq'): """ Iterate over runfolders in runs extracting the most useful information. * run parameters (in run-*.xml) @@ -539,8 +539,8 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1): r.save(cycle_dir) # save illumina flowcell status report - save_flowcell_reports( os.path.join(r.image_analysis.pathname, '..'), cycle_dir ) - + save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'), cycle_dir) + # save stuff from bustard # grab IVC plot save_ivc_plot(r.bustard, cycle_dir) @@ -548,13 +548,18 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1): # build base call saving commands if site is not None: lanes = [] - for lane in range(1,9): + for lane in range(1, 9): if r.gerald.lanes[lane].analysis != 'none': lanes.append(lane) run_name = srf.pathname_to_run_name(r.pathname) - srf_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir) - srf.run_commands(r.bustard.pathname, srf_cmds, num_jobs) + if raw_format == 'qseq': + seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir) + elif raw_format == 'srf': + seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0) + else: + raise ValueError('Unknown --raw-format=%s' % (raw_format)) + srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs) # save stuff from GERALD # copy stuff out of the main run @@ -562,14 +567,14 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1): # save summary file save_summary_file(g, cycle_dir) - + # compress eland result files compress_eland_results(g, cycle_dir, num_jobs) - + # md5 all the compressed files once we're done md5_commands = srf.make_md5_commands(cycle_dir) srf.run_commands(cycle_dir, md5_commands, num_jobs) - + def rm_list(files, dry_run=True): for f in files: if os.path.exists(f): @@ -580,7 +585,7 @@ def rm_list(files, dry_run=True): else: os.unlink(f) else: - logging.warn("%s doesn't exist."% (f,)) + logging.warn("%s doesn't exist." % (f,)) def clean_runs(runs, dry_run=True): """ @@ -613,7 +618,7 @@ def clean_runs(runs, dry_run=True): logging.info("Cleaning intermediate files") # make clean_intermediate if os.path.exists(os.path.join(run.image_analysis.pathname, 'Makefile')): - clean_process = subprocess.Popen(['make', 'clean_intermediate'], + clean_process = subprocess.Popen(['make', 'clean_intermediate'], cwd=run.image_analysis.pathname,) clean_process.wait() diff --git a/htsworkflow/pipelines/srf.py b/htsworkflow/pipelines/srf.py index b27c33e..cc18198 100644 --- a/htsworkflow/pipelines/srf.py +++ b/htsworkflow/pipelines/srf.py @@ -46,8 +46,8 @@ def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel destdir - where to write all the srf files """ # clean up pathname - logging.info("run_name %s" % ( run_name, )) - + logging.info("run_name %s" % (run_name,)) + cmd_list = [] for lane in lanes: name_prefix = '%s_%%l_%%t_' % (run_name,) @@ -57,19 +57,19 @@ def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel seq_pattern = 's_%d_*_seq.txt' % (lane,) if cmdlevel == SOLEXA2SRF: - cmd = ['solexa2srf', + cmd = ['solexa2srf', '-N', name_prefix, - '-n', '%3x:%3y', - '-o', dest_path, + '-n', '%t:%3x:%3y', + '-o', dest_path, seq_pattern] elif cmdlevel == ILLUMINA2SRF10: - cmd = ['illumina2srf', + cmd = ['illumina2srf', '-v1.0', '-o', dest_path, seq_pattern] elif cmdlevel == ILLUMINA2SRF11: seq_pattern = 's_%d_*_qseq.txt' % (lane,) - cmd = ['illumina2srf', + cmd = ['illumina2srf', '-o', dest_path, seq_pattern] else: @@ -88,7 +88,7 @@ def create_qseq_patterns(bustard_dir): qseqs = [ os.path.split(x)[-1] for x in qseqs ] if len(qseqs[0].split('_')) == 4: # single ended - return [(None,"s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")] + return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")] elif len(qseqs[0].split('_')) == 5: # more than 1 read # build a dictionary of read numbers by lane @@ -108,7 +108,7 @@ def create_qseq_patterns(bustard_dir): return qseq_patterns else: raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern') - + def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11): """ make a subprocess-friendly list of command line arguments to run solexa2srf @@ -122,8 +122,8 @@ def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdleve destdir - where to write all the srf files """ # clean up pathname - logging.info("run_name %s" % ( run_name, )) - + logging.info("run_name %s" % (run_name,)) + cmd_list = [] for lane in lanes: name_prefix = '%s_%%l_%%t_' % (run_name,) @@ -137,11 +137,11 @@ def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdleve else: destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, lane, read) dest_path = os.path.join(destdir, destname) - + cmd = " ".join(['tar', 'cjf', dest_path, pattern % (lane,) ]) logging.info("Generated command: " + cmd) cmd_list.append(cmd) - + return cmd_list def run_commands(new_dir, cmd_list, num_jobs): @@ -151,22 +151,22 @@ def run_commands(new_dir, cmd_list, num_jobs): q = queuecommands.QueueCommands(cmd_list, num_jobs) q.run() os.chdir(curdir) - + def make_md5_commands(destdir): """ Scan the cycle dir and create md5s for the contents """ cmd_list = [] destdir = os.path.abspath(destdir) - bz2s = glob(os.path.join(destdir,"*.bz2")) - gzs = glob(os.path.join(destdir,"*gz")) - srfs = glob(os.path.join(destdir,"*.srf")) + bz2s = glob(os.path.join(destdir, "*.bz2")) + gzs = glob(os.path.join(destdir, "*gz")) + srfs = glob(os.path.join(destdir, "*.srf")) file_list = bz2s + gzs + srfs for f in file_list: - cmd = " ".join(['md5sum', f, '>', f+'.md5']) - logging.info('generated command: '+cmd) + cmd = " ".join(['md5sum', f, '>', f + '.md5']) + logging.info('generated command: ' + cmd) cmd_list.append(cmd) return cmd_list diff --git a/scripts/htsw-runfolder b/scripts/htsw-runfolder index 145fd7a..7e35c97 100755 --- a/scripts/htsw-runfolder +++ b/scripts/htsw-runfolder @@ -35,7 +35,7 @@ import sys from htsworkflow.pipelines import runfolder from htsworkflow.pipelines.runfolder import ElementTree - + def make_parser(): usage = 'usage: %prog [options] runfolder_root_dir' parser = optparse.OptionParser(usage) @@ -78,6 +78,10 @@ def make_parser(): ' GERALD directory, and it assumes the parent ' 'directories are the bustard and image processing ' 'directories.') + parser.add_option('--raw-format', dest="raw_format", default='qseq', + choices=['qseq', 'srf'], + help='Specify which type of raw format to use. ' + 'Currently supported options: qseq, srf') return parser @@ -123,20 +127,21 @@ def main(cmdlist=None): if opt.extract_results: if opt.dry_run: parser.error("Dry-run is not supported for extract-results") - runfolder.extract_results(runs, - opt.output_dir, - opt.site, - opt.max_jobs) + runfolder.extract_results(runs, + opt.output_dir, + opt.site, + opt.max_jobs, + opt.raw_format) command_run = True if opt.clean: runfolder.clean_runs(runs, opt.dry_run) command_run = True if command_run == False: - print "You need to specify a command."+os.linesep + print "You need to specify a command." + os.linesep parser.print_help() else: - print "You need to specify some run folders to process..."+os.linesep + print "You need to specify some run folders to process..." + os.linesep parser.print_help() return 0