from htsworkflow.util.alphanum import alphanum
from htsworkflow.util.ethelp import indent, flatten
-
class PipelineRun(object):
"""
Capture "interesting" information about a pipeline run
PIPELINE_RUN = 'PipelineRun'
FLOWCELL_ID = 'FlowcellID'
- def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+ def __init__(self, pathname=None, xml=None):
if pathname is not None:
self.pathname = os.path.normpath(pathname)
else:
self.pathname = None
self._name = None
self._flowcell_id = None
- self.firecrest = firecrest
- self.bustard = bustard
- self.gerald = gerald
+ self.image_analysis = None
+ self.bustard = None
+ self.gerald = None
if xml is not None:
self.set_elements(xml)
-
+
def _get_flowcell_id(self):
# extract flowcell ID
if self._flowcell_id is None:
flowcell_id = path_fields[-1]
else:
flowcell_id = 'unknown'
-
+
logging.warning(
"Flowcell id was not found, guessing %s" % (
flowcell_id))
root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
flowcell.text = self.flowcell_id
- root.append(self.firecrest.get_elements())
+ root.append(self.image_analysis.get_elements())
root.append(self.bustard.get_elements())
root.append(self.gerald.get_elements())
return root
# this file gets imported by all the others,
# so we need to hide the imports to avoid a cyclic imports
from htsworkflow.pipelines import firecrest
+ from htsworkflow.pipelines import ipar
from htsworkflow.pipelines import bustard
from htsworkflow.pipelines import gerald
if tag == PipelineRun.FLOWCELL_ID.lower():
self._flowcell_id = element.text
#ok the xword.Xword.XWORD pattern for module.class.constant is lame
+ # you should only have Firecrest or IPAR, never both of them.
elif tag == firecrest.Firecrest.FIRECREST.lower():
- self.firecrest = firecrest.Firecrest(xml=element)
+ self.image_analysis = firecrest.Firecrest(xml=element)
+ elif tag == ipar.IPAR.IPAR.lower():
+ self.image_analysis = ipar.IPAR(xml=element)
elif tag == bustard.Bustard.BUSTARD.lower():
self.bustard = bustard.Bustard(xml=element)
elif tag == gerald.Gerald.GERALD.lower():
Given a run tuple, find the latest date and use that as our name
"""
if self._name is None:
- tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+ tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
return self._name
in there gerald component.
"""
from htsworkflow.pipelines import firecrest
+ from htsworkflow.pipelines import ipar
from htsworkflow.pipelines import bustard
from htsworkflow.pipelines import gerald
- datadir = os.path.join(runfolder, 'Data')
-
- logging.info('Searching for runs in ' + datadir)
- runs = []
- for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
- f = firecrest.firecrest(firecrest_pathname)
- bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+ def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
+ logging.info("Looking for bustard directories in %s" % (pathname,))
+ bustard_glob = os.path.join(pathname, "Bustard*")
for bustard_pathname in glob(bustard_glob):
+ logging.info("Found bustard directory %s" % (bustard_pathname,))
b = bustard.bustard(bustard_pathname)
gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+ logging.info("Looking for gerald directories in %s" % (pathname,))
for gerald_pathname in glob(gerald_glob):
+ logging.info("Found gerald directory %s" % (gerald_pathname,))
try:
g = gerald.gerald(gerald_pathname)
- runs.append(PipelineRun(runfolder, f, b, g))
+ p = PipelineRun(runfolder)
+ p.image_analysis = image_analysis
+ p.bustard = b
+ p.gerald = g
+ runs.append(p)
except IOError, e:
print "Ignoring", str(e)
+
+ datadir = os.path.join(runfolder, 'Data')
+
+ logging.info('Searching for runs in ' + datadir)
+ runs = []
+ # scan for firecrest directories
+ for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+ logging.info('Found firecrest in ' + datadir)
+ image_analysis = firecrest.firecrest(firecrest_pathname)
+ scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname)
+ # scan for IPAR directories
+ for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
+ logging.info('Found ipar directories in ' + datadir)
+ image_analysis = ipar.ipar(ipar_pathname)
+ scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname)
+
return runs
-
-
+
+
def extract_run_parameters(runs):
"""
Search through runfolder_path for various runs and grab their parameters
summarized_reads[genome] = genome_reads
return summarized_reads
+def summarize_lane(gerald, lane_id):
+ report = []
+ summary_results = gerald.summary.lane_results
+ eland_result = gerald.eland_results.results[lane_id]
+ report.append("Sample name %s" % (eland_result.sample_name))
+ report.append("Lane id %s" % (eland_result.lane_id,))
+ cluster = summary_results[eland_result.lane_id].cluster
+ report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+ report.append("Total Reads: %d" % (eland_result.reads))
+ mc = eland_result._match_codes
+ nm = mc['NM']
+ nm_percent = float(nm)/eland_result.reads * 100
+ qc = mc['QC']
+ qc_percent = float(qc)/eland_result.reads * 100
+
+ report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
+ report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
+ report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+ (mc['U0'], mc['U1'], mc['U2']))
+ report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+ (mc['R0'], mc['R1'], mc['R2']))
+ report.append("Mapped Reads")
+ mapped_reads = summarize_mapped_reads(eland_result.mapped_reads)
+ for name, counts in mapped_reads.items():
+ report.append(" %s: %d" % (name, counts))
+ return report
+
def summary_report(runs):
"""
Summarize cluster numbers and mapped read counts for a runfolder
eland_keys = run.gerald.eland_results.results.keys()
eland_keys.sort(alphanum)
- lane_results = run.gerald.summary.lane_results
for lane_id in eland_keys:
- result = run.gerald.eland_results.results[lane_id]
- report.append("Sample name %s" % (result.sample_name))
- report.append("Lane id %s" % (result.lane_id,))
- cluster = lane_results[result.lane_id].cluster
- report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
- report.append("Total Reads: %d" % (result.reads))
- mc = result._match_codes
- nm = mc['NM']
- nm_percent = float(nm)/result.reads * 100
- qc = mc['QC']
- qc_percent = float(qc)/result.reads * 100
-
- report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
- report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
- report.append('Unique (0,1,2 mismatches) %d %d %d' % \
- (mc['U0'], mc['U1'], mc['U2']))
- report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
- (mc['R0'], mc['R1'], mc['R2']))
- report.append("Mapped Reads")
- mapped_reads = summarize_mapped_reads(result.mapped_reads)
- for name, counts in mapped_reads.items():
- report.append(" %s: %d" % (name, counts))
+ report.extend(summarize_lane(run.gerald, lane_id))
report.append('---')
report.append('')
return os.linesep.join(report)
logging.info("Using %s as result directory" % (result_dir,))
if not os.path.exists(result_dir):
os.mkdir(result_dir)
-
+
# create cycle_dir
- cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop)
+ cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
logging.info("Filling in %s" % (cycle,))
cycle_dir = os.path.join(result_dir, cycle)
if os.path.exists(cycle_dir):
logging.info("Running tar: " + " ".join(tar_cmd[:10]))
logging.info("Running bzip2: " + " ".join(bzip_cmd))
logging.info("Writing to %s" %(tar_dest_name))
-
+
tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname)
bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
tar.wait()