From: Diane Trout Date: Thu, 30 Oct 2008 22:03:12 +0000 (+0000) Subject: Add support for scanning for results in the IPAR directory. X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~26 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=e5fd73d7bdcc8bd091898756674b38261ad5803b Add support for scanning for results in the IPAR directory. The field that was the firecrest class in PipelineRun is now the "image_analysis" field and can be either firecrest or ipar. I also extracted some of the common functions out of the runfolder test modules and added them to a seperate "simulate_runfolder" module. --- diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 109c6a8..59608da 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -33,7 +33,7 @@ class Gerald(object): def __init__(self, gerald, key): self._gerald = gerald self._key = key - + def __get_attribute(self, xml_tag): subtree = self._gerald.tree.find('LaneSpecificRunParameters') container = subtree.find(xml_tag) @@ -81,9 +81,9 @@ class Gerald(object): if self._keys is None: tree = self._gerald.tree analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') - # according to the pipeline specs I think their fields + # according to the pipeline specs I think their fields # are sampleName_laneID, with sampleName defaulting to s - # since laneIDs are constant lets just try using + # since laneIDs are constant lets just try using # those consistently. self._keys = [ x.tag.split('_')[1] for x in analysis] return self._keys @@ -138,7 +138,7 @@ class Gerald(object): if self.tree is None or self.summary is None: return None - gerald = ElementTree.Element(Gerald.GERALD, + gerald = ElementTree.Element(Gerald.GERALD, {'version': unicode(Gerald.XML_VERSION)}) gerald.append(self.tree) gerald.append(self.summary.get_elements()) @@ -162,16 +162,18 @@ class Gerald(object): self.eland_results = ELAND(xml=element) else: logging.warn("Unrecognized tag %s" % (element.tag,)) - + def gerald(pathname): g = Gerald() g.pathname = pathname path, name = os.path.split(pathname) + logging.info("Parsing gerald config.xml") config_pathname = os.path.join(pathname, 'config.xml') g.tree = ElementTree.parse(config_pathname).getroot() # parse Summary.htm file + logging.info("Parsing Summary.htm") summary_pathname = os.path.join(pathname, 'Summary.htm') g.summary = Summary(summary_pathname) # parse eland files @@ -213,7 +215,7 @@ def parse_mean_range_element(element): """ Grab mean/deviation out of element """ - return (tonumber(element.attrib['mean']), + return (tonumber(element.attrib['mean']), tonumber(element.attrib['deviation'])) def parse_summary_element(element): @@ -238,7 +240,7 @@ class Summary(object): Mostly for the cluster number """ LANE_RESULT_SUMMARY = 'LaneResultSummary' - TAGS = { + TAGS = { 'LaneYield': 'lane_yield', 'Cluster': 'cluster', # Raw 'ClusterPF': 'cluster_pass_filter', @@ -249,7 +251,7 @@ class Summary(object): 'AverageAlignmentScore': 'average_alignment_score', 'PercentErrorRate': 'percent_error_rate' } - + def __init__(self, html=None, xml=None): self.lane = None self.lane_yield = None @@ -299,7 +301,7 @@ class Summary(object): def get_elements(self): lane_result = ElementTree.Element( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY, + Summary.LaneResultSummary.LANE_RESULT_SUMMARY, {'lane': self.lane}) for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): value = getattr(self, variable_name) @@ -326,7 +328,7 @@ class Summary(object): for element in list(tree): try: variable_name = tags[element.tag] - setattr(self, variable_name, + setattr(self, variable_name, parse_summary_element(element)) except KeyError, e: logging.warn('Unrecognized tag %s' % (element.tag,)) @@ -359,10 +361,10 @@ class Summary(object): flatten the children of a ... """ return [flatten(x) for x in row.getchildren() ] - + def _parse_table(self, table): """ - assumes the first line is the header of a table, + assumes the first line is the header of a table, and that the remaining rows are data """ rows = table.getchildren() @@ -370,12 +372,12 @@ class Summary(object): for r in rows: data.append(self._flattened_row(r)) return data - + def _extract_named_tables(self, pathname): """ extract all the 'named' tables from a Summary.htm file and return as a dictionary - + Named tables are

...

...
pairs The contents of the h2 tag is considered to the name of the table. @@ -420,7 +422,7 @@ class Summary(object): self.lane_results[lrs.lane] = lrs def get_elements(self): - summary = ElementTree.Element(Summary.SUMMARY, + summary = ElementTree.Element(Summary.SUMMARY, {'version': unicode(Summary.XML_VERSION)}) for lane in self.lane_results.values(): summary.append(lane.get_elements()) @@ -446,6 +448,7 @@ class Summary(object): def build_genome_fasta_map(genome_dir): # build fasta to fasta file map + logging.info("Building genome map") genome = genome_dir.split(os.path.sep)[-1] fasta_map = {} for vld_file in glob(os.path.join(genome_dir, '*.vld')): @@ -460,7 +463,7 @@ def build_genome_fasta_map(genome_dir): else: fasta_map[name] = os.path.join(genome, name) return fasta_map - + class ElandLane(object): """ Process an eland result file @@ -487,7 +490,7 @@ class ElandLane(object): if genome_map is None: genome_map = {} self.genome_map = genome_map - + if xml is not None: self.set_elements(xml) @@ -502,10 +505,11 @@ class ElandLane(object): if os.stat(self.pathname)[stat.ST_SIZE] == 0: raise RuntimeError("Eland isn't done, try again later.") + logging.info("summarizing results for %s" % (self.pathname)) reads = 0 mapped_reads = {} - match_codes = {'NM':0, 'QC':0, 'RM':0, + match_codes = {'NM':0, 'QC':0, 'RM':0, 'U0':0, 'U1':0, 'U2':0, 'R0':0, 'R1':0, 'R2':0, } @@ -566,8 +570,8 @@ class ElandLane(object): match_codes = property(_get_match_codes) def get_elements(self): - lane = ElementTree.Element(ElandLane.LANE, - {'version': + lane = ElementTree.Element(ElandLane.LANE, + {'version': unicode(ElandLane.XML_VERSION)}) sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) sample_tag.text = self.sample_name @@ -576,17 +580,17 @@ class ElandLane(object): genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) for k, v in self.genome_map.items(): item = ElementTree.SubElement( - genome_map, ElandLane.GENOME_ITEM, + genome_map, ElandLane.GENOME_ITEM, {'name':k, 'value':unicode(v)}) mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) for k, v in self.mapped_reads.items(): item = ElementTree.SubElement( - mapped_reads, ElandLane.MAPPED_ITEM, + mapped_reads, ElandLane.MAPPED_ITEM, {'name':k, 'value':unicode(v)}) match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) for k, v in self.match_codes.items(): item = ElementTree.SubElement( - match_codes, ElandLane.MATCH_ITEM, + match_codes, ElandLane.MATCH_ITEM, {'name':k, 'value':unicode(v)}) reads = ElementTree.SubElement(lane, ElandLane.READS) reads.text = unicode(self.reads) @@ -600,7 +604,7 @@ class ElandLane(object): # reset dictionaries self._mapped_reads = {} self._match_codes = {} - + for element in tree: tag = element.tag.lower() if tag == ElandLane.SAMPLE_NAME.lower(): @@ -653,7 +657,7 @@ class ELAND(object): def __init__(self, xml=None): # we need information from the gerald config.xml self.results = {} - + if xml is not None: self.set_elements(xml) @@ -662,7 +666,7 @@ class ELAND(object): def keys(self): return self.results.keys() - + def values(self): return self.results.values() @@ -673,7 +677,7 @@ class ELAND(object): return self.results[key] def get_elements(self): - root = ElementTree.Element(ELAND.ELAND, + root = ElementTree.Element(ELAND.ELAND, {'version': unicode(ELAND.XML_VERSION)}) for lane_id, lane in self.results.items(): eland_lane = lane.get_elements() @@ -703,6 +707,7 @@ def eland(basedir, gerald=None, genome_maps=None): # but I needed to persist the sample_name/lane_id for # runfolder summary_report path, name = os.path.split(pathname) + logging.info("Adding eland file %s" %(name,)) split_name = name.split('_') lane_id = split_name[1] diff --git a/htsworkflow/pipelines/ipar.py b/htsworkflow/pipelines/ipar.py new file mode 100644 index 0000000..a559229 --- /dev/null +++ b/htsworkflow/pipelines/ipar.py @@ -0,0 +1,225 @@ +""" +Extract information about the IPAR run + +IPAR - class holding the properties we found +IPAR - IPAR factory function initalized from a directory name +fromxml - IPAR factory function initalized from an xml dump from + the IPAR object. +""" + +import datetime +import logging +import os +import re +import stat +import time + +from htsworkflow.pipelines.runfolder import \ + ElementTree, \ + VERSION_RE, \ + EUROPEAN_STRPTIME + +class Tiles(object): + def __init__(self, tree): + self.tree = tree.find("TileSelection") + + def keys(self): + key_list = [] + for c in self.tree.getchildren(): + k = c.attrib.get('Index', None) + if k is not None: + key_list.append(k) + return key_list + + def values(self): + value_list = [] + for lane in self.tree.getchildren(): + attributes = {} + for child in lane.getchildren(): + if child.tag == "Sample": + attributes['Sample'] = child.text + elif child.tag == 'TileRange': + attributes['TileRange'] = (int(child.attrib['Min']),int(child.attrib['Max'])) + value_list.append(attributes) + return value_list + + def items(self): + return zip(self.keys(), self.values()) + + def __getitem__(self, key): + # FIXME: this is inefficient. building the dictionary be rescanning the xml. + v = dict(self.items()) + return v[key] + +class IPAR(object): + XML_VERSION=1 + + # xml tag names + IPAR = 'IPAR' + TIMESTAMP = 'timestamp' + MATRIX = 'matrix' + RUN = 'Run' + + def __init__(self, xml=None): + self.tree = None + self.date = datetime.datetime.today() + self._tiles = None + if xml is not None: + self.set_elements(xml) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + def _set_time(self, value): + mtime_tuple = time.localtime(value) + self.date = datetime.datetime(*(mtime_tuple[0:7])) + time = property(_get_time, _set_time, + doc='run time as seconds since epoch') + + def _get_cycles(self): + if self.tree is None: + return None + cycles = self.tree.find("Cycles") + if cycles is None: + return None + return cycles.attrib + + def _get_start(self): + """ + return cycle start + """ + cycles = self._get_cycles() + if cycles is not None: + return int(cycles['First']) + else: + return None + start = property(_get_start, doc="get cycle start") + + def _get_stop(self): + """ + return cycle stop + """ + cycles = self._get_cycles() + if cycles is not None: + return int(cycles['Last']) + else: + return None + stop = property(_get_stop, doc="get cycle stop") + + def _get_tiles(self): + if self._tiles is None: + self._tiles = Tiles(self.tree) + return self._tiles + tiles = property(_get_tiles) + + def _get_version(self): + software = self.tree.find('Software') + if software is not None: + return software.attrib['Version'] + version = property(_get_version, "IPAR software version") + + + def file_list(self): + """ + Generate list of all files that should be generated by the IPAR unit + """ + suffix_node = self.tree.find('RunParameters/CompressionSuffix') + if suffix_node is None: + print "find compression suffix failed" + return None + suffix = suffix_node.text + files = [] + format = "%s_%s_%04d_%s.txt%s" + for lane, attrib in self.tiles.items(): + for file_type in ["int","nse"]: + start, stop = attrib['TileRange'] + for tile in range(start, stop+1): + files.append(format % (attrib['Sample'], lane, tile, file_type, suffix)) + return files + + def dump(self): + print "Matrix:", self.matrix + print "Tree:", self.tree + + def get_elements(self): + attribs = {'version': str(IPAR.XML_VERSION) } + root = ElementTree.Element(IPAR.IPAR, attrib=attribs) + timestamp = ElementTree.SubElement(root, IPAR.TIMESTAMP) + timestamp.text = str(int(self.time)) + root.append(self.tree) + matrix = ElementTree.SubElement(root, IPAR.MATRIX) + matrix.text = self.matrix + return root + + def set_elements(self, tree): + if tree.tag != IPAR.IPAR: + raise ValueError('Expected "IPAR" SubElements') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > IPAR.XML_VERSION: + logging.warn('IPAR XML tree is a higher version than this class') + for element in list(tree): + if element.tag == IPAR.RUN: + self.tree = element + elif element.tag == IPAR.TIMESTAMP: + self.time = int(element.text) + elif element.tag == IPAR.MATRIX: + self.matrix = element.text + else: + raise ValueError("Unrecognized tag: %s" % (element.tag,)) + +def load_ipar_param_tree(paramfile): + """ + look for a .param file and load it if it is an IPAR tree + """ + + tree = ElementTree.parse(paramfile).getroot() + run = tree.find('Run') + if run.attrib.has_key('Name') and run.attrib['Name'].startswith("IPAR"): + return run + + return None + +def ipar(pathname): + """ + Examine the directory at pathname and initalize a IPAR object + """ + logging.info("Searching IPAR directory") + i = IPAR() + + # parse firecrest directory name + path, name = os.path.split(pathname) + groups = name.split('_') + if groups[0] != 'IPAR': + raise ValueError('ipar can only process IPAR directories') + + # contents of the matrix file? + matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') + i.matrix = open(matrix_pathname, 'r').read() + + # look for parameter xml file + paramfile = os.path.join(path, '.params') + if os.path.exists(paramfile): + i.tree = load_ipar_param_tree(paramfile) + mtime_local = os.stat(paramfile)[stat.ST_MTIME] + i.time = mtime_local + return i + +def fromxml(tree): + """ + Initialize a IPAR object from an element tree node + """ + f = IPAR() + f.set_elements(tree) + return f + +if __name__ == "__main__": + i = ipar(os.path.expanduser('~/gec/081021_HWI-EAS229_0063_30HKUAAXX/Data/IPAR_1.01')) + x = i.get_elements() + j = fromxml(x) + #ElementTree.dump(x) + print j.date + print j.start + print j.stop + print i.tiles.keys() + print j.tiles.keys() + print j.tiles.items() + print j.file_list() \ No newline at end of file diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index dee3231..fc2beeb 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -25,7 +25,6 @@ LANES_PER_FLOWCELL = 8 from htsworkflow.util.alphanum import alphanum from htsworkflow.util.ethelp import indent, flatten - class PipelineRun(object): """ Capture "interesting" information about a pipeline run @@ -34,20 +33,20 @@ class PipelineRun(object): PIPELINE_RUN = 'PipelineRun' FLOWCELL_ID = 'FlowcellID' - def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None): + def __init__(self, pathname=None, xml=None): if pathname is not None: self.pathname = os.path.normpath(pathname) else: self.pathname = None self._name = None self._flowcell_id = None - self.firecrest = firecrest - self.bustard = bustard - self.gerald = gerald + self.image_analysis = None + self.bustard = None + self.gerald = None if xml is not None: self.set_elements(xml) - + def _get_flowcell_id(self): # extract flowcell ID if self._flowcell_id is None: @@ -63,7 +62,7 @@ class PipelineRun(object): flowcell_id = path_fields[-1] else: flowcell_id = 'unknown' - + logging.warning( "Flowcell id was not found, guessing %s" % ( flowcell_id)) @@ -78,7 +77,7 @@ class PipelineRun(object): root = ElementTree.Element(PipelineRun.PIPELINE_RUN) flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID) flowcell.text = self.flowcell_id - root.append(self.firecrest.get_elements()) + root.append(self.image_analysis.get_elements()) root.append(self.bustard.get_elements()) root.append(self.gerald.get_elements()) return root @@ -87,6 +86,7 @@ class PipelineRun(object): # this file gets imported by all the others, # so we need to hide the imports to avoid a cyclic imports from htsworkflow.pipelines import firecrest + from htsworkflow.pipelines import ipar from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald @@ -99,8 +99,11 @@ class PipelineRun(object): if tag == PipelineRun.FLOWCELL_ID.lower(): self._flowcell_id = element.text #ok the xword.Xword.XWORD pattern for module.class.constant is lame + # you should only have Firecrest or IPAR, never both of them. elif tag == firecrest.Firecrest.FIRECREST.lower(): - self.firecrest = firecrest.Firecrest(xml=element) + self.image_analysis = firecrest.Firecrest(xml=element) + elif tag == ipar.IPAR.IPAR.lower(): + self.image_analysis = ipar.IPAR(xml=element) elif tag == bustard.Bustard.BUSTARD.lower(): self.bustard = bustard.Bustard(xml=element) elif tag == gerald.Gerald.GERALD.lower(): @@ -113,7 +116,7 @@ class PipelineRun(object): Given a run tuple, find the latest date and use that as our name """ if self._name is None: - tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time) + tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time) timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' return self._name @@ -143,28 +146,48 @@ def get_runs(runfolder): in there gerald component. """ from htsworkflow.pipelines import firecrest + from htsworkflow.pipelines import ipar from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald - datadir = os.path.join(runfolder, 'Data') - - logging.info('Searching for runs in ' + datadir) - runs = [] - for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): - f = firecrest.firecrest(firecrest_pathname) - bustard_glob = os.path.join(firecrest_pathname, "Bustard*") + def scan_post_image_analysis(runs, runfolder, image_analysis, pathname): + logging.info("Looking for bustard directories in %s" % (pathname,)) + bustard_glob = os.path.join(pathname, "Bustard*") for bustard_pathname in glob(bustard_glob): + logging.info("Found bustard directory %s" % (bustard_pathname,)) b = bustard.bustard(bustard_pathname) gerald_glob = os.path.join(bustard_pathname, 'GERALD*') + logging.info("Looking for gerald directories in %s" % (pathname,)) for gerald_pathname in glob(gerald_glob): + logging.info("Found gerald directory %s" % (gerald_pathname,)) try: g = gerald.gerald(gerald_pathname) - runs.append(PipelineRun(runfolder, f, b, g)) + p = PipelineRun(runfolder) + p.image_analysis = image_analysis + p.bustard = b + p.gerald = g + runs.append(p) except IOError, e: print "Ignoring", str(e) + + datadir = os.path.join(runfolder, 'Data') + + logging.info('Searching for runs in ' + datadir) + runs = [] + # scan for firecrest directories + for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): + logging.info('Found firecrest in ' + datadir) + image_analysis = firecrest.firecrest(firecrest_pathname) + scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname) + # scan for IPAR directories + for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")): + logging.info('Found ipar directories in ' + datadir) + image_analysis = ipar.ipar(ipar_pathname) + scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname) + return runs - - + + def extract_run_parameters(runs): """ Search through runfolder_path for various runs and grab their parameters @@ -190,6 +213,33 @@ def summarize_mapped_reads(mapped_reads): summarized_reads[genome] = genome_reads return summarized_reads +def summarize_lane(gerald, lane_id): + report = [] + summary_results = gerald.summary.lane_results + eland_result = gerald.eland_results.results[lane_id] + report.append("Sample name %s" % (eland_result.sample_name)) + report.append("Lane id %s" % (eland_result.lane_id,)) + cluster = summary_results[eland_result.lane_id].cluster + report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) + report.append("Total Reads: %d" % (eland_result.reads)) + mc = eland_result._match_codes + nm = mc['NM'] + nm_percent = float(nm)/eland_result.reads * 100 + qc = mc['QC'] + qc_percent = float(qc)/eland_result.reads * 100 + + report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) + report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) + report.append('Unique (0,1,2 mismatches) %d %d %d' % \ + (mc['U0'], mc['U1'], mc['U2'])) + report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ + (mc['R0'], mc['R1'], mc['R2'])) + report.append("Mapped Reads") + mapped_reads = summarize_mapped_reads(eland_result.mapped_reads) + for name, counts in mapped_reads.items(): + report.append(" %s: %d" % (name, counts)) + return report + def summary_report(runs): """ Summarize cluster numbers and mapped read counts for a runfolder @@ -202,30 +252,8 @@ def summary_report(runs): eland_keys = run.gerald.eland_results.results.keys() eland_keys.sort(alphanum) - lane_results = run.gerald.summary.lane_results for lane_id in eland_keys: - result = run.gerald.eland_results.results[lane_id] - report.append("Sample name %s" % (result.sample_name)) - report.append("Lane id %s" % (result.lane_id,)) - cluster = lane_results[result.lane_id].cluster - report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) - report.append("Total Reads: %d" % (result.reads)) - mc = result._match_codes - nm = mc['NM'] - nm_percent = float(nm)/result.reads * 100 - qc = mc['QC'] - qc_percent = float(qc)/result.reads * 100 - - report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) - report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) - report.append('Unique (0,1,2 mismatches) %d %d %d' % \ - (mc['U0'], mc['U1'], mc['U2'])) - report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ - (mc['R0'], mc['R1'], mc['R2'])) - report.append("Mapped Reads") - mapped_reads = summarize_mapped_reads(result.mapped_reads) - for name, counts in mapped_reads.items(): - report.append(" %s: %d" % (name, counts)) + report.extend(summarize_lane(run.gerald, lane_id)) report.append('---') report.append('') return os.linesep.join(report) @@ -239,9 +267,9 @@ def extract_results(runs, output_base_dir=None): logging.info("Using %s as result directory" % (result_dir,)) if not os.path.exists(result_dir): os.mkdir(result_dir) - + # create cycle_dir - cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop) + cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop) logging.info("Filling in %s" % (cycle,)) cycle_dir = os.path.join(result_dir, cycle) if os.path.exists(cycle_dir): @@ -278,7 +306,7 @@ def extract_results(runs, output_base_dir=None): logging.info("Running tar: " + " ".join(tar_cmd[:10])) logging.info("Running bzip2: " + " ".join(bzip_cmd)) logging.info("Writing to %s" %(tar_dest_name)) - + tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname) bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) tar.wait() diff --git a/htsworkflow/pipelines/test/__init__.py b/htsworkflow/pipelines/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py new file mode 100644 index 0000000..c7a9961 --- /dev/null +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -0,0 +1,833 @@ +""" +Create simulated solexa/illumina runfolders for testing +""" + +import os + + +def make_ipar_dir(data_dir): + """ + Construct an artificial ipar parameter file and directory + """ + params = """ + + + + + + + 1 + 37 + 081021_HWI-EAS229_0063_30HKUAAXX + + + 1 + 37 + 081021_HWI-EAS229_0063_30HKUAAXX + + gzip + .p.gz + HWI-EAS229 + 081021_HWI-EAS229_0063_30HKUAAXX + + + 1 + 2.7 + 1.5 + 4 + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + s + + + + + +""" + f = open(os.path.join(data_dir, '.params'),'w') + f.write(params) + f.close() + ipar_dir = os.path.join(data_dir, 'IPAR_1.01') + if not os.path.exists(ipar_dir): + os.mkdir(ipar_dir) + return ipar_dir + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + +def make_summary100_htm(gerald_dir): + summary_htm=""" + + + + +

080627_HWI-EAS229_0036_3055HAXX Summary

+

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
+



Chip Results Summary

+ + + + + + + + + + +
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
+



Expanded Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
+

IVC Plots
+

IVC.htm +

+

All Intensity Plots
+

All.htm +

+

Error graphs:
+

Error.htm +

+Back to top +



Lane 1

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
+Back to top +



Lane 2

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
+Back to top +



Lane 3

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
+Back to top +



Lane 4

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
+Back to top +



Lane 5

+ + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
+Back to top +



Lane 6

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
+Back to top +



Lane 7

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
+Back to top +



Lane 8

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
+Back to top + + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() diff --git a/htsworkflow/pipelines/test/test_runfolder026.py b/htsworkflow/pipelines/test/test_runfolder026.py index 40d29cf..7b15381 100644 --- a/htsworkflow/pipelines/test/test_runfolder026.py +++ b/htsworkflow/pipelines/test/test_runfolder026.py @@ -12,140 +12,8 @@ from htsworkflow.pipelines import gerald from htsworkflow.pipelines import runfolder from htsworkflow.pipelines.runfolder import ElementTree +from htsworkflow.pipelines.test.simulate_runfolder import * -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() - def make_summary_htm(gerald_dir): summary_htm = """ @@ -588,7 +456,7 @@ class RunfolderTests(unittest.TestCase): r2 = runfolder.PipelineRun(xml=xml) self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.image_analysis, None) self.failIfEqual(r2.bustard, None) self.failIfEqual(r2.gerald, None) diff --git a/htsworkflow/pipelines/test/test_runfolder030.py b/htsworkflow/pipelines/test/test_runfolder030.py index 7457b12..6922764 100644 --- a/htsworkflow/pipelines/test/test_runfolder030.py +++ b/htsworkflow/pipelines/test/test_runfolder030.py @@ -12,139 +12,8 @@ from htsworkflow.pipelines import gerald from htsworkflow.pipelines import runfolder from htsworkflow.pipelines.runfolder import ElementTree +from htsworkflow.pipelines.test.simulate_runfolder import * -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() def make_summary_htm(gerald_dir): summary_htm=""" @@ -1011,7 +880,7 @@ class RunfolderTests(unittest.TestCase): r2 = runfolder.PipelineRun(xml=xml) self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.image_analysis, None) self.failIfEqual(r2.bustard, None) self.failIfEqual(r2.gerald, None) diff --git a/htsworkflow/pipelines/test/test_runfolder_ipar100.py b/htsworkflow/pipelines/test/test_runfolder_ipar100.py new file mode 100644 index 0000000..8be0db1 --- /dev/null +++ b/htsworkflow/pipelines/test/test_runfolder_ipar100.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipelines import ipar +from htsworkflow.pipelines import bustard +from htsworkflow.pipelines import gerald +from htsworkflow.pipelines import runfolder +from htsworkflow.pipelines.runfolder import ElementTree + +from htsworkflow.pipelines.test.simulate_runfolder import * + + +def make_runfolder(obj=None): + """ + Make a fake runfolder, attach all the directories to obj if defined + """ + # make a fake runfolder directory + temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + runfolder_dir = os.path.join(temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(runfolder_dir) + + data_dir = os.path.join(runfolder_dir, 'Data') + os.mkdir(data_dir) + + ipar_dir = make_ipar_dir(data_dir) + + matrix_dir = os.path.join(ipar_dir, 'Matrix') + os.mkdir(matrix_dir) + make_matrix(matrix_dir) + + bustard_dir = os.path.join(ipar_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(bustard_dir) + make_phasing_params(bustard_dir) + + gerald_dir = os.path.join(bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(gerald_dir) + make_gerald_config(gerald_dir) + make_summary100_htm(gerald_dir) + make_eland_results(gerald_dir) + + if obj is not None: + obj.temp_dir = temp_dir + obj.runfolder_dir = runfolder_dir + obj.data_dir = data_dir + obj.image_analysis_dir = ipar_dir + obj.matrix_dir = matrix_dir + obj.bustard_dir = bustard_dir + obj.gerald_dir = gerald_dir + + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # attaches all the directories to the object passed in + make_runfolder(self) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_ipar(self): + """ + Construct a firecrest object + """ + i = ipar.ipar(self.image_analysis_dir) + self.failUnlessEqual(i.version, '2.01.192.0') + self.failUnlessEqual(i.start, 1) + self.failUnlessEqual(i.stop, 37) + + xml = i.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + i2 = ipar.IPAR(xml=xml) + self.failUnlessEqual(i.version, i2.version) + self.failUnlessEqual(i.start, i2.start) + self.failUnlessEqual(i.stop, i2.stop) + self.failUnlessEqual(i.date, i2.date) + self.failUnlessEqual(i.file_list(), i2.file_list()) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (96483, 9074), (133738, 7938), + (152142, 10002), (15784, 2162), + (119735, 8465), (152177, 8146), + (84649, 7325), (54622, 4812),] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-10-30.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-10-30.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.image_analysis, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") +