From: Diane Trout Date: Tue, 5 May 2009 18:02:20 +0000 (+0000) Subject: Handle lanes that were only sequenced. X-Git-Tag: 0.2.0.5~13 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=c553e004ef9974a82b7d4fcd5e995d898e92ed67 Handle lanes that were only sequenced. So the report needs to be a bit smaller, and I need to archive a different file Also this version only counts the number of records in the "sequence" file which is raw sequence that passed the QC filter, I don't have a uniform way of determining how many total sequences thre were. --- diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py index f9adcd8..bd6864e 100644 --- a/htsworkflow/pipelines/eland.py +++ b/htsworkflow/pipelines/eland.py @@ -12,34 +12,80 @@ from htsworkflow.pipelines.runfolder import ElementTree from htsworkflow.util.ethelp import indent, flatten from htsworkflow.util.opener import autoopen -class ElandLane(object): +SAMPLE_NAME = 'SampleName' +LANE_ID = 'LaneID' +END = 'End' +READS = 'Reads' + +GENOME_MAP = 'GenomeMap' +GENOME_ITEM = 'GenomeItem' +MAPPED_READS = 'MappedReads' +MAPPED_ITEM = 'MappedItem' +MATCH_CODES = 'MatchCodes' +MATCH_ITEM = 'Code' +READS = 'Reads' + +ELAND_SINGLE = 0 +ELAND_MULTI = 1 +ELAND_EXTENDED = 2 +ELAND_EXPORT = 3 + + +class ResultLane(object): """ - Process an eland result file + Base class for result lanes """ XML_VERSION = 2 - LANE = 'ElandLane' - SAMPLE_NAME = 'SampleName' - LANE_ID = 'LaneID' - END = 'End' - GENOME_MAP = 'GenomeMap' - GENOME_ITEM = 'GenomeItem' - MAPPED_READS = 'MappedReads' - MAPPED_ITEM = 'MappedItem' - MATCH_CODES = 'MatchCodes' - MATCH_ITEM = 'Code' - READS = 'Reads' - - ELAND_SINGLE = 0 - ELAND_MULTI = 1 - ELAND_EXTENDED = 2 - ELAND_EXPORT = 3 + LANE = 'ResultLane' - def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None): + def __init__(self, pathname=None, lane_id=None, end=None, xml=None): self.pathname = pathname self._sample_name = None self.lane_id = lane_id self.end = end self._reads = None + + if xml is not None: + self.set_elements(xml) + + def _update(self): + """ + Actually read the file and actually count the reads + """ + raise NotImplementedError("Can't count abstract classes") + + def _update_name(self): + # extract the sample name + if self.pathname is None: + return + + path, name = os.path.split(self.pathname) + split_name = name.split('_') + self._sample_name = split_name[0] + + def _get_sample_name(self): + if self._sample_name is None: + self._update_name() + return self._sample_name + sample_name = property(_get_sample_name) + + def _get_reads(self): + if self._reads is None: + self._update() + return self._reads + reads = property(_get_reads) + + +class ElandLane(ResultLane): + """ + Process an eland result file + """ + XML_VERSION = 2 + LANE = "ElandLane" + + def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None): + super(ElandLane, self).__init__(pathname, lane_id, xml) + self._mapped_reads = None self._match_codes = None if genome_map is None: @@ -55,15 +101,15 @@ class ElandLane(object): # attempt autodetect eland file type pathn, name = os.path.split(pathname) if re.search('result', name): - self.eland_type = ElandLane.ELAND_SINGLE + self.eland_type = ELAND_SINGLE elif re.search('multi', name): - self.eland_type = ElandLane.ELAND_MULTI + self.eland_type = ELAND_MULTI elif re.search('extended', name): - self.eland_type = ElandLane.ELAND_EXTENDED + self.eland_type = ELAND_EXTENDED elif re.search('export', name): - self.eland_type = ElandLane.ELAND_EXPORT + self.eland_type = ELAND_EXPORT else: - self.eland_type = ElandLane.ELAND_SINGLE + self.eland_type = ELAND_SINGLE def _update(self): """ @@ -79,10 +125,10 @@ class ElandLane(object): logging.info("summarizing results for %s" % (self.pathname)) - if self.eland_type == ElandLane.ELAND_SINGLE: + if self.eland_type == ELAND_SINGLE: result = self._update_eland_result(self.pathname) - elif self.eland_type == ElandLane.ELAND_MULTI or \ - self.eland_type == ElandLane.ELAND_EXTENDED: + elif self.eland_type == ELAND_MULTI or \ + self.eland_type == ELAND_EXTENDED: result = self._update_eland_multi(self.pathname) else: raise NotImplementedError("Only support single/multi/extended eland files") @@ -163,27 +209,6 @@ class ElandLane(object): mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 return match_codes, mapped_reads, reads - def _update_name(self): - # extract the sample name - if self.pathname is None: - return - - path, name = os.path.split(self.pathname) - split_name = name.split('_') - self._sample_name = split_name[0] - - def _get_sample_name(self): - if self._sample_name is None: - self._update_name() - return self._sample_name - sample_name = property(_get_sample_name) - - def _get_reads(self): - if self._reads is None: - self._update() - return self._reads - reads = property(_get_reads) - def _get_mapped_reads(self): if self._mapped_reads is None: self._update() @@ -244,29 +269,29 @@ class ElandLane(object): lane = ElementTree.Element(ElandLane.LANE, {'version': unicode(ElandLane.XML_VERSION)}) - sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) + sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME) sample_tag.text = self.sample_name - lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) + lane_tag = ElementTree.SubElement(lane, LANE_ID) lane_tag.text = str(self.lane_id) if self.end is not None: - end_tag = ElementTree.SubElement(lane, ElandLane.END) + end_tag = ElementTree.SubElement(lane, END) end_tag.text = str(self.end) - genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) + genome_map = ElementTree.SubElement(lane, GENOME_MAP) for k, v in self.genome_map.items(): item = ElementTree.SubElement( - genome_map, ElandLane.GENOME_ITEM, + genome_map, GENOME_ITEM, {'name':k, 'value':unicode(v)}) - mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) + mapped_reads = ElementTree.SubElement(lane, MAPPED_READS) for k, v in self.mapped_reads.items(): item = ElementTree.SubElement( - mapped_reads, ElandLane.MAPPED_ITEM, + mapped_reads, MAPPED_ITEM, {'name':k, 'value':unicode(v)}) - match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) + match_codes = ElementTree.SubElement(lane, MATCH_CODES) for k, v in self.match_codes.items(): item = ElementTree.SubElement( - match_codes, ElandLane.MATCH_ITEM, + match_codes, MATCH_ITEM, {'name':k, 'value':unicode(v)}) - reads = ElementTree.SubElement(lane, ElandLane.READS) + reads = ElementTree.SubElement(lane, READS) reads.text = unicode(self.reads) return lane @@ -281,37 +306,132 @@ class ElandLane(object): for element in tree: tag = element.tag.lower() - if tag == ElandLane.SAMPLE_NAME.lower(): + if tag == SAMPLE_NAME.lower(): self._sample_name = element.text - elif tag == ElandLane.LANE_ID.lower(): + elif tag == LANE_ID.lower(): self.lane_id = int(element.text) - elif tag == ElandLane.END.lower(): + elif tag == END.lower(): self.end = int(element.text) - elif tag == ElandLane.GENOME_MAP.lower(): + elif tag == GENOME_MAP.lower(): for child in element: name = child.attrib['name'] value = child.attrib['value'] self.genome_map[name] = value - elif tag == ElandLane.MAPPED_READS.lower(): + elif tag == MAPPED_READS.lower(): for child in element: name = child.attrib['name'] value = child.attrib['value'] self._mapped_reads[name] = int(value) - elif tag == ElandLane.MATCH_CODES.lower(): + elif tag == MATCH_CODES.lower(): for child in element: name = child.attrib['name'] value = int(child.attrib['value']) self._match_codes[name] = value - elif tag == ElandLane.READS.lower(): + elif tag == READS.lower(): self._reads = int(element.text) else: logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) +class SequenceLane(ResultLane): + XML_VERSION=1 + LANE = 'SequenceLane' + SEQUENCE_TYPE = 'SequenceType' + + NONE_TYPE = None + SCARF_TYPE = 1 + FASTQ_TYPE = 2 + SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' } + + def __init__(self, pathname=None, lane_id=None, end=None, xml=None): + self.sequence_type = None + super(SequenceLane, self).__init__(pathname, lane_id, end, xml) + + def _guess_sequence_type(self, pathname): + """ + Determine if we have a scarf or fastq sequence file + """ + f = open(pathname,'r') + l = f.readline() + f.close() + + if l[0] == '@': + # fastq starts with a @ + self.sequence_type = SequenceLane.FASTQ_TYPE + else: + self.sequence_type = SequenceLane.SCARF_TYPE + return self.sequence_type + + def _update(self): + """ + Actually read the file and actually count the reads + """ + # can't do anything if we don't have a file to process + if self.pathname is None: + return + + if os.stat(self.pathname)[stat.ST_SIZE] == 0: + raise RuntimeError("Sequencing isn't done, try again later.") + + self._guess_sequence_type(self.pathname) + + logging.info("summarizing results for %s" % (self.pathname)) + lines = 0 + f = open(self.pathname) + for l in f.xreadlines(): + lines += 1 + f.close() + + if self.sequence_type == SequenceLane.SCARF_TYPE: + self._reads = lines + elif self.sequence_type == SequenceLane.FASTQ_TYPE: + self._reads = lines / 4 + else: + raise NotImplementedError("This only supports scarf or fastq squence files") + + def get_elements(self): + lane = ElementTree.Element(SequenceLane.LANE, + {'version': + unicode(SequenceLane.XML_VERSION)}) + sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME) + sample_tag.text = self.sample_name + lane_tag = ElementTree.SubElement(lane, LANE_ID) + lane_tag.text = str(self.lane_id) + if self.end is not None: + end_tag = ElementTree.SubElement(lane, END) + end_tag.text = str(self.end) + reads = ElementTree.SubElement(lane, READS) + reads.text = unicode(self.reads) + sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE) + sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type]) + + return lane + + def set_elements(self, tree): + if tree.tag != SequenceLane.LANE: + raise ValueError('Exptecting %s' % (SequenceLane.LANE,)) + lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()]) + + for element in tree: + tag = element.tag.lower() + if tag == SAMPLE_NAME.lower(): + self._sample_name = element.text + elif tag == LANE_ID.lower(): + self.lane_id = int(element.text) + elif tag == END.lower(): + self.end = int(element.text) + elif tag == READS.lower(): + self._reads = int(element.text) + elif tag == SequenceLane.SEQUENCE_TYPE.lower(): + self.sequence_type = lookup_sequence_type.get(element.text, None) + print self.sequence_type + else: + logging.warn("SequenceLane unrecognized tag %s" % (element.tag,)) + class ELAND(object): """ Summarize information from eland files """ - XML_VERSION = 2 + XML_VERSION = 3 ELAND = 'ElandCollection' LANE = 'Lane' @@ -343,7 +463,11 @@ class ELAND(object): for element in list(tree): lane_id = int(element.attrib[ELAND.LANE_ID]) end = int(element.attrib.get(ELAND.END, 0)) - lane = ElandLane(xml=element) + if element.tag.lower() == ElandLane.LANE.lower(): + lane = ElandLane(xml=element) + elif element.tag.lower() == SequenceLane.LANE.lower(): + lane = SequenceLane(xml=element) + self.results[end][lane_id] = lane def check_for_eland_file(basedir, pattern, lane_id, end): @@ -360,6 +484,44 @@ def check_for_eland_file(basedir, pattern, lane_id, end): else: return None +def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps): + # yes the lane_id is also being computed in ElandLane._update + # I didn't want to clutter up my constructor + # but I needed to persist the sample_name/lane_id for + # runfolder summary_report + path, name = os.path.split(pathname) + logging.info("Adding eland file %s" %(name,)) + # split_name = name.split('_') + # lane_id = int(split_name[1]) + + if genome_maps is not None: + genome_map = genome_maps[lane_id] + elif gerald is not None: + genome_dir = gerald.lanes[lane_id].eland_genome + genome_map = build_genome_fasta_map(genome_dir) + else: + genome_map = {} + + lane = ElandLane(pathname, lane_id, end, genome_map) + + if end is None: + effective_end = 0 + else: + effective_end = end - 1 + + results[effective_end][lane_id] = lane + +def update_result_with_sequence(gerald, results, lane_id, end, pathname): + result = SequenceLane(pathname, lane_id, end) + + if end is None: + effective_end = 0 + else: + effective_end = end - 1 + + results[effective_end][lane_id] = result + + def eland(gerald_dir, gerald=None, genome_maps=None): e = ELAND() @@ -380,50 +542,34 @@ def eland(gerald_dir, gerald=None, genome_maps=None): # the order in patterns determines the preference for what # will be found. - patterns = ['s_%s_eland_result.txt', - 's_%s_eland_result.txt.bz2', - 's_%s_eland_result.txt.gz', - 's_%s_eland_extended.txt', - 's_%s_eland_extended.txt.bz2', - 's_%s_eland_extended.txt.gz', - 's_%s_eland_multi.txt', - 's_%s_eland_multi.txt.bz2', - 's_%s_eland_multi.txt.gz',] + MAPPED_ELAND = 0 + SEQUENCE = 1 + patterns = [('s_%s_eland_result.txt', MAPPED_ELAND), + ('s_%s_eland_result.txt.bz2', MAPPED_ELAND), + ('s_%s_eland_result.txt.gz', MAPPED_ELAND), + ('s_%s_eland_extended.txt', MAPPED_ELAND), + ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND), + ('s_%s_eland_extended.txt.gz', MAPPED_ELAND), + ('s_%s_eland_multi.txt', MAPPED_ELAND), + ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND), + ('s_%s_eland_multi.txt.gz', MAPPED_ELAND), + ('s_%s_sequence.txt', SEQUENCE),] for basedir in basedirs: for end in ends: for lane_id in lane_ids: for p in patterns: - pathname = check_for_eland_file(basedir, p, lane_id, end) + pathname = check_for_eland_file(basedir, p[0], lane_id, end) if pathname is not None: - break + if p[1] == MAPPED_ELAND: + update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps) + elif p[1] == SEQUENCE: + update_result_with_sequence(gerald, e.results, lane_id, end, pathname) + break else: logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end)) continue - # yes the lane_id is also being computed in ElandLane._update - # I didn't want to clutter up my constructor - # but I needed to persist the sample_name/lane_id for - # runfolder summary_report - path, name = os.path.split(pathname) - logging.info("Adding eland file %s" %(name,)) - # split_name = name.split('_') - # lane_id = int(split_name[1]) - - if genome_maps is not None: - genome_map = genome_maps[lane_id] - elif gerald is not None: - genome_dir = gerald.lanes[lane_id].eland_genome - genome_map = build_genome_fasta_map(genome_dir) - else: - genome_map = {} - - eland_result = ElandLane(pathname, lane_id, end, genome_map) - if end is None: - effective_end = 0 - else: - effective_end = end - 1 - e.results[effective_end][lane_id] = eland_result return e def build_genome_fasta_map(genome_dir): diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 56453af..11795c2 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -313,22 +313,27 @@ def summarize_lane(gerald, lane_id): cluster = summary_results[end][eland_result.lane_id].cluster report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) report.append("Total Reads: %d" % (eland_result.reads)) - mc = eland_result._match_codes - nm = mc['NM'] - nm_percent = float(nm)/eland_result.reads * 100 - qc = mc['QC'] - qc_percent = float(qc)/eland_result.reads * 100 - - report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) - report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) - report.append('Unique (0,1,2 mismatches) %d %d %d' % \ - (mc['U0'], mc['U1'], mc['U2'])) - report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ - (mc['R0'], mc['R1'], mc['R2'])) - report.append("Mapped Reads") - mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads) - for name, counts in mapped_reads.items(): - report.append(" %s: %d" % (name, counts)) + + if hasattr(eland_result, 'match_codes'): + mc = eland_result.match_codes + nm = mc['NM'] + nm_percent = float(nm)/eland_result.reads * 100 + qc = mc['QC'] + qc_percent = float(qc)/eland_result.reads * 100 + + report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) + report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) + report.append('Unique (0,1,2 mismatches) %d %d %d' % \ + (mc['U0'], mc['U1'], mc['U2'])) + report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ + (mc['R0'], mc['R1'], mc['R2'])) + + if hasattr(eland_result, 'genome_map'): + report.append("Mapped Reads") + mapped_reads = summarize_mapped_reads(eland_result.genome_map, eland_result.mapped_reads) + for name, counts in mapped_reads.items(): + report.append(" %s: %d" % (name, counts)) + report.append('') return report @@ -415,7 +420,8 @@ def extract_results(runs, output_base_dir=None): logging.info("Running bzip2: " + " ".join(bzip_cmd)) logging.info("Writing to %s" %(tar_dest_name)) - tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, + env = {'BZIP': '-9'} + tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env, cwd=scores_path) bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) tar.wait() diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index 455d8c1..2e340d2 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -7,7 +7,8 @@ import shutil TEST_CODE_DIR = os.path.split(__file__)[0] TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata') - +LANE_LIST = range(1,9) + def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37): firecrest_dir = os.path.join(data_dir, 'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version) @@ -118,7 +119,7 @@ def make_eland_results(gerald_dir): f.write(eland_result) f.close() -def make_eland_multi(gerald_dir, paired=False): +def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST): eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788 AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT NM >HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:2 chr5.fa:55269838R0 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379 AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG 2:1:0 chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0 @@ -135,16 +136,51 @@ def make_eland_multi(gerald_dir, paired=False): """] if paired: for e in [1,2]: - for i in range(1,9): + for i in lane_list: pathname = os.path.join(gerald_dir, 's_%d_%d_eland_multi.txt' % (i,e)) f = open(pathname, 'w') f.write(eland_multi[e-1]) f.close() else: - for i in range(1,9): + for i in lane_list: pathname = os.path.join(gerald_dir, 's_%d_eland_multi.txt' % (i,)) f = open(pathname, 'w') f.write(eland_multi[0]) f.close() + +def make_scarf(gerald_dir, lane_list=LANE_LIST): + seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER +HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a +HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V +HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^ +HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE +""" + for l in lane_list: + pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,)) + f = open(pathname,'w') + f.write(seq) + f.close() + +def make_fastq(gerald_dir, lane_list=LANE_LIST): + seq = """@HWI-EAS229:1:2:182:712#0/1 +AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA ++HWI-EAS229:1:2:182:712#0/1 +\bab_bbaabbababbaaa]]D]bb_baabbab\baa +@HWI-EAS229:1:2:198:621#0/1 +CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC ++HWI-EAS229:1:2:198:621#0/1 +[aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa +@HWI-EAS229:1:2:209:1321#0/1 +AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA ++HWI-EAS229:1:2:209:1321#0/1 +_bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa +""" + for l in lane_list: + pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,)) + f = open(pathname,'w') + f.write(seq) + f.close() + + diff --git a/htsworkflow/pipelines/test/test_runfolder_ipar130.py b/htsworkflow/pipelines/test/test_runfolder_ipar130.py index c782e4f..9e6ac22 100644 --- a/htsworkflow/pipelines/test/test_runfolder_ipar130.py +++ b/htsworkflow/pipelines/test/test_runfolder_ipar130.py @@ -6,6 +6,7 @@ import tempfile import shutil import unittest +from htsworkflow.pipelines import eland from htsworkflow.pipelines import ipar from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald @@ -42,7 +43,9 @@ def make_runfolder(obj=None): os.mkdir(gerald_dir) make_gerald_config_100(gerald_dir) make_summary_ipar130_htm(gerald_dir) - make_eland_multi(gerald_dir) + make_eland_multi(gerald_dir, lane_list=[1,2,3,4,5,6,]) + make_scarf(gerald_dir, lane_list=[7,]) + make_fastq(gerald_dir, lane_list=[8,]) if obj is not None: obj.temp_dir = temp_dir @@ -222,17 +225,18 @@ class RunfolderTests(unittest.TestCase): g2_results = g2_eland.results[0][lane] self.failUnlessEqual(g_results.reads, g2_results.reads) - self.failUnlessEqual(len(g_results.mapped_reads), - len(g2_results.mapped_reads)) - for k in g_results.mapped_reads.keys(): - self.failUnlessEqual(g_results.mapped_reads[k], - g2_results.mapped_reads[k]) + if isinstance(g_results, eland.ElandLane): + self.failUnlessEqual(len(g_results.mapped_reads), + len(g2_results.mapped_reads)) + for k in g_results.mapped_reads.keys(): + self.failUnlessEqual(g_results.mapped_reads[k], + g2_results.mapped_reads[k]) - self.failUnlessEqual(len(g_results.match_codes), - len(g2_results.match_codes)) - for k in g_results.match_codes.keys(): - self.failUnlessEqual(g_results.match_codes[k], - g2_results.match_codes[k]) + self.failUnlessEqual(len(g_results.match_codes), + len(g2_results.match_codes)) + for k in g_results.match_codes.keys(): + self.failUnlessEqual(g_results.match_codes[k], + g2_results.match_codes[k]) def test_eland(self): @@ -244,10 +248,11 @@ class RunfolderTests(unittest.TestCase): genome_maps = { 1:hg_map, 2:hg_map, 3:hg_map, 4:hg_map, 5:hg_map, 6:hg_map, 7:hg_map, 8:hg_map } - eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + eland_container = gerald.eland(self.gerald_dir, genome_maps=genome_maps) - for i in range(1,9): - lane = eland.results[0][i] + # I added sequence lanes to the last 2 lanes of this test case + for i in range(1,7): + lane = eland_container.results[0][i] self.failUnlessEqual(lane.reads, 6) self.failUnlessEqual(lane.sample_name, "s") self.failUnlessEqual(lane.lane_id, i) @@ -262,28 +267,47 @@ class RunfolderTests(unittest.TestCase): self.failUnlessEqual(lane.match_codes['NM'], 1) self.failUnlessEqual(lane.match_codes['QC'], 0) - xml = eland.get_elements() + # test scarf + lane = eland_container.results[0][7] + self.failUnlessEqual(lane.reads, 5) + self.failUnlessEqual(lane.sample_name, 's') + self.failUnlessEqual(lane.lane_id, 7) + self.failUnlessEqual(lane.sequence_type, eland.SequenceLane.SCARF_TYPE) + + # test fastq + lane = eland_container.results[0][8] + self.failUnlessEqual(lane.reads, 3) + self.failUnlessEqual(lane.sample_name, 's') + self.failUnlessEqual(lane.lane_id, 8) + self.failUnlessEqual(lane.sequence_type, eland.SequenceLane.FASTQ_TYPE) + + xml = eland_container.get_elements() # just make sure that element tree can serialize the tree xml_str = ElementTree.tostring(xml) e2 = gerald.ELAND(xml=xml) for i in range(1,9): - l1 = eland.results[0][i] + l1 = eland_container.results[0][i] l2 = e2.results[0][i] self.failUnlessEqual(l1.reads, l2.reads) self.failUnlessEqual(l1.sample_name, l2.sample_name) self.failUnlessEqual(l1.lane_id, l2.lane_id) - self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 17) - for k in l1.mapped_reads.keys(): - self.failUnlessEqual(l1.mapped_reads[k], - l2.mapped_reads[k]) - - self.failUnlessEqual(len(l1.match_codes), 9) - self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) - for k in l1.match_codes.keys(): - self.failUnlessEqual(l1.match_codes[k], - l2.match_codes[k]) + if isinstance(l1, eland.ElandLane): + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 17) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + elif isinstance(l1, eland.SequenceLane): + print 'l1', l1.__dict__ + print 'l2', l2.__dict__ + self.failUnlessEqual(l1.sequence_type, l2.sequence_type) def test_runfolder(self): runs = runfolder.get_runs(self.runfolder_dir)