From a644279b03e32ecea147badd8def47a7b0b16f32 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Tue, 9 Dec 2008 01:19:23 +0000 Subject: [PATCH] Test 1.1rc1 style runs, which unfortunately require a hack for parsing the summary.htm files since illumina's html is invalid. They forgot to use < when writing <=. Most web browsers will ignore it, but ElementTree is pickier. Also as of this commit the summary parsing code still doesn't understand paired end runs so the paired end summary file parsing tests still fail. --- htsworkflow/pipelines/summary.py | 6 +- .../pipelines/test/simulate_runfolder.py | 406 ++++++++++++++++++ .../pipelines/test/test_runfolder110.py | 292 +++++++++++++ .../pipelines/test/test_runfolder_pair.py | 33 +- 4 files changed, 721 insertions(+), 16 deletions(-) create mode 100644 htsworkflow/pipelines/test/test_runfolder110.py diff --git a/htsworkflow/pipelines/summary.py b/htsworkflow/pipelines/summary.py index 8830177..ad07d06 100644 --- a/htsworkflow/pipelines/summary.py +++ b/htsworkflow/pipelines/summary.py @@ -161,7 +161,11 @@ class Summary(object): The contents of the h2 tag is considered to the name of the table. """ - tree = ElementTree.parse(pathname).getroot() + # tree = ElementTree.parse(pathname).getroot() + # hack for 1.1rc1, this should be removed when possible. + file_body = open(pathname).read() + file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=') + tree = ElementTree.fromstring(file_body) body = tree.find('body') tables = {} for i in range(len(body)): diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index f527a7b..112201a 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -825,6 +825,412 @@ def make_summary100_htm(gerald_dir): f.write(summary_htm) f.close() +def make_summary_htm_110(gerald_dir): + summary_htm = """ + + + + +

081017_HWI-EAS229_0062_30J55AAXX Summary

+

Summary Information For Experiment 081017_HWI-EAS229_0062_30J55AAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder081017_HWI-EAS229_0062_30J55AAXX
Chip IDunknown
+



Chip Results Summary

+ + + + + + + + + + +
ClustersClusters (PF)Yield (kbases)
162491175996221593686019
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterChast. Thresh.Num TilesTiles
1unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 1
2unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 2
3unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 3
4unknownhg18ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 4
5unknownhg18ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 5
6unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 6
7unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 7
8unknownmm9ELAND37'((FAILED_CHASTITY<=1))'0.6100Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
1435340190220 +/- 15118117659 +/- 8144273 +/- 1680.02 +/- 2.5262.15 +/- 5.5477.18 +/- 0.2213447.28 +/- 43.352.78 +/- 0.13
2462364190560 +/- 14399124963 +/- 5687271 +/- 1675.73 +/- 2.4665.83 +/- 4.1270.06 +/- 0.3912082.95 +/- 64.813.22 +/- 0.09
3468929187597 +/- 12369126737 +/- 5549274 +/- 1672.61 +/- 2.6767.69 +/- 2.7274.03 +/- 0.2212470.18 +/- 50.024.27 +/- 0.08
4491642204142 +/- 16877132876 +/- 4023253 +/- 1680.43 +/- 3.1065.39 +/- 3.8472.95 +/- 0.1513273.80 +/- 39.750.78 +/- 0.10
5433033247308 +/- 11600117036 +/- 4489273 +/- 1168.60 +/- 2.4047.48 +/- 3.6366.91 +/- 0.5411700.08 +/- 66.332.62 +/- 0.13
6483012204298 +/- 15640130543 +/- 6972254 +/- 1181.35 +/- 1.9664.14 +/- 4.4077.28 +/- 0.1114084.01 +/- 23.090.71 +/- 0.03
7474325202707 +/- 15404128196 +/- 9745255 +/- 1379.95 +/- 2.0863.48 +/- 5.6375.78 +/- 0.1813758.74 +/- 60.860.88 +/- 0.12
8437372198075 +/- 14702118208 +/- 14798259 +/- 1481.80 +/- 2.5359.85 +/- 7.6774.55 +/- 0.3613586.07 +/- 103.970.71 +/- 0.15
Tile mean across chip
Av.20311312452726477.5662.0073.5913050.392.00
+



Expanded Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
11902200.68000.28003.1710726262.15241 +/- 130.56 +/- 0.220.29 +/- 0.1477.182.7886184
21905600.68000.28003.539867865.83238 +/- 140.78 +/- 0.150.53 +/- 0.1570.063.2283090
31875970.68000.28004.4410400867.69233 +/- 140.56 +/- 0.170.59 +/- 0.2674.034.2789278
42041420.68000.28001.3811576565.39239 +/- 141.28 +/- 0.210.77 +/- 0.2172.950.7893475
52473080.68000.28003.4010300647.48242 +/- 101.61 +/- 0.391.21 +/- 0.2166.912.6273768
62042980.68000.28001.3312223364.14242 +/- 121.30 +/- 0.110.73 +/- 0.2277.280.7197646
72027070.68000.28001.5111751363.48238 +/- 131.27 +/- 0.380.66 +/- 0.2275.780.8893659
81980750.68000.28001.4111111559.85244 +/- 121.19 +/- 0.160.65 +/- 0.2974.550.7185327
+ +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + def make_summary_paired_htm(gerald_dir): summary_htm = """ diff --git a/htsworkflow/pipelines/test/test_runfolder110.py b/htsworkflow/pipelines/test/test_runfolder110.py new file mode 100644 index 0000000..e1f3fdd --- /dev/null +++ b/htsworkflow/pipelines/test/test_runfolder110.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipelines import firecrest +from htsworkflow.pipelines import bustard +from htsworkflow.pipelines import gerald +from htsworkflow.pipelines import runfolder +from htsworkflow.pipelines.runfolder import ElementTree + +from htsworkflow.pipelines.test.simulate_runfolder import * + + +def make_runfolder(obj=None): + """ + Make a fake runfolder, attach all the directories to obj if defined + """ + # make a fake runfolder directory + temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + runfolder_dir = os.path.join(temp_dir, + '081017_HWI-EAS229_0062_30J55AAXX') + os.mkdir(runfolder_dir) + + data_dir = os.path.join(runfolder_dir, 'Data') + os.mkdir(data_dir) + + firecrest_dir = os.path.join(data_dir, + 'C1-37_Firecrest1.9.6_20-10-2008_diane') + os.mkdir(firecrest_dir) + + matrix_dir = os.path.join(firecrest_dir, 'Matrix') + os.mkdir(matrix_dir) + make_matrix(matrix_dir) + + bustard_dir = os.path.join(firecrest_dir, + 'Bustard1.9.6_20-10-2008_diane') + os.mkdir(bustard_dir) + make_phasing_params(bustard_dir) + + gerald_dir = os.path.join(bustard_dir, + 'GERALD_20-10-2008_diane') + os.mkdir(gerald_dir) + make_gerald_config(gerald_dir) + make_summary_htm_110(gerald_dir) + make_eland_multi(gerald_dir) + + if obj is not None: + obj.temp_dir = temp_dir + obj.runfolder_dir = runfolder_dir + obj.data_dir = data_dir + obj.image_analysis_dir = firecrest_dir + obj.matrix_dir = matrix_dir + obj.bustard_dir = bustard_dir + obj.gerald_dir = gerald_dir + + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # attaches all the directories to the object passed in + make_runfolder(self) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.image_analysis_dir) + self.failUnlessEqual(f.version, '1.9.6') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 37) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,10,20)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.9.6') + self.failUnlessEqual(b.date, date(2008,10,20)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # I want to be able to use a simple iterator + for l in g.lanes.values(): + self.failUnlessEqual(l.analysis, 'eland') + self.failUnlessEqual(l.read_length, '32') + self.failUnlessEqual(l.use_bases, 'Y'*32) + + # raw cluster numbers extracted from summary file + # its the first +/- value in the lane results summary + # section + clusters = [None, + (190220, 15118), (190560, 14399), + (187597, 12369), (204142, 16877), + (247308, 11600), (204298, 15640), + (202707, 15404), (198075, 14702),] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + hg_map = {'Lambda.fa': 'Lambda.fa'} + for i in range(1,22): + short_name = 'chr%d.fa' % (i,) + long_name = 'hg18/chr%d.fa' % (i,) + hg_map[short_name] = long_name + + genome_maps = { '1':hg_map, '2':hg_map, '3':hg_map, '4':hg_map, + '5':hg_map, '6':hg_map, '7':hg_map, '8':hg_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 15) + self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4) + self.failUnlessEqual(lane.match_codes['U0'], 1) + self.failUnlessEqual(lane.match_codes['R0'], 2) + self.failUnlessEqual(lane.match_codes['U1'], 1) + self.failUnlessEqual(lane.match_codes['R1'], 9) + self.failUnlessEqual(lane.match_codes['U2'], 0) + self.failUnlessEqual(lane.match_codes['R2'], 12) + self.failUnlessEqual(lane.match_codes['NM'], 1) + self.failUnlessEqual(lane.match_codes['QC'], 0) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 15) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + name = 'run_30J55AAXX_2008-10-20.xml' + self.failUnlessEqual(runs[0].name, name) + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '30J55AAXX') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + name = 'run_30J55AAXX_2008-10-20.xml' + self.failUnlessEqual(runs[0].name, name) + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.image_analysis, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htsworkflow/pipelines/test/test_runfolder_pair.py b/htsworkflow/pipelines/test/test_runfolder_pair.py index e5997b7..20dc0d7 100644 --- a/htsworkflow/pipelines/test/test_runfolder_pair.py +++ b/htsworkflow/pipelines/test/test_runfolder_pair.py @@ -6,7 +6,7 @@ import tempfile import shutil import unittest -from htsworkflow.pipelines import ipar +from htsworkflow.pipelines import firecrest from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald from htsworkflow.pipelines import runfolder @@ -29,7 +29,7 @@ def make_runfolder(obj=None): data_dir = os.path.join(runfolder_dir, 'Data') os.mkdir(data_dir) - ipar_dir = make_firecrest_dir(data_dir, "1.9.2", 1, 74) + ipar_dir = make_firecrest_dir(data_dir, "1.9.6", 1, 152) matrix_dir = os.path.join(ipar_dir, 'Matrix') os.mkdir(matrix_dir) @@ -69,25 +69,28 @@ class RunfolderTests(unittest.TestCase): def tearDown(self): shutil.rmtree(self.temp_dir) - def test_ipar(self): + def test_firecrest(self): """ Construct a firecrest object """ - i = ipar.ipar(self.image_analysis_dir) - self.failUnlessEqual(i.version, '2.01.192.0') - self.failUnlessEqual(i.start, 1) - self.failUnlessEqual(i.stop, 37) - - xml = i.get_elements() + f = firecrest.firecrest(self.image_analysis_dir) + self.failUnlessEqual(f.version, '1.9.6') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 152) + self.failUnlessEqual(f.user, 'diane') + # As of 2008-12-8, the date was being set in + # simulate_runfolder.make_firecrest_dir + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() # just make sure that element tree can serialize the tree xml_str = ElementTree.tostring(xml) - i2 = ipar.IPAR(xml=xml) - self.failUnlessEqual(i.version, i2.version) - self.failUnlessEqual(i.start, i2.start) - self.failUnlessEqual(i.stop, i2.stop) - self.failUnlessEqual(i.date, i2.date) - self.failUnlessEqual(i.file_list(), i2.file_list()) + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) def test_bustard(self): """ -- 2.30.2