The summary parsing code now seems to handle paired end runs
[htsworkflow.git] / htsworkflow / pipelines / test / test_runfolder026.py
1 #!/usr/bin/env python
2
3 from datetime import datetime, date
4 import os
5 import tempfile
6 import shutil
7 import unittest
8
9 from htsworkflow.pipelines import firecrest
10 from htsworkflow.pipelines import bustard
11 from htsworkflow.pipelines import gerald
12 from htsworkflow.pipelines import runfolder
13 from htsworkflow.pipelines.runfolder import ElementTree
14
15 from htsworkflow.pipelines.test.simulate_runfolder import *
16
17
18 def make_summary_htm(gerald_dir):
19     summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
20 <!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
21 <html>
22 <body>
23
24 <a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
25 <h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
26 <h2><br></br>Chip Summary<br></br></h2>
27 <table border="1" cellpadding="5">
28 <tr><td>Machine</td><td>HWI-EAS229</td></tr>
29 <tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
30 <tr><td>Chip ID</td><td>unknown</td></tr>
31 </table>
32 <h2><br></br>Lane Parameter Summary<br></br></h2>
33 <table border="1" cellpadding="5">
34 <tr>
35 <td>Lane</td>
36 <td>Sample ID</td>
37 <td>Sample Target</td>
38 <td>Sample Type</td>
39 <td>Length</td>
40 <td>Filter</td>
41 <td>Tiles</td>
42 </tr>
43 <tr>
44 <td>1</td>
45 <td>unknown</td>
46 <td>dm3</td>
47 <td>ELAND</td>
48 <td>32</td>
49 <td>'((CHASTITY>=0.6))'</td>
50 <td><a href="#Lane1">Lane 1</a></td>
51 </tr>
52 <tr>
53 <td>2</td>
54 <td>unknown</td>
55 <td>equcab1</td>
56 <td>ELAND</td>
57 <td>32</td>
58 <td>'((CHASTITY>=0.6))'</td>
59 <td><a href="#Lane2">Lane 2</a></td>
60 </tr>
61 <tr>
62 <td>3</td>
63 <td>unknown</td>
64 <td>equcab1</td>
65 <td>ELAND</td>
66 <td>32</td>
67 <td>'((CHASTITY>=0.6))'</td>
68 <td><a href="#Lane3">Lane 3</a></td>
69 </tr>
70 <tr>
71 <td>4</td>
72 <td>unknown</td>
73 <td>canfam2</td>
74 <td>ELAND</td>
75 <td>32</td>
76 <td>'((CHASTITY>=0.6))'</td>
77 <td><a href="#Lane4">Lane 4</a></td>
78 </tr>
79 <tr>
80 <td>5</td>
81 <td>unknown</td>
82 <td>hg18</td>
83 <td>ELAND</td>
84 <td>32</td>
85 <td>'((CHASTITY>=0.6))'</td>
86 <td><a href="#Lane5">Lane 5</a></td>
87 </tr>
88 <tr>
89 <td>6</td>
90 <td>unknown</td>
91 <td>hg18</td>
92 <td>ELAND</td>
93 <td>32</td>
94 <td>'((CHASTITY>=0.6))'</td>
95 <td><a href="#Lane6">Lane 6</a></td>
96 </tr>
97 <tr>
98 <td>7</td>
99 <td>unknown</td>
100 <td>hg18</td>
101 <td>ELAND</td>
102 <td>32</td>
103 <td>'((CHASTITY>=0.6))'</td>
104 <td><a href="#Lane7">Lane 7</a></td>
105 </tr>
106 <tr>
107 <td>8</td>
108 <td>unknown</td>
109 <td>hg18</td>
110 <td>ELAND</td>
111 <td>32</td>
112 <td>'((CHASTITY>=0.6))'</td>
113 <td><a href="#Lane8">Lane 8</a></td>
114 </tr>
115 </table>
116 <h2><br></br>Lane Results Summary<br></br></h2>
117 <table border="1" cellpadding="5">
118 <tr>
119
120 <td>Lane </td>
121 <td>Clusters </td>
122 <td>Av 1st Cycle Int </td>
123 <td>% intensity after 20 cycles </td>
124 <td>% PF Clusters </td>
125 <td>% Align (PF) </td>
126 <td>Av Alignment Score (PF) </td>
127 <td> % Error Rate (PF) </td>
128 </tr>
129 <tr>
130 <td>1</td>
131 <td>17421 +/- 2139</td>
132 <td>7230 +/- 801</td>
133 <td>23.73 +/- 10.79</td>
134 <td>13.00 +/- 22.91</td>
135 <td>32.03 +/- 18.45</td>
136 <td>6703.57 +/- 3753.85</td>
137 <td>4.55 +/- 4.81</td>
138 </tr>
139 <tr>
140 <td>2</td>
141 <td>20311 +/- 2402</td>
142 <td>7660 +/- 678</td>
143 <td>17.03 +/- 4.40</td>
144 <td>40.74 +/- 30.33</td>
145 <td>29.54 +/- 9.03</td>
146 <td>5184.02 +/- 1631.54</td>
147 <td>3.27 +/- 3.94</td>
148 </tr>
149 <tr>
150 <td>3</td>
151 <td>20193 +/- 2399</td>
152 <td>7700 +/- 797</td>
153 <td>15.75 +/- 3.30</td>
154 <td>56.56 +/- 17.16</td>
155 <td>27.33 +/- 7.48</td>
156 <td>4803.49 +/- 1313.31</td>
157 <td>3.07 +/- 2.86</td>
158 </tr>
159 <tr>
160 <td>4</td>
161 <td>15537 +/- 2531</td>
162 <td>7620 +/- 1392</td>
163 <td>15.37 +/- 3.79</td>
164 <td>63.05 +/- 18.30</td>
165 <td>15.88 +/- 4.99</td>
166 <td>3162.13 +/- 962.59</td>
167 <td>3.11 +/- 2.22</td>
168 </tr>
169 <tr>
170 <td>5</td>
171 <td>32047 +/- 3356</td>
172 <td>8093 +/- 831</td>
173 <td>23.79 +/- 6.18</td>
174 <td>53.36 +/- 18.06</td>
175 <td>48.04 +/- 13.77</td>
176 <td>9866.23 +/- 2877.30</td>
177 <td>2.26 +/- 1.16</td>
178 </tr>
179 <tr>
180 <td>6</td>
181 <td>32946 +/- 4753</td>
182 <td>8227 +/- 736</td>
183 <td>24.07 +/- 4.69</td>
184 <td>54.65 +/- 12.57</td>
185 <td>50.98 +/- 10.54</td>
186 <td>10468.86 +/- 2228.53</td>
187 <td>2.21 +/- 2.33</td>
188 </tr>
189 <tr>
190 <td>7</td>
191 <td>39504 +/- 4171</td>
192 <td>8401 +/- 785</td>
193 <td>22.55 +/- 4.56</td>
194 <td>45.22 +/- 10.34</td>
195 <td>48.41 +/- 9.67</td>
196 <td>9829.40 +/- 1993.20</td>
197 <td>2.26 +/- 1.11</td>
198 </tr>
199 <tr>
200 <td>8</td>
201 <td>37998 +/- 3792</td>
202 <td>8443 +/- 1211</td>
203 <td>39.03 +/- 7.52</td>
204 <td>42.16 +/- 12.35</td>
205 <td>40.98 +/- 14.89</td>
206 <td>8128.87 +/- 3055.34</td>
207 <td>3.57 +/- 2.77</td>
208 </tr>
209 </table>
210 </body>
211 </html>
212 """
213     pathname = os.path.join(gerald_dir, 'Summary.htm')
214     f = open(pathname, 'w')
215     f.write(summary_htm)
216     f.close()
217
218 def make_eland_results(gerald_dir):
219     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
220 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
221 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
222 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
223 """
224     for i in range(1,9):
225         pathname = os.path.join(gerald_dir, 
226                                 's_%d_eland_result.txt' % (i,))
227         f = open(pathname, 'w')
228         f.write(eland_result)
229         f.close()
230                      
231 class RunfolderTests(unittest.TestCase):
232     """
233     Test components of the runfolder processing code
234     which includes firecrest, bustard, and gerald
235     """
236     def setUp(self):
237         # make a fake runfolder directory
238         self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
239
240         self.runfolder_dir = os.path.join(self.temp_dir, 
241                                           '080102_HWI-EAS229_0010_207BTAAXX')
242         os.mkdir(self.runfolder_dir)
243
244         self.data_dir = os.path.join(self.runfolder_dir, 'Data')
245         os.mkdir(self.data_dir)
246
247         self.firecrest_dir = os.path.join(self.data_dir, 
248                                'C1-33_Firecrest1.8.28_12-04-2008_diane'
249                              )
250         os.mkdir(self.firecrest_dir)
251         self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
252         os.mkdir(self.matrix_dir)
253         make_matrix(self.matrix_dir)
254
255         self.bustard_dir = os.path.join(self.firecrest_dir, 
256                                         'Bustard1.8.28_12-04-2008_diane')
257         os.mkdir(self.bustard_dir)
258         make_phasing_params(self.bustard_dir)
259         
260         self.gerald_dir = os.path.join(self.bustard_dir,
261                                        'GERALD_12-04-2008_diane')
262         os.mkdir(self.gerald_dir)
263         make_gerald_config(self.gerald_dir)
264         make_summary_htm(self.gerald_dir)
265         make_eland_results(self.gerald_dir)
266
267     def tearDown(self):
268         shutil.rmtree(self.temp_dir)
269
270     def test_firecrest(self):
271         """
272         Construct a firecrest object
273         """
274         f = firecrest.firecrest(self.firecrest_dir)
275         self.failUnlessEqual(f.version, '1.8.28')
276         self.failUnlessEqual(f.start, 1)
277         self.failUnlessEqual(f.stop, 33)
278         self.failUnlessEqual(f.user, 'diane')
279         self.failUnlessEqual(f.date, date(2008,4,12))
280
281         xml = f.get_elements()
282         # just make sure that element tree can serialize the tree
283         xml_str = ElementTree.tostring(xml)
284
285         f2 = firecrest.Firecrest(xml=xml)
286         self.failUnlessEqual(f.version, f2.version)
287         self.failUnlessEqual(f.start,   f2.start)
288         self.failUnlessEqual(f.stop,    f2.stop)
289         self.failUnlessEqual(f.user,    f2.user)
290         self.failUnlessEqual(f.date,    f2.date)
291
292     def test_bustard(self):
293         """
294         construct a bustard object
295         """
296         b = bustard.bustard(self.bustard_dir)
297         self.failUnlessEqual(b.version, '1.8.28')
298         self.failUnlessEqual(b.date,    date(2008,4,12))
299         self.failUnlessEqual(b.user,    'diane')
300         self.failUnlessEqual(len(b.phasing), 8)
301         self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
302         
303         xml = b.get_elements()
304         b2 = bustard.Bustard(xml=xml)
305         self.failUnlessEqual(b.version, b2.version)
306         self.failUnlessEqual(b.date,    b2.date )
307         self.failUnlessEqual(b.user,    b2.user)
308         self.failUnlessEqual(len(b.phasing), len(b2.phasing))
309         for key in b.phasing.keys():
310             self.failUnlessEqual(b.phasing[key].lane, 
311                                  b2.phasing[key].lane)
312             self.failUnlessEqual(b.phasing[key].phasing, 
313                                  b2.phasing[key].phasing)
314             self.failUnlessEqual(b.phasing[key].prephasing, 
315                                  b2.phasing[key].prephasing)
316
317     def test_gerald(self):
318         # need to update gerald and make tests for it
319         g = gerald.gerald(self.gerald_dir) 
320
321         self.failUnlessEqual(g.version, 
322             '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
323         self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
324         self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
325         self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
326
327         
328         # list of genomes, matches what was defined up in 
329         # make_gerald_config.
330         # the first None is to offset the genomes list to be 1..9
331         # instead of pythons default 0..8
332         genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
333                          '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
334
335         # test lane specific parameters from gerald config file
336         for i in range(1,9):
337             cur_lane = g.lanes[i]
338             self.failUnlessEqual(cur_lane.analysis, 'eland')
339             self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
340             self.failUnlessEqual(cur_lane.read_length, '32')
341             self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
342
343         # test data extracted from summary file
344         clusters = [None, 
345                     (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
346                     (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
347
348         self.failUnlessEqual(len(g.summary), 1)
349         for i in range(1,9):
350             summary_lane = g.summary[0][i]
351             self.failUnlessEqual(summary_lane.cluster, clusters[i])
352             self.failUnlessEqual(summary_lane.lane, i)
353
354         xml = g.get_elements()
355         # just make sure that element tree can serialize the tree
356         xml_str = ElementTree.tostring(xml)
357         g2 = gerald.Gerald(xml=xml)
358
359         # do it all again after extracting from the xml file
360         self.failUnlessEqual(g.version, g2.version)
361         self.failUnlessEqual(g.date, g2.date)
362         self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
363         self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
364
365         # test lane specific parameters from gerald config file
366         for i in range(1,9):
367             g_lane = g.lanes[i]
368             g2_lane = g2.lanes[i]
369             self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
370             self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
371             self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
372             self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
373
374         self.failUnlessEqual(len(g.summary), 1)
375         # test (some) summary elements
376         for i in range(1,9):
377             g_summary = g.summary[0][i]
378             g2_summary = g2.summary[0][i]
379             self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
380             self.failUnlessEqual(g_summary.lane, g2_summary.lane)
381
382             g_eland = g.eland_results
383             g2_eland = g2.eland_results
384             for lane in g_eland.keys():
385                 self.failUnlessEqual(g_eland[lane].reads, 
386                                      g2_eland[lane].reads)
387                 self.failUnlessEqual(len(g_eland[lane].mapped_reads), 
388                                      len(g2_eland[lane].mapped_reads))
389                 for k in g_eland[lane].mapped_reads.keys():
390                     self.failUnlessEqual(g_eland[lane].mapped_reads[k],
391                                          g2_eland[lane].mapped_reads[k])
392
393                 self.failUnlessEqual(len(g_eland[lane].match_codes), 
394                                      len(g2_eland[lane].match_codes))
395                 for k in g_eland[lane].match_codes.keys():
396                     self.failUnlessEqual(g_eland[lane].match_codes[k],
397                                          g2_eland[lane].match_codes[k])
398
399
400     def test_eland(self):
401         dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
402                     'chr2L.fa': 'dm3/chr2L.fa',
403                     'Lambda.fa': 'Lambda.fa'}
404         genome_maps = { 1:dm3_map, 2:dm3_map, 3:dm3_map, 4:dm3_map,
405                         5:dm3_map, 6:dm3_map, 7:dm3_map, 8:dm3_map }
406         eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
407         
408         for i in range(1,9):
409             lane = eland[i]
410             self.failUnlessEqual(lane.reads, 4)
411             self.failUnlessEqual(lane.sample_name, "s")
412             self.failUnlessEqual(lane.lane_id, i)
413             self.failUnlessEqual(len(lane.mapped_reads), 3)
414             self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
415             self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
416             self.failUnlessEqual(lane.match_codes['U1'], 2)
417             self.failUnlessEqual(lane.match_codes['NM'], 1)
418
419         xml = eland.get_elements()
420         # just make sure that element tree can serialize the tree
421         xml_str = ElementTree.tostring(xml)
422         e2 = gerald.ELAND(xml=xml)
423
424         for i in range(1,9):
425             l1 = eland[i]
426             l2 = e2[i]
427             self.failUnlessEqual(l1.reads, l2.reads)
428             self.failUnlessEqual(l1.sample_name, l2.sample_name)
429             self.failUnlessEqual(l1.lane_id, l2.lane_id)
430             self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
431             self.failUnlessEqual(len(l1.mapped_reads), 3)
432             for k in l1.mapped_reads.keys():
433                 self.failUnlessEqual(l1.mapped_reads[k],
434                                      l2.mapped_reads[k])
435
436             self.failUnlessEqual(len(l1.match_codes), 9)
437             self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
438             for k in l1.match_codes.keys():
439                 self.failUnlessEqual(l1.match_codes[k], 
440                                      l2.match_codes[k])
441
442     def test_runfolder(self):
443         runs = runfolder.get_runs(self.runfolder_dir)
444         
445         # do we get the flowcell id from the filename?
446         self.failUnlessEqual(len(runs), 1)
447         self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
448
449         # do we get the flowcell id from the FlowcellId.xml file
450         make_flowcell_id(self.runfolder_dir, '207BTAAXY')
451         runs = runfolder.get_runs(self.runfolder_dir)
452         self.failUnlessEqual(len(runs), 1)
453         self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
454         
455         r1 = runs[0]
456         xml = r1.get_elements()
457         xml_str = ElementTree.tostring(xml)
458
459         r2 = runfolder.PipelineRun(xml=xml)
460         self.failUnlessEqual(r1.name, r2.name)
461         self.failIfEqual(r2.image_analysis, None)
462         self.failIfEqual(r2.bustard, None)
463         self.failIfEqual(r2.gerald, None)
464         
465
466 def suite():
467     return unittest.makeSuite(RunfolderTests,'test')
468
469 if __name__ == "__main__":
470     unittest.main(defaultTest="suite")
471