Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / htsworkflow / pipelines / test / test_sequences.py
1 #!/usr/bin/env python
2 import os
3 import shutil
4 import tempfile
5 import unittest
6
7 from htsworkflow.pipelines import sequences
8
9
10 class SequenceFileTests(unittest.TestCase):
11     """
12     Make sure the sequence archive class works
13     """
14     def test_get_flowcell_cycle(self):
15         tests = [
16             ('/root/42BW9AAXX/C1-152',
17              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
18             ('/root/42BW9AAXX/C1-152/',
19              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
20             ('/root/42BW9AAXX/C1-152/Project_12345',
21              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
22             ('/root/42BW9AAXX/C1-152/Project_12345/',
23              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
24         ]
25
26         for t in tests:
27             path = sequences.get_flowcell_cycle(t[0])
28             self.assertEqual(path, t[1])
29
30     def test_flowcell_cycle(self):
31         """
32         Make sure code to parse directory heirarchy works
33         """
34         path = '/root/42BW9AAXX/C1-152'
35         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
36
37         self.assertEqual(flowcell, '42BW9AAXX')
38         self.assertEqual(start, 1)
39         self.assertEqual(stop, 152)
40         self.assertEqual(project, None)
41
42         path = '/root/42BW9AAXX/other'
43         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
44
45     def test_flowcell_project_cycle(self):
46         """
47         Make sure code to parse directory heirarchy works
48         """
49         path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
50         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
51
52         self.assertEqual(flowcell, '42BW9AAXX')
53         self.assertEqual(start, 1)
54         self.assertEqual(stop, 152)
55         self.assertEqual(project, 'Project_12345_Index1')
56
57         path = '/root/42BW9AAXX/other'
58         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
59
60     def test_srf(self):
61         path = '/root/42BW9AAXX/C1-38'
62         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
63         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_5.srf'
64         pathname = os.path.join(path,name)
65         f0 = sequences.parse_srf(path, name)
66         f1 = sequences.parse_srf(path, name)
67         fother = sequences.parse_srf(path, other)
68
69         self.assertEqual(f0.filetype, 'srf')
70         self.assertEqual(f0.path, pathname)
71         self.assertEqual(unicode(f0), unicode(pathname))
72         self.assertEqual(repr(f0), "<srf 42BW9AAXX 4 %s>" % (pathname,))
73         self.assertEqual(f0.flowcell, '42BW9AAXX')
74         self.assertEqual(f0.lane, 4)
75         self.assertEqual(f0.read, None)
76         self.assertEqual(f0.pf, None)
77         self.assertEqual(f0.cycle, 38)
78         self.assertEqual(f0.make_target_name('/tmp'),
79                          os.path.join('/tmp', name))
80
81         self.assertEqual(f0, f1)
82         self.assertNotEqual(f0, fother)
83
84
85     def test_qseq(self):
86         path = '/root/42BW9AAXX/C1-36'
87         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
88         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1.tar.bz2'
89         pathname = os.path.join(path,name)
90         f0 = sequences.parse_qseq(path, name)
91         f1 = sequences.parse_qseq(path, name)
92         fother = sequences.parse_qseq(path, other)
93
94         self.assertEqual(f0.filetype, 'qseq')
95         self.assertEqual(f0.path, pathname)
96         self.assertEqual(unicode(f0), unicode(pathname))
97         self.assertEqual(repr(f0), "<qseq 42BW9AAXX 4 %s>" %(pathname,))
98         self.assertEqual(f0.flowcell, '42BW9AAXX')
99         self.assertEqual(f0.lane, 4)
100         self.assertEqual(f0.read, 1)
101         self.assertEqual(f0.pf, None)
102         self.assertEqual(f0.cycle, 36)
103         self.assertEqual(f0.make_target_name('/tmp'),
104                          os.path.join('/tmp', name))
105
106         self.assertEqual(f0, f1)
107         self.assertNotEqual(f0, fother)
108
109         path = '/root/ilmn200901/C1-202'
110         name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
111         other = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r2.tar.bz2'
112         pathname = os.path.join(path, name)
113         f0 = sequences.parse_qseq(path, name)
114         f1 = sequences.parse_qseq(path, name)
115         fother = sequences.parse_qseq(path, other)
116
117         self.assertEqual(f0.filetype, 'qseq')
118         self.assertEqual(f0.path, pathname)
119         self.assertEqual(unicode(f0), unicode(pathname))
120         self.assertEqual(repr(f0), "<qseq ilmn200901 1 %s>" %(pathname,))
121         self.assertEqual(f0.lane, 1)
122         self.assertEqual(f0.read, 1)
123         self.assertEqual(f0.pf, None)
124         self.assertEqual(f0.cycle, 202)
125         self.assertEqual(f0.make_target_name('/tmp'),
126                          os.path.join('/tmp', name))
127
128         self.assertEqual(f0, f1)
129         self.assertNotEqual(f0, fother)
130
131     def test_fastq(self):
132         path = '/root/42BW9AAXX/C1-38'
133         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
134         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1_pass.fastq.bz2'
135         pathname = os.path.join(path,name)
136         f0 = sequences.parse_fastq(path, name)
137         f1 = sequences.parse_fastq(path, name)
138         fother = sequences.parse_fastq(path, other)
139
140         self.assertEqual(f0.filetype, 'fastq')
141         self.assertEqual(f0.path, pathname)
142         self.assertEqual(unicode(f0), unicode(pathname))
143         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" % (pathname,))
144         self.assertEqual(f0.flowcell, '42BW9AAXX')
145         self.assertEqual(f0.lane, 4)
146         self.assertEqual(f0.read, 1)
147         self.assertEqual(f0.pf, True)
148         self.assertEqual(f0.cycle, 38)
149         self.assertEqual(f0.make_target_name('/tmp'),
150                          os.path.join('/tmp', name))
151
152         self.assertEqual(f0, f1)
153         self.assertNotEqual(f0, fother)
154
155         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
156         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
157         pathname = os.path.join(path,name)
158         f0 = sequences.parse_fastq(path, name)
159         f1 = sequences.parse_fastq(path, name)
160         fother = sequences.parse_fastq(path, other)
161
162         self.assertEqual(f0.filetype, 'fastq')
163         self.assertEqual(f0.path, pathname)
164         self.assertEqual(unicode(f0), unicode(pathname))
165         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" %(pathname,))
166         self.assertEqual(f0.flowcell, '42BW9AAXX')
167         self.assertEqual(f0.lane, 4)
168         self.assertEqual(f0.read, 2)
169         self.assertEqual(f0.pf, False)
170         self.assertEqual(f0.cycle, 38)
171         self.assertEqual(f0.make_target_name('/tmp'),
172                          os.path.join('/tmp', name))
173
174         self.assertEqual(f0, f1)
175         self.assertNotEqual(f0, fother)
176
177     def test_project_fastq(self):
178         path = '/root/42BW9AAXX/C1-38/Project_12345'
179         name = '11111_NoIndex_L001_R1_001.fastq.gz'
180         other = '22222_NoIndex_L001_R1_001.fastq.gz'
181         pathname = os.path.join(path,name)
182         f0 = sequences.parse_fastq(path, name)
183         f1 = sequences.parse_fastq(path, name)
184         fother = sequences.parse_fastq(path, other)
185
186         self.assertEqual(f0.filetype, 'split_fastq')
187         self.assertEqual(f0.path, pathname)
188         self.assertEqual(unicode(f0), unicode(pathname))
189         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" %(pathname,))
190         self.assertEqual(f0.flowcell, '42BW9AAXX')
191         self.assertEqual(f0.lane, 1)
192         self.assertEqual(f0.read, 1)
193         self.assertEqual(f0.pf, True)
194         self.assertEqual(f0.project, '11111')
195         self.assertEqual(f0.index, 'NoIndex')
196         self.assertEqual(f0.cycle, 38)
197         self.assertEqual(f0.make_target_name('/tmp'),
198                          os.path.join('/tmp', name))
199
200         self.assertEqual(f0, f1)
201         self.assertNotEqual(f0, fother)
202
203         name = '11112_AAATTT_L001_R2_003.fastq.gz'
204         other = '11112_AAATTT_L002_R2_003.fastq.gz'
205         pathname = os.path.join(path,name)
206         f0 = sequences.parse_fastq(path, name)
207         f1 = sequences.parse_fastq(path, name)
208         fother = sequences.parse_fastq(path, other)
209
210         self.assertEqual(f0.filetype, 'split_fastq')
211         self.assertEqual(f0.path, pathname)
212         self.assertEqual(unicode(f0), unicode(pathname))
213         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" % (pathname,))
214         self.assertEqual(f0.flowcell, '42BW9AAXX')
215         self.assertEqual(f0.lane, 1)
216         self.assertEqual(f0.read, 2)
217         self.assertEqual(f0.pf, True)
218         self.assertEqual(f0.project, '11112')
219         self.assertEqual(f0.index, 'AAATTT')
220         self.assertEqual(f0.cycle, 38)
221         self.assertEqual(f0.make_target_name('/tmp'),
222                          os.path.join('/tmp', name))
223
224         self.assertEqual(f0, f1)
225         self.assertNotEqual(f0, fother)
226
227     def test_parse_fastq_pf_flag(self):
228         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
229         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
230                 'l1', 'r2', 'nopass']
231         self.assertEqual(sequences.parse_fastq_pf_flag(data), False)
232
233         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
234                 'l1', 'r2', 'pass']
235         self.assertEqual(sequences.parse_fastq_pf_flag(data), True)
236
237         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
238                 'l1', 'r2', 'all']
239         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
240
241         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
242                 'l1', 'r2']
243         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
244
245         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
246                 'l1', 'r2', 'all', 'newthing']
247         self.assertRaises(ValueError, sequences.parse_fastq_pf_flag, data)
248
249
250     def test_project_fastq_hashing(self):
251         """Can we tell the difference between sequence files?
252         """
253         path = '/root/42BW9AAXX/C1-38/Project_12345'
254         names = [('11111_NoIndex_L001_R1_001.fastq.gz',
255                   '11111_NoIndex_L001_R2_001.fastq.gz'),
256                  ('11112_NoIndex_L001_R1_001.fastq.gz',
257                   '11112_NoIndex_L001_R1_002.fastq.gz')
258                  ]
259         for a_name, b_name in names:
260             a = sequences.parse_fastq(path, a_name)
261             b = sequences.parse_fastq(path, b_name)
262             self.assertNotEqual(a, b)
263             self.assertNotEqual(a.key(), b.key())
264             self.assertNotEqual(hash(a), hash(b))
265
266     def test_eland(self):
267         path = '/root/42BW9AAXX/C1-38'
268         name = 's_4_eland_extended.txt.bz2'
269         pathname = os.path.join(path,name)
270         f = sequences.parse_eland(path, name)
271
272         self.assertEqual(f.filetype, 'eland')
273         self.assertEqual(f.path, pathname)
274         self.assertEqual(f.flowcell, '42BW9AAXX')
275         self.assertEqual(f.lane, 4)
276         self.assertEqual(f.read, None)
277         self.assertEqual(f.pf, None)
278         self.assertEqual(f.cycle, 38)
279         self.assertEqual(f.make_target_name('/tmp'),
280                          '/tmp/42BW9AAXX_38_s_4_eland_extended.txt.bz2')
281
282         path = '/root/42BW9AAXX/C1-152'
283         name = 's_4_1_eland_extended.txt.bz2'
284         pathname = os.path.join(path,name)
285         f = sequences.parse_eland(path, name)
286
287         self.assertEqual(f.filetype, 'eland')
288         self.assertEqual(f.path, pathname)
289         self.assertEqual(f.flowcell, '42BW9AAXX')
290         self.assertEqual(f.lane, 4)
291         self.assertEqual(f.read, 1)
292         self.assertEqual(f.pf, None)
293         self.assertEqual(f.cycle, 152)
294         self.assertEqual(f.make_target_name('/tmp'),
295                          '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
296
297     def test_sql(self):
298         """
299         Make sure that the quick and dirty sql interface in sequences works
300         """
301         import sqlite3
302         db = sqlite3.connect(":memory:")
303         c = db.cursor()
304         sequences.create_sequence_table(c)
305
306         data = [('/root/42BW9AAXX/C1-152',
307                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
308                 ('/root/42BW9AAXX/C1-152',
309                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'),
310                 ('/root/42BW9AAXX/C1-152',
311                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'),
312                 ('/root/42BW9AAXX/C1-152',
313                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
314
315         for path, name in data:
316             seq = sequences.parse_qseq(path, name)
317             seq.save(c)
318
319         count = c.execute("select count(*) from sequences")
320         row = count.fetchone()
321         self.assertEqual(row[0], 4)
322
323     def test_scan_for_sequences(self):
324         # simulate tree
325         file_types_seen = set()
326         file_types_to_see = set(['fastq', 'srf', 'eland', 'qseq'])
327         lanes = set()
328         lanes_to_see = set((1,2,3))
329         with SimulateSimpleTree() as tree:
330             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
331             for s in seqs:
332                 self.assertEquals(s.flowcell, '42BW9AAXX')
333                 self.assertEquals(s.cycle, 33)
334                 self.assertEquals(s.project, None)
335                 lanes.add(s.lane)
336                 file_types_seen.add(s.filetype)
337
338             self.assertEquals(len(seqs), 8)
339
340         self.assertEqual(lanes, lanes_to_see)
341         self.assertEqual(file_types_to_see, file_types_seen)
342         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
343
344     def test_scan_for_hiseq_sequences(self):
345         # simulate tree
346         file_types_seen = set()
347         file_types_to_see = set(['split_fastq'])
348         lanes = set()
349         lanes_to_see = set((1,2))
350         projects_seen = set()
351         projects_to_see = set(('11111', '21111', '31111'))
352         with SimulateHiSeqTree() as tree:
353             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
354             for s in seqs:
355                 self.assertEquals(s.flowcell, 'C02AAACXX')
356                 self.assertEquals(s.cycle, 101)
357                 lanes.add(s.lane)
358                 file_types_seen.add(s.filetype)
359                 projects_seen.add(s.project)
360
361             self.assertEquals(len(seqs), 12)
362
363         self.assertEqual(lanes, lanes_to_see)
364         self.assertEqual(file_types_to_see, file_types_seen)
365         self.assertEqual(projects_to_see, projects_seen)
366         # make sure we require a list, and not the confusing iterating over
367         # a string
368         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
369
370 class SimulateTree(object):
371     def __enter__(self):
372         return self
373
374     def __exit__(self, exc_type, exc_val, exc_tb):
375         shutil.rmtree(self.root)
376
377     def mkflowcell(self, *components):
378         head = self.root
379         for c in components:
380             head = os.path.join(head, c)
381             if not os.path.exists(head):
382                 os.mkdir(head)
383         return head
384
385     def mkfile(self, flowcell, filename):
386         pathname = os.path.join(flowcell, filename)
387         stream = open(pathname,'w')
388         stream.write(pathname)
389         stream.write(os.linesep)
390         stream.close()
391
392 class SimulateHiSeqTree(SimulateTree):
393     def __init__(self):
394         self.root = tempfile.mkdtemp(prefix='sequences_')
395
396         files = [
397             ('Project_11111', '11111_AAGGCC_L001_R1_001.fastq.gz',),
398             ('Project_11111', '11111_AAGGCC_L001_R1_002.fastq.gz',),
399             ('Project_11111', '11111_AAGGCC_L001_R2_001.fastq.gz',),
400             ('Project_11111', '11111_AAGGCC_L001_R2_002.fastq.gz',),
401             ('Project_21111', '21111_TTTTTT_L001_R1_001.fastq.gz',),
402             ('Project_21111', '21111_TTTTTT_L001_R1_002.fastq.gz',),
403             ('Project_21111', '21111_TTTTTT_L001_R2_001.fastq.gz',),
404             ('Project_21111', '21111_TTTTTT_L001_R2_002.fastq.gz',),
405             ('Project_31111', '31111_NoIndex_L002_R1_001.fastq.gz',),
406             ('Project_31111', '31111_NoIndex_L002_R1_002.fastq.gz',),
407             ('Project_31111', '31111_NoIndex_L002_R2_001.fastq.gz',),
408             ('Project_31111', '31111_NoIndex_L002_R2_002.fastq.gz',),
409             ('.', '11111_AAGGCC_L001_R1_001_export.txt.gz'),
410             ('.', '11111_AAGGCC_L001_R1_002_export.txt.gz'),
411             ('.', '11111_AAGGCC_L001_R2_001_export.txt.gz'),
412             ('.', '11111_AAGGCC_L001_R2_002_export.txt.gz'),
413             ('.', '21111_AAGGCC_L001_R1_001_export.txt.gz'),
414             ('.', '21111_AAGGCC_L001_R1_002_export.txt.gz'),
415             ('.', '21111_AAGGCC_L001_R2_001_export.txt.gz'),
416             ('.', '21111_AAGGCC_L001_R2_002_export.txt.gz'),
417             ('.', '31111_NoIndex_L002_R1_001_export.txt.gz'),
418             ('.', '31111_NoIndex_L002_R1_002_export.txt.gz'),
419             ('.', '31111_NoIndex_L002_R2_001_export.txt.gz'),
420             ('.', '31111_NoIndex_L002_R2_002_export.txt.gz'),
421             ]
422         for d, f in files:
423             fc = self.mkflowcell(self.root, 'C02AAACXX', 'C1-101', d)
424             self.mkfile(fc, f)
425
426 class SimulateSimpleTree(SimulateTree):
427     def __init__(self):
428         self.root = tempfile.mkdtemp(prefix='sequences_')
429
430         fc = self.mkflowcell(self.root, '42BW9AAXX', 'C1-33')
431         files = [
432             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2',
433             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2',
434             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2.md5',
435             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2.md5',
436             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_2.srf',
437             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_pass.fastq.bz2',
438             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_pass.fastq.bz2',
439             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_nopass.fastq.bz2',
440             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_nopass.fastq.bz2',
441             's_1_eland_extended.txt.bz2',
442             's_1_eland_extended.txt.bz2.md5',
443             ]
444         for f in files:
445             self.mkfile(fc, f)
446
447
448 def suite():
449     return unittest.makeSuite(SequenceFileTests,'test')
450
451 if __name__ == "__main__":
452     unittest.main(defaultTest="suite")