7 from htsworkflow.pipelines import sequences
10 class SequenceFileTests(unittest.TestCase):
12 Make sure the sequence archive class works
14 def test_get_flowcell_cycle(self):
16 ('/root/42BW9AAXX/C1-152',
17 sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
18 ('/root/42BW9AAXX/C1-152/',
19 sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
20 ('/root/42BW9AAXX/C1-152/Project_12345',
21 sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
22 ('/root/42BW9AAXX/C1-152/Project_12345/',
23 sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
27 path = sequences.get_flowcell_cycle(t[0])
28 self.assertEqual(path, t[1])
30 def test_flowcell_cycle(self):
32 Make sure code to parse directory heirarchy works
34 path = '/root/42BW9AAXX/C1-152'
35 flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
37 self.assertEqual(flowcell, '42BW9AAXX')
38 self.assertEqual(start, 1)
39 self.assertEqual(stop, 152)
40 self.assertEqual(project, None)
42 path = '/root/42BW9AAXX/other'
43 self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
45 def test_flowcell_project_cycle(self):
47 Make sure code to parse directory heirarchy works
49 path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
50 flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
52 self.assertEqual(flowcell, '42BW9AAXX')
53 self.assertEqual(start, 1)
54 self.assertEqual(stop, 152)
55 self.assertEqual(project, 'Project_12345_Index1')
57 path = '/root/42BW9AAXX/other'
58 self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
61 path = '/root/42BW9AAXX/C1-38'
62 name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
63 other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_5.srf'
64 pathname = os.path.join(path,name)
65 f0 = sequences.parse_srf(path, name)
66 f1 = sequences.parse_srf(path, name)
67 fother = sequences.parse_srf(path, other)
69 self.assertEqual(f0.filetype, 'srf')
70 self.assertEqual(f0.path, pathname)
71 self.assertEqual(unicode(f0), unicode(pathname))
72 self.assertEqual(repr(f0), "<srf 42BW9AAXX 4 %s>" % (pathname,))
73 self.assertEqual(f0.flowcell, '42BW9AAXX')
74 self.assertEqual(f0.lane, 4)
75 self.assertEqual(f0.read, None)
76 self.assertEqual(f0.pf, None)
77 self.assertEqual(f0.cycle, 38)
78 self.assertEqual(f0.make_target_name('/tmp'),
79 os.path.join('/tmp', name))
81 self.assertEqual(f0, f1)
82 self.assertNotEqual(f0, fother)
86 path = '/root/42BW9AAXX/C1-36'
87 name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
88 other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1.tar.bz2'
89 pathname = os.path.join(path,name)
90 f0 = sequences.parse_qseq(path, name)
91 f1 = sequences.parse_qseq(path, name)
92 fother = sequences.parse_qseq(path, other)
94 self.assertEqual(f0.filetype, 'qseq')
95 self.assertEqual(f0.path, pathname)
96 self.assertEqual(unicode(f0), unicode(pathname))
97 self.assertEqual(repr(f0), "<qseq 42BW9AAXX 4 %s>" %(pathname,))
98 self.assertEqual(f0.flowcell, '42BW9AAXX')
99 self.assertEqual(f0.lane, 4)
100 self.assertEqual(f0.read, 1)
101 self.assertEqual(f0.pf, None)
102 self.assertEqual(f0.cycle, 36)
103 self.assertEqual(f0.make_target_name('/tmp'),
104 os.path.join('/tmp', name))
106 self.assertEqual(f0, f1)
107 self.assertNotEqual(f0, fother)
109 path = '/root/ilmn200901/C1-202'
110 name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
111 other = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r2.tar.bz2'
112 pathname = os.path.join(path, name)
113 f0 = sequences.parse_qseq(path, name)
114 f1 = sequences.parse_qseq(path, name)
115 fother = sequences.parse_qseq(path, other)
117 self.assertEqual(f0.filetype, 'qseq')
118 self.assertEqual(f0.path, pathname)
119 self.assertEqual(unicode(f0), unicode(pathname))
120 self.assertEqual(repr(f0), "<qseq ilmn200901 1 %s>" %(pathname,))
121 self.assertEqual(f0.lane, 1)
122 self.assertEqual(f0.read, 1)
123 self.assertEqual(f0.pf, None)
124 self.assertEqual(f0.cycle, 202)
125 self.assertEqual(f0.make_target_name('/tmp'),
126 os.path.join('/tmp', name))
128 self.assertEqual(f0, f1)
129 self.assertNotEqual(f0, fother)
131 def test_fastq(self):
132 path = '/root/42BW9AAXX/C1-38'
133 name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
134 other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1_pass.fastq.bz2'
135 pathname = os.path.join(path,name)
136 f0 = sequences.parse_fastq(path, name)
137 f1 = sequences.parse_fastq(path, name)
138 fother = sequences.parse_fastq(path, other)
140 self.assertEqual(f0.filetype, 'fastq')
141 self.assertEqual(f0.path, pathname)
142 self.assertEqual(unicode(f0), unicode(pathname))
143 self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" % (pathname,))
144 self.assertEqual(f0.flowcell, '42BW9AAXX')
145 self.assertEqual(f0.lane, 4)
146 self.assertEqual(f0.read, 1)
147 self.assertEqual(f0.pf, True)
148 self.assertEqual(f0.cycle, 38)
149 self.assertEqual(f0.make_target_name('/tmp'),
150 os.path.join('/tmp', name))
152 self.assertEqual(f0, f1)
153 self.assertNotEqual(f0, fother)
155 name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
156 other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
157 pathname = os.path.join(path,name)
158 f0 = sequences.parse_fastq(path, name)
159 f1 = sequences.parse_fastq(path, name)
160 fother = sequences.parse_fastq(path, other)
162 self.assertEqual(f0.filetype, 'fastq')
163 self.assertEqual(f0.path, pathname)
164 self.assertEqual(unicode(f0), unicode(pathname))
165 self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" %(pathname,))
166 self.assertEqual(f0.flowcell, '42BW9AAXX')
167 self.assertEqual(f0.lane, 4)
168 self.assertEqual(f0.read, 2)
169 self.assertEqual(f0.pf, False)
170 self.assertEqual(f0.cycle, 38)
171 self.assertEqual(f0.make_target_name('/tmp'),
172 os.path.join('/tmp', name))
174 self.assertEqual(f0, f1)
175 self.assertNotEqual(f0, fother)
177 def test_project_fastq(self):
178 path = '/root/42BW9AAXX/C1-38/Project_12345'
179 name = '11111_NoIndex_L001_R1_001.fastq.gz'
180 other = '22222_NoIndex_L001_R1_001.fastq.gz'
181 pathname = os.path.join(path,name)
182 f0 = sequences.parse_fastq(path, name)
183 f1 = sequences.parse_fastq(path, name)
184 fother = sequences.parse_fastq(path, other)
186 self.assertEqual(f0.filetype, 'split_fastq')
187 self.assertEqual(f0.path, pathname)
188 self.assertEqual(unicode(f0), unicode(pathname))
189 self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" %(pathname,))
190 self.assertEqual(f0.flowcell, '42BW9AAXX')
191 self.assertEqual(f0.lane, 1)
192 self.assertEqual(f0.read, 1)
193 self.assertEqual(f0.pf, True)
194 self.assertEqual(f0.project, '11111')
195 self.assertEqual(f0.index, 'NoIndex')
196 self.assertEqual(f0.cycle, 38)
197 self.assertEqual(f0.make_target_name('/tmp'),
198 os.path.join('/tmp', name))
200 self.assertEqual(f0, f1)
201 self.assertNotEqual(f0, fother)
203 name = '11112_AAATTT_L001_R2_003.fastq.gz'
204 other = '11112_AAATTT_L002_R2_003.fastq.gz'
205 pathname = os.path.join(path,name)
206 f0 = sequences.parse_fastq(path, name)
207 f1 = sequences.parse_fastq(path, name)
208 fother = sequences.parse_fastq(path, other)
210 self.assertEqual(f0.filetype, 'split_fastq')
211 self.assertEqual(f0.path, pathname)
212 self.assertEqual(unicode(f0), unicode(pathname))
213 self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" % (pathname,))
214 self.assertEqual(f0.flowcell, '42BW9AAXX')
215 self.assertEqual(f0.lane, 1)
216 self.assertEqual(f0.read, 2)
217 self.assertEqual(f0.pf, True)
218 self.assertEqual(f0.project, '11112')
219 self.assertEqual(f0.index, 'AAATTT')
220 self.assertEqual(f0.cycle, 38)
221 self.assertEqual(f0.make_target_name('/tmp'),
222 os.path.join('/tmp', name))
224 self.assertEqual(f0, f1)
225 self.assertNotEqual(f0, fother)
227 def test_parse_fastq_pf_flag(self):
228 other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
229 data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
230 'l1', 'r2', 'nopass']
231 self.assertEqual(sequences.parse_fastq_pf_flag(data), False)
233 data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
235 self.assertEqual(sequences.parse_fastq_pf_flag(data), True)
237 data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
239 self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
241 data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
243 self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
245 data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
246 'l1', 'r2', 'all', 'newthing']
247 self.assertRaises(ValueError, sequences.parse_fastq_pf_flag, data)
250 def test_project_fastq_hashing(self):
251 """Can we tell the difference between sequence files?
253 path = '/root/42BW9AAXX/C1-38/Project_12345'
254 names = [('11111_NoIndex_L001_R1_001.fastq.gz',
255 '11111_NoIndex_L001_R2_001.fastq.gz'),
256 ('11112_NoIndex_L001_R1_001.fastq.gz',
257 '11112_NoIndex_L001_R1_002.fastq.gz')
259 for a_name, b_name in names:
260 a = sequences.parse_fastq(path, a_name)
261 b = sequences.parse_fastq(path, b_name)
262 self.assertNotEqual(a, b)
263 self.assertNotEqual(a.key(), b.key())
264 self.assertNotEqual(hash(a), hash(b))
266 def test_eland(self):
267 path = '/root/42BW9AAXX/C1-38'
268 name = 's_4_eland_extended.txt.bz2'
269 pathname = os.path.join(path,name)
270 f = sequences.parse_eland(path, name)
272 self.assertEqual(f.filetype, 'eland')
273 self.assertEqual(f.path, pathname)
274 self.assertEqual(f.flowcell, '42BW9AAXX')
275 self.assertEqual(f.lane, 4)
276 self.assertEqual(f.read, None)
277 self.assertEqual(f.pf, None)
278 self.assertEqual(f.cycle, 38)
279 self.assertEqual(f.make_target_name('/tmp'),
280 '/tmp/42BW9AAXX_38_s_4_eland_extended.txt.bz2')
282 path = '/root/42BW9AAXX/C1-152'
283 name = 's_4_1_eland_extended.txt.bz2'
284 pathname = os.path.join(path,name)
285 f = sequences.parse_eland(path, name)
287 self.assertEqual(f.filetype, 'eland')
288 self.assertEqual(f.path, pathname)
289 self.assertEqual(f.flowcell, '42BW9AAXX')
290 self.assertEqual(f.lane, 4)
291 self.assertEqual(f.read, 1)
292 self.assertEqual(f.pf, None)
293 self.assertEqual(f.cycle, 152)
294 self.assertEqual(f.make_target_name('/tmp'),
295 '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
299 Make sure that the quick and dirty sql interface in sequences works
302 db = sqlite3.connect(":memory:")
304 sequences.create_sequence_table(c)
306 data = [('/root/42BW9AAXX/C1-152',
307 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
308 ('/root/42BW9AAXX/C1-152',
309 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'),
310 ('/root/42BW9AAXX/C1-152',
311 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'),
312 ('/root/42BW9AAXX/C1-152',
313 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
315 for path, name in data:
316 seq = sequences.parse_qseq(path, name)
319 count = c.execute("select count(*) from sequences")
320 row = count.fetchone()
321 self.assertEqual(row[0], 4)
323 def test_scan_for_sequences(self):
325 file_types_seen = set()
326 file_types_to_see = set(['fastq', 'srf', 'eland', 'qseq'])
328 lanes_to_see = set((1,2,3))
329 with SimulateSimpleTree() as tree:
330 seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
332 self.assertEquals(s.flowcell, '42BW9AAXX')
333 self.assertEquals(s.cycle, 33)
334 self.assertEquals(s.project, None)
336 file_types_seen.add(s.filetype)
338 self.assertEquals(len(seqs), 8)
340 self.assertEqual(lanes, lanes_to_see)
341 self.assertEqual(file_types_to_see, file_types_seen)
342 self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
344 def test_scan_for_hiseq_sequences(self):
346 file_types_seen = set()
347 file_types_to_see = set(['split_fastq'])
349 lanes_to_see = set((1,2))
350 projects_seen = set()
351 projects_to_see = set(('11111', '21111', '31111'))
352 with SimulateHiSeqTree() as tree:
353 seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
355 self.assertEquals(s.flowcell, 'C02AAACXX')
356 self.assertEquals(s.cycle, 101)
358 file_types_seen.add(s.filetype)
359 projects_seen.add(s.project)
361 self.assertEquals(len(seqs), 12)
363 self.assertEqual(lanes, lanes_to_see)
364 self.assertEqual(file_types_to_see, file_types_seen)
365 self.assertEqual(projects_to_see, projects_seen)
366 # make sure we require a list, and not the confusing iterating over
368 self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
370 class SimulateTree(object):
374 def __exit__(self, exc_type, exc_val, exc_tb):
375 shutil.rmtree(self.root)
377 def mkflowcell(self, *components):
380 head = os.path.join(head, c)
381 if not os.path.exists(head):
385 def mkfile(self, flowcell, filename):
386 pathname = os.path.join(flowcell, filename)
387 stream = open(pathname,'w')
388 stream.write(pathname)
389 stream.write(os.linesep)
392 class SimulateHiSeqTree(SimulateTree):
394 self.root = tempfile.mkdtemp(prefix='sequences_')
397 ('Project_11111', '11111_AAGGCC_L001_R1_001.fastq.gz',),
398 ('Project_11111', '11111_AAGGCC_L001_R1_002.fastq.gz',),
399 ('Project_11111', '11111_AAGGCC_L001_R2_001.fastq.gz',),
400 ('Project_11111', '11111_AAGGCC_L001_R2_002.fastq.gz',),
401 ('Project_21111', '21111_TTTTTT_L001_R1_001.fastq.gz',),
402 ('Project_21111', '21111_TTTTTT_L001_R1_002.fastq.gz',),
403 ('Project_21111', '21111_TTTTTT_L001_R2_001.fastq.gz',),
404 ('Project_21111', '21111_TTTTTT_L001_R2_002.fastq.gz',),
405 ('Project_31111', '31111_NoIndex_L002_R1_001.fastq.gz',),
406 ('Project_31111', '31111_NoIndex_L002_R1_002.fastq.gz',),
407 ('Project_31111', '31111_NoIndex_L002_R2_001.fastq.gz',),
408 ('Project_31111', '31111_NoIndex_L002_R2_002.fastq.gz',),
409 ('.', '11111_AAGGCC_L001_R1_001_export.txt.gz'),
410 ('.', '11111_AAGGCC_L001_R1_002_export.txt.gz'),
411 ('.', '11111_AAGGCC_L001_R2_001_export.txt.gz'),
412 ('.', '11111_AAGGCC_L001_R2_002_export.txt.gz'),
413 ('.', '21111_AAGGCC_L001_R1_001_export.txt.gz'),
414 ('.', '21111_AAGGCC_L001_R1_002_export.txt.gz'),
415 ('.', '21111_AAGGCC_L001_R2_001_export.txt.gz'),
416 ('.', '21111_AAGGCC_L001_R2_002_export.txt.gz'),
417 ('.', '31111_NoIndex_L002_R1_001_export.txt.gz'),
418 ('.', '31111_NoIndex_L002_R1_002_export.txt.gz'),
419 ('.', '31111_NoIndex_L002_R2_001_export.txt.gz'),
420 ('.', '31111_NoIndex_L002_R2_002_export.txt.gz'),
423 fc = self.mkflowcell(self.root, 'C02AAACXX', 'C1-101', d)
426 class SimulateSimpleTree(SimulateTree):
428 self.root = tempfile.mkdtemp(prefix='sequences_')
430 fc = self.mkflowcell(self.root, '42BW9AAXX', 'C1-33')
432 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2',
433 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2',
434 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2.md5',
435 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2.md5',
436 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_2.srf',
437 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_pass.fastq.bz2',
438 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_pass.fastq.bz2',
439 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_nopass.fastq.bz2',
440 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_nopass.fastq.bz2',
441 's_1_eland_extended.txt.bz2',
442 's_1_eland_extended.txt.bz2.md5',
449 return unittest.makeSuite(SequenceFileTests,'test')
451 if __name__ == "__main__":
452 unittest.main(defaultTest="suite")