Port pipelines.sequences to rdflib
[htsworkflow.git] / htsworkflow / pipelines / test / test_sequences.py
1 #!/usr/bin/env python
2 import os
3 import shutil
4 import tempfile
5 from unittest import TestCase
6
7 from rdflib import Graph, Namespace, URIRef
8 from rdflib.namespace import RDF
9
10 from htsworkflow.pipelines import sequences
11 from htsworkflow.util.rdfns import libraryOntology
12
13 class SequenceFileTests(TestCase):
14     """
15     Make sure the sequence archive class works
16     """
17     def test_get_flowcell_cycle(self):
18         tests = [
19             ('/root/42BW9AAXX/C1-152',
20              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
21             ('/root/42BW9AAXX/C1-152/',
22              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
23             ('/root/42BW9AAXX/C1-152/Project_12345',
24              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
25             ('/root/42BW9AAXX/C1-152/Project_12345/',
26              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
27         ]
28
29         for t in tests:
30             path = sequences.get_flowcell_cycle(t[0])
31             self.assertEqual(path, t[1])
32
33     def test_flowcell_cycle(self):
34         """
35         Make sure code to parse directory heirarchy works
36         """
37         path = '/root/42BW9AAXX/C1-152'
38         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
39
40         self.assertEqual(flowcell, '42BW9AAXX')
41         self.assertEqual(start, 1)
42         self.assertEqual(stop, 152)
43         self.assertEqual(project, None)
44
45         path = '/root/42BW9AAXX/other'
46         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
47
48     def test_flowcell_project_cycle(self):
49         """
50         Make sure code to parse directory heirarchy works
51         """
52         path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
53         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
54
55         self.assertEqual(flowcell, '42BW9AAXX')
56         self.assertEqual(start, 1)
57         self.assertEqual(stop, 152)
58         self.assertEqual(project, 'Project_12345_Index1')
59
60         path = '/root/42BW9AAXX/other'
61         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
62
63     def test_srf(self):
64         path = '/root/42BW9AAXX/C1-38'
65         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
66         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_5.srf'
67         pathname = os.path.join(path,name)
68         f0 = sequences.parse_srf(path, name)
69         f1 = sequences.parse_srf(path, name)
70         fother = sequences.parse_srf(path, other)
71
72         self.assertEqual(f0.filetype, 'srf')
73         self.assertEqual(f0.path, pathname)
74         self.assertEqual(str(f0), str(pathname))
75         self.assertEqual(repr(f0), "<srf 42BW9AAXX 4 %s>" % (pathname,))
76         self.assertEqual(f0.flowcell, '42BW9AAXX')
77         self.assertEqual(f0.lane, '4')
78         self.assertEqual(f0.read, None)
79         self.assertEqual(f0.pf, None)
80         self.assertEqual(f0.cycle, 38)
81         self.assertEqual(f0.make_target_name('/tmp'),
82                          os.path.join('/tmp', name))
83
84         self.assertEqual(f0, f1)
85         self.assertNotEqual(f0, fother)
86
87
88     def test_qseq(self):
89         path = '/root/42BW9AAXX/C1-36'
90         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
91         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1.tar.bz2'
92         pathname = os.path.join(path,name)
93         f0 = sequences.parse_qseq(path, name)
94         f1 = sequences.parse_qseq(path, name)
95         fother = sequences.parse_qseq(path, other)
96
97         self.assertEqual(f0.filetype, 'qseq')
98         self.assertEqual(f0.path, pathname)
99         self.assertEqual(str(f0), str(pathname))
100         self.assertEqual(repr(f0), "<qseq 42BW9AAXX 4 %s>" %(pathname,))
101         self.assertEqual(f0.flowcell, '42BW9AAXX')
102         self.assertEqual(f0.lane, '4')
103         self.assertEqual(f0.read, 1)
104         self.assertEqual(f0.pf, None)
105         self.assertEqual(f0.cycle, 36)
106         self.assertEqual(f0.make_target_name('/tmp'),
107                          os.path.join('/tmp', name))
108
109         self.assertEqual(f0, f1)
110         self.assertNotEqual(f0, fother)
111
112         path = '/root/ilmn200901/C1-202'
113         name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
114         other = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r2.tar.bz2'
115         pathname = os.path.join(path, name)
116         f0 = sequences.parse_qseq(path, name)
117         f1 = sequences.parse_qseq(path, name)
118         fother = sequences.parse_qseq(path, other)
119
120         self.assertEqual(f0.filetype, 'qseq')
121         self.assertEqual(f0.path, pathname)
122         self.assertEqual(str(f0), str(pathname))
123         self.assertEqual(repr(f0), "<qseq ilmn200901 1 %s>" %(pathname,))
124         self.assertEqual(f0.lane, '1')
125         self.assertEqual(f0.read, 1)
126         self.assertEqual(f0.pf, None)
127         self.assertEqual(f0.cycle, 202)
128         self.assertEqual(f0.make_target_name('/tmp'),
129                          os.path.join('/tmp', name))
130
131         self.assertEqual(f0, f1)
132         self.assertNotEqual(f0, fother)
133
134     def test_fastq(self):
135         path = '/root/42BW9AAXX/C1-38'
136         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
137         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1_pass.fastq.bz2'
138         pathname = os.path.join(path,name)
139         f0 = sequences.parse_fastq(path, name)
140         f1 = sequences.parse_fastq(path, name)
141         fother = sequences.parse_fastq(path, other)
142
143         self.assertEqual(f0.filetype, 'fastq')
144         self.assertEqual(f0.path, pathname)
145         self.assertEqual(str(f0), str(pathname))
146         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" % (pathname,))
147         self.assertEqual(f0.flowcell, '42BW9AAXX')
148         self.assertEqual(f0.lane, '4')
149         self.assertEqual(f0.read, 1)
150         self.assertEqual(f0.pf, True)
151         self.assertEqual(f0.cycle, 38)
152         self.assertEqual(f0.make_target_name('/tmp'),
153                          os.path.join('/tmp', name))
154
155         self.assertEqual(f0, f1)
156         self.assertNotEqual(f0, fother)
157
158         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
159         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
160         pathname = os.path.join(path,name)
161         f0 = sequences.parse_fastq(path, name)
162         f1 = sequences.parse_fastq(path, name)
163         fother = sequences.parse_fastq(path, other)
164
165         self.assertEqual(f0.filetype, 'fastq')
166         self.assertEqual(f0.path, pathname)
167         self.assertEqual(str(f0), str(pathname))
168         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" %(pathname,))
169         self.assertEqual(f0.flowcell, '42BW9AAXX')
170         self.assertEqual(f0.lane, '4')
171         self.assertEqual(f0.read, 2)
172         self.assertEqual(f0.pf, False)
173         self.assertEqual(f0.cycle, 38)
174         self.assertEqual(f0.make_target_name('/tmp'),
175                          os.path.join('/tmp', name))
176
177         self.assertEqual(f0, f1)
178         self.assertNotEqual(f0, fother)
179
180     def test_project_fastq(self):
181         path = '/root/42BW9AAXX/C1-38/Project_12345'
182         name = '11111_NoIndex_L001_R1_001.fastq.gz'
183         other = '22222_NoIndex_L001_R1_001.fastq.gz'
184         pathname = os.path.join(path,name)
185         f0 = sequences.parse_fastq(path, name)
186         f1 = sequences.parse_fastq(path, name)
187         fother = sequences.parse_fastq(path, other)
188
189         self.assertEqual(f0.filetype, 'split_fastq')
190         self.assertEqual(f0.path, pathname)
191         self.assertEqual(str(f0), str(pathname))
192         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" %(pathname,))
193         self.assertEqual(f0.flowcell, '42BW9AAXX')
194         self.assertEqual(f0.lane, '1')
195         self.assertEqual(f0.read, 1)
196         self.assertEqual(f0.pf, True)
197         self.assertEqual(f0.project, '11111')
198         self.assertEqual(f0.index, 'NoIndex')
199         self.assertEqual(f0.cycle, 38)
200         self.assertEqual(f0.make_target_name('/tmp'),
201                          os.path.join('/tmp', name))
202
203         self.assertEqual(f0, f1)
204         self.assertNotEqual(f0, fother)
205
206         name = '11112_AAATTT_L001_R2_003.fastq.gz'
207         other = '11112_AAATTT_L002_R2_003.fastq.gz'
208         pathname = os.path.join(path,name)
209         f0 = sequences.parse_fastq(path, name)
210         f1 = sequences.parse_fastq(path, name)
211         fother = sequences.parse_fastq(path, other)
212
213         self.assertEqual(f0.filetype, 'split_fastq')
214         self.assertEqual(f0.path, pathname)
215         self.assertEqual(str(f0), str(pathname))
216         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" % (pathname,))
217         self.assertEqual(f0.flowcell, '42BW9AAXX')
218         self.assertEqual(f0.lane, '1')
219         self.assertEqual(f0.read, 2)
220         self.assertEqual(f0.pf, True)
221         self.assertEqual(f0.project, '11112')
222         self.assertEqual(f0.index, 'AAATTT')
223         self.assertEqual(f0.cycle, 38)
224         self.assertEqual(f0.make_target_name('/tmp'),
225                          os.path.join('/tmp', name))
226
227         self.assertEqual(f0, f1)
228         self.assertNotEqual(f0, fother)
229
230     def test_parse_fastq_pf_flag(self):
231         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
232         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
233                 'l1', 'r2', 'nopass']
234         self.assertEqual(sequences.parse_fastq_pf_flag(data), False)
235
236         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
237                 'l1', 'r2', 'pass']
238         self.assertEqual(sequences.parse_fastq_pf_flag(data), True)
239
240         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
241                 'l1', 'r2', 'all']
242         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
243
244         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
245                 'l1', 'r2']
246         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
247
248         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
249                 'l1', 'r2', 'all', 'newthing']
250         self.assertRaises(ValueError, sequences.parse_fastq_pf_flag, data)
251
252
253     def test_project_fastq_hashing(self):
254         """Can we tell the difference between sequence files?
255         """
256         path = '/root/42BW9AAXX/C1-38/Project_12345'
257         names = [('11111_NoIndex_L001_R1_001.fastq.gz',
258                   '11111_NoIndex_L001_R2_001.fastq.gz'),
259                  ('11112_NoIndex_L001_R1_001.fastq.gz',
260                   '11112_NoIndex_L001_R1_002.fastq.gz')
261                  ]
262         for a_name, b_name in names:
263             a = sequences.parse_fastq(path, a_name)
264             b = sequences.parse_fastq(path, b_name)
265             self.assertNotEqual(a, b)
266             self.assertNotEqual(a.key(), b.key())
267             self.assertNotEqual(hash(a), hash(b))
268
269     def test_eland(self):
270         path = '/root/42BW9AAXX/C1-38'
271         name = 's_4_eland_extended.txt.bz2'
272         pathname = os.path.join(path,name)
273         f = sequences.parse_eland(path, name)
274
275         self.assertEqual(f.filetype, 'eland')
276         self.assertEqual(f.path, pathname)
277         self.assertEqual(f.flowcell, '42BW9AAXX')
278         self.assertEqual(f.lane, '4')
279         self.assertEqual(f.read, None)
280         self.assertEqual(f.pf, None)
281         self.assertEqual(f.cycle, 38)
282         self.assertEqual(f.make_target_name('/tmp'),
283                          '/tmp/42BW9AAXX_38_s_4_eland_extended.txt.bz2')
284
285         path = '/root/42BW9AAXX/C1-152'
286         name = 's_4_1_eland_extended.txt.bz2'
287         pathname = os.path.join(path,name)
288         f = sequences.parse_eland(path, name)
289
290         self.assertEqual(f.filetype, 'eland')
291         self.assertEqual(f.path, pathname)
292         self.assertEqual(f.flowcell, '42BW9AAXX')
293         self.assertEqual(f.lane, '4')
294         self.assertEqual(f.read, 1)
295         self.assertEqual(f.pf, None)
296         self.assertEqual(f.cycle, 152)
297         self.assertEqual(f.make_target_name('/tmp'),
298                          '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
299
300     def _generate_sequences(self):
301         seqs = []
302         data = [('/root/42BW9AAXX/C1-152',
303                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
304                 ('/root/42BW9AAXX/C1-152',
305                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'),
306                 ('/root/42BW9AAXX/C1-152',
307                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'),
308                 ('/root/42BW9AAXX/C1-152',
309                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
310
311         for path, name in data:
312             seqs.append(sequences.parse_qseq(path, name))
313
314         path = '/root/42BW9AAXX/C1-38/Project_12345'
315         name = '12345_AAATTT_L003_R1_001.fastq.gz'
316         pathname = os.path.join(path,name)
317         seqs.append(sequences.parse_fastq(path, name))
318         self.assertEqual(len(seqs), 5)
319         return seqs
320
321
322     def test_sql(self):
323         """
324         Make sure that the quick and dirty sql interface in sequences works
325         """
326         import sqlite3
327         db = sqlite3.connect(":memory:")
328         c = db.cursor()
329         sequences.create_sequence_table(c)
330
331         for seq in self._generate_sequences():
332             seq.save_to_sql(c)
333
334         count = c.execute("select count(*) from sequences")
335         row = count.fetchone()
336         self.assertEqual(row[0], 5)
337
338     def test_basic_rdf_scan(self):
339         """Make sure we can save to RDF model"""
340         model = Graph()
341
342         for seq in self._generate_sequences():
343             seq.save_to_model(model)
344
345         files = list(model.triples((None,
346                                     RDF['type'],
347                                     libraryOntology['IlluminaResult'])))
348         self.assertEqual(len(files), 5)
349         files = list(model.triples((None,
350                                     libraryOntology['file_type'],
351                                     libraryOntology['qseq'])))
352         self.assertEqual(len(files), 4)
353         files = list(model.triples((None,
354                                     libraryOntology['file_type'],
355                                     libraryOntology['split_fastq'])))
356         self.assertEqual(len(files), 1)
357
358         files = list(model.triples((None, libraryOntology['library_id'], None)))
359         self.assertEqual(len(files), 1)
360
361         files = list(model.triples((None, libraryOntology['flowcell_id'], None)))
362         self.assertEqual(len(files), 5)
363
364         files = list(model.triples((None, libraryOntology['flowcell'], None)))
365         self.assertEqual(len(files), 0)
366
367         files = list(model.triples((None, libraryOntology['library'], None)))
368         self.assertEqual(len(files), 0)
369
370     def test_rdf_scan_with_url(self):
371         """Make sure we can save to RDF model"""
372         model = Graph()
373         base_url = 'http://localhost'
374         for seq in self._generate_sequences():
375             seq.save_to_model(model, base_url=base_url)
376         localFC = Namespace(base_url + '/flowcell/')
377         localLibrary = Namespace(base_url + '/library/')
378
379         files = list(model.triples((None, libraryOntology['flowcell'], None)))
380         self.assertEqual(len(files), 5)
381         for f in files:
382             # object is index 2 in the tuple
383             self.assertEqual(f[2], localFC['42BW9AAXX/'])
384
385         files = list(model.triples((None, libraryOntology['library'], None)))
386         self.assertEqual(len(files), 1)
387         self.assertEqual(files[0][2], localLibrary['12345'])
388
389     def test_rdf_fixup_library(self):
390         """Make sure we can save to RDF model"""
391         base_url = 'http://localhost'
392         localLibrary = Namespace(base_url + '/library/')
393
394         flowcellInfo = """@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
395
396 <{base}/flowcell/42BW9AAXX/>
397     libns:flowcell_id "42BW9AXX"@en ;
398     libns:has_lane <{base}/lane/1169>, <{base}/lane/1170>,
399                    <{base}/lane/1171>, <{base}/lane/1172> ;
400     libns:read_length 75 ;
401     a libns:IlluminaFlowcell .
402
403 <{base}/lane/1169>
404     libns:lane_number "1" ; libns:library <{base}/library/10923/> .
405 <{base}/lane/1170>
406     libns:lane_number "2" ; libns:library <{base}/library/10924/> .
407 <{base}/lane/1171>
408     libns:lane_number "3" ; libns:library <{base}/library/12345/> .
409 <{base}/lane/1172>
410     libns:lane_number "3" ; libns:library <{base}/library/10930/> .
411 """.format(base=base_url)
412         model = Graph()
413         model.parse(data=flowcellInfo, format='turtle')
414         for seq in self._generate_sequences():
415             seq.save_to_model(model)
416         f = sequences.update_model_sequence_library(model, base_url=base_url)
417
418         libTerm = libraryOntology['library']
419         libIdTerm = libraryOntology['library_id']
420
421         url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'
422         nodes = list(model.objects(URIRef(url), libTerm))
423         self.assertEqual(len(nodes), 1)
424         self.assertEqual(nodes[0], localLibrary['10923/'])
425         nodes = list(model.objects(URIRef(url), libIdTerm))
426         self.assertEqual(len(nodes), 1)
427         self.assertEqual(nodes[0].toPython(), '10923')
428
429         url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'
430         nodes = list(model.objects(URIRef(url), libTerm))
431         self.assertEqual(len(nodes), 1)
432         self.assertEqual(nodes[0], localLibrary['10924/'])
433         nodes = list(model.objects(URIRef(url), libIdTerm))
434         self.assertEqual(len(nodes), 1)
435         self.assertEqual(nodes[0].toPython(), '10924')
436
437         url = 'file:///root/42BW9AAXX/C1-38/Project_12345/12345_AAATTT_L003_R1_001.fastq.gz'
438         nodes = list(model.objects(URIRef(url), libTerm))
439         self.assertEqual(len(nodes), 1)
440         self.assertEqual(nodes[0], localLibrary['12345/'])
441         nodes = list(model.objects(URIRef(url), libIdTerm))
442         self.assertEqual(len(nodes), 1)
443         self.assertEqual(nodes[0].toPython(), '12345')
444
445     def test_load_from_model(self):
446         """Can we round trip through a RDF model"""
447         model = Graph()
448         path = '/root/42BW9AAXX/C1-38/Project_12345/'
449         filename = '12345_AAATTT_L003_R1_001.fastq.gz'
450         seq = sequences.parse_fastq(path, filename)
451         seq.save_to_model(model)
452
453         seq_id = 'file://'+path+filename
454         seqNode = URIRef(seq_id)
455         libNode = URIRef('http://localhost/library/12345')
456         model.add((seqNode, libraryOntology['library'], libNode))
457         seq2 = sequences.SequenceFile.load_from_model(model, seq_id)
458
459         self.assertEqual(seq.flowcell, seq2.flowcell)
460         self.assertEqual(seq.flowcell, '42BW9AAXX')
461         self.assertEqual(seq.filetype, seq2.filetype)
462         self.assertEqual(seq2.filetype, 'split_fastq')
463         self.assertEqual(seq.lane, seq2.lane)
464         self.assertEqual(seq2.lane, '3')
465         self.assertEqual(seq.read, seq2.read)
466         self.assertEqual(seq2.read, 1)
467         self.assertEqual(seq.project, seq2.project)
468         self.assertEqual(seq2.project, '12345')
469         self.assertEqual(seq.index, seq2.index)
470         self.assertEqual(seq2.index, 'AAATTT')
471         self.assertEqual(seq.split, seq2.split)
472         self.assertEqual(seq2.split, '001')
473         self.assertEqual(seq.cycle, seq2.cycle)
474         self.assertEqual(seq.pf, seq2.pf)
475         self.assertEqual(seq2.libraryNode, libNode)
476         self.assertEqual(seq.path, seq2.path)
477
478     def test_scan_for_sequences(self):
479         # simulate tree
480         file_types_seen = set()
481         file_types_to_see = set(['fastq', 'srf', 'eland', 'qseq'])
482         lanes = set()
483         lanes_to_see = set(('1','2','3'))
484         with SimulateSimpleTree() as tree:
485             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
486             for s in seqs:
487                 self.assertEquals(s.flowcell, '42BW9AAXX')
488                 self.assertEquals(s.cycle, 33)
489                 self.assertEquals(s.project, None)
490                 lanes.add(s.lane)
491                 file_types_seen.add(s.filetype)
492
493             self.assertEquals(len(seqs), 8)
494
495         self.assertEqual(lanes, lanes_to_see)
496         self.assertEqual(file_types_to_see, file_types_seen)
497         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
498
499     def test_scan_for_hiseq_sequences(self):
500         # simulate tree
501         file_types_seen = set()
502         file_types_to_see = set(['split_fastq'])
503         lanes = set()
504         lanes_to_see = set(('1','2'))
505         projects_seen = set()
506         projects_to_see = set(('11111', '21111', '31111'))
507         with SimulateHiSeqTree() as tree:
508             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
509             for s in seqs:
510                 self.assertEquals(s.flowcell, 'C02AAACXX')
511                 self.assertEquals(s.cycle, 101)
512                 lanes.add(s.lane)
513                 file_types_seen.add(s.filetype)
514                 projects_seen.add(s.project)
515
516             self.assertEquals(len(seqs), 12)
517
518         self.assertEqual(lanes, lanes_to_see)
519         self.assertEqual(file_types_to_see, file_types_seen)
520         self.assertEqual(projects_to_see, projects_seen)
521         # make sure we require a list, and not the confusing iterating over
522         # a string
523         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
524
525 class SimulateTree(object):
526     def __enter__(self):
527         return self
528
529     def __exit__(self, exc_type, exc_val, exc_tb):
530         shutil.rmtree(self.root)
531
532     def mkflowcell(self, *components):
533         head = self.root
534         for c in components:
535             head = os.path.join(head, c)
536             if not os.path.exists(head):
537                 os.mkdir(head)
538         return head
539
540     def mkfile(self, flowcell, filename):
541         pathname = os.path.join(flowcell, filename)
542         stream = open(pathname,'w')
543         stream.write(pathname)
544         stream.write(os.linesep)
545         stream.close()
546
547 class SimulateHiSeqTree(SimulateTree):
548     def __init__(self):
549         self.root = tempfile.mkdtemp(prefix='sequences_')
550
551         files = [
552             ('Project_11111', '11111_AAGGCC_L001_R1_001.fastq.gz',),
553             ('Project_11111', '11111_AAGGCC_L001_R1_002.fastq.gz',),
554             ('Project_11111', '11111_AAGGCC_L001_R2_001.fastq.gz',),
555             ('Project_11111', '11111_AAGGCC_L001_R2_002.fastq.gz',),
556             ('Project_21111', '21111_TTTTTT_L001_R1_001.fastq.gz',),
557             ('Project_21111', '21111_TTTTTT_L001_R1_002.fastq.gz',),
558             ('Project_21111', '21111_TTTTTT_L001_R2_001.fastq.gz',),
559             ('Project_21111', '21111_TTTTTT_L001_R2_002.fastq.gz',),
560             ('Project_31111', '31111_NoIndex_L002_R1_001.fastq.gz',),
561             ('Project_31111', '31111_NoIndex_L002_R1_002.fastq.gz',),
562             ('Project_31111', '31111_NoIndex_L002_R2_001.fastq.gz',),
563             ('Project_31111', '31111_NoIndex_L002_R2_002.fastq.gz',),
564             ('.', '11111_AAGGCC_L001_R1_001_export.txt.gz'),
565             ('.', '11111_AAGGCC_L001_R1_002_export.txt.gz'),
566             ('.', '11111_AAGGCC_L001_R2_001_export.txt.gz'),
567             ('.', '11111_AAGGCC_L001_R2_002_export.txt.gz'),
568             ('.', '21111_AAGGCC_L001_R1_001_export.txt.gz'),
569             ('.', '21111_AAGGCC_L001_R1_002_export.txt.gz'),
570             ('.', '21111_AAGGCC_L001_R2_001_export.txt.gz'),
571             ('.', '21111_AAGGCC_L001_R2_002_export.txt.gz'),
572             ('.', '31111_NoIndex_L002_R1_001_export.txt.gz'),
573             ('.', '31111_NoIndex_L002_R1_002_export.txt.gz'),
574             ('.', '31111_NoIndex_L002_R2_001_export.txt.gz'),
575             ('.', '31111_NoIndex_L002_R2_002_export.txt.gz'),
576             ]
577         for d, f in files:
578             fc = self.mkflowcell(self.root, 'C02AAACXX', 'C1-101', d)
579             self.mkfile(fc, f)
580
581 class SimulateSimpleTree(SimulateTree):
582     def __init__(self):
583         self.root = tempfile.mkdtemp(prefix='sequences_')
584
585         fc = self.mkflowcell(self.root, '42BW9AAXX', 'C1-33')
586         files = [
587             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2',
588             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2',
589             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2.md5',
590             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2.md5',
591             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_2.srf',
592             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_pass.fastq.bz2',
593             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_pass.fastq.bz2',
594             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_nopass.fastq.bz2',
595             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_nopass.fastq.bz2',
596             's_1_eland_extended.txt.bz2',
597             's_1_eland_extended.txt.bz2.md5',
598             ]
599         for f in files:
600             self.mkfile(fc, f)
601
602
603 def suite():
604     from unittest import TestSuite, defaultTestLoader
605     suite = TestSuite()
606     suite.addTests(defaultTestLoader.loadTestsFromTestCase(SequenceFileTests))
607     return suite
608
609
610 if __name__ == "__main__":
611     from unittest import main
612     main(defaultTest="suite")