d051c036ca2eddda6ffbbc1cc3f1c1f31bb5fcea
[htsworkflow.git] / htsworkflow / pipelines / test / test_sequences.py
1 #!/usr/bin/env python
2 import os
3 import shutil
4 import tempfile
5 import unittest
6
7 import RDF
8
9 from htsworkflow.pipelines import sequences
10 from htsworkflow.util.rdfhelp import get_model, load_string_into_model, \
11      rdfNS, libraryOntology, dump_model, fromTypedNode
12
13 class SequenceFileTests(unittest.TestCase):
14     """
15     Make sure the sequence archive class works
16     """
17     def test_get_flowcell_cycle(self):
18         tests = [
19             ('/root/42BW9AAXX/C1-152',
20              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
21             ('/root/42BW9AAXX/C1-152/',
22              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
23             ('/root/42BW9AAXX/C1-152/Project_12345',
24              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
25             ('/root/42BW9AAXX/C1-152/Project_12345/',
26              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
27         ]
28
29         for t in tests:
30             path = sequences.get_flowcell_cycle(t[0])
31             self.assertEqual(path, t[1])
32
33     def test_flowcell_cycle(self):
34         """
35         Make sure code to parse directory heirarchy works
36         """
37         path = '/root/42BW9AAXX/C1-152'
38         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
39
40         self.assertEqual(flowcell, '42BW9AAXX')
41         self.assertEqual(start, 1)
42         self.assertEqual(stop, 152)
43         self.assertEqual(project, None)
44
45         path = '/root/42BW9AAXX/other'
46         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
47
48     def test_flowcell_project_cycle(self):
49         """
50         Make sure code to parse directory heirarchy works
51         """
52         path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
53         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
54
55         self.assertEqual(flowcell, '42BW9AAXX')
56         self.assertEqual(start, 1)
57         self.assertEqual(stop, 152)
58         self.assertEqual(project, 'Project_12345_Index1')
59
60         path = '/root/42BW9AAXX/other'
61         self.assertRaises(ValueError, sequences.get_flowcell_cycle, path)
62
63     def test_srf(self):
64         path = '/root/42BW9AAXX/C1-38'
65         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
66         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_5.srf'
67         pathname = os.path.join(path,name)
68         f0 = sequences.parse_srf(path, name)
69         f1 = sequences.parse_srf(path, name)
70         fother = sequences.parse_srf(path, other)
71
72         self.assertEqual(f0.filetype, 'srf')
73         self.assertEqual(f0.path, pathname)
74         self.assertEqual(unicode(f0), unicode(pathname))
75         self.assertEqual(repr(f0), "<srf 42BW9AAXX 4 %s>" % (pathname,))
76         self.assertEqual(f0.flowcell, '42BW9AAXX')
77         self.assertEqual(f0.lane, 4)
78         self.assertEqual(f0.read, None)
79         self.assertEqual(f0.pf, None)
80         self.assertEqual(f0.cycle, 38)
81         self.assertEqual(f0.make_target_name('/tmp'),
82                          os.path.join('/tmp', name))
83
84         self.assertEqual(f0, f1)
85         self.assertNotEqual(f0, fother)
86
87
88     def test_qseq(self):
89         path = '/root/42BW9AAXX/C1-36'
90         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
91         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1.tar.bz2'
92         pathname = os.path.join(path,name)
93         f0 = sequences.parse_qseq(path, name)
94         f1 = sequences.parse_qseq(path, name)
95         fother = sequences.parse_qseq(path, other)
96
97         self.assertEqual(f0.filetype, 'qseq')
98         self.assertEqual(f0.path, pathname)
99         self.assertEqual(unicode(f0), unicode(pathname))
100         self.assertEqual(repr(f0), "<qseq 42BW9AAXX 4 %s>" %(pathname,))
101         self.assertEqual(f0.flowcell, '42BW9AAXX')
102         self.assertEqual(f0.lane, 4)
103         self.assertEqual(f0.read, 1)
104         self.assertEqual(f0.pf, None)
105         self.assertEqual(f0.cycle, 36)
106         self.assertEqual(f0.make_target_name('/tmp'),
107                          os.path.join('/tmp', name))
108
109         self.assertEqual(f0, f1)
110         self.assertNotEqual(f0, fother)
111
112         path = '/root/ilmn200901/C1-202'
113         name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
114         other = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r2.tar.bz2'
115         pathname = os.path.join(path, name)
116         f0 = sequences.parse_qseq(path, name)
117         f1 = sequences.parse_qseq(path, name)
118         fother = sequences.parse_qseq(path, other)
119
120         self.assertEqual(f0.filetype, 'qseq')
121         self.assertEqual(f0.path, pathname)
122         self.assertEqual(unicode(f0), unicode(pathname))
123         self.assertEqual(repr(f0), "<qseq ilmn200901 1 %s>" %(pathname,))
124         self.assertEqual(f0.lane, 1)
125         self.assertEqual(f0.read, 1)
126         self.assertEqual(f0.pf, None)
127         self.assertEqual(f0.cycle, 202)
128         self.assertEqual(f0.make_target_name('/tmp'),
129                          os.path.join('/tmp', name))
130
131         self.assertEqual(f0, f1)
132         self.assertNotEqual(f0, fother)
133
134     def test_fastq(self):
135         path = '/root/42BW9AAXX/C1-38'
136         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
137         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l5_r1_pass.fastq.bz2'
138         pathname = os.path.join(path,name)
139         f0 = sequences.parse_fastq(path, name)
140         f1 = sequences.parse_fastq(path, name)
141         fother = sequences.parse_fastq(path, other)
142
143         self.assertEqual(f0.filetype, 'fastq')
144         self.assertEqual(f0.path, pathname)
145         self.assertEqual(unicode(f0), unicode(pathname))
146         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" % (pathname,))
147         self.assertEqual(f0.flowcell, '42BW9AAXX')
148         self.assertEqual(f0.lane, 4)
149         self.assertEqual(f0.read, 1)
150         self.assertEqual(f0.pf, True)
151         self.assertEqual(f0.cycle, 38)
152         self.assertEqual(f0.make_target_name('/tmp'),
153                          os.path.join('/tmp', name))
154
155         self.assertEqual(f0, f1)
156         self.assertNotEqual(f0, fother)
157
158         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
159         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
160         pathname = os.path.join(path,name)
161         f0 = sequences.parse_fastq(path, name)
162         f1 = sequences.parse_fastq(path, name)
163         fother = sequences.parse_fastq(path, other)
164
165         self.assertEqual(f0.filetype, 'fastq')
166         self.assertEqual(f0.path, pathname)
167         self.assertEqual(unicode(f0), unicode(pathname))
168         self.assertEqual(repr(f0), "<fastq 42BW9AAXX 4 %s>" %(pathname,))
169         self.assertEqual(f0.flowcell, '42BW9AAXX')
170         self.assertEqual(f0.lane, 4)
171         self.assertEqual(f0.read, 2)
172         self.assertEqual(f0.pf, False)
173         self.assertEqual(f0.cycle, 38)
174         self.assertEqual(f0.make_target_name('/tmp'),
175                          os.path.join('/tmp', name))
176
177         self.assertEqual(f0, f1)
178         self.assertNotEqual(f0, fother)
179
180     def test_project_fastq(self):
181         path = '/root/42BW9AAXX/C1-38/Project_12345'
182         name = '11111_NoIndex_L001_R1_001.fastq.gz'
183         other = '22222_NoIndex_L001_R1_001.fastq.gz'
184         pathname = os.path.join(path,name)
185         f0 = sequences.parse_fastq(path, name)
186         f1 = sequences.parse_fastq(path, name)
187         fother = sequences.parse_fastq(path, other)
188
189         self.assertEqual(f0.filetype, 'split_fastq')
190         self.assertEqual(f0.path, pathname)
191         self.assertEqual(unicode(f0), unicode(pathname))
192         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" %(pathname,))
193         self.assertEqual(f0.flowcell, '42BW9AAXX')
194         self.assertEqual(f0.lane, 1)
195         self.assertEqual(f0.read, 1)
196         self.assertEqual(f0.pf, True)
197         self.assertEqual(f0.project, '11111')
198         self.assertEqual(f0.index, 'NoIndex')
199         self.assertEqual(f0.cycle, 38)
200         self.assertEqual(f0.make_target_name('/tmp'),
201                          os.path.join('/tmp', name))
202
203         self.assertEqual(f0, f1)
204         self.assertNotEqual(f0, fother)
205
206         name = '11112_AAATTT_L001_R2_003.fastq.gz'
207         other = '11112_AAATTT_L002_R2_003.fastq.gz'
208         pathname = os.path.join(path,name)
209         f0 = sequences.parse_fastq(path, name)
210         f1 = sequences.parse_fastq(path, name)
211         fother = sequences.parse_fastq(path, other)
212
213         self.assertEqual(f0.filetype, 'split_fastq')
214         self.assertEqual(f0.path, pathname)
215         self.assertEqual(unicode(f0), unicode(pathname))
216         self.assertEqual(repr(f0), "<split_fastq 42BW9AAXX 1 %s>" % (pathname,))
217         self.assertEqual(f0.flowcell, '42BW9AAXX')
218         self.assertEqual(f0.lane, 1)
219         self.assertEqual(f0.read, 2)
220         self.assertEqual(f0.pf, True)
221         self.assertEqual(f0.project, '11112')
222         self.assertEqual(f0.index, 'AAATTT')
223         self.assertEqual(f0.cycle, 38)
224         self.assertEqual(f0.make_target_name('/tmp'),
225                          os.path.join('/tmp', name))
226
227         self.assertEqual(f0, f1)
228         self.assertNotEqual(f0, fother)
229
230     def test_parse_fastq_pf_flag(self):
231         other = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2_nopass.fastq.bz2'
232         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
233                 'l1', 'r2', 'nopass']
234         self.assertEqual(sequences.parse_fastq_pf_flag(data), False)
235
236         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
237                 'l1', 'r2', 'pass']
238         self.assertEqual(sequences.parse_fastq_pf_flag(data), True)
239
240         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
241                 'l1', 'r2', 'all']
242         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
243
244         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
245                 'l1', 'r2']
246         self.assertEqual(sequences.parse_fastq_pf_flag(data), None)
247
248         data = ['woldlab', '090622', 'HWI-EAS229', '0120', '42BW9AAXX',
249                 'l1', 'r2', 'all', 'newthing']
250         self.assertRaises(ValueError, sequences.parse_fastq_pf_flag, data)
251
252
253     def test_project_fastq_hashing(self):
254         """Can we tell the difference between sequence files?
255         """
256         path = '/root/42BW9AAXX/C1-38/Project_12345'
257         names = [('11111_NoIndex_L001_R1_001.fastq.gz',
258                   '11111_NoIndex_L001_R2_001.fastq.gz'),
259                  ('11112_NoIndex_L001_R1_001.fastq.gz',
260                   '11112_NoIndex_L001_R1_002.fastq.gz')
261                  ]
262         for a_name, b_name in names:
263             a = sequences.parse_fastq(path, a_name)
264             b = sequences.parse_fastq(path, b_name)
265             self.assertNotEqual(a, b)
266             self.assertNotEqual(a.key(), b.key())
267             self.assertNotEqual(hash(a), hash(b))
268
269     def test_eland(self):
270         path = '/root/42BW9AAXX/C1-38'
271         name = 's_4_eland_extended.txt.bz2'
272         pathname = os.path.join(path,name)
273         f = sequences.parse_eland(path, name)
274
275         self.assertEqual(f.filetype, 'eland')
276         self.assertEqual(f.path, pathname)
277         self.assertEqual(f.flowcell, '42BW9AAXX')
278         self.assertEqual(f.lane, 4)
279         self.assertEqual(f.read, None)
280         self.assertEqual(f.pf, None)
281         self.assertEqual(f.cycle, 38)
282         self.assertEqual(f.make_target_name('/tmp'),
283                          '/tmp/42BW9AAXX_38_s_4_eland_extended.txt.bz2')
284
285         path = '/root/42BW9AAXX/C1-152'
286         name = 's_4_1_eland_extended.txt.bz2'
287         pathname = os.path.join(path,name)
288         f = sequences.parse_eland(path, name)
289
290         self.assertEqual(f.filetype, 'eland')
291         self.assertEqual(f.path, pathname)
292         self.assertEqual(f.flowcell, '42BW9AAXX')
293         self.assertEqual(f.lane, 4)
294         self.assertEqual(f.read, 1)
295         self.assertEqual(f.pf, None)
296         self.assertEqual(f.cycle, 152)
297         self.assertEqual(f.make_target_name('/tmp'),
298                          '/tmp/42BW9AAXX_152_s_4_1_eland_extended.txt.bz2')
299
300     def _generate_sequences(self):
301         seqs = []
302         data = [('/root/42BW9AAXX/C1-152',
303                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
304                 ('/root/42BW9AAXX/C1-152',
305                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'),
306                 ('/root/42BW9AAXX/C1-152',
307                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'),
308                 ('/root/42BW9AAXX/C1-152',
309                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
310
311         for path, name in data:
312             seqs.append(sequences.parse_qseq(path, name))
313
314         path = '/root/42BW9AAXX/C1-38/Project_12345'
315         name = '12345_AAATTT_L003_R1_001.fastq.gz'
316         pathname = os.path.join(path,name)
317         seqs.append(sequences.parse_fastq(path, name))
318         self.assertEqual(len(seqs), 5)
319         return seqs
320
321
322     def test_sql(self):
323         """
324         Make sure that the quick and dirty sql interface in sequences works
325         """
326         import sqlite3
327         db = sqlite3.connect(":memory:")
328         c = db.cursor()
329         sequences.create_sequence_table(c)
330
331         for seq in self._generate_sequences():
332             seq.save_to_sql(c)
333
334         count = c.execute("select count(*) from sequences")
335         row = count.fetchone()
336         self.assertEqual(row[0], 5)
337
338     def test_basic_rdf_scan(self):
339         """Make sure we can save to RDF model"""
340         import RDF
341         model = get_model()
342
343         for seq in self._generate_sequences():
344             seq.save_to_model(model)
345
346         files = list(model.find_statements(
347             RDF.Statement(None, rdfNS['type'], libraryOntology['raw_file'])))
348         self.assertEqual(len(files), 5)
349         files = list(model.find_statements(
350             RDF.Statement(None, rdfNS['type'], libraryOntology['qseq'])))
351         self.assertEqual(len(files), 4)
352         files = list(model.find_statements(
353             RDF.Statement(None, rdfNS['type'], libraryOntology['split_fastq'])))
354         self.assertEqual(len(files), 1)
355
356         files = list(model.find_statements(
357             RDF.Statement(None, libraryOntology['library_id'], None)))
358         self.assertEqual(len(files), 1)
359
360         files = list(model.find_statements(
361             RDF.Statement(None, libraryOntology['flowcell_id'], None)))
362         self.assertEqual(len(files), 5)
363
364         files = list(model.find_statements(
365             RDF.Statement(None, libraryOntology['flowcell'], None)))
366         self.assertEqual(len(files), 0)
367
368         files = list(model.find_statements(
369             RDF.Statement(None, libraryOntology['library'], None)))
370         self.assertEqual(len(files), 0)
371
372     def test_rdf_scan_with_url(self):
373         """Make sure we can save to RDF model"""
374         import RDF
375         model = get_model()
376         base_url = 'http://localhost'
377         for seq in self._generate_sequences():
378             seq.save_to_model(model, base_url=base_url)
379         localFC = RDF.NS(base_url + '/flowcell/')
380         localLibrary = RDF.NS(base_url + '/library/')
381
382         files = list(model.find_statements(
383             RDF.Statement(None, libraryOntology['flowcell'], None)))
384         self.assertEqual(len(files), 5)
385         for f in files:
386             self.assertEqual(f.object, localFC['42BW9AAXX/'])
387
388         files = list(model.find_statements(
389             RDF.Statement(None, libraryOntology['library'], None)))
390         self.assertEqual(len(files), 1)
391         self.assertEqual(files[0].object, localLibrary['12345'])
392
393     def test_rdf_fixup_library(self):
394         """Make sure we can save to RDF model"""
395         base_url = 'http://localhost'
396         localLibrary = RDF.NS(base_url + '/library/')
397
398         flowcellInfo = """@prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
399
400 <{base}/flowcell/42BW9AAXX/>
401     libns:flowcell_id "42BW9AXX"@en ;
402     libns:has_lane <{base}/lane/1169>, <{base}/lane/1170>,
403                    <{base}/lane/1171>, <{base}/lane/1172> ;
404     libns:read_length 75 ;
405     a libns:illumina_flowcell .
406
407 <{base}/lane/1169>
408     libns:lane_number 1 ; libns:library <{base}/library/10923/> .
409 <{base}/lane/1170>
410     libns:lane_number 2 ; libns:library <{base}/library/10924/> .
411 <{base}/lane/1171>
412     libns:lane_number 3 ; libns:library <{base}/library/12345/> .
413 <{base}/lane/1172>
414     libns:lane_number 3 ; libns:library <{base}/library/10930/> .
415 """.format(base=base_url)
416         model = get_model()
417         load_string_into_model(model, 'turtle', flowcellInfo)
418         for seq in self._generate_sequences():
419             seq.save_to_model(model)
420         f = sequences.update_model_sequence_library(model, base_url=base_url)
421
422         libTerm = libraryOntology['library']
423         libIdTerm = libraryOntology['library_id']
424
425         url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'
426         nodes = list(model.get_targets(RDF.Uri(url), libTerm))
427         self.assertEqual(len(nodes), 1)
428         self.assertEqual(nodes[0], localLibrary['10923/'])
429         nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
430         self.assertEqual(len(nodes), 1)
431         self.assertEqual(fromTypedNode(nodes[0]), '10923')
432
433         url = 'file:///root/42BW9AAXX/C1-152/woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'
434         nodes = list(model.get_targets(RDF.Uri(url), libTerm))
435         self.assertEqual(len(nodes), 1)
436         self.assertEqual(nodes[0], localLibrary['10924/'])
437         nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
438         self.assertEqual(len(nodes), 1)
439         self.assertEqual(fromTypedNode(nodes[0]), '10924')
440
441         url = 'file:///root/42BW9AAXX/C1-38/Project_12345/12345_AAATTT_L003_R1_001.fastq.gz'
442         nodes = list(model.get_targets(RDF.Uri(url), libTerm))
443         self.assertEqual(len(nodes), 1)
444         self.assertEqual(nodes[0], localLibrary['12345/'])
445         nodes = list(model.get_targets(RDF.Uri(url), libIdTerm))
446         self.assertEqual(len(nodes), 1)
447         self.assertEqual(fromTypedNode(nodes[0]), '12345')
448
449     def test_load_from_model(self):
450         """Can we round trip through a RDF model"""
451         model = get_model()
452         path = '/root/42BW9AAXX/C1-38/Project_12345/'
453         filename = '12345_AAATTT_L003_R1_001.fastq.gz'
454         seq = sequences.parse_fastq(path, filename)
455         seq.save_to_model(model)
456
457         seq_id = 'file://'+path+filename
458         seqNode = RDF.Node(RDF.Uri(seq_id))
459         libNode = RDF.Node(RDF.Uri('http://localhost/library/12345'))
460         model.add_statement(
461             RDF.Statement(seqNode, libraryOntology['library'], libNode))
462         seq2 = sequences.SequenceFile.load_from_model(model, seq_id)
463
464         self.assertEqual(seq.flowcell, seq2.flowcell)
465         self.assertEqual(seq.flowcell, '42BW9AAXX')
466         self.assertEqual(seq.filetype, seq2.filetype)
467         self.assertEqual(seq2.filetype, 'split_fastq')
468         self.assertEqual(seq.lane, seq2.lane)
469         self.assertEqual(seq2.lane, 3)
470         self.assertEqual(seq.read, seq2.read)
471         self.assertEqual(seq2.read, 1)
472         self.assertEqual(seq.project, seq2.project)
473         self.assertEqual(seq2.project, '12345')
474         self.assertEqual(seq.index, seq2.index)
475         self.assertEqual(seq2.index, 'AAATTT')
476         self.assertEqual(seq.split, seq2.split)
477         self.assertEqual(seq2.split, '001')
478         self.assertEqual(seq.cycle, seq2.cycle)
479         self.assertEqual(seq.pf, seq2.pf)
480         self.assertEqual(seq2.libraryNode, libNode)
481         self.assertEqual(seq.path, seq2.path)
482
483     def test_scan_for_sequences(self):
484         # simulate tree
485         file_types_seen = set()
486         file_types_to_see = set(['fastq', 'srf', 'eland', 'qseq'])
487         lanes = set()
488         lanes_to_see = set((1,2,3))
489         with SimulateSimpleTree() as tree:
490             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
491             for s in seqs:
492                 self.assertEquals(s.flowcell, '42BW9AAXX')
493                 self.assertEquals(s.cycle, 33)
494                 self.assertEquals(s.project, None)
495                 lanes.add(s.lane)
496                 file_types_seen.add(s.filetype)
497
498             self.assertEquals(len(seqs), 8)
499
500         self.assertEqual(lanes, lanes_to_see)
501         self.assertEqual(file_types_to_see, file_types_seen)
502         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
503
504     def test_scan_for_hiseq_sequences(self):
505         # simulate tree
506         file_types_seen = set()
507         file_types_to_see = set(['split_fastq'])
508         lanes = set()
509         lanes_to_see = set((1,2))
510         projects_seen = set()
511         projects_to_see = set(('11111', '21111', '31111'))
512         with SimulateHiSeqTree() as tree:
513             seqs = sequences.scan_for_sequences([tree.root, '/a/b/c/98345'])
514             for s in seqs:
515                 self.assertEquals(s.flowcell, 'C02AAACXX')
516                 self.assertEquals(s.cycle, 101)
517                 lanes.add(s.lane)
518                 file_types_seen.add(s.filetype)
519                 projects_seen.add(s.project)
520
521             self.assertEquals(len(seqs), 12)
522
523         self.assertEqual(lanes, lanes_to_see)
524         self.assertEqual(file_types_to_see, file_types_seen)
525         self.assertEqual(projects_to_see, projects_seen)
526         # make sure we require a list, and not the confusing iterating over
527         # a string
528         self.assertRaises(ValueError, sequences.scan_for_sequences, '/tmp')
529
530 class SimulateTree(object):
531     def __enter__(self):
532         return self
533
534     def __exit__(self, exc_type, exc_val, exc_tb):
535         shutil.rmtree(self.root)
536
537     def mkflowcell(self, *components):
538         head = self.root
539         for c in components:
540             head = os.path.join(head, c)
541             if not os.path.exists(head):
542                 os.mkdir(head)
543         return head
544
545     def mkfile(self, flowcell, filename):
546         pathname = os.path.join(flowcell, filename)
547         stream = open(pathname,'w')
548         stream.write(pathname)
549         stream.write(os.linesep)
550         stream.close()
551
552 class SimulateHiSeqTree(SimulateTree):
553     def __init__(self):
554         self.root = tempfile.mkdtemp(prefix='sequences_')
555
556         files = [
557             ('Project_11111', '11111_AAGGCC_L001_R1_001.fastq.gz',),
558             ('Project_11111', '11111_AAGGCC_L001_R1_002.fastq.gz',),
559             ('Project_11111', '11111_AAGGCC_L001_R2_001.fastq.gz',),
560             ('Project_11111', '11111_AAGGCC_L001_R2_002.fastq.gz',),
561             ('Project_21111', '21111_TTTTTT_L001_R1_001.fastq.gz',),
562             ('Project_21111', '21111_TTTTTT_L001_R1_002.fastq.gz',),
563             ('Project_21111', '21111_TTTTTT_L001_R2_001.fastq.gz',),
564             ('Project_21111', '21111_TTTTTT_L001_R2_002.fastq.gz',),
565             ('Project_31111', '31111_NoIndex_L002_R1_001.fastq.gz',),
566             ('Project_31111', '31111_NoIndex_L002_R1_002.fastq.gz',),
567             ('Project_31111', '31111_NoIndex_L002_R2_001.fastq.gz',),
568             ('Project_31111', '31111_NoIndex_L002_R2_002.fastq.gz',),
569             ('.', '11111_AAGGCC_L001_R1_001_export.txt.gz'),
570             ('.', '11111_AAGGCC_L001_R1_002_export.txt.gz'),
571             ('.', '11111_AAGGCC_L001_R2_001_export.txt.gz'),
572             ('.', '11111_AAGGCC_L001_R2_002_export.txt.gz'),
573             ('.', '21111_AAGGCC_L001_R1_001_export.txt.gz'),
574             ('.', '21111_AAGGCC_L001_R1_002_export.txt.gz'),
575             ('.', '21111_AAGGCC_L001_R2_001_export.txt.gz'),
576             ('.', '21111_AAGGCC_L001_R2_002_export.txt.gz'),
577             ('.', '31111_NoIndex_L002_R1_001_export.txt.gz'),
578             ('.', '31111_NoIndex_L002_R1_002_export.txt.gz'),
579             ('.', '31111_NoIndex_L002_R2_001_export.txt.gz'),
580             ('.', '31111_NoIndex_L002_R2_002_export.txt.gz'),
581             ]
582         for d, f in files:
583             fc = self.mkflowcell(self.root, 'C02AAACXX', 'C1-101', d)
584             self.mkfile(fc, f)
585
586 class SimulateSimpleTree(SimulateTree):
587     def __init__(self):
588         self.root = tempfile.mkdtemp(prefix='sequences_')
589
590         fc = self.mkflowcell(self.root, '42BW9AAXX', 'C1-33')
591         files = [
592             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2',
593             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2',
594             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2.md5',
595             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2.md5',
596             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_2.srf',
597             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_pass.fastq.bz2',
598             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_pass.fastq.bz2',
599             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r1_nopass.fastq.bz2',
600             'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l3_r2_nopass.fastq.bz2',
601             's_1_eland_extended.txt.bz2',
602             's_1_eland_extended.txt.bz2.md5',
603             ]
604         for f in files:
605             self.mkfile(fc, f)
606
607
608 def suite():
609     return unittest.makeSuite(SequenceFileTests,'test')
610
611 if __name__ == "__main__":
612     unittest.main(defaultTest="suite")