Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / htsworkflow / pipelines / test / test_sequences.py
1 #!/usr/bin/env python
2 import os
3 import unittest
4
5 from htsworkflow.pipelines import sequences
6
7
8 class SequenceFileTests(unittest.TestCase):
9     """
10     Make sure the sequence archive class works
11     """
12     def test_get_flowcell_cycle(self):
13         tests = [
14             ('/root/42BW9AAXX/C1-152',
15              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
16             ('/root/42BW9AAXX/C1-152/',
17              sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
18             ('/root/42BW9AAXX/C1-152/Project_12345',
19              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
20             ('/root/42BW9AAXX/C1-152/Project_12345/',
21              sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
22         ]
23
24         for t in tests:
25             path = sequences.get_flowcell_cycle(t[0])
26             self.failUnlessEqual(path, t[1])
27
28     def test_flowcell_cycle(self):
29         """
30         Make sure code to parse directory heirarchy works
31         """
32         path = '/root/42BW9AAXX/C1-152'
33         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
34
35         self.failUnlessEqual(flowcell, '42BW9AAXX')
36         self.failUnlessEqual(start, 1)
37         self.failUnlessEqual(stop, 152)
38         self.failUnlessEqual(project, None)
39
40         path = '/root/42BW9AAXX/other'
41         self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
42
43     def test_flowcell_project_cycle(self):
44         """
45         Make sure code to parse directory heirarchy works
46         """
47         path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
48         flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
49
50         self.failUnlessEqual(flowcell, '42BW9AAXX')
51         self.failUnlessEqual(start, 1)
52         self.failUnlessEqual(stop, 152)
53         self.failUnlessEqual(project, 'Project_12345_Index1')
54
55         path = '/root/42BW9AAXX/other'
56         self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
57
58     def test_srf(self):
59         path = '/root/42BW9AAXX/C1-38'
60         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'
61         pathname = os.path.join(path,name)
62         f = sequences.parse_srf(path, name)
63
64         self.failUnlessEqual(f.filetype, 'srf')
65         self.failUnlessEqual(f.path, pathname)
66         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
67         self.failUnlessEqual(f.lane, 4)
68         self.failUnlessEqual(f.read, None)
69         self.failUnlessEqual(f.pf, None)
70         self.failUnlessEqual(f.cycle, 38)
71
72     def test_qseq(self):
73         path = '/root/42BW9AAXX/C1-36'
74         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
75         pathname = os.path.join(path,name)
76         f = sequences.parse_qseq(path, name)
77
78         self.failUnlessEqual(f.filetype, 'qseq')
79         self.failUnlessEqual(f.path, pathname)
80         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
81         self.failUnlessEqual(f.lane, 4)
82         self.failUnlessEqual(f.read, 1)
83         self.failUnlessEqual(f.pf, None)
84         self.failUnlessEqual(f.cycle, 36)
85
86
87         path = '/root/ilmn200901/C1-202'
88         name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2'
89         pathname = os.path.join(path, name)
90         f = sequences.parse_qseq(path, name)
91
92         self.failUnlessEqual(f.filetype, 'qseq')
93         self.failUnlessEqual(f.path, pathname)
94         self.failUnlessEqual(f.lane, 1)
95         self.failUnlessEqual(f.read, 1)
96         self.failUnlessEqual(f.pf, None)
97         self.failUnlessEqual(f.cycle, 202)
98
99     def test_fastq(self):
100         path = '/root/42BW9AAXX/C1-38'
101         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1_pass.fastq.bz2'
102         pathname = os.path.join(path,name)
103         f = sequences.parse_fastq(path, name)
104
105         self.failUnlessEqual(f.filetype, 'fastq')
106         self.failUnlessEqual(f.path, pathname)
107         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
108         self.failUnlessEqual(f.lane, 4)
109         self.failUnlessEqual(f.read, 1)
110         self.failUnlessEqual(f.pf, True)
111         self.failUnlessEqual(f.cycle, 38)
112
113         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2'
114         pathname = os.path.join(path,name)
115         f = sequences.parse_fastq(path, name)
116
117         self.failUnlessEqual(f.filetype, 'fastq')
118         self.failUnlessEqual(f.path, pathname)
119         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
120         self.failUnlessEqual(f.lane, 4)
121         self.failUnlessEqual(f.read, 2)
122         self.failUnlessEqual(f.pf, False)
123         self.failUnlessEqual(f.cycle, 38)
124
125     def test_project_fastq(self):
126         path = '/root/42BW9AAXX/C1-38/Project_12345'
127         name = '11111_NoIndex_L001_R1_001.fastq.gz'
128         pathname = os.path.join(path,name)
129         f = sequences.parse_fastq(path, name)
130
131         self.failUnlessEqual(f.filetype, 'split_fastq')
132         self.failUnlessEqual(f.path, pathname)
133         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
134         self.failUnlessEqual(f.lane, 1)
135         self.failUnlessEqual(f.read, 1)
136         self.failUnlessEqual(f.pf, True)
137         self.failUnlessEqual(f.project, '11111')
138         self.failUnlessEqual(f.index, 'NoIndex')
139         self.failUnlessEqual(f.cycle, 38)
140
141         name = '11112_AAATTT_L001_R2_003.fastq.gz'
142         pathname = os.path.join(path,name)
143         f = sequences.parse_fastq(path, name)
144
145         self.failUnlessEqual(f.filetype, 'split_fastq')
146         self.failUnlessEqual(f.path, pathname)
147         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
148         self.failUnlessEqual(f.lane, 1)
149         self.failUnlessEqual(f.read, 2)
150         self.failUnlessEqual(f.pf, True)
151         self.failUnlessEqual(f.project, '11112')
152         self.failUnlessEqual(f.index, 'AAATTT')
153         self.failUnlessEqual(f.cycle, 38)
154
155     def test_project_fastq_hashing(self):
156         """Can we tell the difference between sequence files?
157         """
158         path = '/root/42BW9AAXX/C1-38/Project_12345'
159         names = [('11111_NoIndex_L001_R1_001.fastq.gz',
160                   '11111_NoIndex_L001_R2_001.fastq.gz'),
161                  ('11112_NoIndex_L001_R1_001.fastq.gz',
162                   '11112_NoIndex_L001_R1_002.fastq.gz')
163                  ]
164         for a_name, b_name in names:
165             a = sequences.parse_fastq(path, a_name)
166             b = sequences.parse_fastq(path, b_name)
167             self.failIfEqual(a, b)
168             self.failIfEqual(a.key(), b.key())
169             self.failIfEqual(hash(a), hash(b))
170
171     def test_eland(self):
172         path = '/root/42BW9AAXX/C1-38'
173         name = 's_4_eland_extended.txt.bz2'
174         pathname = os.path.join(path,name)
175         f = sequences.parse_eland(path, name)
176
177         self.failUnlessEqual(f.filetype, 'eland')
178         self.failUnlessEqual(f.path, pathname)
179         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
180         self.failUnlessEqual(f.lane, 4)
181         self.failUnlessEqual(f.read, None)
182         self.failUnlessEqual(f.pf, None)
183         self.failUnlessEqual(f.cycle, 38)
184
185         path = '/root/42BW9AAXX/C1-152'
186         name = 's_4_1_eland_extended.txt.bz2'
187         pathname = os.path.join(path,name)
188         f = sequences.parse_eland(path, name)
189
190         self.failUnlessEqual(f.filetype, 'eland')
191         self.failUnlessEqual(f.path, pathname)
192         self.failUnlessEqual(f.flowcell, '42BW9AAXX')
193         self.failUnlessEqual(f.lane, 4)
194         self.failUnlessEqual(f.read, 1)
195         self.failUnlessEqual(f.pf, None)
196         self.failUnlessEqual(f.cycle, 152)
197
198     def test_sequence_file_equality(self):
199         path = '/root/42BW9AAXX/C1-38'
200         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r1.tar.bz2'
201
202         f1_qseq = sequences.parse_qseq(path, name)
203         f2_qseq = sequences.parse_qseq(path, name)
204
205         self.failUnlessEqual(f1_qseq, f2_qseq)
206
207     def test_sql(self):
208         """
209         Make sure that the quick and dirty sql interface in sequences works
210         """
211         import sqlite3
212         db = sqlite3.connect(":memory:")
213         c = db.cursor()
214         sequences.create_sequence_table(c)
215
216         data = [('/root/42BW9AAXX/C1-152',
217                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
218                 ('/root/42BW9AAXX/C1-152',
219                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'),
220                 ('/root/42BW9AAXX/C1-152',
221                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'),
222                 ('/root/42BW9AAXX/C1-152',
223                 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),]
224
225         for path, name in data:
226             seq = sequences.parse_qseq(path, name)
227             seq.save(c)
228
229         count = c.execute("select count(*) from sequences")
230         row = count.fetchone()
231         self.failUnlessEqual(row[0], 4)
232
233
234 def suite():
235     return unittest.makeSuite(SequenceFileTests,'test')
236
237 if __name__ == "__main__":
238     unittest.main(defaultTest="suite")