Progress using rdf model to link fastqs with flowcell/lib metadata.
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission.condorfastq import CondorFastqExtract
11 from htsworkflow.submission.results import ResultMap
12 from htsworkflow.util.rdfhelp import load_string_into_model, dump_model
13
14 FCDIRS = [
15     'C02F9ACXX',
16     'C02F9ACXX/C1-202',
17     'C02F9ACXX/C1-202/Project_11154',
18     'C02F9ACXX/C1-202/Project_12342_Index1',
19     'C02F9ACXX/C1-202/Project_12342_Index2',
20     'C02F9ACXX/C1-202/Project_12345',
21     '42JUYAAXX',
22     '42JUYAAXX/C1-76',
23     '30221AAXX',
24     '30221AAXX/C1-33',
25     '30DY0AAXX',
26     '30DY0AAXX/C1-151',
27     '61MJTAAXX',
28     '61MJTAAXX/C1-76',
29 ]
30
31 DATAFILES = [
32     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
33     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
34     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
36     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
37     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
38     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
39     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
40     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
41     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
42     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
43     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
44     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
45     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
46     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
47     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
48     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
49     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
53     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
54     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
55     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
56     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
57     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
58     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
59     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
60     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
61     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
62     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
63     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
64     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
65     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
66     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
67     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
68     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
69     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
70     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
71     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
72     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
73     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
74     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
75     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
76     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
77     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
78     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
79     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
80     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
81     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
82     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
83     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
84     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
85     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
86     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
87     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
88     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
89 ]
90
91 lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
92 @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
93 @prefix dc: <http://purl.org/dc/elements/1.1/> .
94 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
95 @prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
96 @prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
97 @prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
98
99 <http://localhost/flowcell/30221AAXX/>
100         a libns:illumina_flowcell ;
101         libns:read_length 33 ;
102         libns:flowcell_type "Single"@en ;
103         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
104         libns:has_lane <http://localhost/lane/3401> ;
105         libns:has_lane <http://localhost/lane/3402> ;
106         libns:has_lane <http://localhost/lane/3403> ;
107         libns:has_lane <http://localhost/lane/3404> ;
108         libns:has_lane <http://localhost/lane/3405> ;
109         libns:has_lane <http://localhost/lane/3406> ;
110         libns:has_lane <http://localhost/lane/3407> ;
111         libns:has_lane <http://localhost/lane/3408> ;
112         libns:flowcell_id "30221AAXX"@en .
113
114 <http://localhost/lane/3401>
115         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
116         libns:library <http://localhost/library/10000/> ;
117         libns:lane_number 1 .
118 <http://localhost/lane/3402>
119         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
120         libns:library <http://localhost/library/10000/> ;
121         libns:lane_number 2 .
122 <http://localhost/lane/3403>
123         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
124         libns:library <http://localhost/library/10000/> ;
125         libns:lane_number 3 .
126 <http://localhost/lane/3404>
127         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
128         libns:library <http://localhost/library/11154/> ;
129         libns:lane_number 4 .
130         # paired_end 1;
131         # read_length 33;
132         # status "Unknown"@en .
133 <http://localhost/lane/3405>
134         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
135         libns:library <http://localhost/library/10000/> ;
136         libns:lane_number 5 .
137 <http://localhost/lane/3406>
138         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
139         libns:library <http://localhost/library/10000/> ;
140         libns:lane_number 6 .
141 <http://localhost/lane/3407>
142         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
143         libns:library <http://localhost/library/10000/> ;
144         libns:lane_number 7 .
145 <http://localhost/lane/3408>
146         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
147         libns:library <http://localhost/library/10000/> ;
148         libns:lane_number 8 .
149
150 <http://localhost/flowcell/42JUYAAXX/>
151         a libns:illumina_flowcell ;
152         libns:read_length 76 ;
153         libns:flowcell_type "Paired"@en ;
154         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
155         libns:has_lane <http://localhost/lane/4201> ;
156         libns:has_lane <http://localhost/lane/4202> ;
157         libns:has_lane <http://localhost/lane/4203> ;
158         libns:has_lane <http://localhost/lane/4204> ;
159         libns:has_lane <http://localhost/lane/4205> ;
160         libns:has_lane <http://localhost/lane/4206> ;
161         libns:has_lane <http://localhost/lane/4207> ;
162         libns:has_lane <http://localhost/lane/4208> ;
163         libns:flowcell_id "42JUYAAXX"@en .
164
165 <http://localhost/lane/4201>
166         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
167         libns:library <http://localhost/library/1421/> ;
168         libns:lane_number 1 .
169 <http://localhost/lane/4202>
170         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
171         libns:library <http://localhost/library/1421/> ;
172         libns:lane_number 2 .
173 <http://localhost/lane/4203>
174         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
175         libns:library <http://localhost/library/1421/> ;
176         libns:lane_number 3 .
177 <http://localhost/lane/4204>
178         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
179         libns:library <http://localhost/library/1421/> ;
180         libns:lane_number 4 .
181 <http://localhost/lane/4205>
182         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
183         libns:library <http://localhost/library/11154/> ;
184         libns:lane_number 5 .
185         # paired_end 1;
186         # read_length 76;
187         # status "Unknown"@en .
188 <http://localhost/lane/4206>
189         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
190         libns:library <http://localhost/library/1421/> ;
191         libns:lane_number 6 .
192 <http://localhost/lane/4207>
193         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
194         libns:library <http://localhost/library/1421/> ;
195         libns:lane_number 7 .
196 <http://localhost/lane/4208>
197         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
198         libns:library <http://localhost/library/1421/> ;
199         libns:lane_number 8 .
200
201 <http://localhost/flowcell/61MJTAAXX/>
202         a libns:illumina_flowcell ;
203         libns:read_length 76 ;
204         libns:flowcell_type "Single"@en ;
205         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
206         libns:has_lane <http://localhost/lane/6601> ;
207         libns:has_lane <http://localhost/lane/6602> ;
208         libns:has_lane <http://localhost/lane/6603> ;
209         libns:has_lane <http://localhost/lane/6604> ;
210         libns:has_lane <http://localhost/lane/6605> ;
211         libns:has_lane <http://localhost/lane/6606> ;
212         libns:has_lane <http://localhost/lane/6607> ;
213         libns:has_lane <http://localhost/lane/6608> ;
214         libns:flowcell_id "61MJTAAXX"@en .
215
216 <http://localhost/lane/6601>
217         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
218         libns:library <http://localhost/library/1661/> ;
219         libns:lane_number 1 .
220 <http://localhost/lane/6602>
221         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
222         libns:library <http://localhost/library/1661/> ;
223         libns:lane_number 2 .
224 <http://localhost/lane/6603>
225         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
226         libns:library <http://localhost/library/1661/> ;
227         libns:lane_number 3 .
228 <http://localhost/lane/6604>
229         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
230         libns:library <http://localhost/library/1661/> ;
231         libns:lane_number 4 .
232 <http://localhost/lane/6605>
233         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
234         libns:library <http://localhost/library/1661/> ;
235         libns:lane_number 5 .
236 <http://localhost/lane/6606>
237         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
238         libns:library <http://localhost/library/11154/> ;
239         libns:lane_number 6 .
240         # paired_end 1;
241         # read_length 76;
242         # status "Unknown"@en .
243 <http://localhost/lane/6607>
244         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
245         libns:library <http://localhost/library/1661/> ;
246         libns:lane_number 7 .
247 <http://localhost/lane/6608>
248         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
249         libns:library <http://localhost/library/1661/> ;
250         libns:lane_number 8 .
251
252 <http://localhost/flowcell/30DY0AAXX/>
253         a libns:illumina_flowcell ;
254         libns:read_length 76 ;
255         libns:flowcell_type "Paired"@en ;
256         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
257         libns:has_lane <http://localhost/lane/3801> ;
258         libns:has_lane <http://localhost/lane/3802> ;
259         libns:has_lane <http://localhost/lane/3803> ;
260         libns:has_lane <http://localhost/lane/3804> ;
261         libns:has_lane <http://localhost/lane/3805> ;
262         libns:has_lane <http://localhost/lane/3806> ;
263         libns:has_lane <http://localhost/lane/3807> ;
264         libns:has_lane <http://localhost/lane/3808> ;
265         libns:flowcell_id "30DY0AAXX"@en .
266
267 <http://localhost/lane/3801>
268         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
269         libns:library <http://localhost/library/1331/> ;
270         libns:lane_number 1 .
271 <http://localhost/lane/3802>
272         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
273         libns:library <http://localhost/library/1331/> ;
274         libns:lane_number 2 .
275 <http://localhost/lane/3803>
276         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
277         libns:library <http://localhost/library/1331/> ;
278         libns:lane_number 3 .
279 <http://localhost/lane/3804>
280         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
281         libns:library <http://localhost/library/1331/> ;
282         libns:lane_number 4 .
283 <http://localhost/lane/3805>
284         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
285         libns:library <http://localhost/library/1331/> ;
286         libns:lane_number 5 .
287 <http://localhost/lane/3806>
288         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
289         libns:library <http://localhost/library/1331/> ;
290         libns:lane_number 6 .
291 <http://localhost/lane/3807>
292         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
293         libns:library <http://localhost/library/1331/> ;
294         libns:lane_number 7 .
295 <http://localhost/lane/3808>
296         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
297         libns:library <http://localhost/library/11154/> ;
298         libns:lane_number 8 .
299         # paired_end 1;
300         # read_length 76;
301         # status "Unknown"@en .
302
303 <http://localhost/flowcell/C02F9ACXX/>
304         a libns:illumina_flowcell ;
305         libns:read_length 101 ;
306         libns:flowcell_type "Paired"@en ;
307         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
308         libns:has_lane <http://localhost/lane/12300> ;
309         libns:has_lane <http://localhost/lane/12500> ;
310         libns:flowcell_id "C02F9ACXX"@en .
311
312 <http://localhost/lane/12300>
313         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
314         libns:library <http://localhost/library/12345/> ;
315         libns:lane_number 3 .
316         # paired_end 1;
317         # read_length 101;
318         # status "Unknown"@en .
319
320 <http://localhost/lane/12500>
321         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
322         libns:library <http://localhost/library/11154/> ;
323         libns:lane_number 3 .
324         # paired_end 1;
325         # read_length 101;
326         # status "Unknown"@en .
327
328 <http://localhost/library/11154/>
329         a libns:library ;
330         libns:affiliation "TSR"@en;
331         libns:concentration "29.7";
332         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
333         libns:experiment_type "RNA-seq"@en ;
334         libns:gel_cut 300 ;
335         libns:has_lane <http://localhost/lane/3404> ;
336         libns:has_lane <http://localhost/lane/4205> ;
337         libns:has_lane <http://localhost/lane/6606> ;
338         libns:has_lane <http://localhost/lane/3808> ;
339         libns:has_lane <http://localhost/lane/12500> ;
340         libns:insert_size 2000 ;
341         libns:library_id "11154"@en ;
342         libns:library_type "Paired End (Multiplexed)"@en ;
343         libns:made_by "Gary Gygax"@en ;
344         libns:name "Paired Ends ASDF"@en ;
345         libns:replicate "1"@en;
346         libns:species "Mus musculus"@en ;
347         libns:stopping_point "Completed"@en ;
348         libns:total_unique_locations 8841201 .
349         # cell_line
350
351
352 <http://localhost/library/12345/>
353         a libns:library ;
354         libns:affiliation "TSR"@en;
355         libns:concentration "12.345";
356         libns:cell_line "Unknown"@en ;
357         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
358         libns:experiment_type "RNA-seq"@en ;
359         libns:gel_cut 300 ;
360         libns:has_lane <http://localhost/lane/12300> ;
361         libns:insert_size 2000 ;
362         libns:library_id "12345"@en ;
363         libns:library_type "Paired End (Multiplexed)"@en ;
364         libns:made_by "Gary Gygax"@en ;
365         libns:name "Paired Ends THING"@en ;
366         libns:replicate "1"@en;
367         libns:species "Mus musculus"@en ;
368         libns:stopping_point "Completed"@en ;
369         libns:total_unique_locations 8841201 .
370         # cell_line
371 """
372 HOST = "http://localhost"
373
374 class TestCondorFastq(unittest.TestCase):
375     def setUp(self):
376         self.cwd = os.getcwd()
377
378         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
379         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
380         os.mkdir(self.flowcelldir)
381
382         self.logdir = os.path.join(self.tempdir, 'log')
383         os.mkdir(self.logdir)
384
385         for d in FCDIRS:
386             os.mkdir(os.path.join(self.flowcelldir, d))
387
388         for f in DATAFILES:
389             filename = os.path.join(self.flowcelldir, f)
390             with open(filename, 'w') as stream:
391                 stream.write('testfile')
392
393         self.result_map = ResultMap()
394         for lib_id in [u'11154', u'12345']:
395             subname = 'sub-%s' % (lib_id,)
396             sub_dir = os.path.join(self.tempdir, subname)
397             os.mkdir(sub_dir)
398             self.result_map[lib_id] =  sub_dir
399
400         self.extract = CondorFastqExtract(HOST,
401                                           self.flowcelldir,
402                                           self.logdir)
403         load_string_into_model(self.extract.model, 'turtle', lib_turtle)
404
405     def tearDown(self):
406         shutil.rmtree(self.tempdir)
407         os.chdir(self.cwd)
408
409     def test_find_relavant_flowcell_ids(self):
410         expected = set(('30221AAXX',
411                         '42JUYAAXX',
412                         '61MJTAAXX',
413                         '30DY0AAXX',
414                         'C02F9ACXX'))
415         flowcell_ids = self.extract.find_relavant_flowcell_ids()
416         self.assertEqual(flowcell_ids, expected)
417
418     def test_find_archive_sequence(self):
419         seqs = self.extract.find_archive_sequence_files(self.result_map)
420
421         expected = set([
422             (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
423             (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
424             (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
425             (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
426             (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
427             (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
428             (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
429             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
430             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
431             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
432             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
433             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
434             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
435             (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
436             (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
437         ])
438         found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
439         self.assertEqual(expected, found)
440
441     def test_find_needed_targets(self):
442         lib_db = self.extract.find_archive_sequence_files(self.result_map)
443
444         needed_targets = self.extract.update_fastq_targets(self.result_map,
445                                                            lib_db)
446         self.assertEqual(len(needed_targets), 9)
447         srf_30221 = needed_targets[
448             self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
449         qseq_42JUY_r1 = needed_targets[
450             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
451         qseq_42JUY_r2 = needed_targets[
452             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
453         qseq_61MJT = needed_targets[
454             self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
455         split_C02F9_r1 = needed_targets[
456             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
457         split_C02F9_r2 = needed_targets[
458             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
459
460         self.assertEqual(len(srf_30221['srf']), 1)
461         self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
462         self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
463         self.assertEqual(len(qseq_61MJT['qseq']), 1)
464         self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
465         self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
466
467     def test_generate_fastqs(self):
468         commands = self.extract.build_condor_arguments(self.result_map)
469
470         srf = commands['srf']
471         qseq = commands['qseq']
472         split = commands['split_fastq']
473
474         self.assertEqual(len(srf), 2)
475         self.assertEqual(len(qseq), 3)
476         self.assertEqual(len(split), 4)
477
478         srf_data = {
479             os.path.join(self.result_map['11154'],
480                          '11154_30221AAXX_c33_l4.fastq'): {
481                 'mid': None,
482                 'ispaired': False,
483                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
484                 'flowcell': u'30221AAXX',
485                 'target': os.path.join(self.result_map['11154'],
486                                        u'11154_30221AAXX_c33_l4.fastq'),
487             },
488             os.path.join(self.result_map['11154'],
489                          '11154_30DY0AAXX_c151_l8_r1.fastq'): {
490                 'mid': None,
491                 'ispaired': True,
492                 'flowcell': u'30DY0AAXX',
493                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
494                 'mid': 76,
495                 'target':
496                     os.path.join(self.result_map['11154'],
497                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
498                 'target_right':
499                     os.path.join(self.result_map['11154'],
500                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
501             }
502         }
503         for args in srf:
504             expected = srf_data[args['target']]
505             self.assertEqual(args['ispaired'], expected['ispaired'])
506             self.assertEqual(len(args['sources']), 1)
507             _, source_filename = os.path.split(args['sources'][0])
508             self.assertEqual(source_filename, expected['sources'][0])
509             self.assertEqual(args['target'], expected['target'])
510             if args['ispaired']:
511                 self.assertEqual(args['target_right'],
512                                      expected['target_right'])
513             if 'mid' in expected:
514                 self.assertEqual(args['mid'], expected['mid'])
515
516         qseq_data = {
517             os.path.join(self.result_map['11154'],
518                          '11154_42JUYAAXX_c76_l5_r1.fastq'): {
519                 'istar': True,
520                 'ispaired': True,
521                 'sources': [
522                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
523             },
524             os.path.join(self.result_map['11154'],
525                          '11154_42JUYAAXX_c76_l5_r2.fastq'): {
526                 'istar': True,
527                 'ispaired': True,
528                 'sources': [
529                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
530             },
531             os.path.join(self.result_map['11154'],
532                          '11154_61MJTAAXX_c76_l6.fastq'): {
533                 'istar': True,
534                 'ispaired': False,
535                 'sources': [
536                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
537             },
538         }
539         for args in qseq:
540             expected = qseq_data[args['target']]
541             self.assertEqual(args['istar'], expected['istar'])
542             self.assertEqual(args['ispaired'], expected['ispaired'])
543             for i in range(len(expected['sources'])):
544                 _, filename = os.path.split(args['sources'][i])
545                 self.assertEqual(filename, expected['sources'][i])
546
547
548         split_test = dict((( x['target'], x) for x in
549             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
550                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
551              'pyscript': 'desplit_fastq.pyc',
552              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
553             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
554                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
555              'pyscript': 'desplit_fastq.pyc',
556              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
557             {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
558                          u'12345_CGATGT_L003_R1_002.fastq.gz',
559                          u'12345_CGATGT_L003_R1_003.fastq.gz',
560                          ],
561              'pyscript': 'desplit_fastq.pyc',
562              'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
563             {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
564                          u'12345_CGATGT_L003_R2_002.fastq.gz',
565                          u'12345_CGATGT_L003_R2_003.fastq.gz',
566                          ],
567              'pyscript': 'desplit_fastq.pyc',
568              'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
569              ]
570          ))
571         for arg in split:
572             _, target = os.path.split(arg['target'])
573             pyscript = split_test[target]['pyscript']
574             self.assertTrue(arg['pyscript'].endswith(pyscript))
575             filename = split_test[target]['target']
576             self.assertTrue(arg['target'].endswith(filename))
577             for s_index in range(len(arg['sources'])):
578                 s1 = arg['sources'][s_index]
579                 s2 = split_test[target]['sources'][s_index]
580                 self.assertTrue(s1.endswith(s2))
581
582     def test_create_scripts(self):
583         self.extract.create_scripts(self.result_map)
584
585         self.assertTrue(os.path.exists('srf.condor'))
586         with open('srf.condor', 'r') as srf:
587             arguments = [ l for l in srf if l.startswith('argument') ]
588             arguments.sort()
589             self.assertEqual(len(arguments), 2)
590             self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
591                             in arguments[0])
592             self.assertTrue(
593                 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
594                 arguments[1])
595
596         self.assertTrue(os.path.exists('qseq.condor'))
597         with open('qseq.condor', 'r') as srf:
598             arguments = [ l for l in srf if l.startswith('argument') ]
599             arguments.sort()
600             self.assertEqual(len(arguments), 3)
601             self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
602                             arguments[0])
603             self.assertTrue(
604                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
605                 arguments[1])
606             self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
607                             arguments[2])
608
609         self.assertTrue(os.path.exists('split_fastq.condor'))
610         with open('split_fastq.condor', 'r') as split:
611             arguments = [ l for l in split if l.startswith('argument') ]
612             arguments.sort()
613             self.assertEqual(len(arguments), 4)
614             # Lane 3 Read 1
615             self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
616                             arguments[0])
617             # Lane 3 Read 2
618             self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
619                             arguments[1])
620             # Lane 3 Read 1
621             self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
622             self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
623             self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
624             self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
625
626             # Lane 3 Read 2
627             self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
628             self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
629             self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
630             self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
631
632
633 def suite():
634     suite = unittest.makeSuite(TestCondorFastq, 'test')
635     return suite
636
637 if __name__ == "__main__":
638     unittest.main(defaultTest='suite')
639