Use htsworkflow ontology to validate various RDF using components.
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission.condorfastq import CondorFastqExtract
11 from htsworkflow.submission.results import ResultMap
12 from htsworkflow.util.rdfhelp import \
13      add_default_schemas, load_string_into_model, dump_model
14 from htsworkflow.util.rdfinfer import Infer
15
16 FCDIRS = [
17     'C02F9ACXX',
18     'C02F9ACXX/C1-202',
19     'C02F9ACXX/C1-202/Project_11154',
20     'C02F9ACXX/C1-202/Project_12342_Index1',
21     'C02F9ACXX/C1-202/Project_12342_Index2',
22     'C02F9ACXX/C1-202/Project_12345',
23     '42JUYAAXX',
24     '42JUYAAXX/C1-76',
25     '30221AAXX',
26     '30221AAXX/C1-33',
27     '30DY0AAXX',
28     '30DY0AAXX/C1-151',
29     '61MJTAAXX',
30     '61MJTAAXX/C1-76',
31 ]
32
33 DATAFILES = [
34     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
36     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
37     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
38     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
39     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
40     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
41     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
42     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
43     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
44     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
45     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
46     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
47     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
48     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
49     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
53     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
54     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
55     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
56     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
57     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
58     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
59     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
60     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
61     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
62     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
63     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
64     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
65     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
66     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
67     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
68     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
69     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
70     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
71     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
72     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
73     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
74     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
75     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
76     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
77     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
78     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
79     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
80     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
81     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
82     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
83     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
84     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
85     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
86     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
87     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
88     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
89     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
90     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
91 ]
92
93 lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
94 @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
95 @prefix dc: <http://purl.org/dc/elements/1.1/> .
96 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
97 @prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
98 @prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
99 @prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
100
101 <http://localhost/library/10000/> a libns:Library .
102 <http://localhost/library/1331/> a libns:Library .
103 <http://localhost/library/1421/> a libns:Library .
104 <http://localhost/library/1661/> a libns:Library .
105
106 <http://localhost/flowcell/30221AAXX/>
107         a libns:IlluminaFlowcell ;
108         libns:read_length 33 ;
109         libns:flowcell_type "Single"@en ;
110         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
111         libns:has_lane <http://localhost/lane/3401> ;
112         libns:has_lane <http://localhost/lane/3402> ;
113         libns:has_lane <http://localhost/lane/3403> ;
114         libns:has_lane <http://localhost/lane/3404> ;
115         libns:has_lane <http://localhost/lane/3405> ;
116         libns:has_lane <http://localhost/lane/3406> ;
117         libns:has_lane <http://localhost/lane/3407> ;
118         libns:has_lane <http://localhost/lane/3408> ;
119         libns:flowcell_id "30221AAXX"@en .
120
121 <http://localhost/lane/3401>
122         a libns:IlluminaLane ;
123         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
124         libns:library <http://localhost/library/10000/> ;
125         libns:lane_number 1 .
126 <http://localhost/lane/3402>
127         a libns:IlluminaLane ;
128         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
129         libns:library <http://localhost/library/10000/> ;
130         libns:lane_number 2 .
131 <http://localhost/lane/3403>
132         a libns:IlluminaLane ;
133         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
134         libns:library <http://localhost/library/10000/> ;
135         libns:lane_number 3 .
136 <http://localhost/lane/3404>
137         a libns:IlluminaLane ;
138         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
139         libns:library <http://localhost/library/11154/> ;
140         libns:lane_number 4 .
141         # paired_end 1;
142         # read_length 33;
143         # status "Unknown"@en .
144 <http://localhost/lane/3405>
145         a libns:IlluminaLane ;
146         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
147         libns:library <http://localhost/library/10000/> ;
148         libns:lane_number 5 .
149 <http://localhost/lane/3406>
150         a libns:IlluminaLane ;
151         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
152         libns:library <http://localhost/library/10000/> ;
153         libns:lane_number 6 .
154 <http://localhost/lane/3407>
155         a libns:IlluminaLane ;
156         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
157         libns:library <http://localhost/library/10000/> ;
158         libns:lane_number 7 .
159 <http://localhost/lane/3408>
160         a libns:IlluminaLane ;
161         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
162         libns:library <http://localhost/library/10000/> ;
163         libns:lane_number 8 .
164
165 <http://localhost/flowcell/42JUYAAXX/>
166         a libns:IlluminaFlowcell ;
167         libns:read_length 76 ;
168         libns:flowcell_type "Paired"@en ;
169         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
170         libns:has_lane <http://localhost/lane/4201> ;
171         libns:has_lane <http://localhost/lane/4202> ;
172         libns:has_lane <http://localhost/lane/4203> ;
173         libns:has_lane <http://localhost/lane/4204> ;
174         libns:has_lane <http://localhost/lane/4205> ;
175         libns:has_lane <http://localhost/lane/4206> ;
176         libns:has_lane <http://localhost/lane/4207> ;
177         libns:has_lane <http://localhost/lane/4208> ;
178         libns:flowcell_id "42JUYAAXX"@en .
179
180 <http://localhost/lane/4201>
181         a libns:IlluminaLane ;
182         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
183         libns:library <http://localhost/library/1421/> ;
184         libns:lane_number 1 .
185 <http://localhost/lane/4202>
186         a libns:IlluminaLane ;
187         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
188         libns:library <http://localhost/library/1421/> ;
189         libns:lane_number 2 .
190 <http://localhost/lane/4203>
191         a libns:IlluminaLane ;
192         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
193         libns:library <http://localhost/library/1421/> ;
194         libns:lane_number 3 .
195 <http://localhost/lane/4204>
196         a libns:IlluminaLane ;
197         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
198         libns:library <http://localhost/library/1421/> ;
199         libns:lane_number 4 .
200 <http://localhost/lane/4205>
201         a libns:IlluminaLane ;
202         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
203         libns:library <http://localhost/library/11154/> ;
204         libns:lane_number 5 .
205         # paired_end 1;
206         # read_length 76;
207         # status "Unknown"@en .
208 <http://localhost/lane/4206>
209         a libns:IlluminaLane ;
210         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
211         libns:library <http://localhost/library/1421/> ;
212         libns:lane_number 6 .
213 <http://localhost/lane/4207>
214         a libns:IlluminaLane ;
215         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
216         libns:library <http://localhost/library/1421/> ;
217         libns:lane_number 7 .
218 <http://localhost/lane/4208>
219         a libns:IlluminaLane ;
220         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
221         libns:library <http://localhost/library/1421/> ;
222         libns:lane_number 8 .
223
224 <http://localhost/flowcell/61MJTAAXX/>
225         a libns:IlluminaFlowcell ;
226         libns:read_length 76 ;
227         libns:flowcell_type "Single"@en ;
228         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
229         libns:has_lane <http://localhost/lane/6601> ;
230         libns:has_lane <http://localhost/lane/6602> ;
231         libns:has_lane <http://localhost/lane/6603> ;
232         libns:has_lane <http://localhost/lane/6604> ;
233         libns:has_lane <http://localhost/lane/6605> ;
234         libns:has_lane <http://localhost/lane/6606> ;
235         libns:has_lane <http://localhost/lane/6607> ;
236         libns:has_lane <http://localhost/lane/6608> ;
237         libns:flowcell_id "61MJTAAXX"@en .
238
239 <http://localhost/lane/6601>
240         a libns:IlluminaLane ;
241         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
242         libns:library <http://localhost/library/1661/> ;
243         libns:lane_number 1 .
244 <http://localhost/lane/6602>
245         a libns:IlluminaLane ;
246         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
247         libns:library <http://localhost/library/1661/> ;
248         libns:lane_number 2 .
249 <http://localhost/lane/6603>
250         a libns:IlluminaLane ;
251         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
252         libns:library <http://localhost/library/1661/> ;
253         libns:lane_number 3 .
254 <http://localhost/lane/6604>
255         a libns:IlluminaLane ;
256         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
257         libns:library <http://localhost/library/1661/> ;
258         libns:lane_number 4 .
259 <http://localhost/lane/6605>
260         a libns:IlluminaLane ;
261         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
262         libns:library <http://localhost/library/1661/> ;
263         libns:lane_number 5 .
264 <http://localhost/lane/6606>
265         a libns:IlluminaLane ;
266         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
267         libns:library <http://localhost/library/11154/> ;
268         libns:lane_number 6 .
269         # paired_end 1;
270         # read_length 76;
271         # status "Unknown"@en .
272 <http://localhost/lane/6607>
273         a libns:IlluminaLane ;
274         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
275         libns:library <http://localhost/library/1661/> ;
276         libns:lane_number 7 .
277 <http://localhost/lane/6608>
278         a libns:IlluminaLane ;
279         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
280         libns:library <http://localhost/library/1661/> ;
281         libns:lane_number 8 .
282
283 <http://localhost/flowcell/30DY0AAXX/>
284         a libns:IlluminaFlowcell ;
285         libns:read_length 76 ;
286         libns:flowcell_type "Paired"@en ;
287         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
288         libns:has_lane <http://localhost/lane/3801> ;
289         libns:has_lane <http://localhost/lane/3802> ;
290         libns:has_lane <http://localhost/lane/3803> ;
291         libns:has_lane <http://localhost/lane/3804> ;
292         libns:has_lane <http://localhost/lane/3805> ;
293         libns:has_lane <http://localhost/lane/3806> ;
294         libns:has_lane <http://localhost/lane/3807> ;
295         libns:has_lane <http://localhost/lane/3808> ;
296         libns:flowcell_id "30DY0AAXX"@en .
297
298 <http://localhost/lane/3801>
299         a libns:IlluminaLane ;
300         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
301         libns:library <http://localhost/library/1331/> ;
302         libns:lane_number 1 .
303 <http://localhost/lane/3802>
304         a libns:IlluminaLane ;
305         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
306         libns:library <http://localhost/library/1331/> ;
307         libns:lane_number 2 .
308 <http://localhost/lane/3803>
309         a libns:IlluminaLane ;
310         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
311         libns:library <http://localhost/library/1331/> ;
312         libns:lane_number 3 .
313 <http://localhost/lane/3804>
314         a libns:IlluminaLane ;
315         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
316         libns:library <http://localhost/library/1331/> ;
317         libns:lane_number 4 .
318 <http://localhost/lane/3805>
319         a libns:IlluminaLane ;
320         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
321         libns:library <http://localhost/library/1331/> ;
322         libns:lane_number 5 .
323 <http://localhost/lane/3806>
324         a libns:IlluminaLane ;
325         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
326         libns:library <http://localhost/library/1331/> ;
327         libns:lane_number 6 .
328 <http://localhost/lane/3807>
329         a libns:IlluminaLane ;
330         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
331         libns:library <http://localhost/library/1331/> ;
332         libns:lane_number 7 .
333 <http://localhost/lane/3808>
334         a libns:IlluminaLane ;
335         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
336         libns:library <http://localhost/library/11154/> ;
337         libns:lane_number 8 .
338         # paired_end 1;
339         # read_length 76;
340         # status "Unknown"@en .
341
342 <http://localhost/flowcell/C02F9ACXX/>
343         a libns:IlluminaFlowcell ;
344         libns:read_length 101 ;
345         libns:flowcell_type "Paired"@en ;
346         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
347         libns:has_lane <http://localhost/lane/12300> ;
348         libns:has_lane <http://localhost/lane/12500> ;
349         libns:flowcell_id "C02F9ACXX"@en .
350
351 <http://localhost/lane/12300>
352         a libns:IlluminaLane ;
353         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
354         libns:library <http://localhost/library/12345/> ;
355         libns:lane_number 3 .
356         # paired_end 1;
357         # read_length 101;
358         # status "Unknown"@en .
359
360 <http://localhost/lane/12500>
361         a libns:IlluminaLane ;
362         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
363         libns:library <http://localhost/library/11154/> ;
364         libns:lane_number 3 .
365         # paired_end 1;
366         # read_length 101;
367         # status "Unknown"@en .
368
369 <http://localhost/library/11154/>
370         a libns:Library ;
371         libns:affiliation "TSR"@en;
372         libns:concentration "29.7";
373         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
374         libns:experiment_type "RNA-seq"@en ;
375         libns:gel_cut 300 ;
376         libns:has_lane <http://localhost/lane/3404> ;
377         libns:has_lane <http://localhost/lane/4205> ;
378         libns:has_lane <http://localhost/lane/6606> ;
379         libns:has_lane <http://localhost/lane/3808> ;
380         libns:has_lane <http://localhost/lane/12500> ;
381         libns:insert_size 2000 ;
382         libns:library_id "11154"@en ;
383         libns:library_type "Paired End (Multiplexed)"@en ;
384         libns:made_by "Gary Gygax"@en ;
385         libns:name "Paired Ends ASDF"@en ;
386         libns:replicate "1"@en;
387         libns:species "Mus musculus"@en ;
388         libns:stopping_point "Completed"@en ;
389         libns:total_unique_locations 8841201 .
390         # cell_line
391
392
393 <http://localhost/library/12345/>
394         a libns:Library ;
395         libns:affiliation "TSR"@en;
396         libns:concentration "12.345";
397         libns:cell_line "Unknown"@en ;
398         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
399         libns:experiment_type "RNA-seq"@en ;
400         libns:gel_cut 300 ;
401         libns:has_lane <http://localhost/lane/12300> ;
402         libns:insert_size 2000 ;
403         libns:library_id "12345"@en ;
404         libns:library_type "Paired End (Multiplexed)"@en ;
405         libns:made_by "Gary Gygax"@en ;
406         libns:name "Paired Ends THING"@en ;
407         libns:replicate "1"@en;
408         libns:species "Mus musculus"@en ;
409         libns:stopping_point "Completed"@en ;
410         libns:total_unique_locations 8841201 .
411         # cell_line
412 """
413 HOST = "http://localhost"
414
415 class TestCondorFastq(unittest.TestCase):
416     def setUp(self):
417         self.cwd = os.getcwd()
418
419         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
420         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
421         os.mkdir(self.flowcelldir)
422
423         self.logdir = os.path.join(self.tempdir, 'log')
424         os.mkdir(self.logdir)
425
426         for d in FCDIRS:
427             os.mkdir(os.path.join(self.flowcelldir, d))
428
429         for f in DATAFILES:
430             filename = os.path.join(self.flowcelldir, f)
431             with open(filename, 'w') as stream:
432                 stream.write('testfile')
433
434         self.result_map = ResultMap()
435         for lib_id in [u'11154', u'12345']:
436             subname = 'sub-%s' % (lib_id,)
437             sub_dir = os.path.join(self.tempdir, subname)
438             os.mkdir(sub_dir)
439             self.result_map[lib_id] =  sub_dir
440
441         self.extract = CondorFastqExtract(HOST,
442                                           self.flowcelldir,
443                                           self.logdir)
444         load_string_into_model(self.extract.model, 'turtle', lib_turtle)
445         add_default_schemas(self.extract.model)
446         inference = Infer(self.extract.model)
447         errmsgs = list(inference.run_validation())
448         self.assertEqual(len(errmsgs), 0)
449
450     def tearDown(self):
451         shutil.rmtree(self.tempdir)
452         os.chdir(self.cwd)
453
454     def test_find_relavant_flowcell_ids(self):
455         expected = set(('30221AAXX',
456                         '42JUYAAXX',
457                         '61MJTAAXX',
458                         '30DY0AAXX',
459                         'C02F9ACXX'))
460         flowcell_ids = self.extract.find_relavant_flowcell_ids()
461         self.assertEqual(flowcell_ids, expected)
462
463     def test_find_archive_sequence(self):
464         seqs = self.extract.find_archive_sequence_files(self.result_map)
465
466         expected = set([
467             (u'11154', u'42JUYAAXX', 5, 1, 76, True, 'qseq'),
468             (u'11154', u'42JUYAAXX', 5, 2, 76, True, 'qseq'),
469             (u'11154', u'61MJTAAXX', 6, 1, 76, False, 'qseq'),
470             (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
471             (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
472             (u'11154', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
473             (u'11154', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
474             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
475             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
476             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
477             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
478             (u'12345', u'C02F9ACXX', 3, 1, 202, True, 'split_fastq'),
479             (u'12345', u'C02F9ACXX', 3, 2, 202, True, 'split_fastq'),
480             (u'11154', u'30221AAXX', 4, 1, 33, False, 'srf'),
481             (u'11154', u'30DY0AAXX', 8, 1, 151, True, 'srf')
482         ])
483         found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
484         self.assertEqual(expected, found)
485
486     def test_find_needed_targets(self):
487         lib_db = self.extract.find_archive_sequence_files(self.result_map)
488
489         needed_targets = self.extract.update_fastq_targets(self.result_map,
490                                                            lib_db)
491         self.assertEqual(len(needed_targets), 9)
492         srf_30221 = needed_targets[
493             self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
494         qseq_42JUY_r1 = needed_targets[
495             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
496         qseq_42JUY_r2 = needed_targets[
497             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
498         qseq_61MJT = needed_targets[
499             self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
500         split_C02F9_r1 = needed_targets[
501             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
502         split_C02F9_r2 = needed_targets[
503             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
504
505         self.assertEqual(len(srf_30221['srf']), 1)
506         self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
507         self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
508         self.assertEqual(len(qseq_61MJT['qseq']), 1)
509         self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
510         self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
511
512     def test_generate_fastqs(self):
513         commands = self.extract.build_condor_arguments(self.result_map)
514
515         srf = commands['srf']
516         qseq = commands['qseq']
517         split = commands['split_fastq']
518
519         self.assertEqual(len(srf), 2)
520         self.assertEqual(len(qseq), 3)
521         self.assertEqual(len(split), 4)
522
523         srf_data = {
524             os.path.join(self.result_map['11154'],
525                          '11154_30221AAXX_c33_l4.fastq'): {
526                 'mid': None,
527                 'ispaired': False,
528                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
529                 'flowcell': u'30221AAXX',
530                 'target': os.path.join(self.result_map['11154'],
531                                        u'11154_30221AAXX_c33_l4.fastq'),
532             },
533             os.path.join(self.result_map['11154'],
534                          '11154_30DY0AAXX_c151_l8_r1.fastq'): {
535                 'mid': None,
536                 'ispaired': True,
537                 'flowcell': u'30DY0AAXX',
538                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
539                 'mid': 76,
540                 'target':
541                     os.path.join(self.result_map['11154'],
542                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
543                 'target_right':
544                     os.path.join(self.result_map['11154'],
545                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
546             }
547         }
548         for args in srf:
549             expected = srf_data[args['target']]
550             self.assertEqual(args['ispaired'], expected['ispaired'])
551             self.assertEqual(len(args['sources']), 1)
552             _, source_filename = os.path.split(args['sources'][0])
553             self.assertEqual(source_filename, expected['sources'][0])
554             self.assertEqual(args['target'], expected['target'])
555             if args['ispaired']:
556                 self.assertEqual(args['target_right'],
557                                      expected['target_right'])
558             if 'mid' in expected:
559                 self.assertEqual(args['mid'], expected['mid'])
560
561         qseq_data = {
562             os.path.join(self.result_map['11154'],
563                          '11154_42JUYAAXX_c76_l5_r1.fastq'): {
564                 'istar': True,
565                 'ispaired': True,
566                 'sources': [
567                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
568             },
569             os.path.join(self.result_map['11154'],
570                          '11154_42JUYAAXX_c76_l5_r2.fastq'): {
571                 'istar': True,
572                 'ispaired': True,
573                 'sources': [
574                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
575             },
576             os.path.join(self.result_map['11154'],
577                          '11154_61MJTAAXX_c76_l6.fastq'): {
578                 'istar': True,
579                 'ispaired': False,
580                 'sources': [
581                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
582             },
583         }
584         for args in qseq:
585             expected = qseq_data[args['target']]
586             self.assertEqual(args['istar'], expected['istar'])
587             self.assertEqual(args['ispaired'], expected['ispaired'])
588             for i in range(len(expected['sources'])):
589                 _, filename = os.path.split(args['sources'][i])
590                 self.assertEqual(filename, expected['sources'][i])
591
592
593         split_test = dict((( x['target'], x) for x in
594             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
595                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
596              'pyscript': 'desplit_fastq.pyc',
597              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
598             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
599                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
600              'pyscript': 'desplit_fastq.pyc',
601              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
602             {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
603                          u'12345_CGATGT_L003_R1_002.fastq.gz',
604                          u'12345_CGATGT_L003_R1_003.fastq.gz',
605                          ],
606              'pyscript': 'desplit_fastq.pyc',
607              'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
608             {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
609                          u'12345_CGATGT_L003_R2_002.fastq.gz',
610                          u'12345_CGATGT_L003_R2_003.fastq.gz',
611                          ],
612              'pyscript': 'desplit_fastq.pyc',
613              'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
614              ]
615          ))
616         for arg in split:
617             _, target = os.path.split(arg['target'])
618             pyscript = split_test[target]['pyscript']
619             self.assertTrue(arg['pyscript'].endswith(pyscript))
620             filename = split_test[target]['target']
621             self.assertTrue(arg['target'].endswith(filename))
622             for s_index in range(len(arg['sources'])):
623                 s1 = arg['sources'][s_index]
624                 s2 = split_test[target]['sources'][s_index]
625                 self.assertTrue(s1.endswith(s2))
626
627     def test_create_scripts(self):
628         self.extract.create_scripts(self.result_map)
629
630         self.assertTrue(os.path.exists('srf.condor'))
631         with open('srf.condor', 'r') as srf:
632             arguments = [ l for l in srf if l.startswith('argument') ]
633             arguments.sort()
634             self.assertEqual(len(arguments), 2)
635             self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
636                             in arguments[0])
637             self.assertTrue(
638                 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
639                 arguments[1])
640
641         self.assertTrue(os.path.exists('qseq.condor'))
642         with open('qseq.condor', 'r') as srf:
643             arguments = [ l for l in srf if l.startswith('argument') ]
644             arguments.sort()
645             self.assertEqual(len(arguments), 3)
646             self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
647                             arguments[0])
648             self.assertTrue(
649                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
650                 arguments[1])
651             self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
652                             arguments[2])
653
654         self.assertTrue(os.path.exists('split_fastq.condor'))
655         with open('split_fastq.condor', 'r') as split:
656             arguments = [ l for l in split if l.startswith('argument') ]
657             arguments.sort()
658             self.assertEqual(len(arguments), 4)
659             # Lane 3 Read 1
660             self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
661                             arguments[0])
662             # Lane 3 Read 2
663             self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
664                             arguments[1])
665             # Lane 3 Read 1
666             self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
667             self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
668             self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
669             self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
670
671             # Lane 3 Read 2
672             self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
673             self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
674             self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
675             self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
676
677
678 def suite():
679     suite = unittest.makeSuite(TestCondorFastq, 'test')
680     return suite
681
682 if __name__ == "__main__":
683     unittest.main(defaultTest='suite')
684