Merge changing lane_number to string and sequence finding code changes
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission.condorfastq import CondorFastqExtract
11 from htsworkflow.submission.results import ResultMap
12 from htsworkflow.util.rdfhelp import \
13      add_default_schemas, load_string_into_model, dump_model
14 from htsworkflow.util.rdfinfer import Infer
15
16 FCDIRS = [
17     'C02F9ACXX',
18     'C02F9ACXX/C1-202',
19     'C02F9ACXX/C1-202/Project_11154',
20     'C02F9ACXX/C1-202/Project_12342_Index1',
21     'C02F9ACXX/C1-202/Project_12342_Index2',
22     'C02F9ACXX/C1-202/Project_12345',
23     '42JUYAAXX',
24     '42JUYAAXX/C1-76',
25     '30221AAXX',
26     '30221AAXX/C1-33',
27     '30DY0AAXX',
28     '30DY0AAXX/C1-151',
29     '61MJTAAXX',
30     '61MJTAAXX/C1-76',
31 ]
32
33 DATAFILES = [
34     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
36     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
37     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
38     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
39     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
40     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
41     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
42     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
43     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
44     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
45     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
46     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
47     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
48     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
49     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
53     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
54     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
55     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
56     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
57     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
58     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
59     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
60     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
61     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
62     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
63     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
64     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
65     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
66     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
67     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
68     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
69     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
70     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
71     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
72     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
73     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
74     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
75     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
76     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
77     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
78     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
79     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
80     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
81     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
82     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
83     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
84     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
85     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
86     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
87     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
88     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
89     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
90     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
91 ]
92
93 lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
94 @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
95 @prefix dc: <http://purl.org/dc/elements/1.1/> .
96 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
97 @prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
98 @prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
99 @prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
100
101 <http://localhost/library/10000/> a libns:Library .
102 <http://localhost/library/1331/> a libns:Library .
103 <http://localhost/library/1421/> a libns:Library .
104 <http://localhost/library/1661/> a libns:Library .
105
106 <http://localhost/flowcell/30221AAXX/>
107         a libns:IlluminaFlowcell ;
108         libns:read_length 33 ;
109         libns:flowcell_type "Single"@en ;
110         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
111         libns:has_lane <http://localhost/lane/3401> ;
112         libns:has_lane <http://localhost/lane/3402> ;
113         libns:has_lane <http://localhost/lane/3403> ;
114         libns:has_lane <http://localhost/lane/3404> ;
115         libns:has_lane <http://localhost/lane/3405> ;
116         libns:has_lane <http://localhost/lane/3406> ;
117         libns:has_lane <http://localhost/lane/3407> ;
118         libns:has_lane <http://localhost/lane/3408> ;
119         libns:flowcell_id "30221AAXX"@en .
120
121 <http://localhost/lane/3401>
122         a libns:IlluminaLane ;
123         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
124         libns:library <http://localhost/library/10000/> ;
125         libns:lane_number "1" .
126 <http://localhost/lane/3402>
127         a libns:IlluminaLane ;
128         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
129         libns:library <http://localhost/library/10000/> ;
130         libns:lane_number "2" .
131 <http://localhost/lane/3403>
132         a libns:IlluminaLane ;
133         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
134         libns:library <http://localhost/library/10000/> ;
135         libns:lane_number "3" .
136 <http://localhost/lane/3404>
137         a libns:IlluminaLane ;
138         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
139         libns:library <http://localhost/library/11154/> ;
140         libns:lane_number "4" .
141         # paired_end 1;
142         # read_length 33;
143         # status "Unknown"@en .
144 <http://localhost/lane/3405>
145         a libns:IlluminaLane ;
146         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
147         libns:library <http://localhost/library/10000/> ;
148         libns:lane_number "5" .
149 <http://localhost/lane/3406>
150         a libns:IlluminaLane ;
151         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
152         libns:library <http://localhost/library/10000/> ;
153         libns:lane_number "6" .
154 <http://localhost/lane/3407>
155         a libns:IlluminaLane ;
156         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
157         libns:library <http://localhost/library/10000/> ;
158         libns:lane_number "7" .
159 <http://localhost/lane/3408>
160         a libns:IlluminaLane ;
161         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
162         libns:library <http://localhost/library/10000/> ;
163         libns:lane_number "8" .
164
165 <http://localhost/flowcell/42JUYAAXX/>
166         a libns:IlluminaFlowcell ;
167         libns:read_length 76 ;
168         libns:flowcell_type "Paired"@en ;
169         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
170         libns:has_lane <http://localhost/lane/4201> ;
171         libns:has_lane <http://localhost/lane/4202> ;
172         libns:has_lane <http://localhost/lane/4203> ;
173         libns:has_lane <http://localhost/lane/4204> ;
174         libns:has_lane <http://localhost/lane/4205> ;
175         libns:has_lane <http://localhost/lane/4206> ;
176         libns:has_lane <http://localhost/lane/4207> ;
177         libns:has_lane <http://localhost/lane/4208> ;
178         libns:flowcell_id "42JUYAAXX"@en .
179
180 <http://localhost/lane/4201>
181         a libns:IlluminaLane ;
182         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
183         libns:library <http://localhost/library/1421/> ;
184         libns:lane_number "1" .
185 <http://localhost/lane/4202>
186         a libns:IlluminaLane ;
187         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
188         libns:library <http://localhost/library/1421/> ;
189         libns:lane_number "2" .
190 <http://localhost/lane/4203>
191         a libns:IlluminaLane ;
192         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
193         libns:library <http://localhost/library/1421/> ;
194         libns:lane_number "3" .
195 <http://localhost/lane/4204>
196         a libns:IlluminaLane ;
197         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
198         libns:library <http://localhost/library/1421/> ;
199         libns:lane_number "4" .
200 <http://localhost/lane/4205>
201         a libns:IlluminaLane ;
202         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
203         libns:library <http://localhost/library/11154/> ;
204         libns:lane_number "5" .
205         # paired_end 1;
206         # read_length 76;
207         # status "Unknown"@en .
208 <http://localhost/lane/4206>
209         a libns:IlluminaLane ;
210         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
211         libns:library <http://localhost/library/1421/> ;
212         libns:lane_number "6" .
213 <http://localhost/lane/4207>
214         a libns:IlluminaLane ;
215         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
216         libns:library <http://localhost/library/1421/> ;
217         libns:lane_number "7" .
218 <http://localhost/lane/4208>
219         a libns:IlluminaLane ;
220         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
221         libns:library <http://localhost/library/1421/> ;
222         libns:lane_number "8" .
223
224 <http://localhost/flowcell/61MJTAAXX/>
225         a libns:IlluminaFlowcell ;
226         libns:read_length 76 ;
227         libns:flowcell_type "Single"@en ;
228         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
229         libns:has_lane <http://localhost/lane/6601> ;
230         libns:has_lane <http://localhost/lane/6602> ;
231         libns:has_lane <http://localhost/lane/6603> ;
232         libns:has_lane <http://localhost/lane/6604> ;
233         libns:has_lane <http://localhost/lane/6605> ;
234         libns:has_lane <http://localhost/lane/6606> ;
235         libns:has_lane <http://localhost/lane/6607> ;
236         libns:has_lane <http://localhost/lane/6608> ;
237         libns:flowcell_id "61MJTAAXX"@en .
238
239 <http://localhost/lane/6601>
240         a libns:IlluminaLane ;
241         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
242         libns:library <http://localhost/library/1661/> ;
243         libns:lane_number "1" .
244 <http://localhost/lane/6602>
245         a libns:IlluminaLane ;
246         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
247         libns:library <http://localhost/library/1661/> ;
248         libns:lane_number "2" .
249 <http://localhost/lane/6603>
250         a libns:IlluminaLane ;
251         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
252         libns:library <http://localhost/library/1661/> ;
253         libns:lane_number "3" .
254 <http://localhost/lane/6604>
255         a libns:IlluminaLane ;
256         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
257         libns:library <http://localhost/library/1661/> ;
258         libns:lane_number "4" .
259 <http://localhost/lane/6605>
260         a libns:IlluminaLane ;
261         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
262         libns:library <http://localhost/library/1661/> ;
263         libns:lane_number "5" .
264 <http://localhost/lane/6606>
265         a libns:IlluminaLane ;
266         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
267         libns:library <http://localhost/library/11154/> ;
268         libns:lane_number "6" .
269         # paired_end 1;
270         # read_length 76;
271         # status "Unknown"@en .
272 <http://localhost/lane/6607>
273         a libns:IlluminaLane ;
274         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
275         libns:library <http://localhost/library/1661/> ;
276         libns:lane_number "7" .
277 <http://localhost/lane/6608>
278         a libns:IlluminaLane ;
279         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
280         libns:library <http://localhost/library/1661/> ;
281         libns:lane_number "8" .
282
283 <http://localhost/flowcell/30DY0AAXX/>
284         a libns:IlluminaFlowcell ;
285         libns:read_length 76 ;
286         libns:flowcell_type "Paired"@en ;
287         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
288         libns:has_lane <http://localhost/lane/3801> ;
289         libns:has_lane <http://localhost/lane/3802> ;
290         libns:has_lane <http://localhost/lane/3803> ;
291         libns:has_lane <http://localhost/lane/3804> ;
292         libns:has_lane <http://localhost/lane/3805> ;
293         libns:has_lane <http://localhost/lane/3806> ;
294         libns:has_lane <http://localhost/lane/3807> ;
295         libns:has_lane <http://localhost/lane/3808> ;
296         libns:flowcell_id "30DY0AAXX"@en .
297
298 <http://localhost/lane/3801>
299         a libns:IlluminaLane ;
300         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
301         libns:library <http://localhost/library/1331/> ;
302         libns:lane_number "1" .
303 <http://localhost/lane/3802>
304         a libns:IlluminaLane ;
305         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
306         libns:library <http://localhost/library/1331/> ;
307         libns:lane_number "2" .
308 <http://localhost/lane/3803>
309         a libns:IlluminaLane ;
310         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
311         libns:library <http://localhost/library/1331/> ;
312         libns:lane_number "3" .
313 <http://localhost/lane/3804>
314         a libns:IlluminaLane ;
315         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
316         libns:library <http://localhost/library/1331/> ;
317         libns:lane_number "4" .
318 <http://localhost/lane/3805>
319         a libns:IlluminaLane ;
320         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
321         libns:library <http://localhost/library/1331/> ;
322         libns:lane_number "5" .
323 <http://localhost/lane/3806>
324         a libns:IlluminaLane ;
325         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
326         libns:library <http://localhost/library/1331/> ;
327         libns:lane_number "6" .
328 <http://localhost/lane/3807>
329         a libns:IlluminaLane ;
330         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
331         libns:library <http://localhost/library/1331/> ;
332         libns:lane_number "7" .
333 <http://localhost/lane/3808>
334         a libns:IlluminaLane ;
335         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
336         libns:library <http://localhost/library/11154/> ;
337         libns:lane_number "8" .
338         # paired_end 1;
339         # read_length 76;
340         # status "Unknown"@en .
341
342 <http://localhost/flowcell/C02F9ACXX/>
343         a libns:IlluminaFlowcell ;
344         libns:read_length 101 ;
345         libns:flowcell_type "Paired"@en ;
346         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
347         libns:has_lane <http://localhost/lane/12300> ;
348         libns:has_lane <http://localhost/lane/12500> ;
349         libns:flowcell_id "C02F9ACXX"@en .
350
351 <http://localhost/lane/12300>
352         a libns:IlluminaLane ;
353         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
354         libns:library <http://localhost/library/12345/> ;
355         libns:lane_number "3" .
356         # paired_end 1;
357         # read_length 101;
358         # status "Unknown"@en .
359
360 <http://localhost/lane/12500>
361         a libns:IlluminaLane ;
362         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
363         libns:library <http://localhost/library/11154/> ;
364         libns:lane_number "3" .
365         # paired_end 1;
366         # read_length 101;
367         # status "Unknown"@en .
368
369 <http://localhost/library/11154/>
370         a libns:Library ;
371         libns:affiliation "TSR"@en;
372         libns:concentration "29.7";
373         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
374         libns:experiment_type "RNA-seq"@en ;
375         libns:gel_cut 300 ;
376         libns:has_lane <http://localhost/lane/3404> ;
377         libns:has_lane <http://localhost/lane/4205> ;
378         libns:has_lane <http://localhost/lane/6606> ;
379         libns:has_lane <http://localhost/lane/3808> ;
380         libns:has_lane <http://localhost/lane/12500> ;
381         libns:insert_size 2000 ;
382         libns:library_id "11154"@en ;
383         libns:library_type "Paired End (Multiplexed)"@en ;
384         libns:made_by "Gary Gygax"@en ;
385         libns:name "Paired Ends ASDF"@en ;
386         libns:replicate "1"@en;
387         libns:species "Mus musculus"@en ;
388         libns:stopping_point "Completed"@en ;
389         libns:total_unique_locations 8841201 .
390         # cell_line
391
392
393 <http://localhost/library/12345/>
394         a libns:Library ;
395         libns:affiliation "TSR"@en;
396         libns:concentration "12.345";
397         libns:cell_line "Unknown"@en ;
398         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
399         libns:experiment_type "RNA-seq"@en ;
400         libns:gel_cut 300 ;
401         libns:has_lane <http://localhost/lane/12300> ;
402         libns:insert_size 2000 ;
403         libns:library_id "12345"@en ;
404         libns:library_type "Paired End (Multiplexed)"@en ;
405         libns:made_by "Gary Gygax"@en ;
406         libns:name "Paired Ends THING"@en ;
407         libns:replicate "1"@en;
408         libns:species "Mus musculus"@en ;
409         libns:stopping_point "Completed"@en ;
410         libns:total_unique_locations 8841201 .
411         # cell_line
412 """
413 HOST = "http://localhost"
414
415 class TestCondorFastq(unittest.TestCase):
416     def setUp(self):
417         self.cwd = os.getcwd()
418
419         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
420         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
421         os.mkdir(self.flowcelldir)
422
423         self.logdir = os.path.join(self.tempdir, 'log')
424         os.mkdir(self.logdir)
425
426         for d in FCDIRS:
427             os.mkdir(os.path.join(self.flowcelldir, d))
428
429         for f in DATAFILES:
430             filename = os.path.join(self.flowcelldir, f)
431             with open(filename, 'w') as stream:
432                 stream.write('testfile')
433
434         self.result_map = ResultMap()
435         for lib_id in [u'11154', u'12345']:
436             subname = 'sub-%s' % (lib_id,)
437             sub_dir = os.path.join(self.tempdir, subname)
438             os.mkdir(sub_dir)
439             self.result_map[lib_id] =  sub_dir
440
441         self.extract = CondorFastqExtract(HOST,
442                                           self.flowcelldir,
443                                           self.logdir)
444         load_string_into_model(self.extract.model, 'turtle', lib_turtle)
445         add_default_schemas(self.extract.model)
446         inference = Infer(self.extract.model)
447         errmsgs = list(inference.run_validation())
448         self.assertEqual(len(errmsgs), 0)
449         os.chdir(self.tempdir)
450
451     def tearDown(self):
452         shutil.rmtree(self.tempdir)
453         os.chdir(self.cwd)
454
455     def test_find_relevant_flowcell_ids(self):
456         expected = set(('30221AAXX',
457                         '42JUYAAXX',
458                         '61MJTAAXX',
459                         '30DY0AAXX',
460                         'C02F9ACXX'))
461         flowcell_ids = self.extract.find_relevant_flowcell_ids()
462         self.assertEqual(flowcell_ids, expected)
463
464     def test_find_archive_sequence(self):
465         seqs = self.extract.find_archive_sequence_files(self.result_map)
466
467         expected = set([
468             (u'11154', u'42JUYAAXX', '5', 1, 76, True, 'qseq'),
469             (u'11154', u'42JUYAAXX', '5', 2, 76, True, 'qseq'),
470             (u'11154', u'61MJTAAXX', '6', 1, 76, False, 'qseq'),
471             (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
472             (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
473             (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
474             (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
475             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
476             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
477             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
478             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
479             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
480             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
481             (u'11154', u'30221AAXX', '4', 1, 33, False, 'srf'),
482             (u'11154', u'30DY0AAXX', '8', 1, 151, True, 'srf')
483         ])
484         found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
485         self.assertEqual(expected, found)
486
487     def test_find_needed_targets(self):
488         lib_db = self.extract.find_archive_sequence_files(self.result_map)
489
490         needed_targets = self.extract.update_fastq_targets(self.result_map,
491                                                            lib_db)
492         self.assertEqual(len(needed_targets), 9)
493         srf_30221 = needed_targets[
494             self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
495         qseq_42JUY_r1 = needed_targets[
496             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
497         qseq_42JUY_r2 = needed_targets[
498             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
499         qseq_61MJT = needed_targets[
500             self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
501         split_C02F9_r1 = needed_targets[
502             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
503         split_C02F9_r2 = needed_targets[
504             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
505
506         self.assertEqual(len(srf_30221['srf']), 1)
507         self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
508         self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
509         self.assertEqual(len(qseq_61MJT['qseq']), 1)
510         self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
511         self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
512
513     def test_generate_fastqs(self):
514         commands = self.extract.build_condor_arguments(self.result_map)
515
516         srf = commands['srf']
517         qseq = commands['qseq']
518         split = commands['split_fastq']
519
520         self.assertEqual(len(srf), 2)
521         self.assertEqual(len(qseq), 3)
522         self.assertEqual(len(split), 4)
523
524         srf_data = {
525             os.path.join(self.result_map['11154'],
526                          '11154_30221AAXX_c33_l4.fastq'): {
527                 'mid': None,
528                 'ispaired': False,
529                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
530                 'flowcell': u'30221AAXX',
531                 'target': os.path.join(self.result_map['11154'],
532                                        u'11154_30221AAXX_c33_l4.fastq'),
533             },
534             os.path.join(self.result_map['11154'],
535                          '11154_30DY0AAXX_c151_l8_r1.fastq'): {
536                 'mid': None,
537                 'ispaired': True,
538                 'flowcell': u'30DY0AAXX',
539                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
540                 'mid': 76,
541                 'target':
542                     os.path.join(self.result_map['11154'],
543                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
544                 'target_right':
545                     os.path.join(self.result_map['11154'],
546                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
547             }
548         }
549         for args in srf:
550             expected = srf_data[args['target']]
551             self.assertEqual(args['ispaired'], expected['ispaired'])
552             self.assertEqual(len(args['sources']), 1)
553             _, source_filename = os.path.split(args['sources'][0])
554             self.assertEqual(source_filename, expected['sources'][0])
555             self.assertEqual(args['target'], expected['target'])
556             if args['ispaired']:
557                 self.assertEqual(args['target_right'],
558                                      expected['target_right'])
559             if 'mid' in expected:
560                 self.assertEqual(args['mid'], expected['mid'])
561
562         qseq_data = {
563             os.path.join(self.result_map['11154'],
564                          '11154_42JUYAAXX_c76_l5_r1.fastq'): {
565                 'istar': True,
566                 'ispaired': True,
567                 'sources': [
568                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
569             },
570             os.path.join(self.result_map['11154'],
571                          '11154_42JUYAAXX_c76_l5_r2.fastq'): {
572                 'istar': True,
573                 'ispaired': True,
574                 'sources': [
575                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
576             },
577             os.path.join(self.result_map['11154'],
578                          '11154_61MJTAAXX_c76_l6.fastq'): {
579                 'istar': True,
580                 'ispaired': False,
581                 'sources': [
582                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
583             },
584         }
585         for args in qseq:
586             expected = qseq_data[args['target']]
587             self.assertEqual(args['istar'], expected['istar'])
588             self.assertEqual(args['ispaired'], expected['ispaired'])
589             for i in range(len(expected['sources'])):
590                 _, filename = os.path.split(args['sources'][i])
591                 self.assertEqual(filename, expected['sources'][i])
592
593
594         split_test = dict((( x['target'], x) for x in
595             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
596                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
597              'pyscript': 'desplit_fastq.pyc',
598              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
599             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
600                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
601              'pyscript': 'desplit_fastq.pyc',
602              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
603             {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
604                          u'12345_CGATGT_L003_R1_002.fastq.gz',
605                          u'12345_CGATGT_L003_R1_003.fastq.gz',
606                          ],
607              'pyscript': 'desplit_fastq.pyc',
608              'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
609             {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
610                          u'12345_CGATGT_L003_R2_002.fastq.gz',
611                          u'12345_CGATGT_L003_R2_003.fastq.gz',
612                          ],
613              'pyscript': 'desplit_fastq.pyc',
614              'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
615              ]
616          ))
617         for arg in split:
618             _, target = os.path.split(arg['target'])
619             pyscript = split_test[target]['pyscript']
620             self.assertTrue(arg['pyscript'].endswith(pyscript))
621             filename = split_test[target]['target']
622             self.assertTrue(arg['target'].endswith(filename))
623             for s_index in range(len(arg['sources'])):
624                 s1 = arg['sources'][s_index]
625                 s2 = split_test[target]['sources'][s_index]
626                 self.assertTrue(s1.endswith(s2))
627
628     def test_create_scripts(self):
629         self.extract.create_scripts(self.result_map)
630
631         self.assertTrue(os.path.exists('srf.condor'))
632         with open('srf.condor', 'r') as srf:
633             arguments = [ l for l in srf if l.startswith('argument') ]
634             arguments.sort()
635             self.assertEqual(len(arguments), 2)
636             self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
637                             in arguments[0])
638             self.assertTrue(
639                 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
640                 arguments[1])
641
642         self.assertTrue(os.path.exists('qseq.condor'))
643         with open('qseq.condor', 'r') as srf:
644             arguments = [ l for l in srf if l.startswith('argument') ]
645             arguments.sort()
646             self.assertEqual(len(arguments), 3)
647             self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
648                             arguments[0])
649             self.assertTrue(
650                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
651                 arguments[1])
652             self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
653                             arguments[2])
654
655         self.assertTrue(os.path.exists('split_fastq.condor'))
656         with open('split_fastq.condor', 'r') as split:
657             arguments = [ l for l in split if l.startswith('argument') ]
658             arguments.sort()
659             self.assertEqual(len(arguments), 4)
660             # Lane 3 Read 1
661             self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
662                             arguments[0])
663             # Lane 3 Read 2
664             self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
665                             arguments[1])
666             # Lane 3 Read 1
667             self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
668             self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
669             self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
670             self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
671
672             # Lane 3 Read 2
673             self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
674             self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
675             self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
676             self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
677
678
679 def suite():
680     suite = unittest.makeSuite(TestCondorFastq, 'test')
681     return suite
682
683 if __name__ == "__main__":
684     unittest.main(defaultTest='suite')
685