Use regex to make sure scripts end with the right command
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 import re
6 from pprint import pprint
7 import shutil
8 import tempfile
9
10 from django.test import TestCase
11 from django.test.utils import setup_test_environment, \
12      teardown_test_environment
13 from django.db import connection
14 from django.conf import settings
15
16 from htsworkflow.submission.condorfastq import CondorFastqExtract
17 from htsworkflow.submission.results import ResultMap
18 from htsworkflow.util.rdfhelp import \
19      add_default_schemas, load_string_into_model, dump_model
20 from htsworkflow.util.rdfinfer import Infer
21
22 FCDIRS = [
23     'C02F9ACXX',
24     'C02F9ACXX/C1-202',
25     'C02F9ACXX/C1-202/Project_11154',
26     'C02F9ACXX/C1-202/Project_12342_Index1',
27     'C02F9ACXX/C1-202/Project_12342_Index2',
28     'C02F9ACXX/C1-202/Project_12345',
29     '42JUYAAXX',
30     '42JUYAAXX/C1-76',
31     '30221AAXX',
32     '30221AAXX/C1-33',
33     '30DY0AAXX',
34     '30DY0AAXX/C1-151',
35     '61MJTAAXX',
36     '61MJTAAXX/C1-76',
37 ]
38
39 DATAFILES = [
40     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
41     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
42     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
43     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
44     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R1_001.fastq.gz',
45     'C02F9ACXX/C1-202/Project_12342_Index1/12342_GCCAAT_L004_R2_001.fastq.gz',
46     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R1_001.fastq.gz',
47     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L007_R2_001.fastq.gz',
48     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R1_001.fastq.gz',
49     'C02F9ACXX/C1-202/Project_12342_Index2/12342_CGATGT_L005_R2_001.fastq.gz',
50     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_001.fastq.gz',
51     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_002.fastq.gz',
52     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R1_003.fastq.gz',
53     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_001.fastq.gz',
54     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_002.fastq.gz',
55     'C02F9ACXX/C1-202/Project_12345/12345_CGATGT_L003_R2_003.fastq.gz',
56     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
57     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
58     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
59     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
60     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
61     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
62     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
63     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
64     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
65     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
66     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
67     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
68     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
69     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
70     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
71     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
72     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
73     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
74     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
75     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
76     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
77     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
78     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
79     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
80     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
81     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
82     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
83     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
84     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
85     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
86     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
87     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
88     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
89     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
90     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
91     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
92     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
93     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
94     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
95     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
96     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
97 ]
98
99 lib_turtle = """@prefix : <http://www.w3.org/1999/xhtml> .
100 @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
101 @prefix dc: <http://purl.org/dc/elements/1.1/> .
102 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
103 @prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
104 @prefix seqns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> .
105 @prefix invns: <http://jumpgate.caltech.edu/wiki/InventoryOntology#> .
106
107 <http://localhost/library/10000/> a libns:Library .
108 <http://localhost/library/1331/> a libns:Library .
109 <http://localhost/library/1421/> a libns:Library .
110 <http://localhost/library/1661/> a libns:Library .
111
112 <http://localhost/flowcell/30221AAXX/>
113         a libns:IlluminaFlowcell ;
114         libns:read_length 33 ;
115         libns:flowcell_type "Single"@en ;
116         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
117         libns:has_lane <http://localhost/lane/3401> ;
118         libns:has_lane <http://localhost/lane/3402> ;
119         libns:has_lane <http://localhost/lane/3403> ;
120         libns:has_lane <http://localhost/lane/3404> ;
121         libns:has_lane <http://localhost/lane/3405> ;
122         libns:has_lane <http://localhost/lane/3406> ;
123         libns:has_lane <http://localhost/lane/3407> ;
124         libns:has_lane <http://localhost/lane/3408> ;
125         libns:flowcell_id "30221AAXX"@en .
126
127 <http://localhost/lane/3401>
128         a libns:IlluminaLane ;
129         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
130         libns:library <http://localhost/library/10000/> ;
131         libns:lane_number "1" .
132 <http://localhost/lane/3402>
133         a libns:IlluminaLane ;
134         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
135         libns:library <http://localhost/library/10000/> ;
136         libns:lane_number "2" .
137 <http://localhost/lane/3403>
138         a libns:IlluminaLane ;
139         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
140         libns:library <http://localhost/library/10000/> ;
141         libns:lane_number "3" .
142 <http://localhost/lane/3404>
143         a libns:IlluminaLane ;
144         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
145         libns:library <http://localhost/library/11154/> ;
146         libns:lane_number "4" .
147         # paired_end 1;
148         # read_length 33;
149         # status "Unknown"@en .
150 <http://localhost/lane/3405>
151         a libns:IlluminaLane ;
152         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
153         libns:library <http://localhost/library/10000/> ;
154         libns:lane_number "5" .
155 <http://localhost/lane/3406>
156         a libns:IlluminaLane ;
157         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
158         libns:library <http://localhost/library/10000/> ;
159         libns:lane_number "6" .
160 <http://localhost/lane/3407>
161         a libns:IlluminaLane ;
162         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
163         libns:library <http://localhost/library/10000/> ;
164         libns:lane_number "7" .
165 <http://localhost/lane/3408>
166         a libns:IlluminaLane ;
167         libns:flowcell <http://localhost/flowcell/30221AAXX/> ;
168         libns:library <http://localhost/library/10000/> ;
169         libns:lane_number "8" .
170
171 <http://localhost/flowcell/42JUYAAXX/>
172         a libns:IlluminaFlowcell ;
173         libns:read_length 76 ;
174         libns:flowcell_type "Paired"@en ;
175         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
176         libns:has_lane <http://localhost/lane/4201> ;
177         libns:has_lane <http://localhost/lane/4202> ;
178         libns:has_lane <http://localhost/lane/4203> ;
179         libns:has_lane <http://localhost/lane/4204> ;
180         libns:has_lane <http://localhost/lane/4205> ;
181         libns:has_lane <http://localhost/lane/4206> ;
182         libns:has_lane <http://localhost/lane/4207> ;
183         libns:has_lane <http://localhost/lane/4208> ;
184         libns:flowcell_id "42JUYAAXX"@en .
185
186 <http://localhost/lane/4201>
187         a libns:IlluminaLane ;
188         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
189         libns:library <http://localhost/library/1421/> ;
190         libns:lane_number "1" .
191 <http://localhost/lane/4202>
192         a libns:IlluminaLane ;
193         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
194         libns:library <http://localhost/library/1421/> ;
195         libns:lane_number "2" .
196 <http://localhost/lane/4203>
197         a libns:IlluminaLane ;
198         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
199         libns:library <http://localhost/library/1421/> ;
200         libns:lane_number "3" .
201 <http://localhost/lane/4204>
202         a libns:IlluminaLane ;
203         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
204         libns:library <http://localhost/library/1421/> ;
205         libns:lane_number "4" .
206 <http://localhost/lane/4205>
207         a libns:IlluminaLane ;
208         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
209         libns:library <http://localhost/library/11154/> ;
210         libns:lane_number "5" .
211         # paired_end 1;
212         # read_length 76;
213         # status "Unknown"@en .
214 <http://localhost/lane/4206>
215         a libns:IlluminaLane ;
216         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
217         libns:library <http://localhost/library/1421/> ;
218         libns:lane_number "6" .
219 <http://localhost/lane/4207>
220         a libns:IlluminaLane ;
221         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
222         libns:library <http://localhost/library/1421/> ;
223         libns:lane_number "7" .
224 <http://localhost/lane/4208>
225         a libns:IlluminaLane ;
226         libns:flowcell <http://localhost/flowcell/42JUYAAXX/> ;
227         libns:library <http://localhost/library/1421/> ;
228         libns:lane_number "8" .
229
230 <http://localhost/flowcell/61MJTAAXX/>
231         a libns:IlluminaFlowcell ;
232         libns:read_length 76 ;
233         libns:flowcell_type "Single"@en ;
234         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
235         libns:has_lane <http://localhost/lane/6601> ;
236         libns:has_lane <http://localhost/lane/6602> ;
237         libns:has_lane <http://localhost/lane/6603> ;
238         libns:has_lane <http://localhost/lane/6604> ;
239         libns:has_lane <http://localhost/lane/6605> ;
240         libns:has_lane <http://localhost/lane/6606> ;
241         libns:has_lane <http://localhost/lane/6607> ;
242         libns:has_lane <http://localhost/lane/6608> ;
243         libns:flowcell_id "61MJTAAXX"@en .
244
245 <http://localhost/lane/6601>
246         a libns:IlluminaLane ;
247         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
248         libns:library <http://localhost/library/1661/> ;
249         libns:lane_number "1" .
250 <http://localhost/lane/6602>
251         a libns:IlluminaLane ;
252         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
253         libns:library <http://localhost/library/1661/> ;
254         libns:lane_number "2" .
255 <http://localhost/lane/6603>
256         a libns:IlluminaLane ;
257         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
258         libns:library <http://localhost/library/1661/> ;
259         libns:lane_number "3" .
260 <http://localhost/lane/6604>
261         a libns:IlluminaLane ;
262         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
263         libns:library <http://localhost/library/1661/> ;
264         libns:lane_number "4" .
265 <http://localhost/lane/6605>
266         a libns:IlluminaLane ;
267         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
268         libns:library <http://localhost/library/1661/> ;
269         libns:lane_number "5" .
270 <http://localhost/lane/6606>
271         a libns:IlluminaLane ;
272         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
273         libns:library <http://localhost/library/11154/> ;
274         libns:lane_number "6" .
275         # paired_end 1;
276         # read_length 76;
277         # status "Unknown"@en .
278 <http://localhost/lane/6607>
279         a libns:IlluminaLane ;
280         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
281         libns:library <http://localhost/library/1661/> ;
282         libns:lane_number "7" .
283 <http://localhost/lane/6608>
284         a libns:IlluminaLane ;
285         libns:flowcell <http://localhost/flowcell/61MJTAAXX/> ;
286         libns:library <http://localhost/library/1661/> ;
287         libns:lane_number "8" .
288
289 <http://localhost/flowcell/30DY0AAXX/>
290         a libns:IlluminaFlowcell ;
291         libns:read_length 76 ;
292         libns:flowcell_type "Paired"@en ;
293         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
294         libns:has_lane <http://localhost/lane/3801> ;
295         libns:has_lane <http://localhost/lane/3802> ;
296         libns:has_lane <http://localhost/lane/3803> ;
297         libns:has_lane <http://localhost/lane/3804> ;
298         libns:has_lane <http://localhost/lane/3805> ;
299         libns:has_lane <http://localhost/lane/3806> ;
300         libns:has_lane <http://localhost/lane/3807> ;
301         libns:has_lane <http://localhost/lane/3808> ;
302         libns:flowcell_id "30DY0AAXX"@en .
303
304 <http://localhost/lane/3801>
305         a libns:IlluminaLane ;
306         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
307         libns:library <http://localhost/library/1331/> ;
308         libns:lane_number "1" .
309 <http://localhost/lane/3802>
310         a libns:IlluminaLane ;
311         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
312         libns:library <http://localhost/library/1331/> ;
313         libns:lane_number "2" .
314 <http://localhost/lane/3803>
315         a libns:IlluminaLane ;
316         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
317         libns:library <http://localhost/library/1331/> ;
318         libns:lane_number "3" .
319 <http://localhost/lane/3804>
320         a libns:IlluminaLane ;
321         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
322         libns:library <http://localhost/library/1331/> ;
323         libns:lane_number "4" .
324 <http://localhost/lane/3805>
325         a libns:IlluminaLane ;
326         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
327         libns:library <http://localhost/library/1331/> ;
328         libns:lane_number "5" .
329 <http://localhost/lane/3806>
330         a libns:IlluminaLane ;
331         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
332         libns:library <http://localhost/library/1331/> ;
333         libns:lane_number "6" .
334 <http://localhost/lane/3807>
335         a libns:IlluminaLane ;
336         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
337         libns:library <http://localhost/library/1331/> ;
338         libns:lane_number "7" .
339 <http://localhost/lane/3808>
340         a libns:IlluminaLane ;
341         libns:flowcell <http://localhost/flowcell/30DY0AAXX/> ;
342         libns:library <http://localhost/library/11154/> ;
343         libns:lane_number "8" .
344         # paired_end 1;
345         # read_length 76;
346         # status "Unknown"@en .
347
348 <http://localhost/flowcell/C02F9ACXX/>
349         a libns:IlluminaFlowcell ;
350         libns:read_length 101 ;
351         libns:flowcell_type "Paired"@en ;
352         libns:date "2012-01-19T20:23:26"^^xsd:dateTime;
353         libns:has_lane <http://localhost/lane/12300> ;
354         libns:has_lane <http://localhost/lane/12500> ;
355         libns:flowcell_id "C02F9ACXX"@en .
356
357 <http://localhost/lane/12300>
358         a libns:IlluminaLane ;
359         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
360         libns:library <http://localhost/library/12345/> ;
361         libns:lane_number "3" .
362         # paired_end 1;
363         # read_length 101;
364         # status "Unknown"@en .
365
366 <http://localhost/lane/12500>
367         a libns:IlluminaLane ;
368         libns:flowcell <http://localhost/flowcell/C02F9ACXX/> ;
369         libns:library <http://localhost/library/11154/> ;
370         libns:lane_number "3" .
371         # paired_end 1;
372         # read_length 101;
373         # status "Unknown"@en .
374
375 <http://localhost/library/11154/>
376         a libns:Library ;
377         libns:affiliation "TSR"@en;
378         libns:concentration "29.7";
379         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
380         libns:experiment_type "RNA-seq"@en ;
381         libns:gel_cut 300 ;
382         libns:has_lane <http://localhost/lane/3404> ;
383         libns:has_lane <http://localhost/lane/4205> ;
384         libns:has_lane <http://localhost/lane/6606> ;
385         libns:has_lane <http://localhost/lane/3808> ;
386         libns:has_lane <http://localhost/lane/12500> ;
387         libns:insert_size 2000 ;
388         libns:library_id "11154"@en ;
389         libns:library_type "Paired End (Multiplexed)"@en ;
390         libns:made_by "Gary Gygax"@en ;
391         libns:name "Paired Ends ASDF"@en ;
392         libns:replicate "1"@en;
393         libns:species_name "Mus musculus"@en ;
394         libns:stopping_point "Completed"@en ;
395         libns:total_unique_locations 8841201 .
396         # cell_line
397
398 <http://localhost/library/12345/>
399         a libns:Library ;
400         libns:affiliation "TSR"@en;
401         libns:concentration "12.345";
402         libns:cell_line "Unknown"@en ;
403         libns:date "2012-12-28T00:00:00"^^xsd:dateTime ;
404         libns:experiment_type "RNA-seq"@en ;
405         libns:gel_cut 300 ;
406         libns:has_lane <http://localhost/lane/12300> ;
407         libns:insert_size 2000 ;
408         libns:library_id "12345"@en ;
409         libns:library_type "Paired End (Multiplexed)"@en ;
410         libns:made_by "Gary Gygax"@en ;
411         libns:name "Paired Ends THING"@en ;
412         libns:replicate "1"@en;
413         libns:species_name "Mus musculus"@en ;
414         libns:stopping_point "Completed"@en ;
415         libns:total_unique_locations 8841201 .
416         # cell_line
417 """
418 HOST = "http://localhost"
419
420 class TestCondorFastq(TestCase):
421     def setUp(self):
422         self.cwd = os.getcwd()
423
424         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
425         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
426         os.mkdir(self.flowcelldir)
427
428         self.logdir = os.path.join(self.tempdir, 'log')
429         os.mkdir(self.logdir)
430
431         for d in FCDIRS:
432             os.mkdir(os.path.join(self.flowcelldir, d))
433
434         for f in DATAFILES:
435             filename = os.path.join(self.flowcelldir, f)
436             with open(filename, 'w') as stream:
437                 stream.write('testfile')
438
439         self.result_map = ResultMap()
440         for lib_id in [u'11154', u'12345']:
441             subname = 'sub-%s' % (lib_id,)
442             sub_dir = os.path.join(self.tempdir, subname)
443             os.mkdir(sub_dir)
444             self.result_map[lib_id] =  sub_dir
445
446         self.extract = CondorFastqExtract(HOST,
447                                           self.flowcelldir,
448                                           self.logdir)
449         load_string_into_model(self.extract.model, 'turtle', lib_turtle)
450         add_default_schemas(self.extract.model)
451         inference = Infer(self.extract.model)
452         errmsgs = list(inference.run_validation())
453         self.assertEqual(len(errmsgs), 0)
454         os.chdir(self.tempdir)
455
456     def tearDown(self):
457         shutil.rmtree(self.tempdir)
458         os.chdir(self.cwd)
459
460     def test_find_relevant_flowcell_ids(self):
461         expected = set(('30221AAXX',
462                         '42JUYAAXX',
463                         '61MJTAAXX',
464                         '30DY0AAXX',
465                         'C02F9ACXX'))
466         flowcell_ids = self.extract.find_relevant_flowcell_ids()
467         self.assertEqual(flowcell_ids, expected)
468
469     def test_find_archive_sequence(self):
470         seqs = self.extract.find_archive_sequence_files(self.result_map)
471
472         expected = set([
473             (u'11154', u'42JUYAAXX', '5', 1, 76, True, 'qseq'),
474             (u'11154', u'42JUYAAXX', '5', 2, 76, True, 'qseq'),
475             (u'11154', u'61MJTAAXX', '6', 1, 76, False, 'qseq'),
476             (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
477             (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
478             (u'11154', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
479             (u'11154', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
480             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
481             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
482             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
483             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
484             (u'12345', u'C02F9ACXX', '3', 1, 202, True, 'split_fastq'),
485             (u'12345', u'C02F9ACXX', '3', 2, 202, True, 'split_fastq'),
486             (u'11154', u'30221AAXX', '4', 1, 33, False, 'srf'),
487             (u'11154', u'30DY0AAXX', '8', 1, 151, True, 'srf')
488         ])
489         found = set([(l.library_id, l.flowcell_id, l.lane_number, l.read, l.cycle, l.ispaired, l.filetype) for l in seqs])
490         self.assertEqual(expected, found)
491
492     def test_find_needed_targets(self):
493         lib_db = self.extract.find_archive_sequence_files(self.result_map)
494
495         needed_targets = self.extract.update_fastq_targets(self.result_map,
496                                                            lib_db)
497         self.assertEqual(len(needed_targets), 9)
498         srf_30221 = needed_targets[
499             self.result_map['11154'] + u'/11154_30221AAXX_c33_l4.fastq']
500         qseq_42JUY_r1 = needed_targets[
501             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
502         qseq_42JUY_r2 = needed_targets[
503             self.result_map['11154'] + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
504         qseq_61MJT = needed_targets[
505             self.result_map['11154'] + u'/11154_61MJTAAXX_c76_l6.fastq']
506         split_C02F9_r1 = needed_targets[
507             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
508         split_C02F9_r2 = needed_targets[
509             self.result_map['11154'] + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
510
511         self.assertEqual(len(srf_30221['srf']), 1)
512         self.assertEqual(len(qseq_42JUY_r1['qseq']), 1)
513         self.assertEqual(len(qseq_42JUY_r2['qseq']), 1)
514         self.assertEqual(len(qseq_61MJT['qseq']), 1)
515         self.assertEqual(len(split_C02F9_r1['split_fastq']), 2)
516         self.assertEqual(len(split_C02F9_r2['split_fastq']), 2)
517
518     def test_generate_fastqs(self):
519         commands = self.extract.build_condor_arguments(self.result_map)
520
521         srf = commands['srf']
522         qseq = commands['qseq']
523         split = commands['split_fastq']
524
525         self.assertEqual(len(srf), 2)
526         self.assertEqual(len(qseq), 3)
527         self.assertEqual(len(split), 4)
528
529         srf_data = {
530             os.path.join(self.result_map['11154'],
531                          '11154_30221AAXX_c33_l4.fastq'): {
532                 'mid': None,
533                 'ispaired': False,
534                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
535                 'flowcell': u'30221AAXX',
536                 'target': os.path.join(self.result_map['11154'],
537                                        u'11154_30221AAXX_c33_l4.fastq'),
538             },
539             os.path.join(self.result_map['11154'],
540                          '11154_30DY0AAXX_c151_l8_r1.fastq'): {
541                 'mid': None,
542                 'ispaired': True,
543                 'flowcell': u'30DY0AAXX',
544                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
545                 'mid': 76,
546                 'target':
547                     os.path.join(self.result_map['11154'],
548                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
549                 'target_right':
550                     os.path.join(self.result_map['11154'],
551                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
552             }
553         }
554         for args in srf:
555             expected = srf_data[args['target']]
556             self.assertEqual(args['ispaired'], expected['ispaired'])
557             self.assertEqual(len(args['sources']), 1)
558             _, source_filename = os.path.split(args['sources'][0])
559             self.assertEqual(source_filename, expected['sources'][0])
560             self.assertEqual(args['target'], expected['target'])
561             if args['ispaired']:
562                 self.assertEqual(args['target_right'],
563                                      expected['target_right'])
564             if 'mid' in expected:
565                 self.assertEqual(args['mid'], expected['mid'])
566
567         qseq_data = {
568             os.path.join(self.result_map['11154'],
569                          '11154_42JUYAAXX_c76_l5_r1.fastq'): {
570                 'istar': True,
571                 'ispaired': True,
572                 'sources': [
573                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
574             },
575             os.path.join(self.result_map['11154'],
576                          '11154_42JUYAAXX_c76_l5_r2.fastq'): {
577                 'istar': True,
578                 'ispaired': True,
579                 'sources': [
580                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
581             },
582             os.path.join(self.result_map['11154'],
583                          '11154_61MJTAAXX_c76_l6.fastq'): {
584                 'istar': True,
585                 'ispaired': False,
586                 'sources': [
587                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
588             },
589         }
590         for args in qseq:
591             expected = qseq_data[args['target']]
592             self.assertEqual(args['istar'], expected['istar'])
593             self.assertEqual(args['ispaired'], expected['ispaired'])
594             for i in range(len(expected['sources'])):
595                 _, filename = os.path.split(args['sources'][i])
596                 self.assertEqual(filename, expected['sources'][i])
597
598
599         split_test = dict((( x['target'], x) for x in
600             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
601                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
602              'pyscript': 'desplit_fastq.pyc?$',
603              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
604             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
605                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
606              'pyscript': 'desplit_fastq.pyc?$',
607              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'},
608             {'sources': [u'12345_CGATGT_L003_R1_001.fastq.gz',
609                          u'12345_CGATGT_L003_R1_002.fastq.gz',
610                          u'12345_CGATGT_L003_R1_003.fastq.gz',
611                          ],
612              'pyscript': 'desplit_fastq.pyc?$',
613              'target': u'12345_C02F9ACXX_c202_l3_r1.fastq'},
614             {'sources': [u'12345_CGATGT_L003_R2_001.fastq.gz',
615                          u'12345_CGATGT_L003_R2_002.fastq.gz',
616                          u'12345_CGATGT_L003_R2_003.fastq.gz',
617                          ],
618              'pyscript': 'desplit_fastq.pyc?$',
619              'target': u'12345_C02F9ACXX_c202_l3_r2.fastq'}
620              ]
621          ))
622         for arg in split:
623             _, target = os.path.split(arg['target'])
624             pyscript = split_test[target]['pyscript']
625             self.assertTrue(re.search(pyscript, arg['pyscript']))
626             filename = split_test[target]['target']
627             self.assertTrue(arg['target'].endswith(filename))
628             for s_index in range(len(arg['sources'])):
629                 s1 = arg['sources'][s_index]
630                 s2 = split_test[target]['sources'][s_index]
631                 self.assertTrue(s1.endswith(s2))
632
633     def test_create_scripts(self):
634         self.extract.create_scripts(self.result_map)
635
636         self.assertTrue(os.path.exists('srf.condor'))
637         with open('srf.condor', 'r') as srf:
638             arguments = [ l for l in srf if l.startswith('argument') ]
639             arguments.sort()
640             self.assertEqual(len(arguments), 2)
641             self.assertTrue('sub-11154/11154_30221AAXX_c33_l4.fastq'
642                             in arguments[0])
643             self.assertTrue(
644                 'sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
645                 arguments[1])
646
647         self.assertTrue(os.path.exists('qseq.condor'))
648         with open('qseq.condor', 'r') as srf:
649             arguments = [ l for l in srf if l.startswith('argument') ]
650             arguments.sort()
651             self.assertEqual(len(arguments), 3)
652             self.assertTrue('sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
653                             arguments[0])
654             self.assertTrue(
655                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
656                 arguments[1])
657             self.assertTrue('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
658                             arguments[2])
659
660         self.assertTrue(os.path.exists('split_fastq.condor'))
661         with open('split_fastq.condor', 'r') as split:
662             arguments = [ l for l in split if l.startswith('argument') ]
663             arguments.sort()
664             self.assertEqual(len(arguments), 4)
665             # Lane 3 Read 1
666             self.assertTrue('11154_NoIndex_L003_R1_001.fastq.gz' in \
667                             arguments[0])
668             # Lane 3 Read 2
669             self.assertTrue('11154_NoIndex_L003_R2_002.fastq.gz' in \
670                             arguments[1])
671             # Lane 3 Read 1
672             self.assertTrue('12345_CGATGT_L003_R1_001.fastq.gz' in arguments[2])
673             self.assertTrue('12345_CGATGT_L003_R1_002.fastq.gz' in arguments[2])
674             self.assertTrue('12345_CGATGT_L003_R1_003.fastq.gz' in arguments[2])
675             self.assertTrue('12345_C02F9ACXX_c202_l3_r1.fastq' in arguments[2])
676
677             # Lane 3 Read 2
678             self.assertTrue('12345_CGATGT_L003_R2_001.fastq.gz' in arguments[3])
679             self.assertTrue('12345_CGATGT_L003_R2_002.fastq.gz' in arguments[3])
680             self.assertTrue('12345_CGATGT_L003_R2_003.fastq.gz' in arguments[3])
681             self.assertTrue('12345_C02F9ACXX_c202_l3_r2.fastq' in arguments[3])
682
683
684 def suite():
685     from unittest import TestSuite, defaultTestLoader
686     suite = TestSuite()
687     suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestCondorFastq))
688     return suite
689
690 if __name__ == "__main__":
691     from unittest import main
692     main(defaultTest='suite')