Convert dictionary comprehension to dict(generator) so it'll work with 2.6
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission import condorfastq
11 from htsworkflow.submission.results import ResultMap
12
13 FCDIRS = [
14     'C02F9ACXX',
15     'C02F9ACXX/C1-202',
16     'C02F9ACXX/C1-202/Project_11154',
17     'C02F9ACXX/C1-202/Project_12342_Index1',
18     'C02F9ACXX/C1-202/Project_12342_Index2',
19     '42JUYAAXX',
20     '42JUYAAXX/C1-76',
21     '30221AAXX',
22     '30221AAXX/C1-33',
23     '30DY0AAXX',
24     '30DY0AAXX/C1-151',
25     '61MJTAAXX',
26     '61MJTAAXX/C1-76',
27 ]
28
29 DATAFILES = [
30     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
31     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
32     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
33     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
34     'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
36     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
37     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
38     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
39     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
40     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
41     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
42     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
43     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
44     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
45     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
46     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
47     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
48     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
49     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
53     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
54     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
55     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
56     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
57     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
58     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
59     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
60     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
61     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
62     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
63     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
64     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
65     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
66     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
67     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
68     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
69     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
70     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
71     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
72     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
73     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
74     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
75     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
76     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
77     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
78 ]
79
80 LIBDATA = {
81     '11154':{u'antibody_id': None,
82              u'cell_line': u'Unknown',
83              u'cell_line_id': 1,
84              u'experiment_type': u'RNA-seq',
85              u'experiment_type_id': 4,
86              u'gel_cut_size': 300,
87              u'hidden': False,
88              u'id': u'11154',
89              u'insert_size': 200,
90              u'lane_set': [{u'flowcell': u'30221AAXX',
91                             u'lane_number': 4,
92                             u'paired_end': False,
93                             u'read_length': 33,
94                             u'status': u'Unknown',
95                             u'status_code': None},
96                            {u'flowcell': u'42JUYAAXX',
97                             u'lane_number': 5,
98                             u'paired_end': True,
99                             u'read_length': 76,
100                             u'status': u'Unknown',
101                             u'status_code': None},
102                            {u'flowcell': u'61MJTAAXX',
103                             u'lane_number': 6,
104                             u'paired_end': False,
105                             u'read_length': 76,
106                             u'status': u'Unknown',
107                             u'status_code': None},
108                            {u'flowcell': u'30DY0AAXX',
109                             u'lane_number': 8,
110                             u'paired_end': True,
111                             u'read_length': 76,
112                             u'status': u'Unknown',
113                             u'status_code': None},
114                            {u'flowcell': u'C02F9ACXX',
115                             u'lane_number': 3,
116                             u'paired_end': True,
117                             u'read_length': 101,
118                             u'status': u'Unknown',
119                             u'status_code': None}],
120              u'library_id': u'11154',
121              u'library_name': u'Paired ends ASDF ',
122              u'library_species': u'Mus musculus',
123              u'library_species_id': 9,
124              u'library_type': u'Paired End (non-multiplexed)',
125              u'library_type_id': 2,
126              u'made_by': u'Gary Gygax',
127              u'made_for': u'TSR',
128              u'notes': u'300 bp gel fragment',
129              u'replicate': 1,
130              u'stopping_point': u'1Aa',
131              u'successful_pM': None,
132              u'undiluted_concentration': u'29.7'}
133     }
134
135 FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
136
137 class FakeApi(object):
138     def __init__(self, *args, **kwargs):
139         pass
140
141     def get_library(self, libid):
142         lib_data = LIBDATA[libid]
143         return copy.deepcopy(lib_data)
144
145 class TestCondorFastq(unittest.TestCase):
146     def setUp(self):
147         self.cwd = os.getcwd()
148
149         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
150         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
151         os.mkdir(self.flowcelldir)
152
153         self.logdir = os.path.join(self.tempdir, 'log')
154         os.mkdir(self.logdir)
155
156         for d in FCDIRS:
157             os.mkdir(os.path.join(self.flowcelldir, d))
158
159         for f in DATAFILES:
160             filename = os.path.join(self.flowcelldir, f)
161             with open(filename, 'w') as stream:
162                 stream.write('testfile')
163
164         self.subname = unicode('sub-11154')
165         self.subdir = os.path.join(self.tempdir, self.subname)
166         os.mkdir(self.subdir)
167
168         self.result_map = ResultMap()
169         self.result_map.add_result('11154', self.subname)
170
171     def tearDown(self):
172         shutil.rmtree(self.tempdir)
173         os.chdir(self.cwd)
174
175     def test_find_archive_sequence(self):
176         extract = condorfastq.CondorFastqExtract('host',
177                                                  FAKE_APIDATA,
178                                                  self.tempdir,
179                                                  self.logdir)
180         extract.api = FakeApi()
181
182         lib_db = extract.find_archive_sequence_files(self.result_map)
183
184         self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
185         lanes = [
186             lib_db['11154']['lanes'][(u'30221AAXX', 4)],
187             lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
188             lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
189             lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
190             lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
191         ]
192         self.failUnlessEqual(len(lanes[0]), 1)
193         self.failUnlessEqual(len(lanes[1]), 2)
194         self.failUnlessEqual(len(lanes[2]), 1)
195         self.failUnlessEqual(len(lanes[3]), 1)
196         self.failUnlessEqual(len(lanes[4]), 4)
197
198     def test_find_needed_targets(self):
199
200         extract = condorfastq.CondorFastqExtract('host',
201                                                  FAKE_APIDATA,
202                                                  self.tempdir,
203                                                  self.logdir)
204         extract.api = FakeApi()
205         lib_db = extract.find_archive_sequence_files(self.result_map)
206
207         needed_targets = extract.find_missing_targets(self.result_map,
208                                                       lib_db)
209         self.failUnlessEqual(len(needed_targets), 7)
210         srf_30221 = needed_targets[
211             self.subname + u'/11154_30221AAXX_c33_l4.fastq']
212         qseq_42JUY_r1 = needed_targets[
213             self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
214         qseq_42JUY_r2 = needed_targets[
215             self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
216         qseq_61MJT = needed_targets[
217             self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
218         split_C02F9_r1 = needed_targets[
219             self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
220         split_C02F9_r2 = needed_targets[
221             self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
222
223         self.failUnlessEqual(len(srf_30221['srf']), 1)
224         self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
225         self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
226         self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
227         self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
228         self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
229
230         #print '-------needed targets---------'
231         #pprint(needed_targets)
232
233     def test_generate_fastqs(self):
234         extract = condorfastq.CondorFastqExtract('host',
235                                                  FAKE_APIDATA,
236                                                  self.tempdir,
237                                                  self.logdir)
238         extract.api = FakeApi()
239         commands = extract.build_condor_arguments(self.result_map)
240
241         srf = commands['srf']
242         qseq = commands['qseq']
243         split = commands['split_fastq']
244
245         self.failUnlessEqual(len(srf), 2)
246         self.failUnlessEqual(len(qseq), 3)
247         self.failUnlessEqual(len(split), 2)
248
249         srf_data = {
250             os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
251                 'mid': None,
252                 'ispaired': False,
253                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
254                 'flowcell': u'30221AAXX',
255                 'target': os.path.join(self.subname,
256                                        u'11154_30221AAXX_c33_l4.fastq'),
257             },
258             os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
259                 'mid': None,
260                 'ispaired': True,
261                 'flowcell': u'30DY0AAXX',
262                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
263                 'mid': 76,
264                 'target':
265                     os.path.join(self.subname,
266                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
267                 'target_right':
268                     os.path.join(self.subname,
269                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
270             }
271         }
272         for args in srf:
273             expected = srf_data[args['target']]
274             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
275             self.failUnlessEqual(len(args['sources']), 1)
276             _, source_filename = os.path.split(args['sources'][0])
277             self.failUnlessEqual(source_filename, expected['sources'][0])
278             self.failUnlessEqual(args['target'], expected['target'])
279             if args['ispaired']:
280                 self.failUnlessEqual(args['target_right'],
281                                      expected['target_right'])
282             if 'mid' in expected:
283                 self.failUnlessEqual(args['mid'], expected['mid'])
284
285         qseq_data = {
286             os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
287                 'istar': True,
288                 'ispaired': True,
289                 'sources': [
290                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
291             },
292             os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
293                 'istar': True,
294                 'ispaired': True,
295                 'sources': [
296                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
297             },
298             os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
299                 'istar': True,
300                 'ispaired': False,
301                 'sources': [
302                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
303             },
304         }
305         for args in qseq:
306             expected = qseq_data[args['target']]
307             self.failUnlessEqual(args['istar'], expected['istar'])
308             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
309             for i in range(len(expected['sources'])):
310                 _, filename = os.path.split(args['sources'][i])
311                 self.failUnlessEqual(filename, expected['sources'][i])
312
313
314         split_test = dict((( x['target'], x) for x in
315             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
316                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
317              'pyscript': 'desplit_fastq.pyc',
318              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
319             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
320                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
321              'pyscript': 'desplit_fastq.pyc',
322              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
323          ))
324         for arg in split:
325             _, target = os.path.split(arg['target'])
326             pyscript = split_test[target]['pyscript']
327             self.failUnless(arg['pyscript'].endswith(pyscript))
328             filename = split_test[target]['target']
329             self.failUnless(arg['target'].endswith(filename))
330             for s_index in range(len(arg['sources'])):
331                 s1 = arg['sources'][s_index]
332                 s2 = split_test[target]['sources'][s_index]
333                 self.failUnless(s1.endswith(s2))
334
335         #print '-------commands---------'
336         #pprint (commands)
337
338     def test_create_scripts(self):
339         os.chdir(self.tempdir)
340         extract = condorfastq.CondorFastqExtract('host',
341                                                  FAKE_APIDATA,
342                                                  self.tempdir,
343                                                  self.logdir)
344         extract.api = FakeApi()
345         extract.create_scripts(self.result_map)
346
347         self.failUnless(os.path.exists('srf.condor'))
348         with open('srf.condor', 'r') as srf:
349             arguments = [ l for l in srf if l.startswith('argument') ]
350             arguments.sort()
351             self.failUnlessEqual(len(arguments), 2)
352             self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
353                             in arguments[0])
354             self.failUnless(
355                 '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
356                 arguments[1])
357
358         self.failUnless(os.path.exists('qseq.condor'))
359         with open('qseq.condor', 'r') as srf:
360             arguments = [ l for l in srf if l.startswith('argument') ]
361             arguments.sort()
362             self.failUnlessEqual(len(arguments), 3)
363             self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
364                             arguments[0])
365             self.failUnless(
366                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
367                 arguments[1])
368             self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
369                             arguments[2])
370
371         self.failUnless(os.path.exists('split_fastq.condor'))
372         with open('split_fastq.condor', 'r') as split:
373             arguments = [ l for l in split if l.startswith('argument') ]
374             arguments.sort()
375             self.failUnlessEqual(len(arguments), 2)
376             self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
377                             arguments[0])
378             self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
379                             arguments[1])
380
381
382 def suite():
383     suite = unittest.makeSuite(TestCondorFastq, 'test')
384     return suite
385
386 if __name__ == "__main__":
387     unittest.main(defaultTest='suite')
388