Create a lane to file name turtle data file
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission import condorfastq
11 from htsworkflow.submission.results import ResultMap
12
13 FCDIRS = [
14     'C02F9ACXX',
15     'C02F9ACXX/C1-202',
16     'C02F9ACXX/C1-202/Project_11154',
17     'C02F9ACXX/C1-202/Project_12342_Index1',
18     'C02F9ACXX/C1-202/Project_12342_Index2',
19     '42JUYAAXX',
20     '42JUYAAXX/C1-76',
21     '30221AAXX',
22     '30221AAXX/C1-33',
23     '30DY0AAXX',
24     '30DY0AAXX/C1-151',
25     '61MJTAAXX',
26     '61MJTAAXX/C1-76',
27 ]
28
29 DATAFILES = [
30     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
31     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
32     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
33     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
34     'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
36     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
37     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
38     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
39     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
40     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
41     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
42     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
43     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
44     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
45     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
46     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
47     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
48     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
49     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
53     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
54     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
55     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
56     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
57     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
58     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
59     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
60     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
61     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
62     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
63     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
64     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
65     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
66     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
67     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
68     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
69     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
70     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
71     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
72     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
73     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
74     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
75     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
76     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
77     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
78 ]
79
80 LIBDATA = {
81     '11154':{u'antibody_id': None,
82              u'cell_line': u'Unknown',
83              u'cell_line_id': 1,
84              u'experiment_type': u'RNA-seq',
85              u'experiment_type_id': 4,
86              u'gel_cut_size': 300,
87              u'hidden': False,
88              u'id': u'11154',
89              u'insert_size': 200,
90              u'lane_set': [{u'flowcell': u'30221AAXX',
91                             u'lane_number': 4,
92                             u'lane_id': 3400,
93                             u'paired_end': False,
94                             u'read_length': 33,
95                             u'status': u'Unknown',
96                             u'status_code': None},
97                            {u'flowcell': u'42JUYAAXX',
98                             u'lane_number': 5,
99                             u'lane_id': 4200,
100                             u'paired_end': True,
101                             u'read_length': 76,
102                             u'status': u'Unknown',
103                             u'status_code': None},
104                            {u'flowcell': u'61MJTAAXX',
105                             u'lane_number': 6,
106                             u'lane_id': 6600,
107                             u'paired_end': False,
108                             u'read_length': 76,
109                             u'status': u'Unknown',
110                             u'status_code': None},
111                            {u'flowcell': u'30DY0AAXX',
112                             u'lane_number': 8,
113                             u'lane_id': 3800,
114                             u'paired_end': True,
115                             u'read_length': 76,
116                             u'status': u'Unknown',
117                             u'status_code': None},
118                            {u'flowcell': u'C02F9ACXX',
119                             u'lane_number': 3,
120                             u'lane_id': 12300,
121                             u'paired_end': True,
122                             u'read_length': 101,
123                             u'status': u'Unknown',
124                             u'status_code': None}],
125              u'library_id': u'11154',
126              u'library_name': u'Paired ends ASDF ',
127              u'library_species': u'Mus musculus',
128              u'library_species_id': 9,
129              u'library_type': u'Paired End (non-multiplexed)',
130              u'library_type_id': 2,
131              u'made_by': u'Gary Gygax',
132              u'made_for': u'TSR',
133              u'notes': u'300 bp gel fragment',
134              u'replicate': 1,
135              u'stopping_point': u'1Aa',
136              u'successful_pM': None,
137              u'undiluted_concentration': u'29.7'}
138     }
139
140 FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
141
142 class FakeApi(object):
143     def __init__(self, *args, **kwargs):
144         self.root_url = 'http://localhost'
145
146     def get_library(self, libid):
147         lib_data = LIBDATA[libid]
148         return copy.deepcopy(lib_data)
149
150
151
152 class TestCondorFastq(unittest.TestCase):
153     def setUp(self):
154         self.cwd = os.getcwd()
155
156         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
157         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
158         os.mkdir(self.flowcelldir)
159
160         self.logdir = os.path.join(self.tempdir, 'log')
161         os.mkdir(self.logdir)
162
163         for d in FCDIRS:
164             os.mkdir(os.path.join(self.flowcelldir, d))
165
166         for f in DATAFILES:
167             filename = os.path.join(self.flowcelldir, f)
168             with open(filename, 'w') as stream:
169                 stream.write('testfile')
170
171         self.subname = unicode('sub-11154')
172         self.subdir = os.path.join(self.tempdir, self.subname)
173         os.mkdir(self.subdir)
174
175         self.result_map = ResultMap()
176         self.result_map.add_result('11154', self.subname)
177
178     def tearDown(self):
179         shutil.rmtree(self.tempdir)
180         os.chdir(self.cwd)
181
182     def test_find_archive_sequence(self):
183         extract = condorfastq.CondorFastqExtract('host',
184                                                  FAKE_APIDATA,
185                                                  self.tempdir,
186                                                  self.logdir)
187         extract.api = FakeApi()
188
189         lib_db = extract.find_archive_sequence_files(self.result_map)
190
191         self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
192         lanes = [
193             lib_db['11154']['lanes'][(u'30221AAXX', 4)],
194             lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
195             lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
196             lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
197             lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
198         ]
199         self.failUnlessEqual(len(lanes[0]), 1)
200         self.failUnlessEqual(len(lanes[1]), 2)
201         self.failUnlessEqual(len(lanes[2]), 1)
202         self.failUnlessEqual(len(lanes[3]), 1)
203         self.failUnlessEqual(len(lanes[4]), 4)
204
205     def test_find_needed_targets(self):
206
207         extract = condorfastq.CondorFastqExtract('host',
208                                                  FAKE_APIDATA,
209                                                  self.tempdir,
210                                                  self.logdir)
211         extract.api = FakeApi()
212         lib_db = extract.find_archive_sequence_files(self.result_map)
213
214         needed_targets = extract.find_missing_targets(self.result_map,
215                                                       lib_db)
216         self.failUnlessEqual(len(needed_targets), 7)
217         srf_30221 = needed_targets[
218             self.subname + u'/11154_30221AAXX_c33_l4.fastq']
219         qseq_42JUY_r1 = needed_targets[
220             self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
221         qseq_42JUY_r2 = needed_targets[
222             self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
223         qseq_61MJT = needed_targets[
224             self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
225         split_C02F9_r1 = needed_targets[
226             self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
227         split_C02F9_r2 = needed_targets[
228             self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
229
230         self.failUnlessEqual(len(srf_30221['srf']), 1)
231         self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
232         self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
233         self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
234         self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
235         self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
236
237         #print '-------needed targets---------'
238         #pprint(needed_targets)
239
240     def test_generate_fastqs(self):
241         extract = condorfastq.CondorFastqExtract('host',
242                                                  FAKE_APIDATA,
243                                                  self.tempdir,
244                                                  self.logdir)
245         extract.api = FakeApi()
246         commands = extract.build_condor_arguments(self.result_map)
247
248         srf = commands['srf']
249         qseq = commands['qseq']
250         split = commands['split_fastq']
251
252         self.failUnlessEqual(len(srf), 2)
253         self.failUnlessEqual(len(qseq), 3)
254         self.failUnlessEqual(len(split), 2)
255
256         srf_data = {
257             os.path.join(self.subname, '11154_30221AAXX_c33_l4.fastq'): {
258                 'mid': None,
259                 'ispaired': False,
260                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
261                 'flowcell': u'30221AAXX',
262                 'target': os.path.join(self.subname,
263                                        u'11154_30221AAXX_c33_l4.fastq'),
264             },
265             os.path.join(self.subname, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
266                 'mid': None,
267                 'ispaired': True,
268                 'flowcell': u'30DY0AAXX',
269                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
270                 'mid': 76,
271                 'target':
272                     os.path.join(self.subname,
273                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
274                 'target_right':
275                     os.path.join(self.subname,
276                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
277             }
278         }
279         for args in srf:
280             expected = srf_data[args['target']]
281             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
282             self.failUnlessEqual(len(args['sources']), 1)
283             _, source_filename = os.path.split(args['sources'][0])
284             self.failUnlessEqual(source_filename, expected['sources'][0])
285             self.failUnlessEqual(args['target'], expected['target'])
286             if args['ispaired']:
287                 self.failUnlessEqual(args['target_right'],
288                                      expected['target_right'])
289             if 'mid' in expected:
290                 self.failUnlessEqual(args['mid'], expected['mid'])
291
292         qseq_data = {
293             os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
294                 'istar': True,
295                 'ispaired': True,
296                 'sources': [
297                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
298             },
299             os.path.join(self.subname, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
300                 'istar': True,
301                 'ispaired': True,
302                 'sources': [
303                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
304             },
305             os.path.join(self.subname, '11154_61MJTAAXX_c76_l6.fastq'): {
306                 'istar': True,
307                 'ispaired': False,
308                 'sources': [
309                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
310             },
311         }
312         for args in qseq:
313             expected = qseq_data[args['target']]
314             self.failUnlessEqual(args['istar'], expected['istar'])
315             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
316             for i in range(len(expected['sources'])):
317                 _, filename = os.path.split(args['sources'][i])
318                 self.failUnlessEqual(filename, expected['sources'][i])
319
320
321         split_test = dict((( x['target'], x) for x in
322             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
323                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
324              'pyscript': 'desplit_fastq.pyc',
325              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
326             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
327                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
328              'pyscript': 'desplit_fastq.pyc',
329              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
330          ))
331         for arg in split:
332             _, target = os.path.split(arg['target'])
333             pyscript = split_test[target]['pyscript']
334             self.failUnless(arg['pyscript'].endswith(pyscript))
335             filename = split_test[target]['target']
336             self.failUnless(arg['target'].endswith(filename))
337             for s_index in range(len(arg['sources'])):
338                 s1 = arg['sources'][s_index]
339                 s2 = split_test[target]['sources'][s_index]
340                 self.failUnless(s1.endswith(s2))
341
342         #print '-------commands---------'
343         #pprint (commands)
344
345     def test_create_scripts(self):
346         os.chdir(self.tempdir)
347         extract = condorfastq.CondorFastqExtract('host',
348                                                  FAKE_APIDATA,
349                                                  self.tempdir,
350                                                  self.logdir)
351         extract.api = FakeApi()
352         extract.create_scripts(self.result_map)
353
354         self.failUnless(os.path.exists('srf.condor'))
355         with open('srf.condor', 'r') as srf:
356             arguments = [ l for l in srf if l.startswith('argument') ]
357             arguments.sort()
358             self.failUnlessEqual(len(arguments), 2)
359             self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
360                             in arguments[0])
361             self.failUnless(
362                 '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
363                 arguments[1])
364
365         self.failUnless(os.path.exists('qseq.condor'))
366         with open('qseq.condor', 'r') as srf:
367             arguments = [ l for l in srf if l.startswith('argument') ]
368             arguments.sort()
369             self.failUnlessEqual(len(arguments), 3)
370             self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
371                             arguments[0])
372             self.failUnless(
373                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
374                 arguments[1])
375             self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
376                             arguments[2])
377
378         self.failUnless(os.path.exists('split_fastq.condor'))
379         with open('split_fastq.condor', 'r') as split:
380             arguments = [ l for l in split if l.startswith('argument') ]
381             arguments.sort()
382             self.failUnlessEqual(len(arguments), 2)
383             self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
384                             arguments[0])
385             self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
386                             arguments[1])
387
388 def suite():
389     suite = unittest.makeSuite(TestCondorFastq, 'test')
390     return suite
391
392 if __name__ == "__main__":
393     unittest.main(defaultTest='suite')
394