Change to using django templates for generating condor fastq scripts
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission import condorfastq
11
12 FCDIRS = [
13     'C02F9ACXX',
14     'C02F9ACXX/C1-202',
15     'C02F9ACXX/C1-202/Project_11154',
16     'C02F9ACXX/C1-202/Project_12342_Index1',
17     'C02F9ACXX/C1-202/Project_12342_Index2',
18     '42JUYAAXX',
19     '42JUYAAXX/C1-76',
20     '30221AAXX',
21     '30221AAXX/C1-33',
22     '30DY0AAXX',
23     '30DY0AAXX/C1-151',
24     '61MJTAAXX',
25     '61MJTAAXX/C1-76',
26 ]
27
28 DATAFILES = [
29     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
30     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
31     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
32     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
33     'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
34     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
35     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
36     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
37     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
38     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
39     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
40     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
41     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
42     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
43     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
44     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
45     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
46     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
47     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
48     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
49     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
51     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
52     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
53     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
54     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
55     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
56     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
57     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
58     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
59     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
60     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
61     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_1.srf',
62     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_2.srf',
63     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_3.srf',
64     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_4.srf',
65     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_5.srf',
66     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_6.srf',
67     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_7.srf',
68     '30DY0AAXX/C1-151/woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf',
69     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
70     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
71     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
72     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
73     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
74     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
75     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
76     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
77 ]
78
79 LIBDATA = {
80     '11154':{u'antibody_id': None,
81              u'cell_line': u'Unknown',
82              u'cell_line_id': 1,
83              u'experiment_type': u'RNA-seq',
84              u'experiment_type_id': 4,
85              u'gel_cut_size': 300,
86              u'hidden': False,
87              u'id': u'11154',
88              u'insert_size': 200,
89              u'lane_set': [{u'flowcell': u'30221AAXX',
90                             u'lane_number': 4,
91                             u'paired_end': False,
92                             u'read_length': 33,
93                             u'status': u'Unknown',
94                             u'status_code': None},
95                            {u'flowcell': u'42JUYAAXX',
96                             u'lane_number': 5,
97                             u'paired_end': True,
98                             u'read_length': 76,
99                             u'status': u'Unknown',
100                             u'status_code': None},
101                            {u'flowcell': u'61MJTAAXX',
102                             u'lane_number': 6,
103                             u'paired_end': False,
104                             u'read_length': 76,
105                             u'status': u'Unknown',
106                             u'status_code': None},
107                            {u'flowcell': u'30DY0AAXX',
108                             u'lane_number': 8,
109                             u'paired_end': True,
110                             u'read_length': 76,
111                             u'status': u'Unknown',
112                             u'status_code': None},
113                            {u'flowcell': u'C02F9ACXX',
114                             u'lane_number': 3,
115                             u'paired_end': True,
116                             u'read_length': 101,
117                             u'status': u'Unknown',
118                             u'status_code': None}],
119              u'library_id': u'11154',
120              u'library_name': u'Paired ends ASDF ',
121              u'library_species': u'Mus musculus',
122              u'library_species_id': 9,
123              u'library_type': u'Paired End (non-multiplexed)',
124              u'library_type_id': 2,
125              u'made_by': u'Gary Gygax',
126              u'made_for': u'TSR',
127              u'notes': u'300 bp gel fragment',
128              u'replicate': 1,
129              u'stopping_point': u'1Aa',
130              u'successful_pM': None,
131              u'undiluted_concentration': u'29.7'}
132     }
133
134 FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
135
136 class FakeApi(object):
137     def __init__(self, *args, **kwargs):
138         pass
139
140     def get_library(self, libid):
141         lib_data = LIBDATA[libid]
142         return copy.deepcopy(lib_data)
143
144 class TestCondorFastq(unittest.TestCase):
145     def setUp(self):
146         self.cwd = os.getcwd()
147
148         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
149         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
150         os.mkdir(self.flowcelldir)
151
152         self.logdir = os.path.join(self.tempdir, 'log')
153         os.mkdir(self.logdir)
154
155         for d in FCDIRS:
156             os.mkdir(os.path.join(self.flowcelldir, d))
157
158         for f in DATAFILES:
159             filename = os.path.join(self.flowcelldir, f)
160             with open(filename, 'w') as stream:
161                 stream.write('testfile')
162
163         self.subname = unicode('sub-11154')
164         self.subdir = os.path.join(self.tempdir, self.subname)
165         os.mkdir(self.subdir)
166
167     def tearDown(self):
168         shutil.rmtree(self.tempdir)
169         os.chdir(self.cwd)
170
171     def test_find_archive_sequence(self):
172         extract = condorfastq.CondorFastqExtract('host',
173                                                  FAKE_APIDATA,
174                                                  self.tempdir,
175                                                  self.logdir)
176         extract.api = FakeApi()
177         result_map = [('11154', self.subname)]
178         lib_db = extract.find_archive_sequence_files(result_map)
179
180         self.failUnlessEqual(len(lib_db['11154']['lanes']), 5)
181         lanes = [
182             lib_db['11154']['lanes'][(u'30221AAXX', 4)],
183             lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
184             lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
185             lib_db['11154']['lanes'][(u'30DY0AAXX', 8)],
186             lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
187         ]
188         self.failUnlessEqual(len(lanes[0]), 1)
189         self.failUnlessEqual(len(lanes[1]), 2)
190         self.failUnlessEqual(len(lanes[2]), 1)
191         self.failUnlessEqual(len(lanes[3]), 1)
192         self.failUnlessEqual(len(lanes[4]), 4)
193
194     def test_find_needed_targets(self):
195
196         extract = condorfastq.CondorFastqExtract('host',
197                                                  FAKE_APIDATA,
198                                                  self.tempdir,
199                                                  self.logdir)
200         extract.api = FakeApi()
201         result_map = [('11154', self.subname)]
202         lib_db = extract.find_archive_sequence_files(result_map)
203
204         needed_targets = extract.find_missing_targets(result_map,
205                                                       lib_db)
206         self.failUnlessEqual(len(needed_targets), 7)
207         srf_30221 = needed_targets[
208             self.subname + u'/11154_30221AAXX_c33_l4.fastq']
209         qseq_42JUY_r1 = needed_targets[
210             self.subname + u'/11154_42JUYAAXX_c76_l5_r1.fastq']
211         qseq_42JUY_r2 = needed_targets[
212             self.subname + u'/11154_42JUYAAXX_c76_l5_r2.fastq']
213         qseq_61MJT = needed_targets[
214             self.subname + u'/11154_61MJTAAXX_c76_l6.fastq']
215         split_C02F9_r1 = needed_targets[
216             self.subname + u'/11154_C02F9ACXX_c202_l3_r1.fastq']
217         split_C02F9_r2 = needed_targets[
218             self.subname + u'/11154_C02F9ACXX_c202_l3_r2.fastq']
219
220         self.failUnlessEqual(len(srf_30221['srf']), 1)
221         self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
222         self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
223         self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
224         self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
225         self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
226
227         #print '-------needed targets---------'
228         #pprint(needed_targets)
229
230     def test_generate_fastqs(self):
231         extract = condorfastq.CondorFastqExtract('host',
232                                                  FAKE_APIDATA,
233                                                  self.tempdir,
234                                                  self.logdir)
235         extract.api = FakeApi()
236         result_map = [('11154', self.subdir)]
237         commands = extract.build_condor_arguments(result_map)
238
239         srf = commands['srf']
240         qseq = commands['qseq']
241         split = commands['split_fastq']
242
243         self.failUnlessEqual(len(srf), 2)
244         self.failUnlessEqual(len(qseq), 3)
245         self.failUnlessEqual(len(split), 2)
246
247         srf_data = {
248             os.path.join(self.subdir, '11154_30221AAXX_c33_l4.fastq'): {
249                 'mid': None,
250                 'ispaired': False,
251                 'sources': [u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
252                 'flowcell': u'30221AAXX',
253                 'target': os.path.join(self.subdir,
254                                        u'11154_30221AAXX_c33_l4.fastq'),
255             },
256             os.path.join(self.subdir, '11154_30DY0AAXX_c151_l8_r1.fastq'): {
257                 'mid': None,
258                 'ispaired': True,
259                 'flowcell': u'30DY0AAXX',
260                 'sources': [u'woldlab_090725_HWI-EAS229_0110_30DY0AAXX_8.srf'],
261                 'mid': 76,
262                 'target':
263                     os.path.join(self.subdir,
264                                  u'11154_30DY0AAXX_c151_l8_r1.fastq'),
265                 'target_right':
266                     os.path.join(self.subdir,
267                                  u'11154_30DY0AAXX_c151_l8_r2.fastq'),
268             }
269         }
270         for args in srf:
271             expected = srf_data[args['target']]
272             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
273             self.failUnlessEqual(len(args['sources']), 1)
274             _, source_filename = os.path.split(args['sources'][0])
275             self.failUnlessEqual(source_filename, expected['sources'][0])
276             self.failUnlessEqual(args['target'], expected['target'])
277             if args['ispaired']:
278                 self.failUnlessEqual(args['target_right'],
279                                      expected['target_right'])
280             if 'mid' in expected:
281                 self.failUnlessEqual(args['mid'], expected['mid'])
282
283         qseq_data = {
284             os.path.join(self.subdir, '11154_42JUYAAXX_c76_l5_r1.fastq'): {
285                 'istar': True,
286                 'ispaired': True,
287                 'sources': [
288                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2']
289             },
290             os.path.join(self.subdir, '11154_42JUYAAXX_c76_l5_r2.fastq'): {
291                 'istar': True,
292                 'ispaired': True,
293                 'sources': [
294                     u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2']
295             },
296             os.path.join(self.subdir, '11154_61MJTAAXX_c76_l6.fastq'): {
297                 'istar': True,
298                 'ispaired': False,
299                 'sources': [
300                     u'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
301             },
302         }
303         for args in qseq:
304             expected = qseq_data[args['target']]
305             self.failUnlessEqual(args['istar'], expected['istar'])
306             self.failUnlessEqual(args['ispaired'], expected['ispaired'])
307             for i in range(len(expected['sources'])):
308                 _, filename = os.path.split(args['sources'][i])
309                 self.failUnlessEqual(filename, expected['sources'][i])
310
311
312         split_test = { x['target']: x for x in
313             [{'sources': [u'11154_NoIndex_L003_R1_001.fastq.gz',
314                          u'11154_NoIndex_L003_R1_002.fastq.gz'],
315              'pyscript': 'desplit_fastq.pyc',
316              'target': u'11154_C02F9ACXX_c202_l3_r1.fastq'},
317             {'sources': [u'11154_NoIndex_L003_R2_001.fastq.gz',
318                          u'11154_NoIndex_L003_R2_002.fastq.gz'],
319              'pyscript': 'desplit_fastq.pyc',
320              'target': u'11154_C02F9ACXX_c202_l3_r2.fastq'}]
321          }
322         for arg in split:
323             _, target = os.path.split(arg['target'])
324             pyscript = split_test[target]['pyscript']
325             self.failUnless(arg['pyscript'].endswith(pyscript))
326             filename = split_test[target]['target']
327             self.failUnless(arg['target'].endswith(filename))
328             for s_index in range(len(arg['sources'])):
329                 s1 = arg['sources'][s_index]
330                 s2 = split_test[target]['sources'][s_index]
331                 self.failUnless(s1.endswith(s2))
332
333         #print '-------commands---------'
334         #pprint (commands)
335
336     def test_create_scripts(self):
337         os.chdir(self.tempdir)
338         extract = condorfastq.CondorFastqExtract('host',
339                                                  FAKE_APIDATA,
340                                                  self.tempdir,
341                                                  self.logdir)
342         extract.api = FakeApi()
343         result_map = [('11154', self.subname)]
344         extract.create_scripts(result_map)
345
346         self.failUnless(os.path.exists('srf.condor'))
347         with open('srf.condor', 'r') as srf:
348             arguments = [ l for l in srf if l.startswith('argument') ]
349             arguments.sort()
350             self.failUnlessEqual(len(arguments), 2)
351             self.failUnless('--single sub-11154/11154_30221AAXX_c33_l4.fastq'
352                             in arguments[0])
353             self.failUnless(
354                 '--right sub-11154/11154_30DY0AAXX_c151_l8_r2.fastq' in
355                 arguments[1])
356
357         self.failUnless(os.path.exists('qseq.condor'))
358         with open('qseq.condor', 'r') as srf:
359             arguments = [ l for l in srf if l.startswith('argument') ]
360             arguments.sort()
361             self.failUnlessEqual(len(arguments), 3)
362             self.failUnless('-o sub-11154/11154_42JUYAAXX_c76_l5_r1.fastq ' in
363                             arguments[0])
364             self.failUnless(
365                 'C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2' in
366                 arguments[1])
367             self.failUnless('61MJTAAXX_c76_l6.fastq -f 61MJTAAXX' in
368                             arguments[2])
369
370         self.failUnless(os.path.exists('split_fastq.condor'))
371         with open('split_fastq.condor', 'r') as split:
372             arguments = [ l for l in split if l.startswith('argument') ]
373             arguments.sort()
374             self.failUnlessEqual(len(arguments), 2)
375             self.failUnless('11154_NoIndex_L003_R1_001.fastq.gz' in \
376                             arguments[0])
377             self.failUnless('11154_NoIndex_L003_R2_002.fastq.gz' in \
378                             arguments[1])
379
380
381 def suite():
382     suite = unittest.makeSuite(TestCondorFastq, 'test')
383     return suite
384
385 if __name__ == "__main__":
386     unittest.main(defaultTest='suite')
387