Add extract HiSeq project based split fastq files to ucsc_gather
[htsworkflow.git] / htsworkflow / submission / test / test_condorfastq.py
1 #!/usr/bin/env python
2
3 import copy
4 import os
5 from pprint import pprint
6 import shutil
7 import tempfile
8 import unittest
9
10 from htsworkflow.submission import condorfastq
11
12 FCDIRS = [
13     'C02F9ACXX',
14     'C02F9ACXX/C1-202',
15     'C02F9ACXX/C1-202/Project_11154',
16     'C02F9ACXX/C1-202/Project_12342_Index1',
17     'C02F9ACXX/C1-202/Project_12342_Index2',
18     '42JUYAAXX',
19     '42JUYAAXX/C1-76',
20     '30221AAXX',
21     '30221AAXX/C1-33',
22     '61MJTAAXX',
23     '61MJTAAXX/C1-76',
24 ]
25
26 DATAFILES = [
27     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_001.fastq.gz',
28     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R1_002.fastq.gz',
29     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_001.fastq.gz',
30     'C02F9ACXX/C1-202/Project_11154/11154_NoIndex_L003_R2_002.fastq.gz',
31     'C02F9ACXX/C1-202/Project_12342_Index1/11114_GCCAAT_L004_R1_001.fastq.gz',
32     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L007_R1_001.fastq.gz',
33     'C02F9ACXX/C1-202/Project_12342_Index2/11119_CGATGT_L005_R1_001.fastq.gz',
34     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r1.tar.bz2',
35     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r1.tar.bz2',
36     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r1.tar.bz2',
37     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r1.tar.bz2',
38     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2',
39     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r1.tar.bz2',
40     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r1.tar.bz2',
41     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r1.tar.bz2',
42     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
43     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l1_r2.tar.bz2',
44     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l2_r2.tar.bz2',
45     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l3_r2.tar.bz2',
46     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l4_r2.tar.bz2',
47     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2',
48     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l6_r2.tar.bz2',
49     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l7_r2.tar.bz2',
50     '42JUYAAXX/C1-76/woldlab_100826_HSI-123_0001_42JUYAAXX_l8_r2.tar.bz2',
51     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_1.srf',
52     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_2.srf',
53     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_3.srf',
54     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf',
55     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_5.srf',
56     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_6.srf',
57     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_7.srf',
58     '30221AAXX/C1-33/woldlab_090425_HWI-EAS229_0110_30221AAXX_8.srf',
59     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l1_r1.tar.bz2',
60     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l2_r1.tar.bz2',
61     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l3_r1.tar.bz2',
62     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l4_r1.tar.bz2',
63     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l5_r1.tar.bz2',
64     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2',
65     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l7_r1.tar.bz2',
66     '61MJTAAXX/C1-76/woldlab_100826_HSI-123_0001_61MJTAAXX_l8_r1.tar.bz2',
67 ]
68
69 LIBDATA = {
70     '11154':{u'antibody_id': None,
71              u'cell_line': u'Unknown',
72              u'cell_line_id': 1,
73              u'experiment_type': u'RNA-seq',
74              u'experiment_type_id': 4,
75              u'gel_cut_size': 300,
76              u'hidden': False,
77              u'id': u'11154',
78              u'insert_size': 200,
79              u'lane_set': [{u'flowcell': u'30221AAXX',
80                             u'lane_number': 4,
81                             u'paired_end': False,
82                             u'read_length': 33,
83                             u'status': u'Unknown',
84                             u'status_code': None},
85                            {u'flowcell': u'42JUYAAXX',
86                             u'lane_number': 5,
87                             u'paired_end': True,
88                             u'read_length': 76,
89                             u'status': u'Unknown',
90                             u'status_code': None},
91                            {u'flowcell': u'61MJTAAXX',
92                             u'lane_number': 6,
93                             u'paired_end': False,
94                             u'read_length': 76,
95                             u'status': u'Unknown',
96                             u'status_code': None},
97                            {u'flowcell': u'C02F9ACXX',
98                             u'lane_number': 3,
99                             u'paired_end': True,
100                             u'read_length': 101,
101                             u'status': u'Unknown',
102                             u'status_code': None}],
103              u'library_id': u'11154',
104              u'library_name': u'Paired ends ASDF ',
105              u'library_species': u'Mus musculus',
106              u'library_species_id': 9,
107              u'library_type': u'Paired End (non-multiplexed)',
108              u'library_type_id': 2,
109              u'made_by': u'Gary Gygax',
110              u'made_for': u'TSR',
111              u'notes': u'300 bp gel fragment',
112              u'replicate': 1,
113              u'stopping_point': u'1Aa',
114              u'successful_pM': None,
115              u'undiluted_concentration': u'29.7'}
116     }
117
118 FAKE_APIDATA = {'apiid':0, 'apikey': 'foo'}
119
120 class FakeApi(object):
121     def __init__(self, *args, **kwargs):
122         pass
123
124     def get_library(self, libid):
125         lib_data = LIBDATA[libid]
126         return copy.deepcopy(lib_data)
127
128 class TestCondorFastq(unittest.TestCase):
129     def setUp(self):
130         self.tempdir = tempfile.mkdtemp(prefix='condorfastq_test')
131         self.flowcelldir = os.path.join(self.tempdir, 'flowcells')
132         os.mkdir(self.flowcelldir)
133
134         self.logdir = os.path.join(self.tempdir, 'log')
135         os.mkdir(self.logdir)
136
137         for d in FCDIRS:
138             os.mkdir(os.path.join(self.flowcelldir, d))
139
140         for f in DATAFILES:
141             filename = os.path.join(self.flowcelldir, f)
142             with open(filename, 'w') as stream:
143                 stream.write('testfile')
144
145     def tearDown(self):
146         shutil.rmtree(self.tempdir)
147
148     def test_find_archive_sequence(self):
149         extract = condorfastq.CondorFastqExtract('host',
150                                                  FAKE_APIDATA,
151                                                  self.tempdir,
152                                                  self.logdir)
153         extract.api = FakeApi()
154         result_map = [('11154', '/notarealplace')]
155         lib_db = extract.find_archive_sequence_files(result_map)
156
157         self.failUnlessEqual(len(lib_db['11154']['lanes']), 4)
158         lanes = [
159             lib_db['11154']['lanes'][(u'30221AAXX', 4)],
160             lib_db['11154']['lanes'][(u'42JUYAAXX', 5)],
161             lib_db['11154']['lanes'][(u'61MJTAAXX', 6)],
162             lib_db['11154']['lanes'][(u'C02F9ACXX', 3)],
163         ]
164         self.failUnlessEqual(len(lanes[0]), 1)
165         self.failUnlessEqual(len(lanes[1]), 2)
166         self.failUnlessEqual(len(lanes[2]), 1)
167         self.failUnlessEqual(len(lanes[3]), 4)
168
169     def test_find_needed_targets(self):
170
171         extract = condorfastq.CondorFastqExtract('host',
172                                                  FAKE_APIDATA,
173                                                  self.tempdir,
174                                                  self.logdir)
175         extract.api = FakeApi()
176         result_map = [('11154', '/notarealplace')]
177         lib_db = extract.find_archive_sequence_files(result_map)
178
179         needed_targets = extract.find_missing_targets(result_map,
180                                                       lib_db)
181         self.failUnlessEqual(len(needed_targets), 6)
182         srf_30221 = needed_targets[
183             u'/notarealplace/11154_30221AAXX_c33_l4.fastq']
184         qseq_42JUY_r1 = needed_targets[
185             u'/notarealplace/11154_42JUYAAXX_c76_l5_r1.fastq']
186         qseq_42JUY_r2 = needed_targets[
187             u'/notarealplace/11154_42JUYAAXX_c76_l5_r2.fastq']
188         qseq_61MJT = needed_targets[
189             u'/notarealplace/11154_61MJTAAXX_c76_l6.fastq']
190         split_C02F9_r1 = needed_targets[
191             u'/notarealplace/11154_C02F9ACXX_c202_l3_r1.fastq']
192         split_C02F9_r2 = needed_targets[
193             u'/notarealplace/11154_C02F9ACXX_c202_l3_r2.fastq']
194
195         self.failUnlessEqual(len(srf_30221['srf']), 1)
196         self.failUnlessEqual(len(qseq_42JUY_r1['qseq']), 1)
197         self.failUnlessEqual(len(qseq_42JUY_r2['qseq']), 1)
198         self.failUnlessEqual(len(qseq_61MJT['qseq']), 1)
199         self.failUnlessEqual(len(split_C02F9_r1['split_fastq']), 2)
200         self.failUnlessEqual(len(split_C02F9_r2['split_fastq']), 2)
201
202         #print '-------needed targets---------'
203         #pprint(needed_targets)
204
205     def test_generate_fastqs(self):
206         extract = condorfastq.CondorFastqExtract('host',
207                                                  FAKE_APIDATA,
208                                                  self.tempdir,
209                                                  self.logdir)
210         extract.api = FakeApi()
211         result_map = [('11154', '/notarealplace')]
212         commands = extract.build_condor_arguments(result_map)
213
214         srf = commands['srf']
215         qseq = commands['qseq']
216         split = commands['split_fastq']
217
218         self.failUnlessEqual(len(srf), 1)
219         self.failUnlessEqual(len(qseq), 3)
220         self.failUnlessEqual(len(split), 2)
221
222         srf_data = {u'/notarealplace/11154_30221AAXX_c33_l4.fastq':
223                      [u'30221AAXX',
224                       u'woldlab_090425_HWI-EAS229_0110_30221AAXX_4.srf'],
225                      }
226         for args in srf:
227             args = extract_argument_list(args)
228             expected = srf_data[args[3]]
229             self.failUnless(expected[0] in args[5])
230             self.failUnless(expected[1] in args[0])
231
232         qseq_data = {u'/notarealplace/11154_42JUYAAXX_c76_l5_r1.fastq':
233                      [u'42JUYAAXX',
234                       u'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r1.tar.bz2'],
235                      u'/notarealplace/11154_61MJTAAXX_c76_l6.fastq':
236                      ['61MJTAAXX',
237                       'woldlab_100826_HSI-123_0001_61MJTAAXX_l6_r1.tar.bz2'],
238                      u'/notarealplace/11154_42JUYAAXX_c76_l5_r2.fastq':
239                      ['42JUYAAXX',
240                       'woldlab_100826_HSI-123_0001_42JUYAAXX_l5_r2.tar.bz2'],
241                      }
242         for args in qseq:
243             args = extract_argument_list(args)
244             expected = qseq_data[args[1]]
245             self.failUnless(expected[0] in args[3])
246             self.failUnless(expected[1] in args[5])
247
248         split_data ={u'/notarealplace/11154_C02F9ACXX_c202_l3_r2.fastq':
249                      [u'11154_NoIndex_L003_R2_001.fastq.gz',
250                       u'11154_NoIndex_L003_R2_002.fastq.gz'],
251                      u'/notarealplace/11154_C02F9ACXX_c202_l3_r1.fastq':
252                      [u'11154_NoIndex_L003_R1_001.fastq.gz',
253                       u'11154_NoIndex_L003_R1_002.fastq.gz'],
254                      }
255         for args in split:
256             args = extract_argument_list(args)
257             expected = split_data[args[1]]
258             self.failUnless(expected[0] in args[2])
259             self.failUnless(expected[1] in args[3])
260
261         #print '-------commands---------'
262         #pprint (commands)
263
264 def extract_argument_list(condor_argument):
265     args = condor_argument.split()
266     # eat the command name, and the trailing queue
267     return args[1:-1]
268
269 def suite():
270     suite = unittest.makeSuite(TestCondorFastq, 'test')
271     return suite
272
273 if __name__ == "__main__":
274     unittest.main(defaultTest='suite')
275