c07783ef3f8f7323b2e73ea6687d9d4846852eaa
[htsworkflow.git] / htsworkflow / pipelines / test / simulate_runfolder.py
1 """
2 Create simulated solexa/illumina runfolders for testing
3 """
4
5 import os
6 import shutil
7
8 TEST_CODE_DIR = os.path.split(__file__)[0]
9 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
10 LANE_LIST = range(1,9)
11 TILE_LIST = range(1,101)
12 HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
13                    1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
14                    2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
15                    2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
16
17 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
18     firecrest_dir = os.path.join(data_dir,
19                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
20                                  )
21     os.mkdir(firecrest_dir)
22     return firecrest_dir
23
24 def make_ipar_dir(data_dir, version='1.01'):
25     """
26     Construct an artificial ipar parameter file and directory
27     """
28     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
29     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
30
31     ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
32     if not os.path.exists(ipar_dir):
33       os.mkdir(ipar_dir)
34     return ipar_dir
35
36 def make_flowcell_id(runfolder_dir, flowcell_id=None):
37     if flowcell_id is None:
38         flowcell_id = '207BTAAXY'
39
40     config = """<?xml version="1.0"?>
41 <FlowcellId>
42   <Text>%s</Text>
43 </FlowcellId>""" % (flowcell_id,)
44     config_dir = os.path.join(runfolder_dir, 'Config')
45
46     if not os.path.exists(config_dir):
47         os.mkdir(config_dir)
48     pathname = os.path.join(config_dir, 'FlowcellId.xml')
49     f = open(pathname,'w')
50     f.write(config)
51     f.close()
52
53 def make_bustard_config132(image_dir):
54     source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
55     destination = os.path.join(image_dir, 'config.xml')
56     shutil.copy(source, destination)
57
58 def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
59     """
60     Construct an artificial RTA Intensities parameter file and directory
61     """
62     intensities_dir = os.path.join(data_dir, 'Intensities')
63     if not os.path.exists(intensities_dir):
64       os.mkdir(intensities_dir)
65
66     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
67     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
68
69     return intensities_dir
70
71 def make_rta_basecalls_1460(intensities_dir):
72     """
73     Construct an artificial RTA Intensities parameter file and directory
74     """
75     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
76     if not os.path.exists(basecalls_dir):
77       os.mkdir(basecalls_dir)
78
79     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
80     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
81
82     return basecalls_dir
83
84 def make_rta_intensities_1870(data_dir, version='1.8.70.0'):
85     """
86     Construct an artificial RTA Intensities parameter file and directory
87     """
88     intensities_dir = os.path.join(data_dir, 'Intensities')
89     if not os.path.exists(intensities_dir):
90       os.mkdir(intensities_dir)
91
92     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
93     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
94
95     return intensities_dir
96
97 def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
98     """
99     Construct an artificial RTA Intensities parameter file and directory
100     """
101     intensities_dir = os.path.join(data_dir, 'Intensities')
102     if not os.path.exists(intensities_dir):
103       os.mkdir(intensities_dir)
104
105     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
106     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
107
108     return intensities_dir
109
110 def make_rta_basecalls_1870(intensities_dir):
111     """
112     Construct an artificial RTA Intensities parameter file and directory
113     """
114     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
115     if not os.path.exists(basecalls_dir):
116       os.mkdir(basecalls_dir)
117
118     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
119     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
120
121     return basecalls_dir
122
123 def make_rta_basecalls_1_10(intensities_dir):
124     """
125     Construct an artificial RTA Intensities parameter file and directory
126     """
127     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
128     if not os.path.exists(basecalls_dir):
129         os.mkdir(basecalls_dir)
130
131     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
132     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
133     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
134
135     return basecalls_dir
136
137 def make_qseqs(bustard_dir, basecall_info=None):
138     """
139     Fill gerald directory with qseq files
140     """
141     if basecall_info is None:
142         qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
143         tile_list = TILE_LIST
144         summary_file = '42BRJAAXX_BustardSummary.xml'
145     else:
146         qseq_file = basecall_info.qseq_file
147         tile_list = basecall_info.tile_list
148         summary_file = basecall_info.basecall_summary
149
150     # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
151     # was actually sequence at the start
152     source = os.path.join(TESTDATA_DIR, qseq_file)
153     destdir = bustard_dir
154     if not os.path.isdir(destdir):
155         os.mkdir(destdir)
156
157     for lane in LANE_LIST:
158         for tile in tile_list:
159             destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
160             shutil.copy(source, destination)
161
162     make_matrix_dir(bustard_dir)
163     make_phasing_dir(bustard_dir)
164
165     summary_source = os.path.join(TESTDATA_DIR, summary_file)
166     summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
167     shutil.copy(summary_source, summary_dest)
168
169     return destdir
170
171 def make_scores(gerald_dir, in_temp=True):
172     """
173     Fill gerald directory with score temp files
174     will create the directory if it doesn't exist.
175     """
176     source = os.path.join(TESTDATA_DIR, 's_1_0001_score.txt')
177     destdir = gerald_dir
178     if in_temp:
179         destdir = os.path.join(destdir, 'Temp')
180     if not os.path.isdir(destdir):
181         os.mkdir(destdir)
182
183     for lane in LANE_LIST:
184         for tile in TILE_LIST:
185             destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
186             shutil.copy(source, destination)
187
188     return destdir
189
190 def make_matrix_dir(bustard_dir):
191     """
192     Create several matrix files in <bustard_dir>/Matrix/
193
194     from pipeline 1.4
195     """
196     destdir = os.path.join(bustard_dir, 'Matrix')
197     if not os.path.isdir(destdir):
198         os.mkdir(destdir)
199
200     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
201     for lane in LANE_LIST:
202         destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
203         shutil.copy(source, destination)
204
205 def make_matrix(matrix_filename):
206     contents = """# Auto-generated frequency response matrix
207 > A
208 > C
209 > G
210 > T
211 0.77 0.15 -0.04 -0.04
212 0.76 1.02 -0.05 -0.06
213 -0.10 -0.10 1.17 -0.03
214 -0.13 -0.12 0.80 1.27
215 """
216     f = open(matrix_filename, 'w')
217     f.write(contents)
218     f.close()
219
220 def make_matrix_dir_rta160(bustard_dir):
221     """
222     Create several matrix files in <bustard_dir>/Matrix/
223     """
224     destdir = os.path.join(bustard_dir, 'Matrix')
225     if not os.path.isdir(destdir):
226         os.mkdir(destdir)
227
228     source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
229     lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
230     for fragment in lane_fragments:
231         destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
232         shutil.copy(source, destination)
233
234 def make_matrix_dir_rta_1_10(bustard_dir):
235     make_matrix_dir_rta160(bustard_dir)
236
237 def make_phasing_dir(bustard_dir):
238     """
239     Create several phasing files in <bustard_dir>/Phasing/
240
241     from pipeline 1.4
242     """
243     destdir = os.path.join(bustard_dir, 'Phasing')
244     if not os.path.isdir(destdir):
245         os.mkdir(destdir)
246
247     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
248     for lane in LANE_LIST:
249         destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
250         shutil.copy(source, destination)
251
252 def make_phasing_params(bustard_dir):
253     for lane in LANE_LIST:
254         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
255         f = open(pathname, 'w')
256         f.write("""<Parameters>
257   <Phasing>0.009900</Phasing>
258   <Prephasing>0.003500</Prephasing>
259 </Parameters>
260 """)
261         f.close()
262
263 def make_gerald_config_026(gerald_dir):
264     source = os.path.join(TESTDATA_DIR, 'gerald_config_0.2.6.xml')
265     destination = os.path.join(gerald_dir, 'config.xml')
266     shutil.copy(source, destination)
267
268 def make_gerald_config_100(gerald_dir):
269     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.0.xml')
270     destination = os.path.join(gerald_dir, 'config.xml')
271     shutil.copy(source, destination)
272
273 def make_gerald_config_1_7(gerald_dir):
274     """CASAVA 1.7 gerald config"""
275     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.7.xml')
276     destination = os.path.join(gerald_dir, 'config.xml')
277     shutil.copy(source, destination)
278
279 def make_summary_htm_100(gerald_dir):
280     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
281     destination = os.path.join(gerald_dir, 'Summary.htm')
282     shutil.copy(source, destination)
283
284 def make_summary_htm_110(gerald_dir):
285     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline110.htm')
286     destination = os.path.join(gerald_dir, 'Summary.htm')
287     shutil.copy(source, destination)
288
289 def make_summary_paired_htm(gerald_dir):
290     source = os.path.join(TESTDATA_DIR, 'Summary-paired-pipeline110.htm')
291     destination = os.path.join(gerald_dir, 'Summary.htm')
292     shutil.copy(source, destination)
293
294 def make_summary_ipar130_htm(gerald_dir):
295     source = os.path.join(TESTDATA_DIR, 'Summary-ipar130.htm')
296     destination = os.path.join(gerald_dir, 'Summary.htm')
297     shutil.copy(source, destination)
298
299 def make_summary_rta160_xml(gerald_dir):
300     source = os.path.join(TESTDATA_DIR, 'Summary-rta160.xml')
301     destination = os.path.join(gerald_dir, 'Summary.xml')
302     shutil.copy(source, destination)
303
304
305 def make_summary_casava1_7_xml(gerald_dir):
306     source = os.path.join(TESTDATA_DIR, 'Summary-casava1.7.xml')
307     destination = os.path.join(gerald_dir, 'Summary.xml')
308     shutil.copy(source, destination)
309
310
311 def make_eland_results(gerald_dir):
312     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
313 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
314 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
315 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
316 """
317     for i in LANE_LIST:
318         pathname = os.path.join(gerald_dir,
319                                 's_%d_eland_result.txt' % (i,))
320         f = open(pathname, 'w')
321         f.write(eland_result)
322         f.close()
323
324 def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
325     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
326 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
327 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
328 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
329 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
330 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
331 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
332 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC
333 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
334 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
335 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
336 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
337 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
338 """]
339     if paired:
340         for e in [1,2]:
341             for i in lane_list:
342                 pathname = os.path.join(gerald_dir,
343                                         's_%d_%d_eland_multi.txt' % (i,e))
344                 f = open(pathname, 'w')
345                 f.write(eland_multi[e-1])
346                 f.close()
347     else:
348         for i in lane_list:
349             pathname = os.path.join(gerald_dir,
350                                     's_%d_eland_multi.txt' % (i,))
351             f = open(pathname, 'w')
352             f.write(eland_multi[0])
353             f.close()
354
355 def make_eland_export(gerald_dir, paired=False, lane_list=LANE_LIST):
356     source = os.path.join(TESTDATA_DIR, 'casava_1.7_export.txt')
357
358     for i in lane_list:
359         destination = os.path.join(gerald_dir,
360                                    's_%d_export.txt' % (i,))
361         shutil.copy(source, destination)
362
363
364 def make_scarf(gerald_dir, lane_list=LANE_LIST):
365     seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
366 HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
367 HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
368 HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
369 HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
370 """
371     for l in lane_list:
372         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
373         f = open(pathname,'w')
374         f.write(seq)
375         f.close()
376
377 def make_fastq(gerald_dir, lane_list=LANE_LIST):
378     seq = """@HWI-EAS229:1:2:182:712#0/1
379 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
380 +HWI-EAS229:1:2:182:712#0/1
381 \\bab_bbaabbababbaaa]]D]bb_baabbab\baa
382 @HWI-EAS229:1:2:198:621#0/1
383 CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
384 +HWI-EAS229:1:2:198:621#0/1
385 [aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
386 @HWI-EAS229:1:2:209:1321#0/1
387 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
388 +HWI-EAS229:1:2:209:1321#0/1
389 _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
390 """
391     for l in lane_list:
392         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
393         f = open(pathname,'w')
394         f.write(seq)
395         f.close()
396
397
398 def ls_tree(root):
399     for dirpath, dirnames, filenames in os.walk(root):
400         for filename in filenames:
401             print os.path.join(dirpath, filename)
402
403
404 class BaseCallInfo(object):
405     """Provide customization for how to setup the base call mock data
406     """
407     def __init__(self, qseq_file, tile_list, basecall_summary):
408         self.qseq_file = qseq_file
409         self.tile_list = tile_list
410         self.basecall_summary = basecall_summary
411
412 # First generation HiSeq Flowcell
413 ABXX_BASE_CALL_INFO = BaseCallInfo(
414     qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
415     tile_list = HISEQ_TILE_LIST,
416     basecall_summary = 'AA01CCABXX_BustardSummary.xml')