assume we are writing text with gzip
[htsworkflow.git] / htsworkflow / pipelines / test / simulate_runfolder.py
1 """
2 Create simulated solexa/illumina runfolders for testing
3 """
4 from __future__ import print_function, unicode_literals
5
6 import gzip
7 import os
8 import shutil
9
10 TEST_CODE_DIR = os.path.split(__file__)[0]
11 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
12 LANE_LIST = range(1,9)
13 TILE_LIST = range(1,101)
14 HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
15                    1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
16                    2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
17                    2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
18
19 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
20     firecrest_dir = os.path.join(data_dir,
21                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
22                                  )
23     os.mkdir(firecrest_dir)
24     return firecrest_dir
25
26 def make_ipar_dir(data_dir, version='1.01'):
27     """
28     Construct an artificial ipar parameter file and directory
29     """
30     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
31     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
32
33     ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
34     if not os.path.exists(ipar_dir):
35       os.mkdir(ipar_dir)
36     return ipar_dir
37
38 def make_flowcell_id(runfolder_dir, flowcell_id=None):
39     if flowcell_id is None:
40         flowcell_id = '207BTAAXY'
41
42     config = """<?xml version="1.0"?>
43 <FlowcellId>
44   <Text>%s</Text>
45 </FlowcellId>""" % (flowcell_id,)
46     config_dir = os.path.join(runfolder_dir, 'Config')
47
48     if not os.path.exists(config_dir):
49         os.mkdir(config_dir)
50     pathname = os.path.join(config_dir, 'FlowcellId.xml')
51     f = open(pathname,'w')
52     f.write(config)
53     f.close()
54
55 def make_runinfo(runfolder_dir, flowcell_id):
56     """Simulate a RunInfo.xml file created by >= RTA 1.9
57     """
58     xml = '''<?xml version="1.0"?>
59 <RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
60   <Run Id="{runfolder}" Number="101">
61     <Flowcell>{flowcell}</Flowcell>
62     <Instrument>SN787</Instrument>
63     <Date>110815</Date>
64     <Reads>
65       <Read Number="1" NumCycles="50" IsIndexedRead="N" />
66       <Read Number="2" NumCycles="7" IsIndexedRead="Y" />
67     </Reads>
68     <FlowcellLayout LaneCount="8" SurfaceCount="2" SwathCount="3" TileCount="8" />
69     <AlignToPhiX />
70   </Run>
71 </RunInfo>
72 '''
73     path, runfolder = os.path.split(runfolder_dir)
74     runinfo = os.path.join(runfolder_dir, 'RunInfo.xml')
75     stream = open(runinfo, 'w')
76     stream.write(xml.format(runfolder=runfolder, flowcell=flowcell_id))
77     stream.close()
78     return runinfo
79
80 def make_bustard_config132(image_dir):
81     source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
82     destination = os.path.join(image_dir, 'config.xml')
83     shutil.copy(source, destination)
84
85 def make_aligned_config_1_12(aligned_dir):
86     """This is rouglhly equivalent to the old gerald file"""
87     source = os.path.join(TESTDATA_DIR, '1_12', 'aligned_config_1_12.xml')
88     destination = os.path.join(aligned_dir, 'config.xml')
89     shutil.copy(source, destination)
90
91 def make_unaligned_config_1_12(unaligned_dir):
92     demultiplex_pairs = [ # (src,
93       # dest),
94         (os.path.join(TESTDATA_DIR, '1_12', 'demultiplex_1.12.4.2.xml'),
95          os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
96         (os.path.join(TESTDATA_DIR, '1_12',
97                       'demultiplexed_bustard_1.12.4.2.xml'),
98          os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')),
99         (os.path.join(TESTDATA_DIR, '1_12',
100                       'demultiplexed_summary_1.12.4.2.xml'),
101          os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')),
102     ]
103     for src, dest in demultiplex_pairs:
104         shutil.copy(src, dest)
105         
106 def make_unaligned_status_1_12(unaligned_dir, flowcell_id):
107     basecall_status = ['All.htm', 'Demultiplex_Stats.htm', 'IVC.htm']
108     test_data_root = os.path.join(TESTDATA_DIR, '1_12', 'basecall_stats')
109     basecall_stats = os.path.join(unaligned_dir, 
110                                   'Basecall_Stats_{0}'.format(flowcell_id))
111     os.mkdir(basecall_stats)
112     for filename in basecall_status:
113         source = os.path.join(test_data_root, filename)
114         destination = os.path.join(basecall_stats, filename)
115         shutil.copy(source, destination)
116
117 def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
118     """
119     Construct an artificial RTA Intensities parameter file and directory
120     """
121     intensities_dir = os.path.join(data_dir, 'Intensities')
122     if not os.path.exists(intensities_dir):
123       os.mkdir(intensities_dir)
124
125     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
126     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
127
128     return intensities_dir
129
130 def make_rta_basecalls_1460(intensities_dir):
131     """
132     Construct an artificial RTA Intensities parameter file and directory
133     """
134     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
135     if not os.path.exists(basecalls_dir):
136       os.mkdir(basecalls_dir)
137
138     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
139     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
140
141     return basecalls_dir
142
143 def make_rta_intensities_1870(data_dir, version='1.8.70.0'):
144     """
145     Construct an artificial RTA Intensities parameter file and directory
146     """
147     intensities_dir = os.path.join(data_dir, 'Intensities')
148     if not os.path.exists(intensities_dir):
149       os.mkdir(intensities_dir)
150
151     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
152     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
153
154     return intensities_dir
155
156 def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
157     """
158     Construct an artificial RTA Intensities parameter file and directory
159     """
160     intensities_dir = os.path.join(data_dir, 'Intensities')
161     if not os.path.exists(intensities_dir):
162       os.mkdir(intensities_dir)
163
164     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
165     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
166
167     return intensities_dir
168
169 def make_rta_intensities_1_12(data_dir, version='1.12.4.2'):
170     """
171     Construct an artificial RTA Intensities parameter file and directory
172     """
173     intensities_dir = os.path.join(data_dir, 'Intensities')
174     if not os.path.exists(intensities_dir):
175       os.mkdir(intensities_dir)
176
177     param_file = os.path.join(TESTDATA_DIR, '1_12',
178                               'rta_intensities_config_1.12.4.2.xml')
179     shutil.copy(param_file, os.path.join(intensities_dir, 'RTAConfig.xml'))
180
181     return intensities_dir
182
183 def make_rta_basecalls_1870(intensities_dir):
184     """
185     Construct an artificial RTA Intensities parameter file and directory
186     """
187     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
188     if not os.path.exists(basecalls_dir):
189       os.mkdir(basecalls_dir)
190
191     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
192     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
193
194     return basecalls_dir
195
196 def make_rta_basecalls_1_10(intensities_dir):
197     """
198     Construct an artificial RTA Intensities parameter file and directory
199     """
200     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
201     if not os.path.exists(basecalls_dir):
202         os.mkdir(basecalls_dir)
203
204     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
205     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
206     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
207
208     return basecalls_dir
209
210 def make_rta_basecalls_1_12(intensities_dir):
211     """
212     Construct an artificial RTA Intensities parameter file and directory
213     """
214     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
215     if not os.path.exists(basecalls_dir):
216         os.mkdir(basecalls_dir)
217
218     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
219     param_file = os.path.join(TESTDATA_DIR, '1_12',
220                               'rta_basecalls_config_1.12.4.2.xml')
221     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
222
223     return basecalls_dir
224
225
226 def make_qseqs(bustard_dir, basecall_info=None):
227     """
228     Fill gerald directory with qseq files
229     """
230     if basecall_info is None:
231         qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
232         tile_list = TILE_LIST
233         summary_file = '42BRJAAXX_BustardSummary.xml'
234     else:
235         qseq_file = basecall_info.qseq_file
236         tile_list = basecall_info.tile_list
237         summary_file = basecall_info.basecall_summary
238
239     # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
240     # was actually sequence at the start
241     source = os.path.join(TESTDATA_DIR, qseq_file)
242     destdir = bustard_dir
243     if not os.path.isdir(destdir):
244         os.mkdir(destdir)
245
246     for lane in LANE_LIST:
247         for tile in tile_list:
248             destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
249             shutil.copy(source, destination)
250
251     make_matrix_dir(bustard_dir)
252     make_phasing_dir(bustard_dir)
253
254     summary_source = os.path.join(TESTDATA_DIR, summary_file)
255     summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
256     shutil.copy(summary_source, summary_dest)
257
258     return destdir
259
260 def make_scores(gerald_dir, in_temp=True):
261     """
262     Fill gerald directory with score temp files
263     will create the directory if it doesn't exist.
264     """
265     source = os.path.join(TESTDATA_DIR, 's_1_0001_score.txt')
266     destdir = gerald_dir
267     if in_temp:
268         destdir = os.path.join(destdir, 'Temp')
269     if not os.path.isdir(destdir):
270         os.mkdir(destdir)
271
272     for lane in LANE_LIST:
273         for tile in TILE_LIST:
274             destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
275             shutil.copy(source, destination)
276
277     return destdir
278
279 def make_matrix_dir(bustard_dir):
280     """
281     Create several matrix files in <bustard_dir>/Matrix/
282
283     from pipeline 1.4
284     """
285     destdir = os.path.join(bustard_dir, 'Matrix')
286     if not os.path.isdir(destdir):
287         os.mkdir(destdir)
288
289     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
290     for lane in LANE_LIST:
291         destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
292         shutil.copy(source, destination)
293
294 def make_matrix(matrix_filename):
295     contents = """# Auto-generated frequency response matrix
296 > A
297 > C
298 > G
299 > T
300 0.77 0.15 -0.04 -0.04
301 0.76 1.02 -0.05 -0.06
302 -0.10 -0.10 1.17 -0.03
303 -0.13 -0.12 0.80 1.27
304 """
305     f = open(matrix_filename, 'w')
306     f.write(contents)
307     f.close()
308
309 def make_matrix_dir_rta160(bustard_dir):
310     """
311     Create several matrix files in <bustard_dir>/Matrix/
312     """
313     destdir = os.path.join(bustard_dir, 'Matrix')
314     if not os.path.isdir(destdir):
315         os.mkdir(destdir)
316
317     source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
318     lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
319     for fragment in lane_fragments:
320         destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
321         shutil.copy(source, destination)
322
323 def make_matrix_dir_rta_1_10(bustard_dir):
324     make_matrix_dir_rta160(bustard_dir)
325
326 def make_matrix_dir_rta_1_12(bustard_dir):
327     make_matrix_dir_rta160(bustard_dir)
328
329 def make_phasing_dir(bustard_dir):
330     """
331     Create several phasing files in <bustard_dir>/Phasing/
332
333     from pipeline 1.4
334     """
335     destdir = os.path.join(bustard_dir, 'Phasing')
336     if not os.path.isdir(destdir):
337         os.mkdir(destdir)
338
339     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
340     for lane in LANE_LIST:
341         destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
342         shutil.copy(source, destination)
343
344 def make_phasing_params(bustard_dir):
345     for lane in LANE_LIST:
346         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
347         f = open(pathname, 'w')
348         f.write("""<Parameters>
349   <Phasing>0.009900</Phasing>
350   <Prephasing>0.003500</Prephasing>
351 </Parameters>
352 """)
353         f.close()
354
355 def make_gerald_config_026(gerald_dir):
356     source = os.path.join(TESTDATA_DIR, 'gerald_config_0.2.6.xml')
357     destination = os.path.join(gerald_dir, 'config.xml')
358     shutil.copy(source, destination)
359
360 def make_gerald_config_100(gerald_dir):
361     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.0.xml')
362     destination = os.path.join(gerald_dir, 'config.xml')
363     shutil.copy(source, destination)
364
365 def make_gerald_config_1_7(gerald_dir):
366     """CASAVA 1.7 gerald config"""
367     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.7.xml')
368     destination = os.path.join(gerald_dir, 'config.xml')
369     shutil.copy(source, destination)
370
371 def make_summary_htm_100(gerald_dir):
372     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
373     destination = os.path.join(gerald_dir, 'Summary.htm')
374     shutil.copy(source, destination)
375
376 def make_summary_htm_110(gerald_dir):
377     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline110.htm')
378     destination = os.path.join(gerald_dir, 'Summary.htm')
379     shutil.copy(source, destination)
380
381 def make_summary_paired_htm(gerald_dir):
382     source = os.path.join(TESTDATA_DIR, 'Summary-paired-pipeline110.htm')
383     destination = os.path.join(gerald_dir, 'Summary.htm')
384     shutil.copy(source, destination)
385
386 def make_summary_ipar130_htm(gerald_dir):
387     source = os.path.join(TESTDATA_DIR, 'Summary-ipar130.htm')
388     destination = os.path.join(gerald_dir, 'Summary.htm')
389     shutil.copy(source, destination)
390
391 def make_summary_rta160_xml(gerald_dir):
392     source = os.path.join(TESTDATA_DIR, 'Summary-rta160.xml')
393     destination = os.path.join(gerald_dir, 'Summary.xml')
394     shutil.copy(source, destination)
395
396
397 def make_summary_casava1_7_xml(gerald_dir):
398     source = os.path.join(TESTDATA_DIR, 'Summary-casava1.7.xml')
399     destination = os.path.join(gerald_dir, 'Summary.xml')
400     shutil.copy(source, destination)
401
402 def make_status_rta1_12(datadir):
403     sourcedir = os.path.join(TESTDATA_DIR, '1_12')
404     status_htm = os.path.join(sourcedir, 'Status.htm')
405     destination = os.path.join(datadir, 'Status.htm')
406     shutil.copy(status_htm, destination)
407
408     status_dir = os.path.join(datadir, 'Status_Files')
409     status_source_dir = os.path.join(sourcedir, 'Status_Files')
410     shutil.copytree(status_source_dir, status_dir)
411
412     report_source_dir = os.path.join(sourcedir, 'reports')
413     report_dir = os.path.join(datadir, 'reports')
414     shutil.copytree(report_source_dir, report_dir)
415
416 def make_eland_results(gerald_dir):
417     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
418 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
419 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
420 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
421 """
422     for i in LANE_LIST:
423         pathname = os.path.join(gerald_dir,
424                                 's_%d_eland_result.txt' % (i,))
425         f = open(pathname, 'w')
426         f.write(eland_result)
427         f.close()
428
429 def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
430     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
431 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
432 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
433 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
434 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
435 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
436 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
437 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC
438 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
439 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
440 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
441 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
442 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
443 """]
444     if paired:
445         for e in [1,2]:
446             for i in lane_list:
447                 pathname = os.path.join(gerald_dir,
448                                         's_%d_%d_eland_multi.txt' % (i,e))
449                 f = open(pathname, 'w')
450                 f.write(eland_multi[e-1])
451                 f.close()
452     else:
453         for i in lane_list:
454             pathname = os.path.join(gerald_dir,
455                                     's_%d_eland_multi.txt' % (i,))
456             f = open(pathname, 'w')
457             f.write(eland_multi[0])
458             f.close()
459
460 def make_eland_export(gerald_dir, paired=False, lane_list=LANE_LIST):
461     source = os.path.join(TESTDATA_DIR, 'casava_1.7_export.txt')
462
463     for i in lane_list:
464         destination = os.path.join(gerald_dir,
465                                    's_%d_export.txt' % (i,))
466         shutil.copy(source, destination)
467
468
469 def make_scarf(gerald_dir, lane_list=LANE_LIST):
470     seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
471 HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
472 HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
473 HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
474 HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
475 """
476     for l in lane_list:
477         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
478         f = open(pathname,'w')
479         f.write(seq)
480         f.close()
481
482 def make_fastq(gerald_dir, lane_list=LANE_LIST):
483     seq = """@HWI-EAS229:1:2:182:712#0/1
484 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
485 +HWI-EAS229:1:2:182:712#0/1
486 \\bab_bbaabbababbaaa]]D]bb_baabbab\baa
487 @HWI-EAS229:1:2:198:621#0/1
488 CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
489 +HWI-EAS229:1:2:198:621#0/1
490 [aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
491 @HWI-EAS229:1:2:209:1321#0/1
492 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
493 +HWI-EAS229:1:2:209:1321#0/1
494 _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
495 """
496     for l in lane_list:
497         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
498         f = open(pathname,'w')
499         f.write(seq)
500         f.close()
501
502 UNALIGNED_READS = [1,2]
503 UNALIGNED_SAMPLES = [ (1, UNALIGNED_READS, '11111', None, None),
504                       (2, UNALIGNED_READS, '11112', None, None),
505                       (3, UNALIGNED_READS, '11113', 1, 'ATCACG'),
506                       (3, UNALIGNED_READS, '11113', 2, 'CGATGT'),
507                       (3, UNALIGNED_READS, '11113', 3, 'TTAGGC'),
508                       (4, UNALIGNED_READS, '11114', 6, 'GCCAAT'),
509                       (5, UNALIGNED_READS, '11115', 1, 'ATCACG'),
510                       (5, UNALIGNED_READS, '11116', 7, 'ACTTGA'),
511                       (5, UNALIGNED_READS, '11117', 9, 'GATCAG'),
512                       (6, UNALIGNED_READS, '11118', 1, 'ATCACG'),
513                       (7, UNALIGNED_READS, '11119', 2, 'CGATGT'),
514                       (8, UNALIGNED_READS, '11120', 3, 'TTAGGC'),
515                       (1, UNALIGNED_READS, None, None, None),
516                       (2, UNALIGNED_READS, None, None, None),
517                       (3, UNALIGNED_READS, None, None, None),
518                       (4, UNALIGNED_READS, None, None, None),
519                       (5, UNALIGNED_READS, None, None, None)]
520
521
522 def make_aligned_eland_export(aligned_dir, flowcell_id):
523     summary_source = os.path.join(TESTDATA_DIR, 'sample_summary_1_12.htm')
524     for lane, read, project_id, index_id, index_seq in UNALIGNED_SAMPLES:
525         paths = DemultiplexedPaths(aligned_dir,
526                                    flowcell_id,
527                                    lane,
528                                    project_id,
529                                    index_id,
530                                    index_seq)
531         paths.make_sample_dirs()
532         paths.make_summary_dirs()
533         summary_dest = os.path.join(paths.summary_dir, 'Sample_Summary.htm')
534         shutil.copy(summary_source, summary_dest)
535
536         body = get_aligned_sample_export(lane, index_seq)
537         for split in ['001','002']:
538             for read in UNALIGNED_READS:
539                 suffix = 'R{0}_{1}_export.txt.gz'.format(read, split)
540                 pathname = paths.make_test_filename(suffix)
541                 stream = gzip.open(pathname, 'wt')
542                 stream.write(body)
543                 stream.close()
544
545
546 def make_unaligned_fastqs_1_12(unaligned_dir, flowcell_id):
547     """Create a default mix of unaligned sample files
548     """
549     for lane, read, name, index_id, index in UNALIGNED_SAMPLES:
550         make_unaligned_fastq_sample_1_12(unaligned_dir,
551                                          flowcell_id,
552                                          lane,
553                                          read,
554                                          name,
555                                          index_id,
556                                          index)
557
558 def make_unaligned_fastq_sample_1_12(unaligned_dir,
559                                      flowcell_id,
560                                      lane,
561                                      reads,
562                                      project_id,
563                                      index_id=None,
564                                      index_seq=None):
565
566     paths = DemultiplexedPaths(unaligned_dir,
567                                flowcell_id,
568                                lane,
569                                project_id,
570                                index_id,
571                                index_seq)
572     paths.make_sample_dirs()
573
574     sample_seq = get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq)
575     for split in ['001','002']:
576         for read in reads:
577             suffix = 'R{0}_{1}.fastq.gz'.format(read, split)
578             pathname = paths.make_test_filename(suffix)
579             stream = gzip.open(pathname, 'wt')
580             stream.write(sample_seq)
581             stream.close()
582
583     sheetname = os.path.join(paths.sample_dir, 'SampleSheet.csv')
584     stream = open(sheetname, 'w')
585     stream.write('FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject'+os.linesep)
586     template = '{flowcell},{lane},{id},mm9,{index},Sample #{id},N,PR_indexing,Operator,{sample_project}'+os.linesep
587     stream.write(template.format(flowcell=flowcell_id,
588                                  lane=lane,
589                                  id=paths.sample_id,
590                                  index=paths.index_seq,
591                                  sample_project=paths.sample_project))
592     stream.close()
593
594
595 class DemultiplexedPaths(object):
596     def __init__(self, basedir, flowcell_id, lane, project_id, index_id, index_seq):
597         if lane not in LANE_LIST:
598             raise ValueError("Invalid lane ID: {0}".format(lane))
599         self.basedir = basedir
600         self.flowcell_id = flowcell_id
601         self.lane = lane
602
603         if project_id is None:
604             # undetermined
605             self.index_seq = ''
606             self.sample_id = 'lane{0}'.format(lane)
607             self.sample_project = 'Undetermined_indices'
608             self.rootname = 'lane{lane}_Undetermined_L00{lane}_'.format(
609                 lane=lane)
610             self.project_dir = 'Undetermined_indices'
611             self.sample_dir = 'Sample_lane{lane}'.format(lane=lane)
612         elif index_seq is None:
613             self.index_seq = ''
614             self.sample_id = project_id
615             self.sample_project = '{project_id}'.format(project_id=project_id)
616             self.rootname = '{project_id}_NoIndex_L00{lane}_'.format(
617                 project_id=project_id,
618                 lane=lane)
619             self.project_dir = 'Project_' + self.sample_project
620             self.sample_dir = 'Sample_{project_id}'.format(
621                 project_id=project_id)
622         else:
623             self.index_seq = index_seq
624             self.sample_id = project_id
625             self.sample_project = '{project_id}_Index{index_id}'.format(
626                 project_id=project_id,
627                 index_id=index_id)
628             self.rootname = '{project_id}_{index}_L00{lane}_'.format(
629                 project_id=project_id,
630                 index=index_seq,
631                 lane=lane)
632             self.project_dir = 'Project_' + self.sample_project
633             self.sample_dir = 'Sample_{project_id}'.format(
634                 project_id=project_id)
635
636         self.project_dir = os.path.join(self.basedir, self.project_dir)
637         self.sample_dir = os.path.join(self.project_dir, self.sample_dir)
638         self.summary_dir = 'Summary_Stats_{0}'.format(self.flowcell_id)
639         self.summary_dir = os.path.join(self.project_dir, self.summary_dir)
640
641
642     def make_sample_dirs(self):
643         if not os.path.isdir(self.project_dir):
644             os.mkdir(self.project_dir)
645         if not os.path.isdir(self.sample_dir):
646             os.mkdir(self.sample_dir)
647
648     def make_summary_dirs(self):
649         if not os.path.isdir(self.summary_dir):
650             os.mkdir(self.summary_dir)
651
652     def make_test_filename(self, suffix):
653         filename = self.rootname + suffix
654         pathname = os.path.join(self.sample_dir, filename)
655         return pathname
656
657     def dump(self):
658         print('index seq: {0}'.format(self.index_seq))
659
660         print('project dir: {0}'.format(self.project_dir))
661         print('sample dir: {0}'.format(self.sample_dir))
662         print('rootname: {0}'.format(self.rootname))
663         print('path: {0}'.format(
664             os.path.join(self.project_dir,
665                          self.sample_dir,
666                          self.rootname+'R1_001.fastq.gz')))
667
668
669 def get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq):
670     seq = """@HWI-ST0787:101:{flowcell}:{lane}:1101:2416:3469 1:Y:0:{index}
671 TCCTTCATTCCACCGGAGTCTGTGGAATTCTCGGGTGCCAAGGAACTCCA
672 +
673 CCCFFFFFHHHHHJJJJJJJJJIJJJJJJJJJJJJJJJJIIJJIIJJJJJ
674 @HWI-ST0787:101:{flowcell}:{lane}:1101:2677:3293 1:Y:0:{index}
675 TGGAAATCCATTGGGGTTTCCCCTGGAATTCTCGGGTGCCAAGGAACTCC
676 +
677 @CCFF3BDHHHHHIIIIIHHIIIDIIIGIIIEGIIIIIIIIIIIIIIIHH
678 @HWI-ST0787:101:{flowcell}:{lane}:1101:2616:3297 1:Y:0:{index}
679 TAATACTGCCGGGTAATGATGGCTGGAATTCTCGGGTGCCAAGGAACTCC
680 +
681 CCCFFFFFHHHHHCGHJJJJJJJJJJJJJJJJJIIJJJJJJJJJIHJJJI
682 @HWI-ST0787:101:{flowcell}:{lane}:1101:2545:3319 1:N:0:{index}
683 TCCTTCATTCCACCGGAGTCTGCTGGAATTCTCGGGTGCCAAGGAACTCC
684 +
685 CCCFFFFFHHHFHJGIGHIJHIIGHIGIGIGEHFIJJJIHIJHJIIJJIH
686 """.format(flowcell=flowcell_id, lane=lane, index=index_seq)
687     return seq
688
689 def get_aligned_sample_export(lane, index_seq):
690     body = """HWI-ST0787\t102\t{lane}\t1101\t1207\t1993\t{index}\t1\tAANGGATTCGATCCGGCTTAAGAGATGAAAACCGAAAGGGCCGACCGAA\taaBS`ccceg[`ae[dRR_[[SPPPP__ececfYYWaegh^\\ZLLY\\X`\tNM\t\t\t\t\t\t
691 HWI-ST0787\t102\t{lane}\t1101\t1478\t1997\t{index}\t1\tCAAGAACCCCGGGGGGGGGGGGGCAGAGAGGGGGAATTTTTTTTTTGTT\tBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\tNM\t\t\t\t\t\t\t\t\t\t\tN
692 HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\tchr9.fa\t67717938\tR\t99\t72
693 HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\t3:4:3\t\t\t\t\t\t\t\t\t\t\tY
694 """.format(lane=lane, index=index_seq)
695     return body
696
697 def print_ls_tree(root):
698     """List tree contents, useful for debugging.
699     """
700     for dirpath, dirnames, filenames in os.walk(root):
701         for filename in filenames:
702             print(os.path.join(dirpath, filename))
703
704
705 class BaseCallInfo(object):
706     """Provide customization for how to setup the base call mock data
707     """
708     def __init__(self, qseq_file, tile_list, basecall_summary):
709         self.qseq_file = qseq_file
710         self.tile_list = tile_list
711         self.basecall_summary = basecall_summary
712
713 # First generation HiSeq Flowcell
714 ABXX_BASE_CALL_INFO = BaseCallInfo(
715     qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
716     tile_list = HISEQ_TILE_LIST,
717     basecall_summary = 'AA01CCABXX_BustardSummary.xml')