Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / test / simulate_runfolder.py
1 """
2 Create simulated solexa/illumina runfolders for testing
3 """
4 import gzip
5 import os
6 import shutil
7
8 TEST_CODE_DIR = os.path.split(__file__)[0]
9 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
10 LANE_LIST = list(range(1,9))
11 TILE_LIST = list(range(1,101))
12 HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
13                    1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
14                    2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
15                    2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
16
17 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
18     firecrest_dir = os.path.join(data_dir,
19                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
20                                  )
21     os.mkdir(firecrest_dir)
22     return firecrest_dir
23
24 def make_ipar_dir(data_dir, version='1.01'):
25     """
26     Construct an artificial ipar parameter file and directory
27     """
28     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
29     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
30
31     ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
32     if not os.path.exists(ipar_dir):
33       os.mkdir(ipar_dir)
34     return ipar_dir
35
36 def make_flowcell_id(runfolder_dir, flowcell_id=None):
37     if flowcell_id is None:
38         flowcell_id = '207BTAAXY'
39
40     config = """<?xml version="1.0"?>
41 <FlowcellId>
42   <Text>%s</Text>
43 </FlowcellId>""" % (flowcell_id,)
44     config_dir = os.path.join(runfolder_dir, 'Config')
45
46     if not os.path.exists(config_dir):
47         os.mkdir(config_dir)
48     pathname = os.path.join(config_dir, 'FlowcellId.xml')
49     f = open(pathname,'w')
50     f.write(config)
51     f.close()
52
53 def make_runinfo(runfolder_dir, flowcell_id):
54     """Simulate a RunInfo.xml file created by >= RTA 1.9
55     """
56     xml = '''<?xml version="1.0"?>
57 <RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
58   <Run Id="{runfolder}" Number="101">
59     <Flowcell>{flowcell}</Flowcell>
60     <Instrument>SN787</Instrument>
61     <Date>110815</Date>
62     <Reads>
63       <Read Number="1" NumCycles="50" IsIndexedRead="N" />
64       <Read Number="2" NumCycles="7" IsIndexedRead="Y" />
65     </Reads>
66     <FlowcellLayout LaneCount="8" SurfaceCount="2" SwathCount="3" TileCount="8" />
67     <AlignToPhiX />
68   </Run>
69 </RunInfo>
70 '''
71     path, runfolder = os.path.split(runfolder_dir)
72     runinfo = os.path.join(runfolder_dir, 'RunInfo.xml')
73     stream = open(runinfo, 'w')
74     stream.write(xml.format(runfolder=runfolder, flowcell=flowcell_id))
75     stream.close()
76     return runinfo
77
78 def make_bustard_config132(image_dir):
79     source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
80     destination = os.path.join(image_dir, 'config.xml')
81     shutil.copy(source, destination)
82
83 def make_aligned_config_1_12(aligned_dir):
84     """This is rouglhly equivalent to the old gerald file"""
85     source = os.path.join(TESTDATA_DIR, '1_12', 'aligned_config_1_12.xml')
86     destination = os.path.join(aligned_dir, 'config.xml')
87     shutil.copy(source, destination)
88
89 def make_unaligned_config_1_12(unaligned_dir):
90     demultiplex_pairs = [ # (src,
91       # dest),
92         (os.path.join(TESTDATA_DIR, '1_12', 'demultiplex_1.12.4.2.xml'),
93          os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
94         (os.path.join(TESTDATA_DIR, '1_12',
95                       'demultiplexed_bustard_1.12.4.2.xml'),
96          os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')),
97         (os.path.join(TESTDATA_DIR, '1_12',
98                       'demultiplexed_summary_1.12.4.2.xml'),
99          os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')),
100     ]
101     for src, dest in demultiplex_pairs:
102         shutil.copy(src, dest)
103         
104 def make_unaligned_status_1_12(unaligned_dir, flowcell_id):
105     basecall_status = ['All.htm', 'Demultiplex_Stats.htm', 'IVC.htm']
106     test_data_root = os.path.join(TESTDATA_DIR, '1_12', 'basecall_stats')
107     basecall_stats = os.path.join(unaligned_dir, 
108                                   'Basecall_Stats_{0}'.format(flowcell_id))
109     os.mkdir(basecall_stats)
110     for filename in basecall_status:
111         source = os.path.join(test_data_root, filename)
112         destination = os.path.join(basecall_stats, filename)
113         shutil.copy(source, destination)
114
115 def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
116     """
117     Construct an artificial RTA Intensities parameter file and directory
118     """
119     intensities_dir = os.path.join(data_dir, 'Intensities')
120     if not os.path.exists(intensities_dir):
121       os.mkdir(intensities_dir)
122
123     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
124     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
125
126     return intensities_dir
127
128 def make_rta_basecalls_1460(intensities_dir):
129     """
130     Construct an artificial RTA Intensities parameter file and directory
131     """
132     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
133     if not os.path.exists(basecalls_dir):
134       os.mkdir(basecalls_dir)
135
136     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
137     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
138
139     return basecalls_dir
140
141 def make_rta_intensities_1870(data_dir, version='1.8.70.0'):
142     """
143     Construct an artificial RTA Intensities parameter file and directory
144     """
145     intensities_dir = os.path.join(data_dir, 'Intensities')
146     if not os.path.exists(intensities_dir):
147       os.mkdir(intensities_dir)
148
149     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
150     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
151
152     return intensities_dir
153
154 def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
155     """
156     Construct an artificial RTA Intensities parameter file and directory
157     """
158     intensities_dir = os.path.join(data_dir, 'Intensities')
159     if not os.path.exists(intensities_dir):
160       os.mkdir(intensities_dir)
161
162     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
163     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
164
165     return intensities_dir
166
167 def make_rta_intensities_1_12(data_dir, version='1.12.4.2'):
168     """
169     Construct an artificial RTA Intensities parameter file and directory
170     """
171     intensities_dir = os.path.join(data_dir, 'Intensities')
172     if not os.path.exists(intensities_dir):
173       os.mkdir(intensities_dir)
174
175     param_file = os.path.join(TESTDATA_DIR, '1_12',
176                               'rta_intensities_config_1.12.4.2.xml')
177     shutil.copy(param_file, os.path.join(intensities_dir, 'RTAConfig.xml'))
178
179     return intensities_dir
180
181 def make_rta_basecalls_1870(intensities_dir):
182     """
183     Construct an artificial RTA Intensities parameter file and directory
184     """
185     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
186     if not os.path.exists(basecalls_dir):
187       os.mkdir(basecalls_dir)
188
189     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
190     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
191
192     return basecalls_dir
193
194 def make_rta_basecalls_1_10(intensities_dir):
195     """
196     Construct an artificial RTA Intensities parameter file and directory
197     """
198     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
199     if not os.path.exists(basecalls_dir):
200         os.mkdir(basecalls_dir)
201
202     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
203     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
204     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
205
206     return basecalls_dir
207
208 def make_rta_basecalls_1_12(intensities_dir):
209     """
210     Construct an artificial RTA Intensities parameter file and directory
211     """
212     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
213     if not os.path.exists(basecalls_dir):
214         os.mkdir(basecalls_dir)
215
216     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
217     param_file = os.path.join(TESTDATA_DIR, '1_12',
218                               'rta_basecalls_config_1.12.4.2.xml')
219     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
220
221     return basecalls_dir
222
223
224 def make_qseqs(bustard_dir, basecall_info=None):
225     """
226     Fill gerald directory with qseq files
227     """
228     if basecall_info is None:
229         qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
230         tile_list = TILE_LIST
231         summary_file = '42BRJAAXX_BustardSummary.xml'
232     else:
233         qseq_file = basecall_info.qseq_file
234         tile_list = basecall_info.tile_list
235         summary_file = basecall_info.basecall_summary
236
237     # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
238     # was actually sequence at the start
239     source = os.path.join(TESTDATA_DIR, qseq_file)
240     destdir = bustard_dir
241     if not os.path.isdir(destdir):
242         os.mkdir(destdir)
243
244     for lane in LANE_LIST:
245         for tile in tile_list:
246             destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
247             shutil.copy(source, destination)
248
249     make_matrix_dir(bustard_dir)
250     make_phasing_dir(bustard_dir)
251
252     summary_source = os.path.join(TESTDATA_DIR, summary_file)
253     summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
254     shutil.copy(summary_source, summary_dest)
255
256     return destdir
257
258 def make_scores(gerald_dir, in_temp=True):
259     """
260     Fill gerald directory with score temp files
261     will create the directory if it doesn't exist.
262     """
263     source = os.path.join(TESTDATA_DIR, 's_1_0001_score.txt')
264     destdir = gerald_dir
265     if in_temp:
266         destdir = os.path.join(destdir, 'Temp')
267     if not os.path.isdir(destdir):
268         os.mkdir(destdir)
269
270     for lane in LANE_LIST:
271         for tile in TILE_LIST:
272             destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
273             shutil.copy(source, destination)
274
275     return destdir
276
277 def make_matrix_dir(bustard_dir):
278     """
279     Create several matrix files in <bustard_dir>/Matrix/
280
281     from pipeline 1.4
282     """
283     destdir = os.path.join(bustard_dir, 'Matrix')
284     if not os.path.isdir(destdir):
285         os.mkdir(destdir)
286
287     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
288     for lane in LANE_LIST:
289         destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
290         shutil.copy(source, destination)
291
292 def make_matrix(matrix_filename):
293     contents = """# Auto-generated frequency response matrix
294 > A
295 > C
296 > G
297 > T
298 0.77 0.15 -0.04 -0.04
299 0.76 1.02 -0.05 -0.06
300 -0.10 -0.10 1.17 -0.03
301 -0.13 -0.12 0.80 1.27
302 """
303     f = open(matrix_filename, 'w')
304     f.write(contents)
305     f.close()
306
307 def make_matrix_dir_rta160(bustard_dir):
308     """
309     Create several matrix files in <bustard_dir>/Matrix/
310     """
311     destdir = os.path.join(bustard_dir, 'Matrix')
312     if not os.path.isdir(destdir):
313         os.mkdir(destdir)
314
315     source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
316     lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
317     for fragment in lane_fragments:
318         destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
319         shutil.copy(source, destination)
320
321 def make_matrix_dir_rta_1_10(bustard_dir):
322     make_matrix_dir_rta160(bustard_dir)
323
324 def make_matrix_dir_rta_1_12(bustard_dir):
325     make_matrix_dir_rta160(bustard_dir)
326
327 def make_phasing_dir(bustard_dir):
328     """
329     Create several phasing files in <bustard_dir>/Phasing/
330
331     from pipeline 1.4
332     """
333     destdir = os.path.join(bustard_dir, 'Phasing')
334     if not os.path.isdir(destdir):
335         os.mkdir(destdir)
336
337     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
338     for lane in LANE_LIST:
339         destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
340         shutil.copy(source, destination)
341
342 def make_phasing_params(bustard_dir):
343     for lane in LANE_LIST:
344         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
345         f = open(pathname, 'w')
346         f.write("""<Parameters>
347   <Phasing>0.009900</Phasing>
348   <Prephasing>0.003500</Prephasing>
349 </Parameters>
350 """)
351         f.close()
352
353 def make_gerald_config_026(gerald_dir):
354     source = os.path.join(TESTDATA_DIR, 'gerald_config_0.2.6.xml')
355     destination = os.path.join(gerald_dir, 'config.xml')
356     shutil.copy(source, destination)
357
358 def make_gerald_config_100(gerald_dir):
359     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.0.xml')
360     destination = os.path.join(gerald_dir, 'config.xml')
361     shutil.copy(source, destination)
362
363 def make_gerald_config_1_7(gerald_dir):
364     """CASAVA 1.7 gerald config"""
365     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.7.xml')
366     destination = os.path.join(gerald_dir, 'config.xml')
367     shutil.copy(source, destination)
368
369 def make_summary_htm_100(gerald_dir):
370     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
371     destination = os.path.join(gerald_dir, 'Summary.htm')
372     shutil.copy(source, destination)
373
374 def make_summary_htm_110(gerald_dir):
375     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline110.htm')
376     destination = os.path.join(gerald_dir, 'Summary.htm')
377     shutil.copy(source, destination)
378
379 def make_summary_paired_htm(gerald_dir):
380     source = os.path.join(TESTDATA_DIR, 'Summary-paired-pipeline110.htm')
381     destination = os.path.join(gerald_dir, 'Summary.htm')
382     shutil.copy(source, destination)
383
384 def make_summary_ipar130_htm(gerald_dir):
385     source = os.path.join(TESTDATA_DIR, 'Summary-ipar130.htm')
386     destination = os.path.join(gerald_dir, 'Summary.htm')
387     shutil.copy(source, destination)
388
389 def make_summary_rta160_xml(gerald_dir):
390     source = os.path.join(TESTDATA_DIR, 'Summary-rta160.xml')
391     destination = os.path.join(gerald_dir, 'Summary.xml')
392     shutil.copy(source, destination)
393
394
395 def make_summary_casava1_7_xml(gerald_dir):
396     source = os.path.join(TESTDATA_DIR, 'Summary-casava1.7.xml')
397     destination = os.path.join(gerald_dir, 'Summary.xml')
398     shutil.copy(source, destination)
399
400 def make_status_rta1_12(datadir):
401     sourcedir = os.path.join(TESTDATA_DIR, '1_12')
402     status_htm = os.path.join(sourcedir, 'Status.htm')
403     destination = os.path.join(datadir, 'Status.htm')
404     shutil.copy(status_htm, destination)
405
406     status_dir = os.path.join(datadir, 'Status_Files')
407     status_source_dir = os.path.join(sourcedir, 'Status_Files')
408     shutil.copytree(status_source_dir, status_dir)
409
410     report_source_dir = os.path.join(sourcedir, 'reports')
411     report_dir = os.path.join(datadir, 'reports')
412     shutil.copytree(report_source_dir, report_dir)
413
414 def make_eland_results(gerald_dir):
415     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
416 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
417 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
418 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
419 """
420     for i in LANE_LIST:
421         pathname = os.path.join(gerald_dir,
422                                 's_%d_eland_result.txt' % (i,))
423         f = open(pathname, 'w')
424         f.write(eland_result)
425         f.close()
426
427 def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
428     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
429 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
430 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
431 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
432 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
433 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
434 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
435 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC
436 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
437 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
438 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
439 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
440 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
441 """]
442     if paired:
443         for e in [1,2]:
444             for i in lane_list:
445                 pathname = os.path.join(gerald_dir,
446                                         's_%d_%d_eland_multi.txt' % (i,e))
447                 f = open(pathname, 'w')
448                 f.write(eland_multi[e-1])
449                 f.close()
450     else:
451         for i in lane_list:
452             pathname = os.path.join(gerald_dir,
453                                     's_%d_eland_multi.txt' % (i,))
454             f = open(pathname, 'w')
455             f.write(eland_multi[0])
456             f.close()
457
458 def make_eland_export(gerald_dir, paired=False, lane_list=LANE_LIST):
459     source = os.path.join(TESTDATA_DIR, 'casava_1.7_export.txt')
460
461     for i in lane_list:
462         destination = os.path.join(gerald_dir,
463                                    's_%d_export.txt' % (i,))
464         shutil.copy(source, destination)
465
466
467 def make_scarf(gerald_dir, lane_list=LANE_LIST):
468     seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
469 HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
470 HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
471 HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
472 HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
473 """
474     for l in lane_list:
475         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
476         f = open(pathname,'w')
477         f.write(seq)
478         f.close()
479
480 def make_fastq(gerald_dir, lane_list=LANE_LIST):
481     seq = """@HWI-EAS229:1:2:182:712#0/1
482 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
483 +HWI-EAS229:1:2:182:712#0/1
484 \\bab_bbaabbababbaaa]]D]bb_baabbab\baa
485 @HWI-EAS229:1:2:198:621#0/1
486 CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
487 +HWI-EAS229:1:2:198:621#0/1
488 [aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
489 @HWI-EAS229:1:2:209:1321#0/1
490 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
491 +HWI-EAS229:1:2:209:1321#0/1
492 _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
493 """
494     for l in lane_list:
495         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
496         f = open(pathname,'w')
497         f.write(seq)
498         f.close()
499
500 UNALIGNED_READS = [1,2]
501 UNALIGNED_SAMPLES = [ (1, UNALIGNED_READS, '11111', None, None),
502                       (2, UNALIGNED_READS, '11112', None, None),
503                       (3, UNALIGNED_READS, '11113', 1, 'ATCACG'),
504                       (3, UNALIGNED_READS, '11113', 2, 'CGATGT'),
505                       (3, UNALIGNED_READS, '11113', 3, 'TTAGGC'),
506                       (4, UNALIGNED_READS, '11114', 6, 'GCCAAT'),
507                       (5, UNALIGNED_READS, '11115', 1, 'ATCACG'),
508                       (5, UNALIGNED_READS, '11116', 7, 'ACTTGA'),
509                       (5, UNALIGNED_READS, '11117', 9, 'GATCAG'),
510                       (6, UNALIGNED_READS, '11118', 1, 'ATCACG'),
511                       (7, UNALIGNED_READS, '11119', 2, 'CGATGT'),
512                       (8, UNALIGNED_READS, '11120', 3, 'TTAGGC'),
513                       (1, UNALIGNED_READS, None, None, None),
514                       (2, UNALIGNED_READS, None, None, None),
515                       (3, UNALIGNED_READS, None, None, None),
516                       (4, UNALIGNED_READS, None, None, None),
517                       (5, UNALIGNED_READS, None, None, None)]
518
519
520 def make_aligned_eland_export(aligned_dir, flowcell_id):
521     summary_source = os.path.join(TESTDATA_DIR, 'sample_summary_1_12.htm')
522     for lane, read, project_id, index_id, index_seq in UNALIGNED_SAMPLES:
523         paths = DemultiplexedPaths(aligned_dir,
524                                    flowcell_id,
525                                    lane,
526                                    project_id,
527                                    index_id,
528                                    index_seq)
529         paths.make_sample_dirs()
530         paths.make_summary_dirs()
531         summary_dest = os.path.join(paths.summary_dir, 'Sample_Summary.htm')
532         shutil.copy(summary_source, summary_dest)
533
534         body = get_aligned_sample_export(lane, index_seq)
535         for split in ['001','002']:
536             for read in UNALIGNED_READS:
537                 suffix = 'R{0}_{1}_export.txt.gz'.format(read, split)
538                 pathname = paths.make_test_filename(suffix)
539                 stream = gzip.open(pathname, 'w')
540                 stream.write(body)
541                 stream.close()
542
543
544 def make_unaligned_fastqs_1_12(unaligned_dir, flowcell_id):
545     """Create a default mix of unaligned sample files
546     """
547     for lane, read, name, index_id, index in UNALIGNED_SAMPLES:
548         make_unaligned_fastq_sample_1_12(unaligned_dir,
549                                          flowcell_id,
550                                          lane,
551                                          read,
552                                          name,
553                                          index_id,
554                                          index)
555
556 def make_unaligned_fastq_sample_1_12(unaligned_dir,
557                                      flowcell_id,
558                                      lane,
559                                      reads,
560                                      project_id,
561                                      index_id=None,
562                                      index_seq=None):
563
564     paths = DemultiplexedPaths(unaligned_dir,
565                                flowcell_id,
566                                lane,
567                                project_id,
568                                index_id,
569                                index_seq)
570     paths.make_sample_dirs()
571
572     sample_seq = get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq)
573     for split in ['001','002']:
574         for read in reads:
575             suffix = 'R{0}_{1}.fastq.gz'.format(read, split)
576             pathname = paths.make_test_filename(suffix)
577             stream = gzip.open(pathname, 'w')
578             stream.write(sample_seq)
579             stream.close()
580
581     sheetname = os.path.join(paths.sample_dir, 'SampleSheet.csv')
582     stream = open(sheetname, 'w')
583     stream.write('FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject'+os.linesep)
584     template = '{flowcell},{lane},{id},mm9,{index},Sample #{id},N,PR_indexing,Operator,{sample_project}'+os.linesep
585     stream.write(template.format(flowcell=flowcell_id,
586                                  lane=lane,
587                                  id=paths.sample_id,
588                                  index=paths.index_seq,
589                                  sample_project=paths.sample_project))
590     stream.close()
591
592
593 class DemultiplexedPaths(object):
594     def __init__(self, basedir, flowcell_id, lane, project_id, index_id, index_seq):
595         if lane not in LANE_LIST:
596             raise ValueError("Invalid lane ID: {0}".format(lane))
597         self.basedir = basedir
598         self.flowcell_id = flowcell_id
599         self.lane = lane
600
601         if project_id is None:
602             # undetermined
603             self.index_seq = ''
604             self.sample_id = 'lane{0}'.format(lane)
605             self.sample_project = 'Undetermined_indices'
606             self.rootname = 'lane{lane}_Undetermined_L00{lane}_'.format(
607                 lane=lane)
608             self.project_dir = 'Undetermined_indices'
609             self.sample_dir = 'Sample_lane{lane}'.format(lane=lane)
610         elif index_seq is None:
611             self.index_seq = ''
612             self.sample_id = project_id
613             self.sample_project = '{project_id}'.format(project_id=project_id)
614             self.rootname = '{project_id}_NoIndex_L00{lane}_'.format(
615                 project_id=project_id,
616                 lane=lane)
617             self.project_dir = 'Project_' + self.sample_project
618             self.sample_dir = 'Sample_{project_id}'.format(
619                 project_id=project_id)
620         else:
621             self.index_seq = index_seq
622             self.sample_id = project_id
623             self.sample_project = '{project_id}_Index{index_id}'.format(
624                 project_id=project_id,
625                 index_id=index_id)
626             self.rootname = '{project_id}_{index}_L00{lane}_'.format(
627                 project_id=project_id,
628                 index=index_seq,
629                 lane=lane)
630             self.project_dir = 'Project_' + self.sample_project
631             self.sample_dir = 'Sample_{project_id}'.format(
632                 project_id=project_id)
633
634         self.project_dir = os.path.join(self.basedir, self.project_dir)
635         self.sample_dir = os.path.join(self.project_dir, self.sample_dir)
636         self.summary_dir = 'Summary_Stats_{0}'.format(self.flowcell_id)
637         self.summary_dir = os.path.join(self.project_dir, self.summary_dir)
638
639
640     def make_sample_dirs(self):
641         if not os.path.isdir(self.project_dir):
642             os.mkdir(self.project_dir)
643         if not os.path.isdir(self.sample_dir):
644             os.mkdir(self.sample_dir)
645
646     def make_summary_dirs(self):
647         if not os.path.isdir(self.summary_dir):
648             os.mkdir(self.summary_dir)
649
650     def make_test_filename(self, suffix):
651         filename = self.rootname + suffix
652         pathname = os.path.join(self.sample_dir, filename)
653         return pathname
654
655     def dump(self):
656         print(('index seq: {0}'.format(self.index_seq)))
657
658         print(('project dir: {0}'.format(self.project_dir)))
659         print(('sample dir: {0}'.format(self.sample_dir)))
660         print(('rootname: {0}'.format(self.rootname)))
661         print(('path: {0}'.format(
662             os.path.join(self.project_dir,
663                          self.sample_dir,
664                          self.rootname+'R1_001.fastq.gz'))))
665
666
667 def get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq):
668     seq = """@HWI-ST0787:101:{flowcell}:{lane}:1101:2416:3469 1:Y:0:{index}
669 TCCTTCATTCCACCGGAGTCTGTGGAATTCTCGGGTGCCAAGGAACTCCA
670 +
671 CCCFFFFFHHHHHJJJJJJJJJIJJJJJJJJJJJJJJJJIIJJIIJJJJJ
672 @HWI-ST0787:101:{flowcell}:{lane}:1101:2677:3293 1:Y:0:{index}
673 TGGAAATCCATTGGGGTTTCCCCTGGAATTCTCGGGTGCCAAGGAACTCC
674 +
675 @CCFF3BDHHHHHIIIIIHHIIIDIIIGIIIEGIIIIIIIIIIIIIIIHH
676 @HWI-ST0787:101:{flowcell}:{lane}:1101:2616:3297 1:Y:0:{index}
677 TAATACTGCCGGGTAATGATGGCTGGAATTCTCGGGTGCCAAGGAACTCC
678 +
679 CCCFFFFFHHHHHCGHJJJJJJJJJJJJJJJJJIIJJJJJJJJJIHJJJI
680 @HWI-ST0787:101:{flowcell}:{lane}:1101:2545:3319 1:N:0:{index}
681 TCCTTCATTCCACCGGAGTCTGCTGGAATTCTCGGGTGCCAAGGAACTCC
682 +
683 CCCFFFFFHHHFHJGIGHIJHIIGHIGIGIGEHFIJJJIHIJHJIIJJIH
684 """.format(flowcell=flowcell_id, lane=lane, index=index_seq)
685     return seq
686
687 def get_aligned_sample_export(lane, index_seq):
688     body = """HWI-ST0787\t102\t{lane}\t1101\t1207\t1993\t{index}\t1\tAANGGATTCGATCCGGCTTAAGAGATGAAAACCGAAAGGGCCGACCGAA\taaBS`ccceg[`ae[dRR_[[SPPPP__ececfYYWaegh^\\ZLLY\\X`\tNM\t\t\t\t\t\t
689 HWI-ST0787\t102\t{lane}\t1101\t1478\t1997\t{index}\t1\tCAAGAACCCCGGGGGGGGGGGGGCAGAGAGGGGGAATTTTTTTTTTGTT\tBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\tNM\t\t\t\t\t\t\t\t\t\t\tN
690 HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\tchr9.fa\t67717938\tR\t99\t72
691 HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\t3:4:3\t\t\t\t\t\t\t\t\t\t\tY
692 """.format(lane=lane, index=index_seq)
693     return body
694
695 def print_ls_tree(root):
696     """List tree contents, useful for debugging.
697     """
698     for dirpath, dirnames, filenames in os.walk(root):
699         for filename in filenames:
700             print(os.path.join(dirpath, filename))
701
702
703 class BaseCallInfo(object):
704     """Provide customization for how to setup the base call mock data
705     """
706     def __init__(self, qseq_file, tile_list, basecall_summary):
707         self.qseq_file = qseq_file
708         self.tile_list = tile_list
709         self.basecall_summary = basecall_summary
710
711 # First generation HiSeq Flowcell
712 ABXX_BASE_CALL_INFO = BaseCallInfo(
713     qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
714     tile_list = HISEQ_TILE_LIST,
715     basecall_summary = 'AA01CCABXX_BustardSummary.xml')