Fix bugs introduduced by the improved HiSeq runfolder scanning.
[htsworkflow.git] / htsworkflow / pipelines / test / simulate_runfolder.py
1 """
2 Create simulated solexa/illumina runfolders for testing
3 """
4 import gzip
5 import os
6 import shutil
7
8 TEST_CODE_DIR = os.path.split(__file__)[0]
9 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
10 LANE_LIST = range(1,9)
11 TILE_LIST = range(1,101)
12 HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
13                    1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
14                    2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
15                    2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
16
17 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
18     firecrest_dir = os.path.join(data_dir,
19                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
20                                  )
21     os.mkdir(firecrest_dir)
22     return firecrest_dir
23
24 def make_ipar_dir(data_dir, version='1.01'):
25     """
26     Construct an artificial ipar parameter file and directory
27     """
28     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
29     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
30
31     ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
32     if not os.path.exists(ipar_dir):
33       os.mkdir(ipar_dir)
34     return ipar_dir
35
36 def make_flowcell_id(runfolder_dir, flowcell_id=None):
37     if flowcell_id is None:
38         flowcell_id = '207BTAAXY'
39
40     config = """<?xml version="1.0"?>
41 <FlowcellId>
42   <Text>%s</Text>
43 </FlowcellId>""" % (flowcell_id,)
44     config_dir = os.path.join(runfolder_dir, 'Config')
45
46     if not os.path.exists(config_dir):
47         os.mkdir(config_dir)
48     pathname = os.path.join(config_dir, 'FlowcellId.xml')
49     f = open(pathname,'w')
50     f.write(config)
51     f.close()
52
53 def make_bustard_config132(image_dir):
54     source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
55     destination = os.path.join(image_dir, 'config.xml')
56     shutil.copy(source, destination)
57
58 def make_aligned_config_1_12(aligned_dir):
59     """This is rouglhly equivalent to the old gerald file"""
60     source = os.path.join(TESTDATA_DIR, 'aligned_config_1_12.xml')
61     destination = os.path.join(aligned_dir, 'config.xml')
62     shutil.copy(source, destination)
63
64 def make_unaligned_config_1_12(unaligned_dir):
65     demultiplex_pairs = [ # (src,
66       # dest),
67         (os.path.join(TESTDATA_DIR, 'demultiplex_1.12.4.2.xml'),
68          os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
69         (os.path.join(TESTDATA_DIR, 'demultiplexed_bustard_1.12.4.2.xml'),
70          os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')),
71         (os.path.join(TESTDATA_DIR, 'demultiplexed_summary_1.12.4.2.xml'),
72          os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')),
73     ]
74     for src, dest in demultiplex_pairs:
75         shutil.copy(src, dest)
76
77 def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
78     """
79     Construct an artificial RTA Intensities parameter file and directory
80     """
81     intensities_dir = os.path.join(data_dir, 'Intensities')
82     if not os.path.exists(intensities_dir):
83       os.mkdir(intensities_dir)
84
85     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
86     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
87
88     return intensities_dir
89
90 def make_rta_basecalls_1460(intensities_dir):
91     """
92     Construct an artificial RTA Intensities parameter file and directory
93     """
94     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
95     if not os.path.exists(basecalls_dir):
96       os.mkdir(basecalls_dir)
97
98     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
99     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
100
101     return basecalls_dir
102
103 def make_rta_intensities_1870(data_dir, version='1.8.70.0'):
104     """
105     Construct an artificial RTA Intensities parameter file and directory
106     """
107     intensities_dir = os.path.join(data_dir, 'Intensities')
108     if not os.path.exists(intensities_dir):
109       os.mkdir(intensities_dir)
110
111     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
112     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
113
114     return intensities_dir
115
116 def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
117     """
118     Construct an artificial RTA Intensities parameter file and directory
119     """
120     intensities_dir = os.path.join(data_dir, 'Intensities')
121     if not os.path.exists(intensities_dir):
122       os.mkdir(intensities_dir)
123
124     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
125     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
126
127     return intensities_dir
128
129 def make_rta_intensities_1_12(data_dir, version='1.12.4.2'):
130     """
131     Construct an artificial RTA Intensities parameter file and directory
132     """
133     intensities_dir = os.path.join(data_dir, 'Intensities')
134     if not os.path.exists(intensities_dir):
135       os.mkdir(intensities_dir)
136
137     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.12.4.2.xml')
138     shutil.copy(param_file, os.path.join(intensities_dir, 'RTAConfig.xml'))
139
140     return intensities_dir
141
142 def make_rta_basecalls_1870(intensities_dir):
143     """
144     Construct an artificial RTA Intensities parameter file and directory
145     """
146     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
147     if not os.path.exists(basecalls_dir):
148       os.mkdir(basecalls_dir)
149
150     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
151     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
152
153     return basecalls_dir
154
155 def make_rta_basecalls_1_10(intensities_dir):
156     """
157     Construct an artificial RTA Intensities parameter file and directory
158     """
159     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
160     if not os.path.exists(basecalls_dir):
161         os.mkdir(basecalls_dir)
162
163     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
164     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
165     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
166
167     return basecalls_dir
168
169 def make_rta_basecalls_1_12(intensities_dir):
170     """
171     Construct an artificial RTA Intensities parameter file and directory
172     """
173     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
174     if not os.path.exists(basecalls_dir):
175         os.mkdir(basecalls_dir)
176
177     make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
178     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.12.4.2.xml')
179     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
180
181     return basecalls_dir
182
183
184 def make_qseqs(bustard_dir, basecall_info=None):
185     """
186     Fill gerald directory with qseq files
187     """
188     if basecall_info is None:
189         qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
190         tile_list = TILE_LIST
191         summary_file = '42BRJAAXX_BustardSummary.xml'
192     else:
193         qseq_file = basecall_info.qseq_file
194         tile_list = basecall_info.tile_list
195         summary_file = basecall_info.basecall_summary
196
197     # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
198     # was actually sequence at the start
199     source = os.path.join(TESTDATA_DIR, qseq_file)
200     destdir = bustard_dir
201     if not os.path.isdir(destdir):
202         os.mkdir(destdir)
203
204     for lane in LANE_LIST:
205         for tile in tile_list:
206             destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
207             shutil.copy(source, destination)
208
209     make_matrix_dir(bustard_dir)
210     make_phasing_dir(bustard_dir)
211
212     summary_source = os.path.join(TESTDATA_DIR, summary_file)
213     summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
214     shutil.copy(summary_source, summary_dest)
215
216     return destdir
217
218 def make_scores(gerald_dir, in_temp=True):
219     """
220     Fill gerald directory with score temp files
221     will create the directory if it doesn't exist.
222     """
223     source = os.path.join(TESTDATA_DIR, 's_1_0001_score.txt')
224     destdir = gerald_dir
225     if in_temp:
226         destdir = os.path.join(destdir, 'Temp')
227     if not os.path.isdir(destdir):
228         os.mkdir(destdir)
229
230     for lane in LANE_LIST:
231         for tile in TILE_LIST:
232             destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
233             shutil.copy(source, destination)
234
235     return destdir
236
237 def make_matrix_dir(bustard_dir):
238     """
239     Create several matrix files in <bustard_dir>/Matrix/
240
241     from pipeline 1.4
242     """
243     destdir = os.path.join(bustard_dir, 'Matrix')
244     if not os.path.isdir(destdir):
245         os.mkdir(destdir)
246
247     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
248     for lane in LANE_LIST:
249         destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
250         shutil.copy(source, destination)
251
252 def make_matrix(matrix_filename):
253     contents = """# Auto-generated frequency response matrix
254 > A
255 > C
256 > G
257 > T
258 0.77 0.15 -0.04 -0.04
259 0.76 1.02 -0.05 -0.06
260 -0.10 -0.10 1.17 -0.03
261 -0.13 -0.12 0.80 1.27
262 """
263     f = open(matrix_filename, 'w')
264     f.write(contents)
265     f.close()
266
267 def make_matrix_dir_rta160(bustard_dir):
268     """
269     Create several matrix files in <bustard_dir>/Matrix/
270     """
271     destdir = os.path.join(bustard_dir, 'Matrix')
272     if not os.path.isdir(destdir):
273         os.mkdir(destdir)
274
275     source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
276     lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
277     for fragment in lane_fragments:
278         destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
279         shutil.copy(source, destination)
280
281 def make_matrix_dir_rta_1_10(bustard_dir):
282     make_matrix_dir_rta160(bustard_dir)
283
284 def make_matrix_dir_rta_1_12(bustard_dir):
285     make_matrix_dir_rta160(bustard_dir)
286
287 def make_phasing_dir(bustard_dir):
288     """
289     Create several phasing files in <bustard_dir>/Phasing/
290
291     from pipeline 1.4
292     """
293     destdir = os.path.join(bustard_dir, 'Phasing')
294     if not os.path.isdir(destdir):
295         os.mkdir(destdir)
296
297     source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
298     for lane in LANE_LIST:
299         destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
300         shutil.copy(source, destination)
301
302 def make_phasing_params(bustard_dir):
303     for lane in LANE_LIST:
304         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
305         f = open(pathname, 'w')
306         f.write("""<Parameters>
307   <Phasing>0.009900</Phasing>
308   <Prephasing>0.003500</Prephasing>
309 </Parameters>
310 """)
311         f.close()
312
313 def make_gerald_config_026(gerald_dir):
314     source = os.path.join(TESTDATA_DIR, 'gerald_config_0.2.6.xml')
315     destination = os.path.join(gerald_dir, 'config.xml')
316     shutil.copy(source, destination)
317
318 def make_gerald_config_100(gerald_dir):
319     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.0.xml')
320     destination = os.path.join(gerald_dir, 'config.xml')
321     shutil.copy(source, destination)
322
323 def make_gerald_config_1_7(gerald_dir):
324     """CASAVA 1.7 gerald config"""
325     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.7.xml')
326     destination = os.path.join(gerald_dir, 'config.xml')
327     shutil.copy(source, destination)
328
329 def make_summary_htm_100(gerald_dir):
330     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
331     destination = os.path.join(gerald_dir, 'Summary.htm')
332     shutil.copy(source, destination)
333
334 def make_summary_htm_110(gerald_dir):
335     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline110.htm')
336     destination = os.path.join(gerald_dir, 'Summary.htm')
337     shutil.copy(source, destination)
338
339 def make_summary_paired_htm(gerald_dir):
340     source = os.path.join(TESTDATA_DIR, 'Summary-paired-pipeline110.htm')
341     destination = os.path.join(gerald_dir, 'Summary.htm')
342     shutil.copy(source, destination)
343
344 def make_summary_ipar130_htm(gerald_dir):
345     source = os.path.join(TESTDATA_DIR, 'Summary-ipar130.htm')
346     destination = os.path.join(gerald_dir, 'Summary.htm')
347     shutil.copy(source, destination)
348
349 def make_summary_rta160_xml(gerald_dir):
350     source = os.path.join(TESTDATA_DIR, 'Summary-rta160.xml')
351     destination = os.path.join(gerald_dir, 'Summary.xml')
352     shutil.copy(source, destination)
353
354
355 def make_summary_casava1_7_xml(gerald_dir):
356     source = os.path.join(TESTDATA_DIR, 'Summary-casava1.7.xml')
357     destination = os.path.join(gerald_dir, 'Summary.xml')
358     shutil.copy(source, destination)
359
360 def make_summary_rta1_12(status_dir):
361     source = os.path.join(TESTDATA_DIR, 'Summary-rta1_12.htm')
362     destination = os.path.join(status_dir, 'Summary.htm')
363     shutil.copy(source, destination)
364
365 def make_eland_results(gerald_dir):
366     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
367 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
368 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
369 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
370 """
371     for i in LANE_LIST:
372         pathname = os.path.join(gerald_dir,
373                                 's_%d_eland_result.txt' % (i,))
374         f = open(pathname, 'w')
375         f.write(eland_result)
376         f.close()
377
378 def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
379     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
380 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
381 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
382 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
383 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
384 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
385 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
386 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC
387 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
388 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
389 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
390 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
391 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
392 """]
393     if paired:
394         for e in [1,2]:
395             for i in lane_list:
396                 pathname = os.path.join(gerald_dir,
397                                         's_%d_%d_eland_multi.txt' % (i,e))
398                 f = open(pathname, 'w')
399                 f.write(eland_multi[e-1])
400                 f.close()
401     else:
402         for i in lane_list:
403             pathname = os.path.join(gerald_dir,
404                                     's_%d_eland_multi.txt' % (i,))
405             f = open(pathname, 'w')
406             f.write(eland_multi[0])
407             f.close()
408
409 def make_eland_export(gerald_dir, paired=False, lane_list=LANE_LIST):
410     source = os.path.join(TESTDATA_DIR, 'casava_1.7_export.txt')
411
412     for i in lane_list:
413         destination = os.path.join(gerald_dir,
414                                    's_%d_export.txt' % (i,))
415         shutil.copy(source, destination)
416
417
418 def make_scarf(gerald_dir, lane_list=LANE_LIST):
419     seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
420 HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
421 HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
422 HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
423 HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
424 """
425     for l in lane_list:
426         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
427         f = open(pathname,'w')
428         f.write(seq)
429         f.close()
430
431 def make_fastq(gerald_dir, lane_list=LANE_LIST):
432     seq = """@HWI-EAS229:1:2:182:712#0/1
433 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
434 +HWI-EAS229:1:2:182:712#0/1
435 \\bab_bbaabbababbaaa]]D]bb_baabbab\baa
436 @HWI-EAS229:1:2:198:621#0/1
437 CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
438 +HWI-EAS229:1:2:198:621#0/1
439 [aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
440 @HWI-EAS229:1:2:209:1321#0/1
441 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
442 +HWI-EAS229:1:2:209:1321#0/1
443 _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
444 """
445     for l in lane_list:
446         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
447         f = open(pathname,'w')
448         f.write(seq)
449         f.close()
450
451 UNALIGNED_READS = [1,2]
452 UNALIGNED_SAMPLES = [ (1, UNALIGNED_READS, '11111', None, None),
453                       (2, UNALIGNED_READS, '11112', None, None),
454                       (3, UNALIGNED_READS, '11113', 1, 'ATCACG'),
455                       (3, UNALIGNED_READS, '11113', 2, 'CGATGT'),
456                       (3, UNALIGNED_READS, '11113', 3, 'TTAGGC'),
457                       (4, UNALIGNED_READS, '11114', 6, 'GCCAAT'),
458                       (5, UNALIGNED_READS, '11115', 1, 'ATCACG'),
459                       (5, UNALIGNED_READS, '11116', 7, 'ACTTGA'),
460                       (5, UNALIGNED_READS, '11117', 9, 'GATCAG'),
461                       (6, UNALIGNED_READS, '11118', 1, 'ATCACG'),
462                       (7, UNALIGNED_READS, '11119', 2, 'CGATGT'),
463                       (8, UNALIGNED_READS, '11120', 3, 'TTAGGC'),
464                       (1, UNALIGNED_READS, None, None, None),
465                       (2, UNALIGNED_READS, None, None, None),
466                       (3, UNALIGNED_READS, None, None, None),
467                       (4, UNALIGNED_READS, None, None, None),
468                       (5, UNALIGNED_READS, None, None, None)]
469
470
471 def make_aligned_eland_export(aligned_dir, flowcell_id):
472     summary_source = os.path.join(TESTDATA_DIR, 'sample_summary_1_12.htm')
473     for lane, read, project_id, index_id, index_seq in UNALIGNED_SAMPLES:
474         paths = DemultiplexedPaths(aligned_dir,
475                                    flowcell_id,
476                                    lane,
477                                    project_id,
478                                    index_id,
479                                    index_seq)
480         paths.make_sample_dirs()
481         paths.make_summary_dirs()
482         summary_dest = os.path.join(paths.summary_dir, 'Sample_Summary.htm')
483         shutil.copy(summary_source, summary_dest)
484
485         body = get_unaligned_sample_export(lane, index_seq)
486         for split in ['001','002']:
487             for read in UNALIGNED_READS:
488                 suffix = 'R{0}_{1}_export.txt.gz'.format(read, split)
489                 pathname = paths.make_test_filename(suffix)
490                 stream = gzip.open(pathname, 'w')
491                 stream.write(body)
492                 stream.close()
493
494
495 def make_unaligned_fastqs_1_12(unaligned_dir, flowcell_id):
496     """Create a default mix of unaligned sample files
497     """
498     for lane, read, name, index_id, index in UNALIGNED_SAMPLES:
499         make_unaligned_fastq_sample_1_12(unaligned_dir,
500                                          flowcell_id,
501                                          lane,
502                                          read,
503                                          name,
504                                          index_id,
505                                          index)
506
507 def make_unaligned_fastq_sample_1_12(unaligned_dir,
508                                      flowcell_id,
509                                      lane,
510                                      reads,
511                                      project_id,
512                                      index_id=None,
513                                      index_seq=None):
514
515     paths = DemultiplexedPaths(unaligned_dir,
516                                flowcell_id,
517                                lane,
518                                project_id,
519                                index_id,
520                                index_seq)
521     paths.make_sample_dirs()
522
523     sample_seq = get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq)
524     for split in ['001','002']:
525         for read in reads:
526             suffix = 'R{0}_{1}.fastq.gz'.format(read, split)
527             pathname = paths.make_test_filename(suffix)
528             stream = gzip.open(pathname, 'w')
529             stream.write(sample_seq)
530             stream.close()
531
532     sheetname = os.path.join(paths.sample_dir, 'SampleSheet.csv')
533     stream = open(sheetname, 'w')
534     stream.write('FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject'+os.linesep)
535     template = '{flowcell},{lane},{id},mm9,{index},Sample #{id},N,PR_indexing,Operator,{sample_project}'+os.linesep
536     stream.write(template.format(flowcell=flowcell_id,
537                                  lane=lane,
538                                  id=paths.sample_id,
539                                  index=paths.index_seq,
540                                  sample_project=paths.sample_project))
541     stream.close()
542
543
544 class DemultiplexedPaths(object):
545     def __init__(self, basedir, flowcell_id, lane, project_id, index_id, index_seq):
546         if lane not in LANE_LIST:
547             raise ValueError("Invalid lane ID: {0}".format(lane))
548         self.basedir = basedir
549         self.flowcell_id = flowcell_id
550         self.lane = lane
551
552         if project_id is None:
553             # undetermined
554             self.index_seq = ''
555             self.sample_id = 'lane{0}'.format(lane)
556             self.sample_project = 'Undetermined_indices'
557             self.rootname = 'lane{lane}_Undetermined_L00{lane}_'.format(
558                 lane=lane)
559             self.project_dir = 'Undetermined_indices'
560             self.sample_dir = 'Sample_lane{lane}'.format(lane=lane)
561         elif index_seq is None:
562             self.index_seq = ''
563             self.sample_id = project_id
564             self.sample_project = '{project_id}'.format(project_id=project_id)
565             self.rootname = '{project_id}_NoIndex_L00{lane}_'.format(
566                 project_id=project_id,
567                 lane=lane)
568             self.project_dir = 'Project_' + self.sample_project
569             self.sample_dir = 'Sample_{project_id}'.format(
570                 project_id=project_id)
571         else:
572             self.index_seq = index_seq
573             self.sample_id = project_id
574             self.sample_project = '{project_id}_Index{index_id}'.format(
575                 project_id=project_id,
576                 index_id=index_id)
577             self.rootname = '{project_id}_{index}_L00{lane}_'.format(
578                 project_id=project_id,
579                 index=index_seq,
580                 lane=lane)
581             self.project_dir = 'Project_' + self.sample_project
582             self.sample_dir = 'Sample_{project_id}'.format(
583                 project_id=project_id)
584
585         self.project_dir = os.path.join(self.basedir, self.project_dir)
586         self.sample_dir = os.path.join(self.project_dir, self.sample_dir)
587         self.summary_dir = 'Summary_Stats_{0}'.format(self.flowcell_id)
588         self.summary_dir = os.path.join(self.project_dir, self.summary_dir)
589
590
591     def make_sample_dirs(self):
592         if not os.path.isdir(self.project_dir):
593             os.mkdir(self.project_dir)
594         if not os.path.isdir(self.sample_dir):
595             os.mkdir(self.sample_dir)
596
597     def make_summary_dirs(self):
598         if not os.path.isdir(self.summary_dir):
599             os.mkdir(self.summary_dir)
600
601     def make_test_filename(self, suffix):
602         filename = self.rootname + suffix
603         pathname = os.path.join(self.sample_dir, filename)
604         return pathname
605
606     def dump(self):
607         print ('index seq: {0}'.format(self.index_seq))
608
609         print ('project dir: {0}'.format(self.project_dir))
610         print ('sample dir: {0}'.format(self.sample_dir))
611         print ('rootname: {0}'.format(self.rootname))
612         print ('path: {0}'.format(
613             os.path.join(self.project_dir,
614                          self.sample_dir,
615                          self.rootname+'R1_001.fastq.gz')))
616
617
618 def get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq):
619     seq = """@HWI-ST0787:101:{flowcell}:{lane}:1101:2416:3469 1:Y:0:{index}
620 TCCTTCATTCCACCGGAGTCTGTGGAATTCTCGGGTGCCAAGGAACTCCA
621 +
622 CCCFFFFFHHHHHJJJJJJJJJIJJJJJJJJJJJJJJJJIIJJIIJJJJJ
623 @HWI-ST0787:101:{flowcell}:{lane}:1101:2677:3293 1:Y:0:{index}
624 TGGAAATCCATTGGGGTTTCCCCTGGAATTCTCGGGTGCCAAGGAACTCC
625 +
626 @CCFF3BDHHHHHIIIIIHHIIIDIIIGIIIEGIIIIIIIIIIIIIIIHH
627 @HWI-ST0787:101:{flowcell}:{lane}:1101:2616:3297 1:Y:0:{index}
628 TAATACTGCCGGGTAATGATGGCTGGAATTCTCGGGTGCCAAGGAACTCC
629 +
630 CCCFFFFFHHHHHCGHJJJJJJJJJJJJJJJJJIIJJJJJJJJJIHJJJI
631 @HWI-ST0787:101:{flowcell}:{lane}:1101:2545:3319 1:N:0:{index}
632 TCCTTCATTCCACCGGAGTCTGCTGGAATTCTCGGGTGCCAAGGAACTCC
633 +
634 CCCFFFFFHHHFHJGIGHIJHIIGHIGIGIGEHFIJJJIHIJHJIIJJIH
635 """.format(flowcell=flowcell_id, lane=lane, index=index_seq)
636     return seq
637
638 def get_unaligned_sample_export(lane, index_seq):
639     body = """HWI-ST0787\t102\t{lane}\t1101\t1207\t1993\t{index}\t1\tAANGGATTCGATCCGGCTTAAGAGATGAAAACCGAAAGGGCCGACCGAA\taaBS`ccceg[`ae[dRR_[[SPPPP__ececfYYWaegh^\\ZLLY\\X`\tNM\t\t\t\t\t\t
640 HWI-ST0787\t102     {lane}       1101    1478    1997    {index}  1       CAAGAACCCCGGGGGGGGGGGGGCAGAGAGGGGGAATTTTTTTTTTGTT       BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB       NM                                                                                      N
641 HWI-ST0787      102     {lane}       1101    1625    1994    {index}  1       AANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA       \^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c       chrII.fa
642 """.format(lane=lane, index=index_seq)
643     return body
644
645 def ls_tree(root):
646     for dirpath, dirnames, filenames in os.walk(root):
647         for filename in filenames:
648             print os.path.join(dirpath, filename)
649
650
651 class BaseCallInfo(object):
652     """Provide customization for how to setup the base call mock data
653     """
654     def __init__(self, qseq_file, tile_list, basecall_summary):
655         self.qseq_file = qseq_file
656         self.tile_list = tile_list
657         self.basecall_summary = basecall_summary
658
659 # First generation HiSeq Flowcell
660 ABXX_BASE_CALL_INFO = BaseCallInfo(
661     qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
662     tile_list = HISEQ_TILE_LIST,
663     basecall_summary = 'AA01CCABXX_BustardSummary.xml')