Add support for extracting data out of Illumina's new RTA runfolder.
[htsworkflow.git] / htsworkflow / pipelines / test / simulate_runfolder.py
1 """
2 Create simulated solexa/illumina runfolders for testing
3 """
4
5 import os
6 import shutil
7
8 TEST_CODE_DIR = os.path.split(__file__)[0]
9 TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
10 LANE_LIST = range(1,9)
11
12 def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
13     firecrest_dir = os.path.join(data_dir, 
14                                  'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
15                                  )
16     os.mkdir(firecrest_dir)
17     return firecrest_dir
18     
19 def make_ipar_dir(data_dir, version='1.01'):
20     """
21     Construct an artificial ipar parameter file and directory
22     """
23     ipar1_01_file = os.path.join(TESTDATA_DIR, 'IPAR1.01.params')
24     shutil.copy(ipar1_01_file, os.path.join(data_dir, '.params'))
25
26     ipar_dir = os.path.join(data_dir, 'IPAR_%s' % (version,))
27     if not os.path.exists(ipar_dir):
28       os.mkdir(ipar_dir)
29     return ipar_dir
30
31 def make_flowcell_id(runfolder_dir, flowcell_id=None):
32     if flowcell_id is None:
33         flowcell_id = '207BTAAXY'
34
35     config = """<?xml version="1.0"?>
36 <FlowcellId>
37   <Text>%s</Text>
38 </FlowcellId>""" % (flowcell_id,)
39     config_dir = os.path.join(runfolder_dir, 'Config')
40
41     if not os.path.exists(config_dir):
42         os.mkdir(config_dir)
43     pathname = os.path.join(config_dir, 'FlowcellId.xml')
44     f = open(pathname,'w')
45     f.write(config)
46     f.close()
47
48 def make_bustard_config132(gerald_dir):
49     source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
50     destination = os.path.join(gerald_dir, 'config.xml')
51     shutil.copy(source, destination)
52
53 def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
54     """
55     Construct an artificial RTA Intensities parameter file and directory
56     """
57     intensities_dir = os.path.join(data_dir, 'Intensities')
58     if not os.path.exists(intensities_dir):
59       os.mkdir(intensities_dir)
60  
61     param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
62     shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
63
64     return intensities_dir
65
66 def make_rta_basecalls_1460(intensities_dir):
67     """
68     Construct an artificial RTA Intensities parameter file and directory
69     """
70     basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
71     if not os.path.exists(basecalls_dir):
72       os.mkdir(basecalls_dir)
73  
74     param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
75     shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
76
77     return basecalls_dir
78
79
80 def make_matrix(matrix_filename):
81     contents = """# Auto-generated frequency response matrix
82 > A
83 > C
84 > G
85 > T
86 0.77 0.15 -0.04 -0.04
87 0.76 1.02 -0.05 -0.06
88 -0.10 -0.10 1.17 -0.03
89 -0.13 -0.12 0.80 1.27
90 """
91     f = open(matrix_filename, 'w')
92     f.write(contents)
93     f.close()
94
95 def make_phasing_params(bustard_dir):
96     for lane in range(1,9):
97         pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
98         f = open(pathname, 'w')
99         f.write("""<Parameters>
100   <Phasing>0.009900</Phasing>
101   <Prephasing>0.003500</Prephasing>
102 </Parameters>
103 """)
104         f.close()
105
106 def make_gerald_config_026(gerald_dir):
107     source = os.path.join(TESTDATA_DIR, 'gerald_config_0.2.6.xml')
108     destination = os.path.join(gerald_dir, 'config.xml')
109     shutil.copy(source, destination)
110
111 def make_gerald_config_100(gerald_dir):
112     source = os.path.join(TESTDATA_DIR, 'gerald_config_1.0.xml')
113     destination = os.path.join(gerald_dir, 'config.xml')
114     shutil.copy(source, destination)
115
116 def make_summary_htm_100(gerald_dir):
117     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
118     destination = os.path.join(gerald_dir, 'Summary.htm')
119     shutil.copy(source, destination)
120
121 def make_summary_htm_110(gerald_dir):
122     source = os.path.join(TESTDATA_DIR, 'Summary-pipeline110.htm')
123     destination = os.path.join(gerald_dir, 'Summary.htm')
124     shutil.copy(source, destination)
125
126 def make_summary_paired_htm(gerald_dir):
127     source = os.path.join(TESTDATA_DIR, 'Summary-paired-pipeline110.htm')
128     destination = os.path.join(gerald_dir, 'Summary.htm')
129     shutil.copy(source, destination)
130
131 def make_summary_ipar130_htm(gerald_dir):
132     source = os.path.join(TESTDATA_DIR, 'Summary-ipar130.htm')
133     destination = os.path.join(gerald_dir, 'Summary.htm')
134     shutil.copy(source, destination)
135
136 def make_eland_results(gerald_dir):
137     eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
138 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
139 >HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
140 >HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
141 """
142     for i in range(1,9):
143         pathname = os.path.join(gerald_dir,
144                                 's_%d_eland_result.txt' % (i,))
145         f = open(pathname, 'w')
146         f.write(eland_result)
147         f.close()
148
149 def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST):
150     eland_multi = [""">HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
151 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
152 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
153 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
154 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
155 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
156 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788   AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT      NM
157 >HWI-EAS229_60_30DP9AAXX:1:1:1221:788   NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT     QC 
158 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:2   chr5.fa:55269838R0
159 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379   AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG   2:1:0   chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
160 >HWI-EAS229_60_30DP9AAXX:1:1:892:1155   ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT   0:9:10  chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
161 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample1:55269838R0
162 >HWI-EAS229_60_30DP9AAXX:1:1:931:747    AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA   1:0:0   spike.fa/sample2:55269838R0
163 """]
164     if paired:
165         for e in [1,2]:
166             for i in lane_list:
167                 pathname = os.path.join(gerald_dir,
168                                         's_%d_%d_eland_multi.txt' % (i,e))
169                 f = open(pathname, 'w')
170                 f.write(eland_multi[e-1])
171                 f.close()
172     else:
173         for i in lane_list:
174             pathname = os.path.join(gerald_dir,
175                                     's_%d_eland_multi.txt' % (i,))
176             f = open(pathname, 'w')
177             f.write(eland_multi[0])
178             f.close()
179
180 def make_scarf(gerald_dir, lane_list=LANE_LIST):
181     seq = """HWI-EAS229_92_30VNBAAXX:1:1:0:161:NCAATTACACGACGCTAGCCCTAAAGCTATTTCGAGG:E[aaaabb^a\a_^^a[S`ba_WZUXaaaaaaUKPER
182 HWI-EAS229_92_30VNBAAXX:1:1:0:447:NAGATGCGCATTTGAAGTAGGAGCAAAAGATCAAGGT:EUabaab^baabaaaaaaaa^^Uaaaaa\aaaa__`a
183 HWI-EAS229_92_30VNBAAXX:1:1:0:1210:NATAGCCTCTATAGAAGCCACTATTATTTTTTTCTTA:EUa`]`baaaaa^XQU^a`S``S_`J_aaaaaabb^V
184 HWI-EAS229_92_30VNBAAXX:1:1:0:1867:NTGGAGCAGATATAAAAACAGATGGTGACGTTGAAGT:E[^UaaaUaba^aaa^aa^XV\baaLaLaaaaQVXV^
185 HWI-EAS229_92_30VNBAAXX:1:1:0:1898:NAGCTCGTGTCGTGAGATGTTAGGTTAAGTCCTGCAA:EK_aaaaaaaaaaaUZaaZaXM[aaaXSM\aaZ]URE
186 """
187     for l in lane_list:
188         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
189         f = open(pathname,'w')
190         f.write(seq)
191         f.close()
192
193 def make_fastq(gerald_dir, lane_list=LANE_LIST):
194     seq = """@HWI-EAS229:1:2:182:712#0/1
195 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
196 +HWI-EAS229:1:2:182:712#0/1
197 \bab_bbaabbababbaaa]]D]bb_baabbab\baa
198 @HWI-EAS229:1:2:198:621#0/1
199 CCCCCCCCCCCCCCCCCCCCCNCCCCCCCCCCCCCCC
200 +HWI-EAS229:1:2:198:621#0/1
201 [aaaaaaa`_`aaaaaaa[`ZDZaaaaaaaaaaaaaa
202 @HWI-EAS229:1:2:209:1321#0/1
203 AAAAAAAAAAAAAAAAAAAAANAAAAAAAAAAAAAAA
204 +HWI-EAS229:1:2:209:1321#0/1
205 _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa
206 """
207     for l in lane_list:
208         pathname = os.path.join(gerald_dir, 's_%d_sequence.txt' %(l,))
209         f = open(pathname,'w')
210         f.write(seq)
211         f.close()
212
213