"""
Create simulated solexa/illumina runfolders for testing
"""
-
+import gzip
import os
import shutil
f.write(config)
f.close()
+def make_runinfo(runfolder_dir, flowcell_id):
+ """Simulate a RunInfo.xml file created by >= RTA 1.9
+ """
+ xml = '''<?xml version="1.0"?>
+<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
+ <Run Id="{runfolder}" Number="101">
+ <Flowcell>{flowcell}</Flowcell>
+ <Instrument>SN787</Instrument>
+ <Date>110815</Date>
+ <Reads>
+ <Read Number="1" NumCycles="50" IsIndexedRead="N" />
+ <Read Number="2" NumCycles="7" IsIndexedRead="Y" />
+ </Reads>
+ <FlowcellLayout LaneCount="8" SurfaceCount="2" SwathCount="3" TileCount="8" />
+ <AlignToPhiX />
+ </Run>
+</RunInfo>
+'''
+ path, runfolder = os.path.split(runfolder_dir)
+ runinfo = os.path.join(runfolder_dir, 'RunInfo.xml')
+ stream = open(runinfo, 'w')
+ stream.write(xml.format(runfolder=runfolder, flowcell=flowcell_id))
+ stream.close()
+ return runinfo
+
def make_bustard_config132(image_dir):
source = os.path.join(TESTDATA_DIR, 'bustard-config132.xml')
destination = os.path.join(image_dir, 'config.xml')
shutil.copy(source, destination)
+def make_aligned_config_1_12(aligned_dir):
+ """This is rouglhly equivalent to the old gerald file"""
+ source = os.path.join(TESTDATA_DIR, '1_12', 'aligned_config_1_12.xml')
+ destination = os.path.join(aligned_dir, 'config.xml')
+ shutil.copy(source, destination)
+
+def make_unaligned_config_1_12(unaligned_dir):
+ demultiplex_pairs = [ # (src,
+ # dest),
+ (os.path.join(TESTDATA_DIR, '1_12', 'demultiplex_1.12.4.2.xml'),
+ os.path.join(unaligned_dir, 'DemultiplexConfig.xml')),
+ (os.path.join(TESTDATA_DIR, '1_12',
+ 'demultiplexed_bustard_1.12.4.2.xml'),
+ os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')),
+ (os.path.join(TESTDATA_DIR, '1_12',
+ 'demultiplexed_summary_1.12.4.2.xml'),
+ os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')),
+ ]
+ for src, dest in demultiplex_pairs:
+ shutil.copy(src, dest)
+
def make_rta_intensities_1460(data_dir, version='1.4.6.0'):
"""
Construct an artificial RTA Intensities parameter file and directory
return intensities_dir
+def make_rta_intensities_1_12(data_dir, version='1.12.4.2'):
+ """
+ Construct an artificial RTA Intensities parameter file and directory
+ """
+ intensities_dir = os.path.join(data_dir, 'Intensities')
+ if not os.path.exists(intensities_dir):
+ os.mkdir(intensities_dir)
+
+ param_file = os.path.join(TESTDATA_DIR, '1_12',
+ 'rta_intensities_config_1.12.4.2.xml')
+ shutil.copy(param_file, os.path.join(intensities_dir, 'RTAConfig.xml'))
+
+ return intensities_dir
+
def make_rta_basecalls_1870(intensities_dir):
"""
Construct an artificial RTA Intensities parameter file and directory
return basecalls_dir
+def make_rta_basecalls_1_12(intensities_dir):
+ """
+ Construct an artificial RTA Intensities parameter file and directory
+ """
+ basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
+ if not os.path.exists(basecalls_dir):
+ os.mkdir(basecalls_dir)
+
+ make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
+ param_file = os.path.join(TESTDATA_DIR, '1_12',
+ 'rta_basecalls_config_1.12.4.2.xml')
+ shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
+
+ return basecalls_dir
+
+
def make_qseqs(bustard_dir, basecall_info=None):
"""
Fill gerald directory with qseq files
def make_matrix_dir_rta_1_10(bustard_dir):
make_matrix_dir_rta160(bustard_dir)
+def make_matrix_dir_rta_1_12(bustard_dir):
+ make_matrix_dir_rta160(bustard_dir)
+
def make_phasing_dir(bustard_dir):
"""
Create several phasing files in <bustard_dir>/Phasing/
destination = os.path.join(gerald_dir, 'Summary.xml')
shutil.copy(source, destination)
+def make_status_rta1_12(datadir):
+ sourcedir = os.path.join(TESTDATA_DIR, '1_12')
+ status_htm = os.path.join(sourcedir, 'Status.htm')
+ destination = os.path.join(datadir, 'Status.htm')
+ shutil.copy(status_htm, destination)
+
+ status_dir = os.path.join(datadir, 'Status_Files')
+ status_source_dir = os.path.join(sourcedir, 'Status_Files')
+ shutil.copytree(status_source_dir, status_dir)
+
+ report_source_dir = os.path.join(sourcedir, 'reports')
+ report_dir = os.path.join(datadir, 'reports')
+ shutil.copytree(report_source_dir, report_dir)
def make_eland_results(gerald_dir):
eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
f.write(seq)
f.close()
-
-def ls_tree(root):
+UNALIGNED_READS = [1,2]
+UNALIGNED_SAMPLES = [ (1, UNALIGNED_READS, '11111', None, None),
+ (2, UNALIGNED_READS, '11112', None, None),
+ (3, UNALIGNED_READS, '11113', 1, 'ATCACG'),
+ (3, UNALIGNED_READS, '11113', 2, 'CGATGT'),
+ (3, UNALIGNED_READS, '11113', 3, 'TTAGGC'),
+ (4, UNALIGNED_READS, '11114', 6, 'GCCAAT'),
+ (5, UNALIGNED_READS, '11115', 1, 'ATCACG'),
+ (5, UNALIGNED_READS, '11116', 7, 'ACTTGA'),
+ (5, UNALIGNED_READS, '11117', 9, 'GATCAG'),
+ (6, UNALIGNED_READS, '11118', 1, 'ATCACG'),
+ (7, UNALIGNED_READS, '11119', 2, 'CGATGT'),
+ (8, UNALIGNED_READS, '11120', 3, 'TTAGGC'),
+ (1, UNALIGNED_READS, None, None, None),
+ (2, UNALIGNED_READS, None, None, None),
+ (3, UNALIGNED_READS, None, None, None),
+ (4, UNALIGNED_READS, None, None, None),
+ (5, UNALIGNED_READS, None, None, None)]
+
+
+def make_aligned_eland_export(aligned_dir, flowcell_id):
+ summary_source = os.path.join(TESTDATA_DIR, 'sample_summary_1_12.htm')
+ for lane, read, project_id, index_id, index_seq in UNALIGNED_SAMPLES:
+ paths = DemultiplexedPaths(aligned_dir,
+ flowcell_id,
+ lane,
+ project_id,
+ index_id,
+ index_seq)
+ paths.make_sample_dirs()
+ paths.make_summary_dirs()
+ summary_dest = os.path.join(paths.summary_dir, 'Sample_Summary.htm')
+ shutil.copy(summary_source, summary_dest)
+
+ body = get_aligned_sample_export(lane, index_seq)
+ for split in ['001','002']:
+ for read in UNALIGNED_READS:
+ suffix = 'R{0}_{1}_export.txt.gz'.format(read, split)
+ pathname = paths.make_test_filename(suffix)
+ stream = gzip.open(pathname, 'w')
+ stream.write(body)
+ stream.close()
+
+
+def make_unaligned_fastqs_1_12(unaligned_dir, flowcell_id):
+ """Create a default mix of unaligned sample files
+ """
+ for lane, read, name, index_id, index in UNALIGNED_SAMPLES:
+ make_unaligned_fastq_sample_1_12(unaligned_dir,
+ flowcell_id,
+ lane,
+ read,
+ name,
+ index_id,
+ index)
+
+def make_unaligned_fastq_sample_1_12(unaligned_dir,
+ flowcell_id,
+ lane,
+ reads,
+ project_id,
+ index_id=None,
+ index_seq=None):
+
+ paths = DemultiplexedPaths(unaligned_dir,
+ flowcell_id,
+ lane,
+ project_id,
+ index_id,
+ index_seq)
+ paths.make_sample_dirs()
+
+ sample_seq = get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq)
+ for split in ['001','002']:
+ for read in reads:
+ suffix = 'R{0}_{1}.fastq.gz'.format(read, split)
+ pathname = paths.make_test_filename(suffix)
+ stream = gzip.open(pathname, 'w')
+ stream.write(sample_seq)
+ stream.close()
+
+ sheetname = os.path.join(paths.sample_dir, 'SampleSheet.csv')
+ stream = open(sheetname, 'w')
+ stream.write('FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject'+os.linesep)
+ template = '{flowcell},{lane},{id},mm9,{index},Sample #{id},N,PR_indexing,Operator,{sample_project}'+os.linesep
+ stream.write(template.format(flowcell=flowcell_id,
+ lane=lane,
+ id=paths.sample_id,
+ index=paths.index_seq,
+ sample_project=paths.sample_project))
+ stream.close()
+
+
+class DemultiplexedPaths(object):
+ def __init__(self, basedir, flowcell_id, lane, project_id, index_id, index_seq):
+ if lane not in LANE_LIST:
+ raise ValueError("Invalid lane ID: {0}".format(lane))
+ self.basedir = basedir
+ self.flowcell_id = flowcell_id
+ self.lane = lane
+
+ if project_id is None:
+ # undetermined
+ self.index_seq = ''
+ self.sample_id = 'lane{0}'.format(lane)
+ self.sample_project = 'Undetermined_indices'
+ self.rootname = 'lane{lane}_Undetermined_L00{lane}_'.format(
+ lane=lane)
+ self.project_dir = 'Undetermined_indices'
+ self.sample_dir = 'Sample_lane{lane}'.format(lane=lane)
+ elif index_seq is None:
+ self.index_seq = ''
+ self.sample_id = project_id
+ self.sample_project = '{project_id}'.format(project_id=project_id)
+ self.rootname = '{project_id}_NoIndex_L00{lane}_'.format(
+ project_id=project_id,
+ lane=lane)
+ self.project_dir = 'Project_' + self.sample_project
+ self.sample_dir = 'Sample_{project_id}'.format(
+ project_id=project_id)
+ else:
+ self.index_seq = index_seq
+ self.sample_id = project_id
+ self.sample_project = '{project_id}_Index{index_id}'.format(
+ project_id=project_id,
+ index_id=index_id)
+ self.rootname = '{project_id}_{index}_L00{lane}_'.format(
+ project_id=project_id,
+ index=index_seq,
+ lane=lane)
+ self.project_dir = 'Project_' + self.sample_project
+ self.sample_dir = 'Sample_{project_id}'.format(
+ project_id=project_id)
+
+ self.project_dir = os.path.join(self.basedir, self.project_dir)
+ self.sample_dir = os.path.join(self.project_dir, self.sample_dir)
+ self.summary_dir = 'Summary_Stats_{0}'.format(self.flowcell_id)
+ self.summary_dir = os.path.join(self.project_dir, self.summary_dir)
+
+
+ def make_sample_dirs(self):
+ if not os.path.isdir(self.project_dir):
+ os.mkdir(self.project_dir)
+ if not os.path.isdir(self.sample_dir):
+ os.mkdir(self.sample_dir)
+
+ def make_summary_dirs(self):
+ if not os.path.isdir(self.summary_dir):
+ os.mkdir(self.summary_dir)
+
+ def make_test_filename(self, suffix):
+ filename = self.rootname + suffix
+ pathname = os.path.join(self.sample_dir, filename)
+ return pathname
+
+ def dump(self):
+ print ('index seq: {0}'.format(self.index_seq))
+
+ print ('project dir: {0}'.format(self.project_dir))
+ print ('sample dir: {0}'.format(self.sample_dir))
+ print ('rootname: {0}'.format(self.rootname))
+ print ('path: {0}'.format(
+ os.path.join(self.project_dir,
+ self.sample_dir,
+ self.rootname+'R1_001.fastq.gz')))
+
+
+def get_unaligned_sample_fastq_data(flowcell_id, lane, index_seq):
+ seq = """@HWI-ST0787:101:{flowcell}:{lane}:1101:2416:3469 1:Y:0:{index}
+TCCTTCATTCCACCGGAGTCTGTGGAATTCTCGGGTGCCAAGGAACTCCA
++
+CCCFFFFFHHHHHJJJJJJJJJIJJJJJJJJJJJJJJJJIIJJIIJJJJJ
+@HWI-ST0787:101:{flowcell}:{lane}:1101:2677:3293 1:Y:0:{index}
+TGGAAATCCATTGGGGTTTCCCCTGGAATTCTCGGGTGCCAAGGAACTCC
++
+@CCFF3BDHHHHHIIIIIHHIIIDIIIGIIIEGIIIIIIIIIIIIIIIHH
+@HWI-ST0787:101:{flowcell}:{lane}:1101:2616:3297 1:Y:0:{index}
+TAATACTGCCGGGTAATGATGGCTGGAATTCTCGGGTGCCAAGGAACTCC
++
+CCCFFFFFHHHHHCGHJJJJJJJJJJJJJJJJJIIJJJJJJJJJIHJJJI
+@HWI-ST0787:101:{flowcell}:{lane}:1101:2545:3319 1:N:0:{index}
+TCCTTCATTCCACCGGAGTCTGCTGGAATTCTCGGGTGCCAAGGAACTCC
++
+CCCFFFFFHHHFHJGIGHIJHIIGHIGIGIGEHFIJJJIHIJHJIIJJIH
+""".format(flowcell=flowcell_id, lane=lane, index=index_seq)
+ return seq
+
+def get_aligned_sample_export(lane, index_seq):
+ body = """HWI-ST0787\t102\t{lane}\t1101\t1207\t1993\t{index}\t1\tAANGGATTCGATCCGGCTTAAGAGATGAAAACCGAAAGGGCCGACCGAA\taaBS`ccceg[`ae[dRR_[[SPPPP__ececfYYWaegh^\\ZLLY\\X`\tNM\t\t\t\t\t\t
+HWI-ST0787\t102\t{lane}\t1101\t1478\t1997\t{index}\t1\tCAAGAACCCCGGGGGGGGGGGGGCAGAGAGGGGGAATTTTTTTTTTGTT\tBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\tNM\t\t\t\t\t\t\t\t\t\t\tN
+HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\tchr9.fa\t67717938\tR\t99\t72
+HWI-ST0787\t102\t{lane}\t1101\t1625\t1994\t{index}\t1\tAANAATGCTACAGAGACAAAACAAAACTGATATGAAAGTTGAGAATAAA\tB^BS\cccgegg[Q[QQQ[`egdgffbeggfgh^^YcfgfhXaHY^O^c\t3:4:3\t\t\t\t\t\t\t\t\t\t\tY
+""".format(lane=lane, index=index_seq)
+ return body
+
+def print_ls_tree(root):
+ """List tree contents, useful for debugging.
+ """
for dirpath, dirnames, filenames in os.walk(root):
for filename in filenames:
print os.path.join(dirpath, filename)