From: Diane Trout Date: Mon, 12 Sep 2011 22:29:20 +0000 (-0700) Subject: Add support for first-gen HiSeq flowcells (e.g. ABXX) X-Git-Tag: 0.5.4~1 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=0123277f86af843d47170d56d217f45e7d7cb4a6 Add support for first-gen HiSeq flowcells (e.g. ABXX) --- diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py index 271a651..bd478aa 100644 --- a/htsworkflow/pipelines/eland.py +++ b/htsworkflow/pipelines/eland.py @@ -44,7 +44,7 @@ class ResultLane(object): self.lane_id = lane_id self.end = end self._reads = None - + if xml is not None: self.set_elements(xml) @@ -182,13 +182,13 @@ class ElandLane(ResultLane): reads += 1 fields = line.split() # fields[2] = QC/NM/or number of matches - score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], + score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], match_codes) if score_type == ElandLane.SCORE_READ: # when there are too many hits, eland writes a - where # it would have put the list of hits. # or in a different version of eland, it just leaves - # that column blank, and only outputs 3 fields. + # that column blank, and only outputs 3 fields. if len(fields) < 4 or fields[LOCATION_INDEX] == '-': continue @@ -213,16 +213,16 @@ class ElandLane(ResultLane): reads += 1 fields = line.split() # fields[2] = QC/NM/or number of matches - score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], + score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], match_codes) if score_type == ElandLane.SCORE_UNRECOGNIZED: # export files have three states for the match field - # QC code, count of multi-reads, or a single + # QC code, count of multi-reads, or a single # read location. The score_mapped_mismatches function # only understands the first two types. # if we get unrecognized, that implies the field is probably # a location. - code = self._count_mapped_export(mapped_reads, + code = self._count_mapped_export(mapped_reads, fields[LOCATION_INDEX], fields[DESCRIPTOR_INDEX]) match_codes[code] += 1 @@ -232,7 +232,7 @@ class ElandLane(ResultLane): def _score_mapped_mismatches(self, match, match_codes): """Update match_codes with eland map counts, or failure code. - + Returns True if the read mapped, false if it was an error code. """ groups = ElandLane.MATCH_COUNTS_RE.match(match) @@ -260,12 +260,12 @@ class ElandLane(ResultLane): match_codes['U1'] += 1 elif one_mismatches < 255: match_codes['R1'] += one_mismatches - + if two_mismatches == 1: match_codes['U2'] += 1 elif two_mismatches < 255: match_codes['R2'] += two_mismatches - + return ElandLane.SCORE_READ @@ -284,9 +284,9 @@ class ElandLane(ResultLane): def _count_mapped_export(self, mapped_reads, match_string, descriptor): """Count a read as defined in an export file - + match_string contains the chromosome - descriptor contains the an ecoding of bases that match, mismatch, + descriptor contains the an ecoding of bases that match, mismatch, and have indels. returns the "best" match code @@ -320,25 +320,25 @@ class ElandLane(ResultLane): def _get_no_match(self): if self._mapped_reads is None: - self._update() + self._update() return self._match_codes['NM'] - no_match = property(_get_no_match, + no_match = property(_get_no_match, doc="total reads that didn't match the target genome.") def _get_no_match_percent(self): - return float(self.no_match)/self.reads * 100 + return float(self.no_match)/self.reads * 100 no_match_percent = property(_get_no_match_percent, doc="no match reads as percent of total") def _get_qc_failed(self): if self._mapped_reads is None: - self._update() + self._update() return self._match_codes['QC'] qc_failed = property(_get_qc_failed, doc="total reads that didn't match the target genome.") def _get_qc_failed_percent(self): - return float(self.qc_failed)/self.reads * 100 + return float(self.qc_failed)/self.reads * 100 qc_failed_percent = property(_get_qc_failed_percent, doc="QC failed reads as percent of total") @@ -361,7 +361,7 @@ class ElandLane(ResultLane): return sum repeat_reads = property(_get_repeat_reads, doc="total repeat reads") - + def get_elements(self): lane = ElementTree.Element(ElandLane.LANE, {'version': @@ -565,7 +565,7 @@ class ELAND(object): raise ValueError('Expecting %s', ELAND.ELAND) for element in list(tree): lane_id = int(element.attrib[ELAND.LANE_ID]) - end = int(element.attrib.get(ELAND.END, 0)) + end = int(element.attrib.get(ELAND.END, 0)) if element.tag.lower() == ElandLane.LANE.lower(): lane = ElandLane(xml=element) elif element.tag.lower() == SequenceLane.LANE.lower(): @@ -598,16 +598,16 @@ def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_map # split_name = name.split('_') # lane_id = int(split_name[1]) + genome_map = {} if genome_maps is not None: genome_map = genome_maps[lane_id] elif gerald is not None: genome_dir = gerald.lanes[lane_id].eland_genome - genome_map = build_genome_fasta_map(genome_dir) - else: - genome_map = {} + if genome_dir is not None: + genome_map = build_genome_fasta_map(genome_dir) lane = ElandLane(pathname, lane_id, end, genome_map) - + if end is None: effective_end = 0 else: @@ -643,7 +643,7 @@ def eland(gerald_dir, gerald=None, genome_maps=None): if os.path.isdir(basedir_temp): basedirs.append(basedir_temp) - + # the order in patterns determines the preference for what # will be found. MAPPED_ELAND = 0 @@ -723,7 +723,7 @@ def main(cmdline=None): e = eland(a) print e.get_elements() - return + return if __name__ == "__main__": diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 58f3081..f62b4f2 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -56,14 +56,18 @@ class Gerald(object): # default to the chipwide parameters if there isn't an # entry in the lane specific paramaters if genome is None: - subtree = self._gerald.tree.find('ChipWideRunParameters') - container = subtree.find('ELAND_GENOME') - genome = container.text + genome = self._gerald._get_chip_attribute('ELAND_GENOME') + # ignore flag value + if genome == 'Need_to_specify_ELAND_genome_directory': + genome = None return genome eland_genome = property(_get_eland_genome) def _get_read_length(self): - return self.__get_attribute('READ_LENGTH') + read_length = self.__get_attribute('READ_LENGTH') + if read_length is None: + read_length = self._gerald._get_chip_attribute('READ_LENGTH') + return read_length read_length = property(_get_read_length) def _get_use_bases(self): @@ -154,7 +158,7 @@ class Gerald(object): root = '' else: root = os.path.join(root,'') - + experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR') if experiment_dir is None: return None @@ -165,13 +169,16 @@ class Gerald(object): dirnames = experiment_dir.split(os.path.sep) return dirnames[0] runfolder_name = property(_get_runfolder_name) - + def _get_version(self): if self.tree is None: return None return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') version = property(_get_version) + def _get_chip_attribute(self, value): + return self.tree.findtext('ChipWideRunParameters/%s' % (value,)) + def dump(self): """ Debugging function, report current object diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index 5b79c2e..0b06c0b 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -9,14 +9,18 @@ TEST_CODE_DIR = os.path.split(__file__)[0] TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata') LANE_LIST = range(1,9) TILE_LIST = range(1,101) +HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, + 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, + 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, + 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,] def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37): - firecrest_dir = os.path.join(data_dir, + firecrest_dir = os.path.join(data_dir, 'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version) ) os.mkdir(firecrest_dir) return firecrest_dir - + def make_ipar_dir(data_dir, version='1.01'): """ Construct an artificial ipar parameter file and directory @@ -58,7 +62,7 @@ def make_rta_intensities_1460(data_dir, version='1.4.6.0'): intensities_dir = os.path.join(data_dir, 'Intensities') if not os.path.exists(intensities_dir): os.mkdir(intensities_dir) - + param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml') shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml')) @@ -71,7 +75,7 @@ def make_rta_basecalls_1460(intensities_dir): basecalls_dir = os.path.join(intensities_dir, 'BaseCalls') if not os.path.exists(basecalls_dir): os.mkdir(basecalls_dir) - + param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml') shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml')) @@ -84,12 +88,25 @@ def make_rta_intensities_1870(data_dir, version='1.8.70.0'): intensities_dir = os.path.join(data_dir, 'Intensities') if not os.path.exists(intensities_dir): os.mkdir(intensities_dir) - + param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml') shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml')) return intensities_dir +def make_rta_intensities_1_10(data_dir, version='1.10.36.0'): + """ + Construct an artificial RTA Intensities parameter file and directory + """ + intensities_dir = os.path.join(data_dir, 'Intensities') + if not os.path.exists(intensities_dir): + os.mkdir(intensities_dir) + + param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml') + shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml')) + + return intensities_dir + def make_rta_basecalls_1870(intensities_dir): """ Construct an artificial RTA Intensities parameter file and directory @@ -97,35 +114,58 @@ def make_rta_basecalls_1870(intensities_dir): basecalls_dir = os.path.join(intensities_dir, 'BaseCalls') if not os.path.exists(basecalls_dir): os.mkdir(basecalls_dir) - + param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml') shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml')) return basecalls_dir -def make_qseqs(bustard_dir, in_temp=True): +def make_rta_basecalls_1_10(intensities_dir): + """ + Construct an artificial RTA Intensities parameter file and directory + """ + basecalls_dir = os.path.join(intensities_dir, 'BaseCalls') + if not os.path.exists(basecalls_dir): + os.mkdir(basecalls_dir) + + make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO) + param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml') + shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml')) + + return basecalls_dir + +def make_qseqs(bustard_dir, in_temp=True, basecall_info=None): """ Fill gerald directory with qseq files """ + if basecall_info is None: + qseq_file = '42BRJAAXX_8_1_0039_qseq.txt' + tile_list = TILE_LIST + summary_file = '42BRJAAXX_BustardSummary.xml' + else: + qseq_file = basecall_info.qseq_file + tile_list = basecall_info.tile_list + summary_file = basecall_info.basecall_summary + # 42BRJ 8 1 0039 happened to be a better than usual tile, in that there # was actually sequence at the start - source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_1_0039_qseq.txt') + source = os.path.join(TESTDATA_DIR, qseq_file) destdir = bustard_dir if not os.path.isdir(destdir): os.mkdir(destdir) - + for lane in LANE_LIST: - for tile in TILE_LIST: + for tile in tile_list: destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile)) shutil.copy(source, destination) make_matrix_dir(bustard_dir) make_phasing_dir(bustard_dir) - summary_source = os.path.join(TESTDATA_DIR, '42BRJAAXX_BustardSummary.xml') + summary_source = os.path.join(TESTDATA_DIR, summary_file) summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml') shutil.copy(summary_source, summary_dest) - + return destdir def make_scores(gerald_dir, in_temp=True): @@ -139,29 +179,29 @@ def make_scores(gerald_dir, in_temp=True): destdir = os.path.join(destdir, 'Temp') if not os.path.isdir(destdir): os.mkdir(destdir) - + for lane in LANE_LIST: for tile in TILE_LIST: destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile)) shutil.copy(source, destination) - + return destdir def make_matrix_dir(bustard_dir): """ Create several matrix files in /Matrix/ - from pipeline 1.4 + from pipeline 1.4 """ destdir = os.path.join(bustard_dir, 'Matrix') if not os.path.isdir(destdir): os.mkdir(destdir) - + source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt') for lane in LANE_LIST: destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, )) shutil.copy(source, destination) - + def make_matrix(matrix_filename): contents = """# Auto-generated frequency response matrix > A @@ -184,13 +224,13 @@ def make_matrix_dir_rta160(bustard_dir): destdir = os.path.join(bustard_dir, 'Matrix') if not os.path.isdir(destdir): os.mkdir(destdir) - + source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt') lane_fragments = [ "_%d" % (l,) for l in LANE_LIST] for fragment in lane_fragments: destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, )) shutil.copy(source, destination) - + def make_phasing_dir(bustard_dir): """ Create several phasing files in /Phasing/ @@ -200,12 +240,12 @@ def make_phasing_dir(bustard_dir): destdir = os.path.join(bustard_dir, 'Phasing') if not os.path.isdir(destdir): os.mkdir(destdir) - + source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml') for lane in LANE_LIST: destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, )) shutil.copy(source, destination) - + def make_phasing_params(bustard_dir): for lane in LANE_LIST: pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) @@ -227,6 +267,11 @@ def make_gerald_config_100(gerald_dir): destination = os.path.join(gerald_dir, 'config.xml') shutil.copy(source, destination) +def make_gerald_config_1_10(gerald_dir): + source = os.path.join(TESTDATA_DIR, 'gerald_config_1.10.xml') + destination = os.path.join(gerald_dir, 'config.xml') + shutil.copy(source, destination) + def make_summary_htm_100(gerald_dir): source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm') destination = os.path.join(gerald_dir, 'Summary.htm') @@ -252,6 +297,13 @@ def make_summary_rta160_xml(gerald_dir): destination = os.path.join(gerald_dir, 'Summary.xml') shutil.copy(source, destination) + +def make_summary_rta1_10_xml(gerald_dir): + source = os.path.join(TESTDATA_DIR, 'Summary-rta1.10.xml') + destination = os.path.join(gerald_dir, 'Summary.xml') + shutil.copy(source, destination) + + def make_eland_results(gerald_dir): eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. >HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T @@ -273,7 +325,7 @@ def make_eland_multi(gerald_dir, paired=False, lane_list=LANE_LIST): >HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:0 spike.fa/sample1:55269838R0 >HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:0 spike.fa/sample2:55269838R0 """, """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT NM ->HWI-EAS229_60_30DP9AAXX:1:1:1221:788 NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT QC +>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT QC >HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:2 chr5.fa:55269838R0 >HWI-EAS229_60_30DP9AAXX:1:1:1121:379 AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG 2:1:0 chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R >HWI-EAS229_60_30DP9AAXX:1:1:892:1155 ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT 0:9:10 chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1 @@ -339,3 +391,16 @@ _bbbbbaaababaabbbbab]D]aaaaaaaaaaaaaa f.close() +class BaseCallInfo(object): + """Provide customization for how to setup the base call mock data + """ + def __init__(self, qseq_file, tile_list, basecall_summary): + self.qseq_file = qseq_file + self.tile_list = tile_list + self.basecall_summary = basecall_summary + +# First generation HiSeq Flowcell +ABXX_BASE_CALL_INFO = BaseCallInfo( + qseq_file='AA01CCABXX_8_2_2207_qseq.txt', + tile_list = HISEQ_TILE_LIST, + basecall_summary = 'AA01CCABXX_BustardSummary.xml')