self.lane_id = lane_id
self.end = end
self._reads = None
-
+
if xml is not None:
self.set_elements(xml)
reads += 1
fields = line.split()
# fields[2] = QC/NM/or number of matches
- score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
+ score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
match_codes)
if score_type == ElandLane.SCORE_READ:
# when there are too many hits, eland writes a - where
# it would have put the list of hits.
# or in a different version of eland, it just leaves
- # that column blank, and only outputs 3 fields.
+ # that column blank, and only outputs 3 fields.
if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
continue
reads += 1
fields = line.split()
# fields[2] = QC/NM/or number of matches
- score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
+ score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
match_codes)
if score_type == ElandLane.SCORE_UNRECOGNIZED:
# export files have three states for the match field
- # QC code, count of multi-reads, or a single
+ # QC code, count of multi-reads, or a single
# read location. The score_mapped_mismatches function
# only understands the first two types.
# if we get unrecognized, that implies the field is probably
# a location.
- code = self._count_mapped_export(mapped_reads,
+ code = self._count_mapped_export(mapped_reads,
fields[LOCATION_INDEX],
fields[DESCRIPTOR_INDEX])
match_codes[code] += 1
def _score_mapped_mismatches(self, match, match_codes):
"""Update match_codes with eland map counts, or failure code.
-
+
Returns True if the read mapped, false if it was an error code.
"""
groups = ElandLane.MATCH_COUNTS_RE.match(match)
match_codes['U1'] += 1
elif one_mismatches < 255:
match_codes['R1'] += one_mismatches
-
+
if two_mismatches == 1:
match_codes['U2'] += 1
elif two_mismatches < 255:
match_codes['R2'] += two_mismatches
-
+
return ElandLane.SCORE_READ
def _count_mapped_export(self, mapped_reads, match_string, descriptor):
"""Count a read as defined in an export file
-
+
match_string contains the chromosome
- descriptor contains the an ecoding of bases that match, mismatch,
+ descriptor contains the an ecoding of bases that match, mismatch,
and have indels.
returns the "best" match code
def _get_no_match(self):
if self._mapped_reads is None:
- self._update()
+ self._update()
return self._match_codes['NM']
- no_match = property(_get_no_match,
+ no_match = property(_get_no_match,
doc="total reads that didn't match the target genome.")
def _get_no_match_percent(self):
- return float(self.no_match)/self.reads * 100
+ return float(self.no_match)/self.reads * 100
no_match_percent = property(_get_no_match_percent,
doc="no match reads as percent of total")
def _get_qc_failed(self):
if self._mapped_reads is None:
- self._update()
+ self._update()
return self._match_codes['QC']
qc_failed = property(_get_qc_failed,
doc="total reads that didn't match the target genome.")
def _get_qc_failed_percent(self):
- return float(self.qc_failed)/self.reads * 100
+ return float(self.qc_failed)/self.reads * 100
qc_failed_percent = property(_get_qc_failed_percent,
doc="QC failed reads as percent of total")
return sum
repeat_reads = property(_get_repeat_reads,
doc="total repeat reads")
-
+
def get_elements(self):
lane = ElementTree.Element(ElandLane.LANE,
{'version':
raise ValueError('Expecting %s', ELAND.ELAND)
for element in list(tree):
lane_id = int(element.attrib[ELAND.LANE_ID])
- end = int(element.attrib.get(ELAND.END, 0))
+ end = int(element.attrib.get(ELAND.END, 0))
if element.tag.lower() == ElandLane.LANE.lower():
lane = ElandLane(xml=element)
elif element.tag.lower() == SequenceLane.LANE.lower():
# split_name = name.split('_')
# lane_id = int(split_name[1])
+ genome_map = {}
if genome_maps is not None:
genome_map = genome_maps[lane_id]
elif gerald is not None:
genome_dir = gerald.lanes[lane_id].eland_genome
- genome_map = build_genome_fasta_map(genome_dir)
- else:
- genome_map = {}
+ if genome_dir is not None:
+ genome_map = build_genome_fasta_map(genome_dir)
lane = ElandLane(pathname, lane_id, end, genome_map)
-
+
if end is None:
effective_end = 0
else:
if os.path.isdir(basedir_temp):
basedirs.append(basedir_temp)
-
+
# the order in patterns determines the preference for what
# will be found.
MAPPED_ELAND = 0
e = eland(a)
print e.get_elements()
- return
+ return
if __name__ == "__main__":
TESTDATA_DIR = os.path.join(TEST_CODE_DIR, 'testdata')
LANE_LIST = range(1,9)
TILE_LIST = range(1,101)
+HISEQ_TILE_LIST = [1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108,
+ 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
+ 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108,
+ 2201, 2202, 2203, 2204, 2205, 2206, 2207, 2208,]
def make_firecrest_dir(data_dir, version="1.9.2", start=1, stop=37):
- firecrest_dir = os.path.join(data_dir,
+ firecrest_dir = os.path.join(data_dir,
'C%d-%d_Firecrest%s_12-04-2008_diane' % (start, stop, version)
)
os.mkdir(firecrest_dir)
return firecrest_dir
-
+
def make_ipar_dir(data_dir, version='1.01'):
"""
Construct an artificial ipar parameter file and directory
intensities_dir = os.path.join(data_dir, 'Intensities')
if not os.path.exists(intensities_dir):
os.mkdir(intensities_dir)
-
+
param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config.xml')
shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
if not os.path.exists(basecalls_dir):
os.mkdir(basecalls_dir)
-
+
param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config.xml')
shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
intensities_dir = os.path.join(data_dir, 'Intensities')
if not os.path.exists(intensities_dir):
os.mkdir(intensities_dir)
-
+
param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1870.xml')
shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
return intensities_dir
+def make_rta_intensities_1_10(data_dir, version='1.10.36.0'):
+ """
+ Construct an artificial RTA Intensities parameter file and directory
+ """
+ intensities_dir = os.path.join(data_dir, 'Intensities')
+ if not os.path.exists(intensities_dir):
+ os.mkdir(intensities_dir)
+
+ param_file = os.path.join(TESTDATA_DIR, 'rta_intensities_config_1.10.xml')
+ shutil.copy(param_file, os.path.join(intensities_dir, 'config.xml'))
+
+ return intensities_dir
+
def make_rta_basecalls_1870(intensities_dir):
"""
Construct an artificial RTA Intensities parameter file and directory
basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
if not os.path.exists(basecalls_dir):
os.mkdir(basecalls_dir)
-
+
param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1870.xml')
shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
return basecalls_dir
-def make_qseqs(bustard_dir, in_temp=True):
+def make_rta_basecalls_1_10(intensities_dir):
+ """
+ Construct an artificial RTA Intensities parameter file and directory
+ """
+ basecalls_dir = os.path.join(intensities_dir, 'BaseCalls')
+ if not os.path.exists(basecalls_dir):
+ os.mkdir(basecalls_dir)
+
+ make_qseqs(basecalls_dir, basecall_info=ABXX_BASE_CALL_INFO)
+ param_file = os.path.join(TESTDATA_DIR, 'rta_basecalls_config_1.10.xml')
+ shutil.copy(param_file, os.path.join(basecalls_dir, 'config.xml'))
+
+ return basecalls_dir
+
+def make_qseqs(bustard_dir, in_temp=True, basecall_info=None):
"""
Fill gerald directory with qseq files
"""
+ if basecall_info is None:
+ qseq_file = '42BRJAAXX_8_1_0039_qseq.txt'
+ tile_list = TILE_LIST
+ summary_file = '42BRJAAXX_BustardSummary.xml'
+ else:
+ qseq_file = basecall_info.qseq_file
+ tile_list = basecall_info.tile_list
+ summary_file = basecall_info.basecall_summary
+
# 42BRJ 8 1 0039 happened to be a better than usual tile, in that there
# was actually sequence at the start
- source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_1_0039_qseq.txt')
+ source = os.path.join(TESTDATA_DIR, qseq_file)
destdir = bustard_dir
if not os.path.isdir(destdir):
os.mkdir(destdir)
-
+
for lane in LANE_LIST:
- for tile in TILE_LIST:
+ for tile in tile_list:
destination = os.path.join(bustard_dir, 's_%d_1_%04d_qseq.txt' % (lane, tile))
shutil.copy(source, destination)
make_matrix_dir(bustard_dir)
make_phasing_dir(bustard_dir)
- summary_source = os.path.join(TESTDATA_DIR, '42BRJAAXX_BustardSummary.xml')
+ summary_source = os.path.join(TESTDATA_DIR, summary_file)
summary_dest = os.path.join(bustard_dir, 'BustardSummary.xml')
shutil.copy(summary_source, summary_dest)
-
+
return destdir
def make_scores(gerald_dir, in_temp=True):
destdir = os.path.join(destdir, 'Temp')
if not os.path.isdir(destdir):
os.mkdir(destdir)
-
+
for lane in LANE_LIST:
for tile in TILE_LIST:
destination = os.path.join(destdir, 's_%d_%04d_score.txt' % (lane, tile))
shutil.copy(source, destination)
-
+
return destdir
def make_matrix_dir(bustard_dir):
"""
Create several matrix files in <bustard_dir>/Matrix/
- from pipeline 1.4
+ from pipeline 1.4
"""
destdir = os.path.join(bustard_dir, 'Matrix')
if not os.path.isdir(destdir):
os.mkdir(destdir)
-
+
source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_02_matrix.txt')
for lane in LANE_LIST:
destination = os.path.join(destdir, 's_%d_02_matrix.txt' % ( lane, ))
shutil.copy(source, destination)
-
+
def make_matrix(matrix_filename):
contents = """# Auto-generated frequency response matrix
> A
destdir = os.path.join(bustard_dir, 'Matrix')
if not os.path.isdir(destdir):
os.mkdir(destdir)
-
+
source = os.path.join(TESTDATA_DIR, '61MMFAAXX_4_1_matrix.txt')
lane_fragments = [ "_%d" % (l,) for l in LANE_LIST]
for fragment in lane_fragments:
destination = os.path.join(destdir, 's%s_1_matrix.txt' % ( fragment, ))
shutil.copy(source, destination)
-
+
def make_phasing_dir(bustard_dir):
"""
Create several phasing files in <bustard_dir>/Phasing/
destdir = os.path.join(bustard_dir, 'Phasing')
if not os.path.isdir(destdir):
os.mkdir(destdir)
-
+
source = os.path.join(TESTDATA_DIR, '42BRJAAXX_8_01_phasing.xml')
for lane in LANE_LIST:
destination = os.path.join(destdir, 's_%d_01_phasing.xml' % ( lane, ))
shutil.copy(source, destination)
-
+
def make_phasing_params(bustard_dir):
for lane in LANE_LIST:
pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
destination = os.path.join(gerald_dir, 'config.xml')
shutil.copy(source, destination)
+def make_gerald_config_1_10(gerald_dir):
+ source = os.path.join(TESTDATA_DIR, 'gerald_config_1.10.xml')
+ destination = os.path.join(gerald_dir, 'config.xml')
+ shutil.copy(source, destination)
+
def make_summary_htm_100(gerald_dir):
source = os.path.join(TESTDATA_DIR, 'Summary-pipeline100.htm')
destination = os.path.join(gerald_dir, 'Summary.htm')
destination = os.path.join(gerald_dir, 'Summary.xml')
shutil.copy(source, destination)
+
+def make_summary_rta1_10_xml(gerald_dir):
+ source = os.path.join(TESTDATA_DIR, 'Summary-rta1.10.xml')
+ destination = os.path.join(gerald_dir, 'Summary.xml')
+ shutil.copy(source, destination)
+
+
def make_eland_results(gerald_dir):
eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
>HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:0 spike.fa/sample1:55269838R0
>HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:0 spike.fa/sample2:55269838R0
""", """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT NM
->HWI-EAS229_60_30DP9AAXX:1:1:1221:788 NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT QC
+>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 NNNNNNNNNNNNNNGTGGTATGGCGGTGTCTGGTCGT QC
>HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:2 chr5.fa:55269838R0
>HWI-EAS229_60_30DP9AAXX:1:1:1121:379 AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG 2:1:0 chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0,chr7.fa:22516603F1,chr9.fa:134886204R
>HWI-EAS229_60_30DP9AAXX:1:1:892:1155 ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT 0:9:10 chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1
f.close()
+class BaseCallInfo(object):
+ """Provide customization for how to setup the base call mock data
+ """
+ def __init__(self, qseq_file, tile_list, basecall_summary):
+ self.qseq_file = qseq_file
+ self.tile_list = tile_list
+ self.basecall_summary = basecall_summary
+
+# First generation HiSeq Flowcell
+ABXX_BASE_CALL_INFO = BaseCallInfo(
+ qseq_file='AA01CCABXX_8_2_2207_qseq.txt',
+ tile_list = HISEQ_TILE_LIST,
+ basecall_summary = 'AA01CCABXX_BustardSummary.xml')