This also updates the report tools to be compatible with 1.0.
For multi reads I mapped 0/1/2 mismatch reads to U0/U1/U2 if the number of
reads equaled 1 (for each category seperatly) and I mapped reads >1 and < 255
to R0/R1/R2.
Unfortunately 1.1rc1 changed the summary file, so this patch is not
compatible with it yet.
from glob import glob
import logging
import os
+import re
import stat
from htsworkflow.pipelines.runfolder import ElementTree
MATCH_ITEM = 'Code'
READS = 'Reads'
- def __init__(self, pathname=None, genome_map=None, xml=None):
+ ELAND_SINGLE = 0
+ ELAND_MULTI = 1
+ ELAND_EXTENDED = 2
+ ELAND_EXPORT = 3
+
+ def __init__(self, pathname=None, genome_map=None, eland_type=None, xml=None):
self.pathname = pathname
self._sample_name = None
self._lane_id = None
if genome_map is None:
genome_map = {}
self.genome_map = genome_map
+ self.eland_type = None
if xml is not None:
self.set_elements(xml)
+ def _guess_eland_type(self, pathname):
+ if self.eland_type is None:
+ # attempt autodetect eland file type
+ pathn, name = os.path.split(pathname)
+ if re.search('result', name):
+ self.eland_type = ElandLane.ELAND_SINGLE
+ elif re.search('multi', name):
+ self.eland_type = ElandLane.ELAND_MULTI
+ elif re.search('extended', name):
+ self.eland_type = ElandLane.ELAND_EXTENDED
+ elif re.search('export', name):
+ self.eland_type = ElandLane.ELAND_EXPORT
+ else:
+ self.eland_type = ElandLane.ELAND_SINGLE
+
def _update(self):
"""
Actually read the file and actually count the reads
# can't do anything if we don't have a file to process
if self.pathname is None:
return
+ self._guess_eland_type(self.pathname)
if os.stat(self.pathname)[stat.ST_SIZE] == 0:
raise RuntimeError("Eland isn't done, try again later.")
logging.info("summarizing results for %s" % (self.pathname))
+
+ if self.eland_type == ElandLane.ELAND_SINGLE:
+ result = self._update_eland_result(self.pathname)
+ elif self.eland_type == ElandLane.ELAND_MULTI or \
+ self.eland_type == ElandLane.ELAND_EXTENDED:
+ result = self._update_eland_multi(self.pathname)
+ else:
+ raise NotImplementedError("Only support single/multi/extended eland files")
+ self._match_codes, self._mapped_reads, self._reads = result
+
+ def _update_eland_result(self, pathname):
reads = 0
mapped_reads = {}
'U0':0, 'U1':0, 'U2':0,
'R0':0, 'R1':0, 'R2':0,
}
- for line in autoopen(self.pathname,'r'):
+ for line in autoopen(pathname,'r'):
reads += 1
fields = line.split()
# code = fields[2]
continue
fasta = self.genome_map.get(fields[6], fields[6])
mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
- self._match_codes = match_codes
- self._mapped_reads = mapped_reads
- self._reads = reads
+ return match_codes, mapped_reads, reads
+
+ def _update_eland_multi(self, pathname):
+ reads = 0
+ mapped_reads = {}
+
+ match_codes = {'NM':0, 'QC':0, 'RM':0,
+ 'U0':0, 'U1':0, 'U2':0,
+ 'R0':0, 'R1':0, 'R2':0,
+ }
+ match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
+ for line in autoopen(pathname,'r'):
+ reads += 1
+ fields = line.split()
+ # fields[2] = QC/NM/or number of matches
+ groups = match_counts_re.match(fields[2])
+ if groups is None:
+ match_codes[fields[2]] += 1
+ else:
+ # when there are too many hit, eland writes a - where
+ # it would have put the list of hits
+ if fields[3] == '-':
+ continue
+ zero_mismatches = int(groups.group(1))
+ if zero_mismatches == 1:
+ match_codes['U0'] += 1
+ elif zero_mismatches < 255:
+ match_codes['R0'] += zero_mismatches
+
+ one_mismatches = int(groups.group(2))
+ if one_mismatches == 1:
+ match_codes['U1'] += 1
+ elif one_mismatches < 255:
+ match_codes['R1'] += one_mismatches
+
+ two_mismatches = int(groups.group(3))
+ if two_mismatches == 1:
+ match_codes['U2'] += 1
+ elif two_mismatches < 255:
+ match_codes['R2'] += two_mismatches
+
+ chromo = None
+ for match in fields[3].split(','):
+ match_fragment = match.split(':')
+ if len(match_fragment) == 2:
+ chromo = match_fragment[0]
+ pos = match_fragment[1]
+
+ fasta = self.genome_map.get(chromo, chromo)
+ assert fasta is not None
+ mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
+ return match_codes, mapped_reads, reads
def _update_name(self):
# extract the sample name
lane = ElandLane(xml=element)
self.results[lane_id] = lane
+def check_for_eland_file(basedir, lane_id, pattern):
+ basename = pattern % (lane_id,)
+ pathname = os.path.join(basedir, basename)
+ if os.path.exists(pathname):
+ return pathname
+ else:
+ return None
+
def eland(basedir, gerald=None, genome_maps=None):
e = ELAND()
# lets handle compressed eland files too
file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
- for pathname in file_list:
+ lane_ids = ['1','2','3','4','5','6','7','8']
+ # the order in patterns determines the preference for what
+ # will be found.
+ patterns = ['s_%s_eland_result.txt',
+ 's_%s_eland_result.txt.bz2',
+ 's_%s_eland_result.txt.gz',
+ 's_%s_eland_extended.txt',
+ 's_%s_eland_extended.txt.bz2',
+ 's_%s_eland_extended.txt.gz',
+ 's_%s_eland_multi.txt',
+ 's_%s_eland_multi.txt.bz2',
+ 's_%s_eland_multi.txt.gz',]
+
+ for lane_id in lane_ids:
+ for p in patterns:
+ pathname = check_for_eland_file(basedir, lane_id, p)
+ if pathname is not None:
+ break
+ else:
+ continue
# yes the lane_id is also being computed in ElandLane._update
# I didn't want to clutter up my constructor
# but I needed to persist the sample_name/lane_id for
result = [record[0][start:end]]
outstream.write("\t".join(result))
outstream.write(os.linesep)
-
import time
from htsworkflow.pipelines.summary import Summary
-from htsworkflow.pipelines.eland import eland
+from htsworkflow.pipelines.eland import eland, ELAND
from htsworkflow.pipelines.runfolder import \
ElementTree, \
"""
Make it easy to access elements of LaneSpecificRunParameters from python
"""
- def __init__(self, gerald, key):
+ def __init__(self, gerald, lane_id):
self._gerald = gerald
- self._key = key
+ self._lane_id = lane_id
def __get_attribute(self, xml_tag):
subtree = self._gerald.tree.find('LaneSpecificRunParameters')
if len(container.getchildren()) > LANES_PER_FLOWCELL:
raise RuntimeError('GERALD config.xml file changed')
lanes = [x.tag.split('_')[1] for x in container.getchildren()]
- index = lanes.index(self._key)
+ index = lanes.index(self._lane_id)
element = container[index]
return element.text
def _get_analysis(self):
"""
def __init__(self, gerald):
self._gerald = gerald
- self._keys = None
+ self._lane = None
+
+ def _initalize_lanes(self):
+ """
+ build dictionary of LaneParameters
+ """
+ self._lanes = {}
+ tree = self._gerald.tree
+ analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+ # according to the pipeline specs I think their fields
+ # are sampleName_laneID, with sampleName defaulting to s
+ # since laneIDs are constant lets just try using
+ # those consistently.
+ for element in analysis:
+ sample, lane_id = element.tag.split('_')
+ self._lanes[lane_id] = Gerald.LaneParameters(self._gerald, lane_id)
+
def __getitem__(self, key):
- return Gerald.LaneParameters(self._gerald, key)
+ if self._lane is None:
+ self._initalize_lanes()
+ return self._lanes[key]
def keys(self):
- if self._keys is None:
- tree = self._gerald.tree
- analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
- # according to the pipeline specs I think their fields
- # are sampleName_laneID, with sampleName defaulting to s
- # since laneIDs are constant lets just try using
- # those consistently.
- self._keys = [ x.tag.split('_')[1] for x in analysis]
- return self._keys
+ if self._lane is None:
+ self._initalize_lanes()
+ return self._lanes.keys()
def values(self):
- return [ self[x] for x in self.keys() ]
+ if self._lane is None:
+ self._initalize_lanes()
+ return self._lanes.values()
def items(self):
- return zip(self.keys(), self.values())
+ if self._lane is None:
+ self._initalize_lanes()
+ return self._lanes.items()
def __len__(self):
- return len(self.keys())
+ if self._lane is None:
+ self._initalize_lanes()
+ return len(self._lanes)
def __init__(self, xml=None):
self.pathname = None
g.eland_results = eland(g.pathname, g)
return g
+if __name__ == "__main__":
+ # quick test code
+ import sys
+ g = gerald(sys.argv[1])
+ #ElementTree.dump(g.get_elements())
\ No newline at end of file
report.append('')
return os.linesep.join(report)
+def is_compressed(filename):
+ if os.path.splitext(filename)[1] == ".gz":
+ return True
+ elif os.path.splitext(filename)[1] == '.bz2':
+ return True
+ else:
+ return False
+
def extract_results(runs, output_base_dir=None):
if output_base_dir is None:
output_base_dir = os.getcwd()
for eland_lane in g.eland_results.values():
source_name = eland_lane.pathname
path, name = os.path.split(eland_lane.pathname)
- dest_name = os.path.join(cycle_dir, name+'.bz2')
-
- args = ['bzip2', '-9', '-c', source_name]
- logging.info('Running: %s' % ( " ".join(args) ))
- bzip_dest = open(dest_name, 'w')
- bzip = subprocess.Popen(args, stdout=bzip_dest)
- logging.info('Saving to %s' % (dest_name, ))
- bzip.wait()
+ dest_name = os.path.join(cycle_dir, name)
+ if is_compressed(name):
+ logging.info('Already compressed, Saving to %s' % (dest_name, ))
+ shutil.copy(source_name, dest_name)
+ else:
+ # not compressed
+ dest_name += '.bz2'
+ args = ['bzip2', '-9', '-c', source_name]
+ logging.info('Running: %s' % ( " ".join(args) ))
+ bzip_dest = open(dest_name, 'w')
+ bzip = subprocess.Popen(args, stdout=bzip_dest)
+ logging.info('Saving to %s' % (dest_name, ))
+ bzip.wait()
def clean_runs(runs):
"""
"""
Analyze the Summary.htm file produced by GERALD
"""
-
+import types
from htsworkflow.pipelines.runfolder import ElementTree
from htsworkflow.util.ethelp import indent, flatten
f = open(pathname, 'w')
f.write(eland_result)
f.close()
+
+def make_eland_multi(gerald_dir):
+ eland_multi = """>HWI-EAS229_60_30DP9AAXX:1:1:1221:788 AAGATATCTACGACGTGGTATGGCGGTGTCTGGTCGT NM
+>HWI-EAS229_60_30DP9AAXX:1:1:931:747 AAAAAAGCAAATTTCATTCACATGTTCTGTGTTCATA 1:0:2 chr5.fa:55269838R0
+>HWI-EAS229_60_30DP9AAXX:1:1:1121:379 AGAAGAGACATTAAGAGTTCCTGAAATTTATATCTGG 2:1:0 chr16.fa:46189180R1,chr7.fa:122968519R0,chr8.fa:48197174F0
+>HWI-EAS229_60_30DP9AAXX:1:1:892:1155 ACATTCTCCTTTCCTTCTGAAGTTTTTACGATTCTTT 0:9:10 chr10.fa:114298201F1,chr12.fa:8125072F1,19500297F2,42341293R2,chr13.fa:27688155R2,95069772R1,chr15.fa:51016475F2,chr16.fa:27052155F2,chr1.fa:192426217R2,chr21.fa:23685310R2,chr2.fa:106680068F1,chr3.fa:185226695F2,chr4.fa:106626808R2,chr5.fa:14704894F1,43530779F1,126543189F2,chr6.fa:74284101F1,chr7.fa:22516603F1,chr9.fa:134886204R
+"""
+ for i in range(1,9):
+ pathname = os.path.join(gerald_dir,
+ 's_%d_eland_multi.txt' % (i,))
+ f = open(pathname, 'w')
+ f.write(eland_multi)
+ f.close()
os.mkdir(gerald_dir)
make_gerald_config(gerald_dir)
make_summary100_htm(gerald_dir)
- make_eland_results(gerald_dir)
+ make_eland_multi(gerald_dir)
if obj is not None:
obj.temp_dir = temp_dir
self.failUnlessEqual(cur_lane.read_length, '32')
self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+ # I want to be able to use a simple iterator
+ for l in g.lanes.values():
+ self.failUnlessEqual(l.analysis, 'eland')
+ self.failUnlessEqual(l.read_length, '32')
+ self.failUnlessEqual(l.use_bases, 'Y'*32)
+
# test data extracted from summary file
clusters = [None,
(96483, 9074), (133738, 7938),
def test_eland(self):
- dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
- 'chr2L.fa': 'dm3/chr2L.fa',
- 'Lambda.fa': 'Lambda.fa'}
- genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
- '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+ hg_map = {'Lambda.fa': 'Lambda.fa'}
+ for i in range(1,22):
+ short_name = 'chr%d.fa' % (i,)
+ long_name = 'hg18/chr%d.fa' % (i,)
+ hg_map[short_name] = long_name
+
+ genome_maps = { '1':hg_map, '2':hg_map, '3':hg_map, '4':hg_map,
+ '5':hg_map, '6':hg_map, '7':hg_map, '8':hg_map }
eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
for i in range(1,9):
self.failUnlessEqual(lane.reads, 4)
self.failUnlessEqual(lane.sample_name, "s")
self.failUnlessEqual(lane.lane_id, unicode(i))
- self.failUnlessEqual(len(lane.mapped_reads), 3)
- self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
- self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
- self.failUnlessEqual(lane.match_codes['U1'], 2)
+ self.failUnlessEqual(len(lane.mapped_reads), 15)
+ self.failUnlessEqual(lane.mapped_reads['hg18/chr5.fa'], 4)
+ self.failUnlessEqual(lane.match_codes['U1'], 10)
self.failUnlessEqual(lane.match_codes['NM'], 1)
+ self.failUnlessEqual(lane.match_codes['QC'], 0)
xml = eland.get_elements()
# just make sure that element tree can serialize the tree
self.failUnlessEqual(l1.sample_name, l2.sample_name)
self.failUnlessEqual(l1.lane_id, l2.lane_id)
self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
- self.failUnlessEqual(len(l1.mapped_reads), 3)
+ self.failUnlessEqual(len(l1.mapped_reads), 15)
for k in l1.mapped_reads.keys():
self.failUnlessEqual(l1.mapped_reads[k],
l2.mapped_reads[k])
# do we get the flowcell id from the filename?
self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-10-30.xml')
+ name = 'run_207BTAAXX_%s.xml' % ( date.today().strftime('%Y-%m-%d'),)
+ self.failUnlessEqual(runs[0].name, name)
# do we get the flowcell id from the FlowcellId.xml file
make_flowcell_id(self.runfolder_dir, '207BTAAXY')
runs = runfolder.get_runs(self.runfolder_dir)
self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-10-30.xml')
+ name = 'run_207BTAAXY_%s.xml' % ( date.today().strftime('%Y-%m-%d'),)
+ self.failUnlessEqual(runs[0].name, name)
r1 = runs[0]
xml = r1.get_elements()