11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
15 class ElandLane(object):
17 Process an eland result file
21 SAMPLE_NAME = 'SampleName'
23 GENOME_MAP = 'GenomeMap'
24 GENOME_ITEM = 'GenomeItem'
25 MAPPED_READS = 'MappedReads'
26 MAPPED_ITEM = 'MappedItem'
27 MATCH_CODES = 'MatchCodes'
36 def __init__(self, pathname=None, genome_map=None, eland_type=None, xml=None):
37 self.pathname = pathname
38 self._sample_name = None
41 self._mapped_reads = None
42 self._match_codes = None
43 if genome_map is None:
45 self.genome_map = genome_map
46 self.eland_type = None
49 self.set_elements(xml)
51 def _guess_eland_type(self, pathname):
52 if self.eland_type is None:
53 # attempt autodetect eland file type
54 pathn, name = os.path.split(pathname)
55 if re.search('result', name):
56 self.eland_type = ElandLane.ELAND_SINGLE
57 elif re.search('multi', name):
58 self.eland_type = ElandLane.ELAND_MULTI
59 elif re.search('extended', name):
60 self.eland_type = ElandLane.ELAND_EXTENDED
61 elif re.search('export', name):
62 self.eland_type = ElandLane.ELAND_EXPORT
64 self.eland_type = ElandLane.ELAND_SINGLE
68 Actually read the file and actually count the reads
70 # can't do anything if we don't have a file to process
71 if self.pathname is None:
73 self._guess_eland_type(self.pathname)
75 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
76 raise RuntimeError("Eland isn't done, try again later.")
78 logging.info("summarizing results for %s" % (self.pathname))
80 if self.eland_type == ElandLane.ELAND_SINGLE:
81 result = self._update_eland_result(self.pathname)
82 elif self.eland_type == ElandLane.ELAND_MULTI or \
83 self.eland_type == ElandLane.ELAND_EXTENDED:
84 result = self._update_eland_multi(self.pathname)
86 raise NotImplementedError("Only support single/multi/extended eland files")
87 self._match_codes, self._mapped_reads, self._reads = result
89 def _update_eland_result(self, pathname):
93 match_codes = {'NM':0, 'QC':0, 'RM':0,
94 'U0':0, 'U1':0, 'U2':0,
95 'R0':0, 'R1':0, 'R2':0,
97 for line in autoopen(pathname,'r'):
101 # match_codes[code] = match_codes.setdefault(code, 0) + 1
102 # the QC/NM etc codes are in the 3rd field and always present
103 match_codes[fields[2]] += 1
104 # ignore lines that don't have a fasta filename
107 fasta = self.genome_map.get(fields[6], fields[6])
108 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
109 return match_codes, mapped_reads, reads
111 def _update_eland_multi(self, pathname):
115 match_codes = {'NM':0, 'QC':0, 'RM':0,
116 'U0':0, 'U1':0, 'U2':0,
117 'R0':0, 'R1':0, 'R2':0,
119 match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
120 for line in autoopen(pathname,'r'):
122 fields = line.split()
123 # fields[2] = QC/NM/or number of matches
124 groups = match_counts_re.match(fields[2])
126 match_codes[fields[2]] += 1
128 # when there are too many hit, eland writes a - where
129 # it would have put the list of hits
132 zero_mismatches = int(groups.group(1))
133 if zero_mismatches == 1:
134 match_codes['U0'] += 1
135 elif zero_mismatches < 255:
136 match_codes['R0'] += zero_mismatches
138 one_mismatches = int(groups.group(2))
139 if one_mismatches == 1:
140 match_codes['U1'] += 1
141 elif one_mismatches < 255:
142 match_codes['R1'] += one_mismatches
144 two_mismatches = int(groups.group(3))
145 if two_mismatches == 1:
146 match_codes['U2'] += 1
147 elif two_mismatches < 255:
148 match_codes['R2'] += two_mismatches
151 for match in fields[3].split(','):
152 match_fragment = match.split(':')
153 if len(match_fragment) == 2:
154 chromo = match_fragment[0]
155 pos = match_fragment[1]
157 fasta = self.genome_map.get(chromo, chromo)
158 assert fasta is not None
159 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
160 return match_codes, mapped_reads, reads
162 def _update_name(self):
163 # extract the sample name
164 if self.pathname is None:
167 path, name = os.path.split(self.pathname)
168 split_name = name.split('_')
169 self._sample_name = split_name[0]
170 self._lane_id = split_name[1]
172 def _get_sample_name(self):
173 if self._sample_name is None:
175 return self._sample_name
176 sample_name = property(_get_sample_name)
178 def _get_lane_id(self):
179 if self._lane_id is None:
182 lane_id = property(_get_lane_id)
184 def _get_reads(self):
185 if self._reads is None:
188 reads = property(_get_reads)
190 def _get_mapped_reads(self):
191 if self._mapped_reads is None:
193 return self._mapped_reads
194 mapped_reads = property(_get_mapped_reads)
196 def _get_match_codes(self):
197 if self._match_codes is None:
199 return self._match_codes
200 match_codes = property(_get_match_codes)
202 def get_elements(self):
203 lane = ElementTree.Element(ElandLane.LANE,
205 unicode(ElandLane.XML_VERSION)})
206 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
207 sample_tag.text = self.sample_name
208 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
209 lane_tag.text = self.lane_id
210 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
211 for k, v in self.genome_map.items():
212 item = ElementTree.SubElement(
213 genome_map, ElandLane.GENOME_ITEM,
214 {'name':k, 'value':unicode(v)})
215 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
216 for k, v in self.mapped_reads.items():
217 item = ElementTree.SubElement(
218 mapped_reads, ElandLane.MAPPED_ITEM,
219 {'name':k, 'value':unicode(v)})
220 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
221 for k, v in self.match_codes.items():
222 item = ElementTree.SubElement(
223 match_codes, ElandLane.MATCH_ITEM,
224 {'name':k, 'value':unicode(v)})
225 reads = ElementTree.SubElement(lane, ElandLane.READS)
226 reads.text = unicode(self.reads)
230 def set_elements(self, tree):
231 if tree.tag != ElandLane.LANE:
232 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
235 self._mapped_reads = {}
236 self._match_codes = {}
239 tag = element.tag.lower()
240 if tag == ElandLane.SAMPLE_NAME.lower():
241 self._sample_name = element.text
242 elif tag == ElandLane.LANE_ID.lower():
243 self._lane_id = element.text
244 elif tag == ElandLane.GENOME_MAP.lower():
245 for child in element:
246 name = child.attrib['name']
247 value = child.attrib['value']
248 self.genome_map[name] = value
249 elif tag == ElandLane.MAPPED_READS.lower():
250 for child in element:
251 name = child.attrib['name']
252 value = child.attrib['value']
253 self._mapped_reads[name] = int(value)
254 elif tag == ElandLane.MATCH_CODES.lower():
255 for child in element:
256 name = child.attrib['name']
257 value = int(child.attrib['value'])
258 self._match_codes[name] = value
259 elif tag == ElandLane.READS.lower():
260 self._reads = int(element.text)
262 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
266 Summarize information from eland files
270 ELAND = 'ElandCollection'
274 def __init__(self, xml=None):
275 # we need information from the gerald config.xml
279 self.set_elements(xml)
282 return len(self.results)
285 return self.results.keys()
288 return self.results.values()
291 return self.results.items()
293 def __getitem__(self, key):
294 return self.results[key]
296 def get_elements(self):
297 root = ElementTree.Element(ELAND.ELAND,
298 {'version': unicode(ELAND.XML_VERSION)})
299 for lane_id, lane in self.results.items():
300 eland_lane = lane.get_elements()
301 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
302 root.append(eland_lane)
305 def set_elements(self, tree):
306 if tree.tag.lower() != ELAND.ELAND.lower():
307 raise ValueError('Expecting %s', ELAND.ELAND)
308 for element in list(tree):
309 lane_id = element.attrib[ELAND.LANE_ID]
310 lane = ElandLane(xml=element)
311 self.results[lane_id] = lane
313 def check_for_eland_file(basedir, lane_id, pattern):
314 basename = pattern % (lane_id,)
315 pathname = os.path.join(basedir, basename)
316 if os.path.exists(pathname):
321 def eland(basedir, gerald=None, genome_maps=None):
324 file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
325 if len(file_list) == 0:
326 # lets handle compressed eland files too
327 file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
329 lane_ids = ['1','2','3','4','5','6','7','8']
330 # the order in patterns determines the preference for what
332 patterns = ['s_%s_eland_result.txt',
333 's_%s_eland_result.txt.bz2',
334 's_%s_eland_result.txt.gz',
335 's_%s_eland_extended.txt',
336 's_%s_eland_extended.txt.bz2',
337 's_%s_eland_extended.txt.gz',
338 's_%s_eland_multi.txt',
339 's_%s_eland_multi.txt.bz2',
340 's_%s_eland_multi.txt.gz',]
342 for lane_id in lane_ids:
344 pathname = check_for_eland_file(basedir, lane_id, p)
345 if pathname is not None:
349 # yes the lane_id is also being computed in ElandLane._update
350 # I didn't want to clutter up my constructor
351 # but I needed to persist the sample_name/lane_id for
352 # runfolder summary_report
353 path, name = os.path.split(pathname)
354 logging.info("Adding eland file %s" %(name,))
355 split_name = name.split('_')
356 lane_id = split_name[1]
358 if genome_maps is not None:
359 genome_map = genome_maps[lane_id]
360 elif gerald is not None:
361 genome_dir = gerald.lanes[lane_id].eland_genome
362 genome_map = build_genome_fasta_map(genome_dir)
366 eland_result = ElandLane(pathname, genome_map)
367 e.results[lane_id] = eland_result
370 def build_genome_fasta_map(genome_dir):
371 # build fasta to fasta file map
372 logging.info("Building genome map")
373 genome = genome_dir.split(os.path.sep)[-1]
375 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
377 if os.path.islink(vld_file):
379 vld_file = os.path.realpath(vld_file)
380 path, vld_name = os.path.split(vld_file)
381 name, ext = os.path.splitext(vld_name)
383 fasta_map[name] = name
385 fasta_map[name] = os.path.join(genome, name)
389 def extract_eland_sequence(instream, outstream, start, end):
391 Extract a chunk of sequence out of an eland file
393 for line in instream:
394 record = line.split()
396 result = [record[0], record[1][start:end]]
398 result = [record[0][start:end]]
399 outstream.write("\t".join(result))
400 outstream.write(os.linesep)