bdbe5758eb2ef59319d13151d169c91c9aacf04d
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10
11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
14
15 class ElandLane(object):
16     """
17     Process an eland result file
18     """
19     XML_VERSION = 2
20     LANE = 'ElandLane'
21     SAMPLE_NAME = 'SampleName'
22     LANE_ID = 'LaneID'
23     END = 'End'
24     GENOME_MAP = 'GenomeMap'
25     GENOME_ITEM = 'GenomeItem'
26     MAPPED_READS = 'MappedReads'
27     MAPPED_ITEM = 'MappedItem'
28     MATCH_CODES = 'MatchCodes'
29     MATCH_ITEM = 'Code'
30     READS = 'Reads'
31
32     ELAND_SINGLE = 0
33     ELAND_MULTI = 1
34     ELAND_EXTENDED = 2
35     ELAND_EXPORT = 3
36
37     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
38         self.pathname = pathname
39         self._sample_name = None
40         self.lane_id = lane_id
41         self.end = end
42         self._reads = None
43         self._mapped_reads = None
44         self._match_codes = None
45         if genome_map is None:
46             genome_map = {}
47         self.genome_map = genome_map
48         self.eland_type = None
49
50         if xml is not None:
51             self.set_elements(xml)
52
53     def _guess_eland_type(self, pathname):
54         if self.eland_type is None:
55           # attempt autodetect eland file type
56           pathn, name = os.path.split(pathname)
57           if re.search('result', name):
58             self.eland_type = ElandLane.ELAND_SINGLE
59           elif re.search('multi', name):
60             self.eland_type = ElandLane.ELAND_MULTI
61           elif re.search('extended', name):
62             self.eland_type = ElandLane.ELAND_EXTENDED
63           elif re.search('export', name):
64             self.eland_type = ElandLane.ELAND_EXPORT
65           else:
66             self.eland_type = ElandLane.ELAND_SINGLE
67
68     def _update(self):
69         """
70         Actually read the file and actually count the reads
71         """
72         # can't do anything if we don't have a file to process
73         if self.pathname is None:
74             return
75         self._guess_eland_type(self.pathname)
76
77         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
78             raise RuntimeError("Eland isn't done, try again later.")
79
80         logging.info("summarizing results for %s" % (self.pathname))
81
82         if self.eland_type == ElandLane.ELAND_SINGLE:
83           result = self._update_eland_result(self.pathname)
84         elif self.eland_type == ElandLane.ELAND_MULTI or \
85              self.eland_type == ElandLane.ELAND_EXTENDED:
86           result = self._update_eland_multi(self.pathname)
87         else:
88           raise NotImplementedError("Only support single/multi/extended eland files")
89         self._match_codes, self._mapped_reads, self._reads = result
90
91     def _update_eland_result(self, pathname):
92         reads = 0
93         mapped_reads = {}
94
95         match_codes = {'NM':0, 'QC':0, 'RM':0,
96                        'U0':0, 'U1':0, 'U2':0,
97                        'R0':0, 'R1':0, 'R2':0,
98                       }
99         for line in autoopen(pathname,'r'):
100             reads += 1
101             fields = line.split()
102             # code = fields[2]
103             # match_codes[code] = match_codes.setdefault(code, 0) + 1
104             # the QC/NM etc codes are in the 3rd field and always present
105             match_codes[fields[2]] += 1
106             # ignore lines that don't have a fasta filename
107             if len(fields) < 7:
108                 continue
109             fasta = self.genome_map.get(fields[6], fields[6])
110             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
111         return match_codes, mapped_reads, reads
112
113     def _update_eland_multi(self, pathname):
114         reads = 0
115         mapped_reads = {}
116
117         match_codes = {'NM':0, 'QC':0, 'RM':0,
118                        'U0':0, 'U1':0, 'U2':0,
119                        'R0':0, 'R1':0, 'R2':0,
120                       }
121         match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
122         for line in autoopen(pathname,'r'):
123             reads += 1
124             fields = line.split()
125             # fields[2] = QC/NM/or number of matches
126             groups = match_counts_re.match(fields[2])
127             if groups is None:
128                 match_codes[fields[2]] += 1
129             else:
130                 # when there are too many hit, eland writes a - where
131                 # it would have put the list of hits
132                 if len(fields) < 4 or fields[3] == '-':
133                   continue
134                 zero_mismatches = int(groups.group(1))
135                 if zero_mismatches == 1:
136                   match_codes['U0'] += 1
137                 elif zero_mismatches < 255:
138                   match_codes['R0'] += zero_mismatches
139
140                 one_mismatches = int(groups.group(2))
141                 if one_mismatches == 1:
142                   match_codes['U1'] += 1
143                 elif one_mismatches < 255:
144                   match_codes['R1'] += one_mismatches
145
146                 two_mismatches = int(groups.group(3))
147                 if two_mismatches == 1:
148                   match_codes['U2'] += 1
149                 elif two_mismatches < 255:
150                   match_codes['R2'] += two_mismatches
151
152                 chromo = None
153                 for match in fields[3].split(','):
154                   match_fragment = match.split(':')
155                   if len(match_fragment) == 2:
156                       chromo = match_fragment[0]
157                       pos = match_fragment[1]
158
159                   fasta = self.genome_map.get(chromo, chromo)
160                   assert fasta is not None
161                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
162         return match_codes, mapped_reads, reads
163
164     def _update_name(self):
165         # extract the sample name
166         if self.pathname is None:
167             return
168
169         path, name = os.path.split(self.pathname)
170         split_name = name.split('_')
171         self._sample_name = split_name[0]
172
173     def _get_sample_name(self):
174         if self._sample_name is None:
175             self._update_name()
176         return self._sample_name
177     sample_name = property(_get_sample_name)
178
179     def _get_reads(self):
180         if self._reads is None:
181             self._update()
182         return self._reads
183     reads = property(_get_reads)
184
185     def _get_mapped_reads(self):
186         if self._mapped_reads is None:
187             self._update()
188         return self._mapped_reads
189     mapped_reads = property(_get_mapped_reads)
190
191     def _get_match_codes(self):
192         if self._match_codes is None:
193             self._update()
194         return self._match_codes
195     match_codes = property(_get_match_codes)
196
197     def _get_no_match(self):
198         if self._mapped_reads is None:
199             self._update()  
200         return self._match_codes['NM']
201     no_match = property(_get_no_match, 
202                         doc="total reads that didn't match the target genome.")
203
204     def _get_no_match_percent(self):
205         return float(self.no_match)/self.reads * 100 
206     no_match_percent = property(_get_no_match_percent,
207                                 doc="no match reads as percent of total")
208
209     def _get_qc_failed(self):
210         if self._mapped_reads is None:
211             self._update()  
212         return self._match_codes['QC']
213     qc_failed = property(_get_qc_failed,
214                         doc="total reads that didn't match the target genome.")
215
216     def _get_qc_failed_percent(self):
217         return float(self.qc_failed)/self.reads * 100 
218     qc_failed_percent = property(_get_qc_failed_percent,
219                                  doc="QC failed reads as percent of total")
220
221     def _get_unique_reads(self):
222         if self._mapped_reads is None:
223            self._update()
224         sum = 0
225         for code in ['U0','U1','U2']:
226             sum += self._match_codes[code]
227         return sum
228     unique_reads = property(_get_unique_reads,
229                             doc="total unique reads")
230
231     def _get_repeat_reads(self):
232         if self._mapped_reads is None:
233            self._update()
234         sum = 0
235         for code in ['R0','R1','R2']:
236             sum += self._match_codes[code]
237         return sum
238     repeat_reads = property(_get_repeat_reads,
239                             doc="total repeat reads")
240     
241     def get_elements(self):
242         lane = ElementTree.Element(ElandLane.LANE,
243                                    {'version':
244                                     unicode(ElandLane.XML_VERSION)})
245         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
246         sample_tag.text = self.sample_name
247         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
248         lane_tag.text = str(self.lane_id)
249         if self.end is not None:
250             end_tag = ElementTree.SubElement(lane, ElandLane.END)
251             end_tag.text = str(self.end)
252         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
253         for k, v in self.genome_map.items():
254             item = ElementTree.SubElement(
255                 genome_map, ElandLane.GENOME_ITEM,
256                 {'name':k, 'value':unicode(v)})
257         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
258         for k, v in self.mapped_reads.items():
259             item = ElementTree.SubElement(
260                 mapped_reads, ElandLane.MAPPED_ITEM,
261                 {'name':k, 'value':unicode(v)})
262         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
263         for k, v in self.match_codes.items():
264             item = ElementTree.SubElement(
265                 match_codes, ElandLane.MATCH_ITEM,
266                 {'name':k, 'value':unicode(v)})
267         reads = ElementTree.SubElement(lane, ElandLane.READS)
268         reads.text = unicode(self.reads)
269
270         return lane
271
272     def set_elements(self, tree):
273         if tree.tag != ElandLane.LANE:
274             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
275
276         # reset dictionaries
277         self._mapped_reads = {}
278         self._match_codes = {}
279
280         for element in tree:
281             tag = element.tag.lower()
282             if tag == ElandLane.SAMPLE_NAME.lower():
283                 self._sample_name = element.text
284             elif tag == ElandLane.LANE_ID.lower():
285                 self.lane_id = int(element.text)
286             elif tag == ElandLane.END.lower():
287                 self.end = int(element.text)
288             elif tag == ElandLane.GENOME_MAP.lower():
289                 for child in element:
290                     name = child.attrib['name']
291                     value = child.attrib['value']
292                     self.genome_map[name] = value
293             elif tag == ElandLane.MAPPED_READS.lower():
294                 for child in element:
295                     name = child.attrib['name']
296                     value = child.attrib['value']
297                     self._mapped_reads[name] = int(value)
298             elif tag == ElandLane.MATCH_CODES.lower():
299                 for child in element:
300                     name = child.attrib['name']
301                     value = int(child.attrib['value'])
302                     self._match_codes[name] = value
303             elif tag == ElandLane.READS.lower():
304                 self._reads = int(element.text)
305             else:
306                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
307
308 class ELAND(object):
309     """
310     Summarize information from eland files
311     """
312     XML_VERSION = 2
313
314     ELAND = 'ElandCollection'
315     LANE = 'Lane'
316     LANE_ID = 'id'
317     END = 'end'
318
319     def __init__(self, xml=None):
320         # we need information from the gerald config.xml
321         self.results = [{},{}]
322
323         if xml is not None:
324             self.set_elements(xml)
325
326     def get_elements(self):
327         root = ElementTree.Element(ELAND.ELAND,
328                                    {'version': unicode(ELAND.XML_VERSION)})
329         for end in range(len(self.results)):
330            end_results = self.results[end]
331            for lane_id, lane in end_results.items():
332                 eland_lane = lane.get_elements()
333                 eland_lane.attrib[ELAND.END] = unicode (end)
334                 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
335                 root.append(eland_lane)
336         return root
337
338     def set_elements(self, tree):
339         if tree.tag.lower() != ELAND.ELAND.lower():
340             raise ValueError('Expecting %s', ELAND.ELAND)
341         for element in list(tree):
342             lane_id = int(element.attrib[ELAND.LANE_ID])
343             end = int(element.attrib.get(ELAND.END, 0)) 
344             lane = ElandLane(xml=element)
345             self.results[end][lane_id] = lane
346
347 def check_for_eland_file(basedir, pattern, lane_id, end):
348    if end is None:
349       full_lane_id = lane_id
350    else:
351       full_lane_id = "%d_%d" % ( lane_id, end )
352
353    basename = pattern % (full_lane_id,)
354    pathname = os.path.join(basedir, basename)
355    if os.path.exists(pathname):
356        return pathname
357    else:
358        return None
359
360 def eland(basedir, gerald=None, genome_maps=None):
361     e = ELAND()
362
363     # if there is a basedir/Temp change basedir to point to the temp
364     # directory, as 1.1rc1 moves most of the files we've historically
365     # cared about to that subdirectory.
366     # we should look into what the official 'result' files are.
367     basedir_temp = os.path.join(basedir, 'Temp')
368     if os.path.isdir(basedir_temp):
369         basedir = basedir_temp
370
371     lane_ids = range(1,9)
372     ends = [None, 1, 2]
373     
374     # the order in patterns determines the preference for what
375     # will be found.
376     patterns = ['s_%s_eland_result.txt',
377                 's_%s_eland_result.txt.bz2',
378                 's_%s_eland_result.txt.gz',
379                 's_%s_eland_extended.txt',
380                 's_%s_eland_extended.txt.bz2',
381                 's_%s_eland_extended.txt.gz',
382                 's_%s_eland_multi.txt',
383                 's_%s_eland_multi.txt.bz2',
384                 's_%s_eland_multi.txt.gz',]
385
386     for end in ends:
387         for lane_id in lane_ids:
388             for p in patterns:
389                 pathname = check_for_eland_file(basedir, p, lane_id, end)
390                 if pathname is not None:
391                   break
392             else:
393                 continue
394             # yes the lane_id is also being computed in ElandLane._update
395             # I didn't want to clutter up my constructor
396             # but I needed to persist the sample_name/lane_id for
397             # runfolder summary_report
398             path, name = os.path.split(pathname)
399             logging.info("Adding eland file %s" %(name,))
400             # split_name = name.split('_')
401             # lane_id = int(split_name[1])
402
403             if genome_maps is not None:
404                 genome_map = genome_maps[lane_id]
405             elif gerald is not None:
406                 genome_dir = gerald.lanes[lane_id].eland_genome
407                 genome_map = build_genome_fasta_map(genome_dir)
408             else:
409                 genome_map = {}
410
411             eland_result = ElandLane(pathname, lane_id, end, genome_map)
412             if end is None:
413                 effective_end =  0
414             else:
415                 effective_end = end - 1
416             e.results[effective_end][lane_id] = eland_result
417     return e
418
419 def build_genome_fasta_map(genome_dir):
420     # build fasta to fasta file map
421     logging.info("Building genome map")
422     genome = genome_dir.split(os.path.sep)[-1]
423     fasta_map = {}
424     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
425         is_link = False
426         if os.path.islink(vld_file):
427             is_link = True
428         vld_file = os.path.realpath(vld_file)
429         path, vld_name = os.path.split(vld_file)
430         name, ext = os.path.splitext(vld_name)
431         if is_link:
432             fasta_map[name] = name
433         else:
434             fasta_map[name] = os.path.join(genome, name)
435     return fasta_map
436
437
438 def extract_eland_sequence(instream, outstream, start, end):
439     """
440     Extract a chunk of sequence out of an eland file
441     """
442     for line in instream:
443         record = line.split()
444         if len(record) > 1:
445             result = [record[0], record[1][start:end]]
446         else:
447             result = [record[0][start:end]]
448         outstream.write("\t".join(result))
449         outstream.write(os.linesep)