The summary parsing code now seems to handle paired end runs
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10
11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
14
15 class ElandLane(object):
16     """
17     Process an eland result file
18     """
19     XML_VERSION = 1
20     LANE = 'ElandLane'
21     SAMPLE_NAME = 'SampleName'
22     LANE_ID = 'LaneID'
23     GENOME_MAP = 'GenomeMap'
24     GENOME_ITEM = 'GenomeItem'
25     MAPPED_READS = 'MappedReads'
26     MAPPED_ITEM = 'MappedItem'
27     MATCH_CODES = 'MatchCodes'
28     MATCH_ITEM = 'Code'
29     READS = 'Reads'
30
31     ELAND_SINGLE = 0
32     ELAND_MULTI = 1
33     ELAND_EXTENDED = 2
34     ELAND_EXPORT = 3
35
36     def __init__(self, pathname=None, genome_map=None, eland_type=None, xml=None):
37         self.pathname = pathname
38         self._sample_name = None
39         self._lane_id = None
40         self._reads = None
41         self._mapped_reads = None
42         self._match_codes = None
43         if genome_map is None:
44             genome_map = {}
45         self.genome_map = genome_map
46         self.eland_type = None
47
48         if xml is not None:
49             self.set_elements(xml)
50
51     def _guess_eland_type(self, pathname):
52         if self.eland_type is None:
53           # attempt autodetect eland file type
54           pathn, name = os.path.split(pathname)
55           if re.search('result', name):
56             self.eland_type = ElandLane.ELAND_SINGLE
57           elif re.search('multi', name):
58             self.eland_type = ElandLane.ELAND_MULTI
59           elif re.search('extended', name):
60             self.eland_type = ElandLane.ELAND_EXTENDED
61           elif re.search('export', name):
62             self.eland_type = ElandLane.ELAND_EXPORT
63           else:
64             self.eland_type = ElandLane.ELAND_SINGLE
65
66     def _update(self):
67         """
68         Actually read the file and actually count the reads
69         """
70         # can't do anything if we don't have a file to process
71         if self.pathname is None:
72             return
73         self._guess_eland_type(self.pathname)
74
75         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
76             raise RuntimeError("Eland isn't done, try again later.")
77
78         logging.info("summarizing results for %s" % (self.pathname))
79
80         if self.eland_type == ElandLane.ELAND_SINGLE:
81           result = self._update_eland_result(self.pathname)
82         elif self.eland_type == ElandLane.ELAND_MULTI or \
83              self.eland_type == ElandLane.ELAND_EXTENDED:
84           result = self._update_eland_multi(self.pathname)
85         else:
86           raise NotImplementedError("Only support single/multi/extended eland files")
87         self._match_codes, self._mapped_reads, self._reads = result
88
89     def _update_eland_result(self, pathname):
90         reads = 0
91         mapped_reads = {}
92
93         match_codes = {'NM':0, 'QC':0, 'RM':0,
94                        'U0':0, 'U1':0, 'U2':0,
95                        'R0':0, 'R1':0, 'R2':0,
96                       }
97         for line in autoopen(pathname,'r'):
98             reads += 1
99             fields = line.split()
100             # code = fields[2]
101             # match_codes[code] = match_codes.setdefault(code, 0) + 1
102             # the QC/NM etc codes are in the 3rd field and always present
103             match_codes[fields[2]] += 1
104             # ignore lines that don't have a fasta filename
105             if len(fields) < 7:
106                 continue
107             fasta = self.genome_map.get(fields[6], fields[6])
108             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
109         return match_codes, mapped_reads, reads
110
111     def _update_eland_multi(self, pathname):
112         reads = 0
113         mapped_reads = {}
114
115         match_codes = {'NM':0, 'QC':0, 'RM':0,
116                        'U0':0, 'U1':0, 'U2':0,
117                        'R0':0, 'R1':0, 'R2':0,
118                       }
119         match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
120         for line in autoopen(pathname,'r'):
121             reads += 1
122             fields = line.split()
123             # fields[2] = QC/NM/or number of matches
124             groups = match_counts_re.match(fields[2])
125             if groups is None:
126                 match_codes[fields[2]] += 1
127             else:
128                 # when there are too many hit, eland writes a - where
129                 # it would have put the list of hits
130                 if fields[3] == '-':
131                   continue
132                 zero_mismatches = int(groups.group(1))
133                 if zero_mismatches == 1:
134                   match_codes['U0'] += 1
135                 elif zero_mismatches < 255:
136                   match_codes['R0'] += zero_mismatches
137
138                 one_mismatches = int(groups.group(2))
139                 if one_mismatches == 1:
140                   match_codes['U1'] += 1
141                 elif one_mismatches < 255:
142                   match_codes['R1'] += one_mismatches
143
144                 two_mismatches = int(groups.group(3))
145                 if two_mismatches == 1:
146                   match_codes['U2'] += 1
147                 elif two_mismatches < 255:
148                   match_codes['R2'] += two_mismatches
149
150                 chromo = None
151                 for match in fields[3].split(','):
152                   match_fragment = match.split(':')
153                   if len(match_fragment) == 2:
154                       chromo = match_fragment[0]
155                       pos = match_fragment[1]
156
157                   fasta = self.genome_map.get(chromo, chromo)
158                   assert fasta is not None
159                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
160         return match_codes, mapped_reads, reads
161
162     def _update_name(self):
163         # extract the sample name
164         if self.pathname is None:
165             return
166
167         path, name = os.path.split(self.pathname)
168         split_name = name.split('_')
169         self._sample_name = split_name[0]
170         self._lane_id = int(split_name[1])
171
172     def _get_sample_name(self):
173         if self._sample_name is None:
174             self._update_name()
175         return self._sample_name
176     sample_name = property(_get_sample_name)
177
178     def _get_lane_id(self):
179         if self._lane_id is None:
180             self._update_name()
181         return self._lane_id
182     lane_id = property(_get_lane_id)
183
184     def _get_reads(self):
185         if self._reads is None:
186             self._update()
187         return self._reads
188     reads = property(_get_reads)
189
190     def _get_mapped_reads(self):
191         if self._mapped_reads is None:
192             self._update()
193         return self._mapped_reads
194     mapped_reads = property(_get_mapped_reads)
195
196     def _get_match_codes(self):
197         if self._match_codes is None:
198             self._update()
199         return self._match_codes
200     match_codes = property(_get_match_codes)
201
202     def get_elements(self):
203         lane = ElementTree.Element(ElandLane.LANE,
204                                    {'version':
205                                     unicode(ElandLane.XML_VERSION)})
206         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
207         sample_tag.text = self.sample_name
208         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
209         lane_tag.text = str(self.lane_id)
210         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
211         for k, v in self.genome_map.items():
212             item = ElementTree.SubElement(
213                 genome_map, ElandLane.GENOME_ITEM,
214                 {'name':k, 'value':unicode(v)})
215         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
216         for k, v in self.mapped_reads.items():
217             item = ElementTree.SubElement(
218                 mapped_reads, ElandLane.MAPPED_ITEM,
219                 {'name':k, 'value':unicode(v)})
220         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
221         for k, v in self.match_codes.items():
222             item = ElementTree.SubElement(
223                 match_codes, ElandLane.MATCH_ITEM,
224                 {'name':k, 'value':unicode(v)})
225         reads = ElementTree.SubElement(lane, ElandLane.READS)
226         reads.text = unicode(self.reads)
227
228         return lane
229
230     def set_elements(self, tree):
231         if tree.tag != ElandLane.LANE:
232             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
233
234         # reset dictionaries
235         self._mapped_reads = {}
236         self._match_codes = {}
237
238         for element in tree:
239             tag = element.tag.lower()
240             if tag == ElandLane.SAMPLE_NAME.lower():
241                 self._sample_name = element.text
242             elif tag == ElandLane.LANE_ID.lower():
243                 self._lane_id = int(element.text)
244             elif tag == ElandLane.GENOME_MAP.lower():
245                 for child in element:
246                     name = child.attrib['name']
247                     value = child.attrib['value']
248                     self.genome_map[name] = value
249             elif tag == ElandLane.MAPPED_READS.lower():
250                 for child in element:
251                     name = child.attrib['name']
252                     value = child.attrib['value']
253                     self._mapped_reads[name] = int(value)
254             elif tag == ElandLane.MATCH_CODES.lower():
255                 for child in element:
256                     name = child.attrib['name']
257                     value = int(child.attrib['value'])
258                     self._match_codes[name] = value
259             elif tag == ElandLane.READS.lower():
260                 self._reads = int(element.text)
261             else:
262                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
263
264 class ELAND(object):
265     """
266     Summarize information from eland files
267     """
268     XML_VERSION = 1
269
270     ELAND = 'ElandCollection'
271     LANE = 'Lane'
272     LANE_ID = 'id'
273
274     def __init__(self, xml=None):
275         # we need information from the gerald config.xml
276         self.results = {}
277
278         if xml is not None:
279             self.set_elements(xml)
280
281     def __len__(self):
282         return len(self.results)
283
284     def keys(self):
285         return self.results.keys()
286
287     def values(self):
288         return self.results.values()
289
290     def items(self):
291         return self.results.items()
292
293     def __getitem__(self, key):
294         return self.results[key]
295
296     def get_elements(self):
297         root = ElementTree.Element(ELAND.ELAND,
298                                    {'version': unicode(ELAND.XML_VERSION)})
299         for lane_id, lane in self.results.items():
300             eland_lane = lane.get_elements()
301             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
302             root.append(eland_lane)
303         return root
304
305     def set_elements(self, tree):
306         if tree.tag.lower() != ELAND.ELAND.lower():
307             raise ValueError('Expecting %s', ELAND.ELAND)
308         for element in list(tree):
309             lane_id = int(element.attrib[ELAND.LANE_ID])
310             lane = ElandLane(xml=element)
311             self.results[lane_id] = lane
312
313 def check_for_eland_file(basedir, lane_id, pattern):
314    basename = pattern % (lane_id,)
315    pathname = os.path.join(basedir, basename)
316    if os.path.exists(pathname):
317        return pathname
318    else:
319        return None
320
321 def eland(basedir, gerald=None, genome_maps=None):
322     e = ELAND()
323
324     file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
325     if len(file_list) == 0:
326         # lets handle compressed eland files too
327         file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
328
329     lane_ids = range(1,9)
330     # the order in patterns determines the preference for what
331     # will be found.
332     patterns = ['s_%d_eland_result.txt',
333                 's_%d_eland_result.txt.bz2',
334                 's_%d_eland_result.txt.gz',
335                 's_%d_eland_extended.txt',
336                 's_%d_eland_extended.txt.bz2',
337                 's_%d_eland_extended.txt.gz',
338                 's_%d_eland_multi.txt',
339                 's_%d_eland_multi.txt.bz2',
340                 's_%d_eland_multi.txt.gz',]
341
342     for lane_id in lane_ids:
343         for p in patterns:
344             pathname = check_for_eland_file(basedir, lane_id, p)
345             if pathname is not None:
346               break
347         else:
348             continue
349         # yes the lane_id is also being computed in ElandLane._update
350         # I didn't want to clutter up my constructor
351         # but I needed to persist the sample_name/lane_id for
352         # runfolder summary_report
353         path, name = os.path.split(pathname)
354         logging.info("Adding eland file %s" %(name,))
355         split_name = name.split('_')
356         lane_id = int(split_name[1])
357
358         if genome_maps is not None:
359             genome_map = genome_maps[lane_id]
360         elif gerald is not None:
361             genome_dir = gerald.lanes[lane_id].eland_genome
362             genome_map = build_genome_fasta_map(genome_dir)
363         else:
364             genome_map = {}
365
366         eland_result = ElandLane(pathname, genome_map)
367         e.results[lane_id] = eland_result
368     return e
369
370 def build_genome_fasta_map(genome_dir):
371     # build fasta to fasta file map
372     logging.info("Building genome map")
373     genome = genome_dir.split(os.path.sep)[-1]
374     fasta_map = {}
375     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
376         is_link = False
377         if os.path.islink(vld_file):
378             is_link = True
379         vld_file = os.path.realpath(vld_file)
380         path, vld_name = os.path.split(vld_file)
381         name, ext = os.path.splitext(vld_name)
382         if is_link:
383             fasta_map[name] = name
384         else:
385             fasta_map[name] = os.path.join(genome, name)
386     return fasta_map
387
388
389 def extract_eland_sequence(instream, outstream, start, end):
390     """
391     Extract a chunk of sequence out of an eland file
392     """
393     for line in instream:
394         record = line.split()
395         if len(record) > 1:
396             result = [record[0], record[1][start:end]]
397         else:
398             result = [record[0][start:end]]
399         outstream.write("\t".join(result))
400         outstream.write(os.linesep)