559a2a27837dcba42f3467e6a985ccf353e8a5f2
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10
11 from htsworkflow.pipelines.runfolder import ElementTree
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
14
15 SAMPLE_NAME = 'SampleName'
16 LANE_ID = 'LaneID'
17 END = 'End'
18 READS = 'Reads'
19
20 GENOME_MAP = 'GenomeMap'
21 GENOME_ITEM = 'GenomeItem'
22 MAPPED_READS = 'MappedReads'
23 MAPPED_ITEM = 'MappedItem'
24 MATCH_CODES = 'MatchCodes'
25 MATCH_ITEM = 'Code'
26 READS = 'Reads'
27
28 ELAND_SINGLE = 0
29 ELAND_MULTI = 1
30 ELAND_EXTENDED = 2
31 ELAND_EXPORT = 3
32
33
34 class ResultLane(object):
35     """
36     Base class for result lanes
37     """
38     XML_VERSION = 2
39     LANE = 'ResultLane'
40
41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42         self.pathname = pathname
43         self._sample_name = None
44         self.lane_id = lane_id
45         self.end = end
46         self._reads = None
47         
48         if xml is not None:
49             self.set_elements(xml)
50
51     def _update(self):
52         """
53         Actually read the file and actually count the reads
54         """
55         raise NotImplementedError("Can't count abstract classes")
56
57     def _update_name(self):
58         # extract the sample name
59         if self.pathname is None:
60             return
61
62         path, name = os.path.split(self.pathname)
63         split_name = name.split('_')
64         self._sample_name = split_name[0]
65
66     def _get_sample_name(self):
67         if self._sample_name is None:
68             self._update_name()
69         return self._sample_name
70     sample_name = property(_get_sample_name)
71
72     def _get_reads(self):
73         if self._reads is None:
74             self._update()
75         return self._reads
76     reads = property(_get_reads)
77
78
79 class ElandLane(ResultLane):
80     """
81     Process an eland result file
82     """
83     XML_VERSION = 2
84     LANE = "ElandLane"
85
86     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
87         super(ElandLane, self).__init__(pathname, lane_id, end)
88
89         self._mapped_reads = None
90         self._match_codes = None
91         if genome_map is None:
92             genome_map = {}
93         self.genome_map = genome_map
94         self.eland_type = None
95
96         if xml is not None:
97             self.set_elements(xml)
98
99     def _guess_eland_type(self, pathname):
100         if self.eland_type is None:
101           # attempt autodetect eland file type
102           pathn, name = os.path.split(pathname)
103           if re.search('result', name):
104             self.eland_type = ELAND_SINGLE
105           elif re.search('multi', name):
106             self.eland_type = ELAND_MULTI
107           elif re.search('extended', name):
108             self.eland_type = ELAND_EXTENDED
109           elif re.search('export', name):
110             self.eland_type = ELAND_EXPORT
111           else:
112             self.eland_type = ELAND_SINGLE
113
114     def _update(self):
115         """
116         Actually read the file and actually count the reads
117         """
118         # can't do anything if we don't have a file to process
119         if self.pathname is None:
120             return
121         self._guess_eland_type(self.pathname)
122
123         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
124             raise RuntimeError("Eland isn't done, try again later.")
125
126         logging.info("summarizing results for %s" % (self.pathname))
127
128         if self.eland_type == ELAND_SINGLE:
129           result = self._update_eland_result(self.pathname)
130         elif self.eland_type == ELAND_MULTI or \
131              self.eland_type == ELAND_EXTENDED:
132           result = self._update_eland_multi(self.pathname)
133         else:
134           raise NotImplementedError("Only support single/multi/extended eland files")
135         self._match_codes, self._mapped_reads, self._reads = result
136
137     def _update_eland_result(self, pathname):
138         reads = 0
139         mapped_reads = {}
140
141         match_codes = {'NM':0, 'QC':0, 'RM':0,
142                        'U0':0, 'U1':0, 'U2':0,
143                        'R0':0, 'R1':0, 'R2':0,
144                       }
145         for line in autoopen(pathname,'r'):
146             reads += 1
147             fields = line.split()
148             # code = fields[2]
149             # match_codes[code] = match_codes.setdefault(code, 0) + 1
150             # the QC/NM etc codes are in the 3rd field and always present
151             match_codes[fields[2]] += 1
152             # ignore lines that don't have a fasta filename
153             if len(fields) < 7:
154                 continue
155             fasta = self.genome_map.get(fields[6], fields[6])
156             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
157         return match_codes, mapped_reads, reads
158
159     def _update_eland_multi(self, pathname):
160         reads = 0
161         mapped_reads = {}
162
163         match_codes = {'NM':0, 'QC':0, 'RM':0,
164                        'U0':0, 'U1':0, 'U2':0,
165                        'R0':0, 'R1':0, 'R2':0,
166                       }
167         match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
168         for line in autoopen(pathname,'r'):
169             reads += 1
170             fields = line.split()
171             # fields[2] = QC/NM/or number of matches
172             groups = match_counts_re.match(fields[2])
173             if groups is None:
174                 match_codes[fields[2]] += 1
175             else:
176                 # when there are too many hit, eland  writes a - where
177                 # it would have put the list of hits.
178                 # or in a different version of eland, it just leaves
179                 # that column blank, and only outputs 3 fields.     
180                 if len(fields) < 4 or fields[3] == '-':
181                   continue
182                 zero_mismatches = int(groups.group(1))
183                 if zero_mismatches == 1:
184                   match_codes['U0'] += 1
185                 elif zero_mismatches < 255:
186                   match_codes['R0'] += zero_mismatches
187
188                 one_mismatches = int(groups.group(2))
189                 if one_mismatches == 1:
190                   match_codes['U1'] += 1
191                 elif one_mismatches < 255:
192                   match_codes['R1'] += one_mismatches
193
194                 two_mismatches = int(groups.group(3))
195                 if two_mismatches == 1:
196                   match_codes['U2'] += 1
197                 elif two_mismatches < 255:
198                   match_codes['R2'] += two_mismatches
199
200                 chromo = None
201                 for match in fields[3].split(','):
202                   match_fragment = match.split(':')
203                   if len(match_fragment) == 2:
204                       chromo = match_fragment[0]
205                       pos = match_fragment[1]
206
207                   fasta = self.genome_map.get(chromo, chromo)
208                   assert fasta is not None
209                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
210         return match_codes, mapped_reads, reads
211
212     def _get_mapped_reads(self):
213         if self._mapped_reads is None:
214             self._update()
215         return self._mapped_reads
216     mapped_reads = property(_get_mapped_reads)
217
218     def _get_match_codes(self):
219         if self._match_codes is None:
220             self._update()
221         return self._match_codes
222     match_codes = property(_get_match_codes)
223
224     def _get_no_match(self):
225         if self._mapped_reads is None:
226             self._update()  
227         return self._match_codes['NM']
228     no_match = property(_get_no_match, 
229                         doc="total reads that didn't match the target genome.")
230
231     def _get_no_match_percent(self):
232         return float(self.no_match)/self.reads * 100 
233     no_match_percent = property(_get_no_match_percent,
234                                 doc="no match reads as percent of total")
235
236     def _get_qc_failed(self):
237         if self._mapped_reads is None:
238             self._update()  
239         return self._match_codes['QC']
240     qc_failed = property(_get_qc_failed,
241                         doc="total reads that didn't match the target genome.")
242
243     def _get_qc_failed_percent(self):
244         return float(self.qc_failed)/self.reads * 100 
245     qc_failed_percent = property(_get_qc_failed_percent,
246                                  doc="QC failed reads as percent of total")
247
248     def _get_unique_reads(self):
249         if self._mapped_reads is None:
250            self._update()
251         sum = 0
252         for code in ['U0','U1','U2']:
253             sum += self._match_codes[code]
254         return sum
255     unique_reads = property(_get_unique_reads,
256                             doc="total unique reads")
257
258     def _get_repeat_reads(self):
259         if self._mapped_reads is None:
260            self._update()
261         sum = 0
262         for code in ['R0','R1','R2']:
263             sum += self._match_codes[code]
264         return sum
265     repeat_reads = property(_get_repeat_reads,
266                             doc="total repeat reads")
267     
268     def get_elements(self):
269         lane = ElementTree.Element(ElandLane.LANE,
270                                    {'version':
271                                     unicode(ElandLane.XML_VERSION)})
272         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
273         sample_tag.text = self.sample_name
274         lane_tag = ElementTree.SubElement(lane, LANE_ID)
275         lane_tag.text = str(self.lane_id)
276         if self.end is not None:
277             end_tag = ElementTree.SubElement(lane, END)
278             end_tag.text = str(self.end)
279         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
280         for k, v in self.genome_map.items():
281             item = ElementTree.SubElement(
282                 genome_map, GENOME_ITEM,
283                 {'name':k, 'value':unicode(v)})
284         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
285         for k, v in self.mapped_reads.items():
286             item = ElementTree.SubElement(
287                 mapped_reads, MAPPED_ITEM,
288                 {'name':k, 'value':unicode(v)})
289         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
290         for k, v in self.match_codes.items():
291             item = ElementTree.SubElement(
292                 match_codes, MATCH_ITEM,
293                 {'name':k, 'value':unicode(v)})
294         reads = ElementTree.SubElement(lane, READS)
295         reads.text = unicode(self.reads)
296
297         return lane
298
299     def set_elements(self, tree):
300         if tree.tag != ElandLane.LANE:
301             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
302
303         # reset dictionaries
304         self._mapped_reads = {}
305         self._match_codes = {}
306
307         for element in tree:
308             tag = element.tag.lower()
309             if tag == SAMPLE_NAME.lower():
310                 self._sample_name = element.text
311             elif tag == LANE_ID.lower():
312                 self.lane_id = int(element.text)
313             elif tag == END.lower():
314                 self.end = int(element.text)
315             elif tag == GENOME_MAP.lower():
316                 for child in element:
317                     name = child.attrib['name']
318                     value = child.attrib['value']
319                     self.genome_map[name] = value
320             elif tag == MAPPED_READS.lower():
321                 for child in element:
322                     name = child.attrib['name']
323                     value = child.attrib['value']
324                     self._mapped_reads[name] = int(value)
325             elif tag == MATCH_CODES.lower():
326                 for child in element:
327                     name = child.attrib['name']
328                     value = int(child.attrib['value'])
329                     self._match_codes[name] = value
330             elif tag == READS.lower():
331                 self._reads = int(element.text)
332             else:
333                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
334
335 class SequenceLane(ResultLane):
336     XML_VERSION=1
337     LANE = 'SequenceLane'
338     SEQUENCE_TYPE = 'SequenceType'
339
340     NONE_TYPE = None
341     SCARF_TYPE = 1
342     FASTQ_TYPE = 2
343     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
344
345     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
346         self.sequence_type = None
347         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
348
349     def _guess_sequence_type(self, pathname):
350         """
351         Determine if we have a scarf or fastq sequence file
352         """
353         f = open(pathname,'r')
354         l = f.readline()
355         f.close()
356
357         if l[0] == '@':
358             # fastq starts with a @
359             self.sequence_type = SequenceLane.FASTQ_TYPE
360         else:
361             self.sequence_type = SequenceLane.SCARF_TYPE
362         return self.sequence_type
363
364     def _update(self):
365         """
366         Actually read the file and actually count the reads
367         """
368         # can't do anything if we don't have a file to process
369         if self.pathname is None:
370             return
371
372         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
373             raise RuntimeError("Sequencing isn't done, try again later.")
374
375         self._guess_sequence_type(self.pathname)
376
377         logging.info("summarizing results for %s" % (self.pathname))
378         lines = 0
379         f = open(self.pathname)
380         for l in f.xreadlines():
381             lines += 1
382         f.close()
383
384         if self.sequence_type == SequenceLane.SCARF_TYPE:
385             self._reads = lines
386         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
387             self._reads = lines / 4
388         else:
389           raise NotImplementedError("This only supports scarf or fastq squence files")
390
391     def get_elements(self):
392         lane = ElementTree.Element(SequenceLane.LANE,
393                                    {'version':
394                                     unicode(SequenceLane.XML_VERSION)})
395         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
396         sample_tag.text = self.sample_name
397         lane_tag = ElementTree.SubElement(lane, LANE_ID)
398         lane_tag.text = str(self.lane_id)
399         if self.end is not None:
400             end_tag = ElementTree.SubElement(lane, END)
401             end_tag.text = str(self.end)
402         reads = ElementTree.SubElement(lane, READS)
403         reads.text = unicode(self.reads)
404         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
405         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
406
407         return lane
408
409     def set_elements(self, tree):
410         if tree.tag != SequenceLane.LANE:
411             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
412         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
413
414         for element in tree:
415             tag = element.tag.lower()
416             if tag == SAMPLE_NAME.lower():
417                 self._sample_name = element.text
418             elif tag == LANE_ID.lower():
419                 self.lane_id = int(element.text)
420             elif tag == END.lower():
421                 self.end = int(element.text)
422             elif tag == READS.lower():
423                 self._reads = int(element.text)
424             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
425                 self.sequence_type = lookup_sequence_type.get(element.text, None)
426                 print self.sequence_type
427             else:
428                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
429
430 class ELAND(object):
431     """
432     Summarize information from eland files
433     """
434     XML_VERSION = 3
435
436     ELAND = 'ElandCollection'
437     LANE = 'Lane'
438     LANE_ID = 'id'
439     END = 'end'
440
441     def __init__(self, xml=None):
442         # we need information from the gerald config.xml
443         self.results = [{},{}]
444
445         if xml is not None:
446             self.set_elements(xml)
447
448     def get_elements(self):
449         root = ElementTree.Element(ELAND.ELAND,
450                                    {'version': unicode(ELAND.XML_VERSION)})
451         for end in range(len(self.results)):
452            end_results = self.results[end]
453            for lane_id, lane in end_results.items():
454                 eland_lane = lane.get_elements()
455                 eland_lane.attrib[ELAND.END] = unicode (end)
456                 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
457                 root.append(eland_lane)
458         return root
459
460     def set_elements(self, tree):
461         if tree.tag.lower() != ELAND.ELAND.lower():
462             raise ValueError('Expecting %s', ELAND.ELAND)
463         for element in list(tree):
464             lane_id = int(element.attrib[ELAND.LANE_ID])
465             end = int(element.attrib.get(ELAND.END, 0)) 
466             if element.tag.lower() == ElandLane.LANE.lower():
467                 lane = ElandLane(xml=element)
468             elif element.tag.lower() == SequenceLane.LANE.lower():
469                 lane = SequenceLane(xml=element)
470
471             self.results[end][lane_id] = lane
472
473 def check_for_eland_file(basedir, pattern, lane_id, end):
474    if end is None:
475       full_lane_id = lane_id
476    else:
477       full_lane_id = "%d_%d" % ( lane_id, end )
478
479    basename = pattern % (full_lane_id,)
480    pathname = os.path.join(basedir, basename)
481    if os.path.exists(pathname):
482        logging.info('found eland file in %s' % (pathname,))
483        return pathname
484    else:
485        return None
486
487 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
488     # yes the lane_id is also being computed in ElandLane._update
489     # I didn't want to clutter up my constructor
490     # but I needed to persist the sample_name/lane_id for
491     # runfolder summary_report
492     path, name = os.path.split(pathname)
493     logging.info("Adding eland file %s" %(name,))
494     # split_name = name.split('_')
495     # lane_id = int(split_name[1])
496
497     if genome_maps is not None:
498         genome_map = genome_maps[lane_id]
499     elif gerald is not None:
500         genome_dir = gerald.lanes[lane_id].eland_genome
501         genome_map = build_genome_fasta_map(genome_dir)
502     else:
503         genome_map = {}
504
505     lane = ElandLane(pathname, lane_id, end, genome_map)
506     
507     if end is None:
508         effective_end =  0
509     else:
510         effective_end = end - 1
511
512     results[effective_end][lane_id] = lane
513
514 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
515     result = SequenceLane(pathname, lane_id, end)
516
517     if end is None:
518         effective_end =  0
519     else:
520         effective_end = end - 1
521
522     results[effective_end][lane_id] = result
523
524
525 def eland(gerald_dir, gerald=None, genome_maps=None):
526     e = ELAND()
527
528     lane_ids = range(1,9)
529     ends = [None, 1, 2]
530
531     basedirs = [gerald_dir]
532
533     # if there is a basedir/Temp change basedir to point to the temp
534     # directory, as 1.1rc1 moves most of the files we've historically
535     # cared about to that subdirectory.
536     # we should look into what the official 'result' files are.
537     # and 1.3 moves them back
538     basedir_temp = os.path.join(gerald_dir, 'Temp')
539     if os.path.isdir(basedir_temp):
540         basedirs.append(basedir_temp)
541
542    
543     # the order in patterns determines the preference for what
544     # will be found.
545     MAPPED_ELAND = 0
546     SEQUENCE = 1
547     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
548                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
549                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
550                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
551                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
552                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
553                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
554                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
555                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
556                 ('s_%s_sequence.txt', SEQUENCE),]
557
558     for basedir in basedirs:
559         for end in ends:
560             for lane_id in lane_ids:
561                 for p in patterns:
562                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
563                     if pathname is not None:
564                         if p[1] == MAPPED_ELAND:
565                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
566                         elif p[1] == SEQUENCE:
567                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
568                         break
569                 else:
570                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
571                     continue
572
573     return e
574
575 def build_genome_fasta_map(genome_dir):
576     # build fasta to fasta file map
577     logging.info("Building genome map")
578     genome = genome_dir.split(os.path.sep)[-1]
579     fasta_map = {}
580     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
581         is_link = False
582         if os.path.islink(vld_file):
583             is_link = True
584         vld_file = os.path.realpath(vld_file)
585         path, vld_name = os.path.split(vld_file)
586         name, ext = os.path.splitext(vld_name)
587         if is_link:
588             fasta_map[name] = name
589         else:
590             fasta_map[name] = os.path.join(genome, name)
591     return fasta_map
592
593
594 def extract_eland_sequence(instream, outstream, start, end):
595     """
596     Extract a chunk of sequence out of an eland file
597     """
598     for line in instream:
599         record = line.split()
600         if len(record) > 1:
601             result = [record[0], record[1][start:end]]
602         else:
603             result = [record[0][start:end]]
604         outstream.write("\t".join(result))
605         outstream.write(os.linesep)