Loading really old run xml files caused the website to crash
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10
11 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
12 from htsworkflow.util.ethelp import indent, flatten
13 from htsworkflow.util.opener import autoopen
14
15 SAMPLE_NAME = 'SampleName'
16 LANE_ID = 'LaneID'
17 END = 'End'
18 READS = 'Reads'
19
20 GENOME_MAP = 'GenomeMap'
21 GENOME_ITEM = 'GenomeItem'
22 MAPPED_READS = 'MappedReads'
23 MAPPED_ITEM = 'MappedItem'
24 MATCH_CODES = 'MatchCodes'
25 MATCH_ITEM = 'Code'
26 READS = 'Reads'
27
28 ELAND_SINGLE = 0
29 ELAND_MULTI = 1
30 ELAND_EXTENDED = 2
31 ELAND_EXPORT = 3
32
33
34 class ResultLane(object):
35     """
36     Base class for result lanes
37     """
38     XML_VERSION = 2
39     LANE = 'ResultLane'
40
41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42         self.pathname = pathname
43         self._sample_name = None
44         self.lane_id = lane_id
45         self.end = end
46         self._reads = None
47         
48         if xml is not None:
49             self.set_elements(xml)
50
51     def _update(self):
52         """
53         Actually read the file and actually count the reads
54         """
55         raise NotImplementedError("Can't count abstract classes")
56
57     def _update_name(self):
58         # extract the sample name
59         if self.pathname is None:
60             return
61
62         path, name = os.path.split(self.pathname)
63         split_name = name.split('_')
64         self._sample_name = split_name[0]
65
66     def _get_sample_name(self):
67         if self._sample_name is None:
68             self._update_name()
69         return self._sample_name
70     sample_name = property(_get_sample_name)
71
72     def _get_reads(self):
73         if self._reads is None:
74             self._update()
75         return self._reads
76     reads = property(_get_reads)
77
78
79 class ElandLane(ResultLane):
80     """
81     Process an eland result file
82     """
83     XML_VERSION = 2
84     LANE = "ElandLane"
85
86     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
87         super(ElandLane, self).__init__(pathname, lane_id, end)
88
89         self._mapped_reads = None
90         self._match_codes = None
91         if genome_map is None:
92             genome_map = {}
93         self.genome_map = genome_map
94         self.eland_type = None
95
96         if xml is not None:
97             self.set_elements(xml)
98
99     def _guess_eland_type(self, pathname):
100         if self.eland_type is None:
101           # attempt autodetect eland file type
102           pathn, name = os.path.split(pathname)
103           if re.search('result', name):
104             self.eland_type = ELAND_SINGLE
105           elif re.search('multi', name):
106             self.eland_type = ELAND_MULTI
107           elif re.search('extended', name):
108             self.eland_type = ELAND_EXTENDED
109           elif re.search('export', name):
110             self.eland_type = ELAND_EXPORT
111           else:
112             self.eland_type = ELAND_SINGLE
113
114     def _update(self):
115         """
116         Actually read the file and actually count the reads
117         """
118         # can't do anything if we don't have a file to process
119         if self.pathname is None:
120             return
121         self._guess_eland_type(self.pathname)
122
123         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
124             raise RuntimeError("Eland isn't done, try again later.")
125
126         logging.info("summarizing results for %s" % (self.pathname))
127
128         if self.eland_type == ELAND_SINGLE:
129           result = self._update_eland_result(self.pathname)
130         elif self.eland_type == ELAND_MULTI or \
131              self.eland_type == ELAND_EXTENDED:
132           result = self._update_eland_multi(self.pathname)
133         else:
134           raise NotImplementedError("Only support single/multi/extended eland files")
135         self._match_codes, self._mapped_reads, self._reads = result
136
137     def _update_eland_result(self, pathname):
138         reads = 0
139         mapped_reads = {}
140
141         match_codes = {'NM':0, 'QC':0, 'RM':0,
142                        'U0':0, 'U1':0, 'U2':0,
143                        'R0':0, 'R1':0, 'R2':0,
144                       }
145         for line in autoopen(pathname,'r'):
146             reads += 1
147             fields = line.split()
148             # code = fields[2]
149             # match_codes[code] = match_codes.setdefault(code, 0) + 1
150             # the QC/NM etc codes are in the 3rd field and always present
151             match_codes[fields[2]] += 1
152             # ignore lines that don't have a fasta filename
153             if len(fields) < 7:
154                 continue
155             fasta = self.genome_map.get(fields[6], fields[6])
156             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
157         return match_codes, mapped_reads, reads
158
159     def _update_eland_multi(self, pathname):
160         reads = 0
161         mapped_reads = {}
162
163         match_codes = {'NM':0, 'QC':0, 'RM':0,
164                        'U0':0, 'U1':0, 'U2':0,
165                        'R0':0, 'R1':0, 'R2':0,
166                       }
167         match_counts_re = re.compile("([\d]+):([\d]+):([\d]+)")
168         for line in autoopen(pathname,'r'):
169             reads += 1
170             fields = line.split()
171             # fields[2] = QC/NM/or number of matches
172             groups = match_counts_re.match(fields[2])
173             if groups is None:
174                 match_codes[fields[2]] += 1
175             else:
176                 # when there are too many hit, eland  writes a - where
177                 # it would have put the list of hits.
178                 # or in a different version of eland, it just leaves
179                 # that column blank, and only outputs 3 fields.     
180                 if len(fields) < 4 or fields[3] == '-':
181                   continue
182                 zero_mismatches = int(groups.group(1))
183                 if zero_mismatches == 1:
184                   match_codes['U0'] += 1
185                 elif zero_mismatches < 255:
186                   match_codes['R0'] += zero_mismatches
187
188                 one_mismatches = int(groups.group(2))
189                 if one_mismatches == 1:
190                   match_codes['U1'] += 1
191                 elif one_mismatches < 255:
192                   match_codes['R1'] += one_mismatches
193
194                 two_mismatches = int(groups.group(3))
195                 if two_mismatches == 1:
196                   match_codes['U2'] += 1
197                 elif two_mismatches < 255:
198                   match_codes['R2'] += two_mismatches
199
200                 chromo = None
201                 for match in fields[3].split(','):
202                   match_fragment = match.split(':')
203                   if len(match_fragment) == 2:
204                       chromo = match_fragment[0]
205                       pos = match_fragment[1]
206
207                   fasta = self.genome_map.get(chromo, chromo)
208                   assert fasta is not None
209                   mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
210         return match_codes, mapped_reads, reads
211
212     def _get_mapped_reads(self):
213         if self._mapped_reads is None:
214             self._update()
215         return self._mapped_reads
216     mapped_reads = property(_get_mapped_reads)
217
218     def _get_match_codes(self):
219         if self._match_codes is None:
220             self._update()
221         return self._match_codes
222     match_codes = property(_get_match_codes)
223
224     def _get_no_match(self):
225         if self._mapped_reads is None:
226             self._update()  
227         return self._match_codes['NM']
228     no_match = property(_get_no_match, 
229                         doc="total reads that didn't match the target genome.")
230
231     def _get_no_match_percent(self):
232         return float(self.no_match)/self.reads * 100 
233     no_match_percent = property(_get_no_match_percent,
234                                 doc="no match reads as percent of total")
235
236     def _get_qc_failed(self):
237         if self._mapped_reads is None:
238             self._update()  
239         return self._match_codes['QC']
240     qc_failed = property(_get_qc_failed,
241                         doc="total reads that didn't match the target genome.")
242
243     def _get_qc_failed_percent(self):
244         return float(self.qc_failed)/self.reads * 100 
245     qc_failed_percent = property(_get_qc_failed_percent,
246                                  doc="QC failed reads as percent of total")
247
248     def _get_unique_reads(self):
249         if self._mapped_reads is None:
250            self._update()
251         sum = 0
252         for code in ['U0','U1','U2']:
253             sum += self._match_codes[code]
254         return sum
255     unique_reads = property(_get_unique_reads,
256                             doc="total unique reads")
257
258     def _get_repeat_reads(self):
259         if self._mapped_reads is None:
260            self._update()
261         sum = 0
262         for code in ['R0','R1','R2']:
263             sum += self._match_codes[code]
264         return sum
265     repeat_reads = property(_get_repeat_reads,
266                             doc="total repeat reads")
267     
268     def get_elements(self):
269         lane = ElementTree.Element(ElandLane.LANE,
270                                    {'version':
271                                     unicode(ElandLane.XML_VERSION)})
272         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
273         sample_tag.text = self.sample_name
274         lane_tag = ElementTree.SubElement(lane, LANE_ID)
275         lane_tag.text = str(self.lane_id)
276         if self.end is not None:
277             end_tag = ElementTree.SubElement(lane, END)
278             end_tag.text = str(self.end)
279         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
280         for k, v in self.genome_map.items():
281             item = ElementTree.SubElement(
282                 genome_map, GENOME_ITEM,
283                 {'name':k, 'value':unicode(v)})
284         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
285         for k, v in self.mapped_reads.items():
286             item = ElementTree.SubElement(
287                 mapped_reads, MAPPED_ITEM,
288                 {'name':k, 'value':unicode(v)})
289         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
290         for k, v in self.match_codes.items():
291             item = ElementTree.SubElement(
292                 match_codes, MATCH_ITEM,
293                 {'name':k, 'value':unicode(v)})
294         reads = ElementTree.SubElement(lane, READS)
295         reads.text = unicode(self.reads)
296
297         return lane
298
299     def set_elements(self, tree):
300         if tree.tag != ElandLane.LANE:
301             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
302
303         # reset dictionaries
304         self._mapped_reads = {}
305         self._match_codes = {}
306
307         for element in tree:
308             tag = element.tag.lower()
309             if tag == SAMPLE_NAME.lower():
310                 self._sample_name = element.text
311             elif tag == LANE_ID.lower():
312                 self.lane_id = int(element.text)
313             elif tag == END.lower():
314                 self.end = int(element.text)
315             elif tag == GENOME_MAP.lower():
316                 for child in element:
317                     name = child.attrib['name']
318                     value = child.attrib['value']
319                     self.genome_map[name] = value
320             elif tag == MAPPED_READS.lower():
321                 for child in element:
322                     name = child.attrib['name']
323                     value = child.attrib['value']
324                     self._mapped_reads[name] = int(value)
325             elif tag == MATCH_CODES.lower():
326                 for child in element:
327                     name = child.attrib['name']
328                     value = int(child.attrib['value'])
329                     self._match_codes[name] = value
330             elif tag == READS.lower():
331                 self._reads = int(element.text)
332             else:
333                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
334
335 class SequenceLane(ResultLane):
336     XML_VERSION=1
337     LANE = 'SequenceLane'
338     SEQUENCE_TYPE = 'SequenceType'
339
340     NONE_TYPE = None
341     SCARF_TYPE = 1
342     FASTQ_TYPE = 2
343     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
344
345     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
346         self.sequence_type = None
347         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
348
349     def _guess_sequence_type(self, pathname):
350         """
351         Determine if we have a scarf or fastq sequence file
352         """
353         f = open(pathname,'r')
354         l = f.readline()
355         f.close()
356
357         if l[0] == '@':
358             # fastq starts with a @
359             self.sequence_type = SequenceLane.FASTQ_TYPE
360         else:
361             self.sequence_type = SequenceLane.SCARF_TYPE
362         return self.sequence_type
363
364     def _update(self):
365         """
366         Actually read the file and actually count the reads
367         """
368         # can't do anything if we don't have a file to process
369         if self.pathname is None:
370             return
371
372         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
373             raise RuntimeError("Sequencing isn't done, try again later.")
374
375         self._guess_sequence_type(self.pathname)
376
377         logging.info("summarizing results for %s" % (self.pathname))
378         lines = 0
379         f = open(self.pathname)
380         for l in f.xreadlines():
381             lines += 1
382         f.close()
383
384         if self.sequence_type == SequenceLane.SCARF_TYPE:
385             self._reads = lines
386         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
387             self._reads = lines / 4
388         else:
389           raise NotImplementedError("This only supports scarf or fastq squence files")
390
391     def get_elements(self):
392         lane = ElementTree.Element(SequenceLane.LANE,
393                                    {'version':
394                                     unicode(SequenceLane.XML_VERSION)})
395         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
396         sample_tag.text = self.sample_name
397         lane_tag = ElementTree.SubElement(lane, LANE_ID)
398         lane_tag.text = str(self.lane_id)
399         if self.end is not None:
400             end_tag = ElementTree.SubElement(lane, END)
401             end_tag.text = str(self.end)
402         reads = ElementTree.SubElement(lane, READS)
403         reads.text = unicode(self.reads)
404         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
405         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
406
407         return lane
408
409     def set_elements(self, tree):
410         if tree.tag != SequenceLane.LANE:
411             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
412         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
413
414         for element in tree:
415             tag = element.tag.lower()
416             if tag == SAMPLE_NAME.lower():
417                 self._sample_name = element.text
418             elif tag == LANE_ID.lower():
419                 self.lane_id = int(element.text)
420             elif tag == END.lower():
421                 self.end = int(element.text)
422             elif tag == READS.lower():
423                 self._reads = int(element.text)
424             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
425                 self.sequence_type = lookup_sequence_type.get(element.text, None)
426             else:
427                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
428
429 class ELAND(object):
430     """
431     Summarize information from eland files
432     """
433     XML_VERSION = 3
434
435     ELAND = 'ElandCollection'
436     LANE = 'Lane'
437     LANE_ID = 'id'
438     END = 'end'
439
440     def __init__(self, xml=None):
441         # we need information from the gerald config.xml
442         self.results = [{},{}]
443
444         if xml is not None:
445             self.set_elements(xml)
446
447         if len(self.results[0]) == 0:
448             # Initialize our eland object with meaningless junk
449             for l in  LANE_LIST:
450                 self.results[0][l] = ResultLane(lane_id=l, end=0)
451
452
453     def get_elements(self):
454         root = ElementTree.Element(ELAND.ELAND,
455                                    {'version': unicode(ELAND.XML_VERSION)})
456         for end in range(len(self.results)):
457            end_results = self.results[end]
458            for lane_id, lane in end_results.items():
459                 eland_lane = lane.get_elements()
460                 eland_lane.attrib[ELAND.END] = unicode (end)
461                 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
462                 root.append(eland_lane)
463         return root
464
465     def set_elements(self, tree):
466         if tree.tag.lower() != ELAND.ELAND.lower():
467             raise ValueError('Expecting %s', ELAND.ELAND)
468         for element in list(tree):
469             lane_id = int(element.attrib[ELAND.LANE_ID])
470             end = int(element.attrib.get(ELAND.END, 0)) 
471             if element.tag.lower() == ElandLane.LANE.lower():
472                 lane = ElandLane(xml=element)
473             elif element.tag.lower() == SequenceLane.LANE.lower():
474                 lane = SequenceLane(xml=element)
475
476             self.results[end][lane_id] = lane
477
478 def check_for_eland_file(basedir, pattern, lane_id, end):
479    if end is None:
480       full_lane_id = lane_id
481    else:
482       full_lane_id = "%d_%d" % ( lane_id, end )
483
484    basename = pattern % (full_lane_id,)
485    pathname = os.path.join(basedir, basename)
486    if os.path.exists(pathname):
487        logging.info('found eland file in %s' % (pathname,))
488        return pathname
489    else:
490        return None
491
492 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
493     # yes the lane_id is also being computed in ElandLane._update
494     # I didn't want to clutter up my constructor
495     # but I needed to persist the sample_name/lane_id for
496     # runfolder summary_report
497     path, name = os.path.split(pathname)
498     logging.info("Adding eland file %s" %(name,))
499     # split_name = name.split('_')
500     # lane_id = int(split_name[1])
501
502     if genome_maps is not None:
503         genome_map = genome_maps[lane_id]
504     elif gerald is not None:
505         genome_dir = gerald.lanes[lane_id].eland_genome
506         genome_map = build_genome_fasta_map(genome_dir)
507     else:
508         genome_map = {}
509
510     lane = ElandLane(pathname, lane_id, end, genome_map)
511     
512     if end is None:
513         effective_end =  0
514     else:
515         effective_end = end - 1
516
517     results[effective_end][lane_id] = lane
518
519 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
520     result = SequenceLane(pathname, lane_id, end)
521
522     if end is None:
523         effective_end =  0
524     else:
525         effective_end = end - 1
526
527     results[effective_end][lane_id] = result
528
529
530 def eland(gerald_dir, gerald=None, genome_maps=None):
531     e = ELAND()
532
533     lane_ids = range(1,9)
534     ends = [None, 1, 2]
535
536     basedirs = [gerald_dir]
537
538     # if there is a basedir/Temp change basedir to point to the temp
539     # directory, as 1.1rc1 moves most of the files we've historically
540     # cared about to that subdirectory.
541     # we should look into what the official 'result' files are.
542     # and 1.3 moves them back
543     basedir_temp = os.path.join(gerald_dir, 'Temp')
544     if os.path.isdir(basedir_temp):
545         basedirs.append(basedir_temp)
546
547    
548     # the order in patterns determines the preference for what
549     # will be found.
550     MAPPED_ELAND = 0
551     SEQUENCE = 1
552     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
553                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
554                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
555                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
556                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
557                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
558                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
559                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
560                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
561                 ('s_%s_sequence.txt', SEQUENCE),]
562
563     for basedir in basedirs:
564         for end in ends:
565             for lane_id in lane_ids:
566                 for p in patterns:
567                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
568                     if pathname is not None:
569                         if p[1] == MAPPED_ELAND:
570                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
571                         elif p[1] == SEQUENCE:
572                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
573                         break
574                 else:
575                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
576                     continue
577
578     return e
579
580 def build_genome_fasta_map(genome_dir):
581     # build fasta to fasta file map
582     logging.info("Building genome map")
583     genome = genome_dir.split(os.path.sep)[-1]
584     fasta_map = {}
585     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
586         is_link = False
587         if os.path.islink(vld_file):
588             is_link = True
589         vld_file = os.path.realpath(vld_file)
590         path, vld_name = os.path.split(vld_file)
591         name, ext = os.path.splitext(vld_name)
592         if is_link:
593             fasta_map[name] = name
594         else:
595             fasta_map[name] = os.path.join(genome, name)
596     return fasta_map
597
598
599 def extract_eland_sequence(instream, outstream, start, end):
600     """
601     Extract a chunk of sequence out of an eland file
602     """
603     for line in instream:
604         record = line.split()
605         if len(record) > 1:
606             result = [record[0], record[1][start:end]]
607         else:
608             result = [record[0][start:end]]
609         outstream.write("\t".join(result))
610         outstream.write(os.linesep)