acc805a889d3000ffbe4a1f73cedec2930e47bbf
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 from __future__ import print_function
5
6 import collections
7 from glob import glob
8 import logging
9 import os
10 import re
11 import stat
12 import sys
13 import types
14
15 from htsworkflow.pipelines import ElementTree, LANE_LIST
16 from htsworkflow.pipelines.samplekey import SampleKey
17 from htsworkflow.pipelines.genomemap import GenomeMap
18 from htsworkflow.util.ethelp import indent, flatten
19 from htsworkflow.util.opener import autoopen
20
21 LOGGER = logging.getLogger(__name__)
22
23 SAMPLE_NAME = 'SampleName'
24 LANE_ID = 'LaneID'
25 END = 'End'
26 READS = 'Reads'
27
28 GENOME_MAP = 'GenomeMap'
29 GENOME_ITEM = 'GenomeItem'
30 MAPPED_READS = 'MappedReads'
31 MAPPED_ITEM = 'MappedItem'
32 MATCH_CODES = 'MatchCodes'
33 MATCH_ITEM = 'Code'
34 READS = 'Reads'
35
36 ELAND_SINGLE = 0
37 ELAND_MULTI = 1
38 ELAND_EXTENDED = 2
39 ELAND_EXPORT = 3
40
41 class ResultLane(object):
42     """
43     Base class for result lanes
44     """
45     XML_VERSION = 2
46     LANE = 'ResultLane'
47
48     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
49                  xml=None):
50         self.pathnames = pathnames
51         self.sample_name = sample
52         self.lane_id = lane_id
53         self.end = end
54         self._reads = None
55
56         if xml is not None:
57             self.set_elements(xml)
58
59     def _update(self):
60         """
61         Actually read the file and actually count the reads
62         """
63         pass
64
65     def _get_reads(self):
66         if self._reads is None:
67             self._update()
68         return self._reads
69     reads = property(_get_reads)
70
71     def get_elements(self):
72         return None
73
74     def __repr__(self):
75         name = []
76
77         name.append('L%s' % (self.lane_id,))
78         name.append('R%s' % (self.end,))
79         name.append('S%s' % (self.sample_name,))
80
81         return '<ResultLane(' + ",".join(name) + ')>'
82
83 class ElandLane(ResultLane):
84     """
85     Process an eland result file
86     """
87     XML_VERSION = 2
88     LANE = "ElandLane"
89     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92     SCORE_UNRECOGNIZED = 0
93     SCORE_QC = 1
94     SCORE_READ = 2
95
96     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
97                  genome_map=None, eland_type=None, xml=None):
98         super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
99
100         self._mapped_reads = None
101         self._match_codes = None
102         self._reads = None
103         self.genome_map = GenomeMap(genome_map)
104         self.eland_type = None
105
106         if xml is not None:
107             self.set_elements(xml)
108
109     def __repr__(self):
110         name = []
111
112         name.append('L%s' % (self.lane_id,))
113         name.append('R%s' % (self.end,))
114         name.append('S%s' % (self.sample_name,))
115
116         reads = str(self._reads) if self._reads is not None else 'Uncounted'
117         return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
118
119     def _guess_eland_type(self, pathname):
120         if self.eland_type is None:
121           # attempt autodetect eland file type
122           pathn, name = os.path.split(pathname)
123           if re.search('result', name):
124             self.eland_type = ELAND_SINGLE
125           elif re.search('multi', name):
126             self.eland_type = ELAND_MULTI
127           elif re.search('extended', name):
128             self.eland_type = ELAND_EXTENDED
129           elif re.search('export', name):
130             self.eland_type = ELAND_EXPORT
131           else:
132             self.eland_type = ELAND_SINGLE
133
134     def _update(self):
135         """
136         Actually read the file and actually count the reads
137         """
138         # can't do anything if we don't have a file to process
139         if self.pathnames is None or len(self.pathnames) == 0:
140             return
141         pathname = self.pathnames[-1]
142         self._guess_eland_type(pathname)
143
144         if os.stat(pathname)[stat.ST_SIZE] == 0:
145             raise RuntimeError("Eland isn't done, try again later.")
146
147         LOGGER.debug("summarizing results for %s" % (pathname))
148         self._match_codes = MatchCodes()
149         self._mapped_reads = MappedReads()
150         self._reads = 0
151
152         for pathname in self.pathnames:
153             stream = autoopen(pathname, 'r')
154             if self.eland_type == ELAND_SINGLE:
155                 result = self._update_eland_result(stream)
156             elif self.eland_type == ELAND_MULTI or \
157                  self.eland_type == ELAND_EXTENDED:
158                 result = self._update_eland_multi(stream)
159             elif self.eland_type == ELAND_EXPORT:
160                 result = self._update_eland_export(stream)
161             else:
162                 errmsg = "Only support single/multi/extended eland files"
163                 raise NotImplementedError(errmsg)
164             stream.close()
165
166             match, mapped, reads = result
167             self._match_codes += match
168             self._mapped_reads += mapped
169             self._reads += reads
170
171     def _update_eland_result(self, instream):
172         reads = 0
173         mapped_reads = MappedReads()
174         match_codes = MatchCodes()
175
176         for line in instream:
177             reads += 1
178             fields = line.split()
179             # code = fields[2]
180             # match_codes[code] = match_codes.setdefault(code, 0) + 1
181             # the QC/NM etc codes are in the 3rd field and always present
182             match_codes[fields[2]] += 1
183             # ignore lines that don't have a fasta filename
184             if len(fields) < 7:
185                 continue
186             fasta = self.genome_map.get(fields[6], fields[6])
187             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
188         return match_codes, mapped_reads, reads
189
190     def _update_eland_multi(self, instream):
191         """Summarize an eland_extend."""
192         MATCH_INDEX = 2
193         LOCATION_INDEX = 3
194         reads = 0
195         mapped_reads = MappedReads()
196         match_codes = MatchCodes()
197
198         for line in instream:
199             reads += 1
200             fields = line.split()
201             # fields[2] = QC/NM/or number of matches
202             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
203                                                        match_codes)
204             if score_type == ElandLane.SCORE_READ:
205                 # when there are too many hits, eland  writes a - where
206                 # it would have put the list of hits.
207                 # or in a different version of eland, it just leaves
208                 # that column blank, and only outputs 3 fields.
209                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
210                   continue
211
212                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
213
214         return match_codes, mapped_reads, reads
215
216     def _update_eland_export(self, instream):
217         """Summarize a gerald export file."""
218         MATCH_INDEX = 10
219         LOCATION_INDEX = 10
220         DESCRIPTOR_INDEX= 13
221         reads = 0
222         mapped_reads = MappedReads()
223         match_codes = MatchCodes()
224
225         for line in instream:
226             reads += 1
227             fields = line.split()
228             # fields[2] = QC/NM/or number of matches
229             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
230                                                        match_codes)
231             if score_type == ElandLane.SCORE_UNRECOGNIZED:
232                 # export files have three states for the match field
233                 # QC code, count of multi-reads, or a single
234                 # read location. The score_mapped_mismatches function
235                 # only understands the first two types.
236                 # if we get unrecognized, that implies the field is probably
237                 # a location.
238                 code = self._count_mapped_export(mapped_reads,
239                                                  fields[LOCATION_INDEX],
240                                                  fields[DESCRIPTOR_INDEX])
241                 match_codes[code] += 1
242
243         return match_codes, mapped_reads, reads
244
245
246     def _score_mapped_mismatches(self, match, match_codes):
247         """Update match_codes with eland map counts, or failure code.
248
249         Returns True if the read mapped, false if it was an error code.
250         """
251         groups = ElandLane.MATCH_COUNTS_RE.match(match)
252         if groups is None:
253             # match is not of the form [\d]+:[\d]+:[\d]+
254             if match in match_codes:
255                 # match is one quality control codes QC/NM etc
256                 match_codes[match] += 1
257                 return ElandLane.SCORE_QC
258             else:
259                 return ElandLane.SCORE_UNRECOGNIZED
260         else:
261             # match is of the form [\d]+:[\d]+:[\d]+
262             # AKA Multiread
263             zero_mismatches = int(groups.group(1))
264             one_mismatches = int(groups.group(2))
265             two_mismatches = int(groups.group(3))
266
267             if zero_mismatches == 1:
268                 match_codes['U0'] += 1
269             elif zero_mismatches < 255:
270                 match_codes['R0'] += zero_mismatches
271
272             if one_mismatches == 1:
273                 match_codes['U1'] += 1
274             elif one_mismatches < 255:
275                 match_codes['R1'] += one_mismatches
276
277             if two_mismatches == 1:
278                 match_codes['U2'] += 1
279             elif two_mismatches < 255:
280                 match_codes['R2'] += two_mismatches
281
282             return ElandLane.SCORE_READ
283
284
285     def _count_mapped_multireads(self, mapped_reads, match_string):
286         chromo = None
287         for match in match_string.split(','):
288             match_fragment = match.split(':')
289             if len(match_fragment) == 2:
290                 chromo = match_fragment[0]
291                 pos = match_fragment[1]
292
293             fasta = self.genome_map.get(chromo, chromo)
294             assert fasta is not None
295             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
296
297
298     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
299         """Count a read as defined in an export file
300
301         match_string contains the chromosome
302         descriptor contains the an ecoding of bases that match, mismatch,
303                    and have indels.
304         returns the "best" match code
305
306         Currently "best" match code is ignoring the possibility of in-dels
307         """
308         chromo = match_string
309         fasta = self.genome_map.get(chromo, chromo)
310         assert fasta is not None
311         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
312
313         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
314         if len(mismatch_bases) == 0:
315             return 'U0'
316         elif len(mismatch_bases) == 1:
317             return 'U1'
318         else:
319             return 'U2'
320
321
322     def _get_mapped_reads(self):
323         if self._mapped_reads is None:
324             self._update()
325         return self._mapped_reads
326     mapped_reads = property(_get_mapped_reads)
327
328     def _get_match_codes(self):
329         if self._match_codes is None:
330             self._update()
331         return self._match_codes
332     match_codes = property(_get_match_codes)
333
334     def _get_no_match(self):
335         if self._mapped_reads is None:
336             self._update()
337         return self._match_codes['NM']
338     no_match = property(_get_no_match,
339                         doc="total reads that didn't match the target genome.")
340
341     def _get_no_match_percent(self):
342         return float(self.no_match)/self.reads * 100
343     no_match_percent = property(_get_no_match_percent,
344                                 doc="no match reads as percent of total")
345
346     def _get_qc_failed(self):
347         if self._mapped_reads is None:
348             self._update()
349         return self._match_codes['QC']
350     qc_failed = property(_get_qc_failed,
351                         doc="total reads that didn't match the target genome.")
352
353     def _get_qc_failed_percent(self):
354         return float(self.qc_failed)/self.reads * 100
355     qc_failed_percent = property(_get_qc_failed_percent,
356                                  doc="QC failed reads as percent of total")
357
358     def _get_unique_reads(self):
359         if self._mapped_reads is None:
360            self._update()
361         sum = 0
362         for code in ['U0','U1','U2']:
363             sum += self._match_codes[code]
364         return sum
365     unique_reads = property(_get_unique_reads,
366                             doc="total unique reads")
367
368     def _get_repeat_reads(self):
369         if self._mapped_reads is None:
370            self._update()
371         sum = 0
372         for code in ['R0','R1','R2']:
373             sum += self._match_codes[code]
374         return sum
375     repeat_reads = property(_get_repeat_reads,
376                             doc="total repeat reads")
377
378     def get_elements(self):
379         lane = ElementTree.Element(ElandLane.LANE,
380                                    {'version':
381                                     unicode(ElandLane.XML_VERSION)})
382         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
383         sample_tag.text = self.sample_name
384         lane_tag = ElementTree.SubElement(lane, LANE_ID)
385         lane_tag.text = str(self.lane_id)
386         if self.end is not None:
387             end_tag = ElementTree.SubElement(lane, END)
388             end_tag.text = str(self.end)
389         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
390         for k, v in self.genome_map.items():
391             item = ElementTree.SubElement(
392                 genome_map, GENOME_ITEM,
393                 {'name':k, 'value':unicode(v)})
394         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
395         for k, v in self.mapped_reads.items():
396             item = ElementTree.SubElement(
397                 mapped_reads, MAPPED_ITEM,
398                 {'name':k, 'value':unicode(v)})
399         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
400         for k, v in self.match_codes.items():
401             item = ElementTree.SubElement(
402                 match_codes, MATCH_ITEM,
403                 {'name':k, 'value':unicode(v)})
404         reads = ElementTree.SubElement(lane, READS)
405         reads.text = unicode(self.reads)
406
407         return lane
408
409     def set_elements(self, tree):
410         if tree.tag != ElandLane.LANE:
411             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
412
413         # reset dictionaries
414         self._mapped_reads = {}
415         self._match_codes = {}
416
417         for element in tree:
418             tag = element.tag.lower()
419             if tag == SAMPLE_NAME.lower():
420                 self.sample_name = element.text
421             elif tag == LANE_ID.lower():
422                 self.lane_id = int(element.text)
423             elif tag == END.lower():
424                 self.end = int(element.text)
425             elif tag == GENOME_MAP.lower():
426                 for child in element:
427                     name = child.attrib['name']
428                     value = child.attrib['value']
429                     self.genome_map[name] = value
430             elif tag == MAPPED_READS.lower():
431                 for child in element:
432                     name = child.attrib['name']
433                     value = child.attrib['value']
434                     self._mapped_reads[name] = int(value)
435             elif tag == MATCH_CODES.lower():
436                 for child in element:
437                     name = child.attrib['name']
438                     value = int(child.attrib['value'])
439                     self._match_codes[name] = value
440             elif tag == READS.lower():
441                 self._reads = int(element.text)
442             else:
443                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
444
445
446 class MatchCodes(collections.MutableMapping):
447     """Mapping to hold match counts -
448     supports combining two match count sets together
449     """
450     def __init__(self, initializer=None):
451         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
452                             'U0':0, 'U1':0, 'U2':0,
453                             'R0':0, 'R1':0, 'R2':0,
454                             }
455
456         if initializer is not None:
457             if not isinstance(initializer, collections.Mapping):
458                 raise ValueError("Expected dictionary like class")
459             for key in initializer:
460                 if key not in self.match_codes:
461                     errmsg = "Initializer can only contain: %s"
462                     raise ValueError(errmsg % (",".join(self.match_codes.keys())))
463                 self.match_codes[key] += initializer[key]
464
465     def __iter__(self):
466         return iter(self.match_codes)
467
468     def __delitem__(self, key):
469         raise RuntimeError("delete not allowed")
470
471     def __getitem__(self, key):
472         return self.match_codes[key]
473
474     def __setitem__(self, key, value):
475         if key not in self.match_codes:
476             errmsg = "Unrecognized key, allowed values are: %s"
477             raise ValueError(errmsg % (",".join(self.match_codes.keys())))
478         self.match_codes[key] = value
479
480     def __len__(self):
481         return len(self.match_codes)
482
483     def __add__(self, other):
484         if not isinstance(other, MatchCodes):
485             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
486
487         newobj = MatchCodes(self)
488         for key, value in other.items():
489             newobj[key] = self.get(key, 0) + other[key]
490
491         return newobj
492
493
494 class MappedReads(collections.MutableMapping):
495     """Mapping to hold mapped reads -
496     supports combining two mapped read sets together
497     """
498     def __init__(self, initializer=None):
499         self.mapped_reads = {}
500
501         if initializer is not None:
502             if not isinstance(initializer, collections.Mapping):
503                 raise ValueError("Expected dictionary like class")
504             for key in initializer:
505                 self[key] = self.setdefault(key, 0) + initializer[key]
506
507     def __iter__(self):
508         return iter(self.mapped_reads)
509
510     def __delitem__(self, key):
511         del self.mapped_reads[key]
512
513     def __getitem__(self, key):
514         return self.mapped_reads[key]
515
516     def __setitem__(self, key, value):
517         self.mapped_reads[key] = value
518
519     def __len__(self):
520         return len(self.mapped_reads)
521
522     def __add__(self, other):
523         if not isinstance(other, MappedReads):
524             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
525
526         newobj = MappedReads(self)
527         for key in other:
528             newobj[key] = self.get(key, 0) + other[key]
529
530         return newobj
531
532 class SequenceLane(ResultLane):
533     XML_VERSION=1
534     LANE = 'SequenceLane'
535     SEQUENCE_TYPE = 'SequenceType'
536
537     NONE_TYPE = None
538     SCARF_TYPE = 1
539     FASTQ_TYPE = 2
540     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
541
542     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
543                  xml=None):
544         self.sequence_type = None
545         super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
546
547     def _guess_sequence_type(self, pathname):
548         """
549         Determine if we have a scarf or fastq sequence file
550         """
551         f = open(pathname,'r')
552         l = f.readline()
553         f.close()
554
555         if l[0] == '@':
556             # fastq starts with a @
557             self.sequence_type = SequenceLane.FASTQ_TYPE
558         else:
559             self.sequence_type = SequenceLane.SCARF_TYPE
560         return self.sequence_type
561
562     def _update(self):
563         """
564         Actually read the file and actually count the reads
565         """
566         # can't do anything if we don't have a file to process
567         if self.pathnames is None:
568             return
569
570         pathname = self.pathnames[-1]
571         if os.stat(pathname)[stat.ST_SIZE] == 0:
572             raise RuntimeError("Sequencing isn't done, try again later.")
573
574         self._guess_sequence_type(pathname)
575
576         LOGGER.info("summarizing results for %s" % (pathname))
577         lines = 0
578         f = open(pathname)
579         for l in f.xreadlines():
580             lines += 1
581         f.close()
582
583         if self.sequence_type == SequenceLane.SCARF_TYPE:
584             self._reads = lines
585         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
586             self._reads = lines / 4
587         else:
588             errmsg = "This only supports scarf or fastq squence files"
589             raise NotImplementedError(errmsg)
590
591     def get_elements(self):
592         lane = ElementTree.Element(SequenceLane.LANE,
593                                    {'version':
594                                     unicode(SequenceLane.XML_VERSION)})
595         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
596         sample_tag.text = self.sample_name
597         lane_tag = ElementTree.SubElement(lane, LANE_ID)
598         lane_tag.text = str(self.lane_id)
599         if self.end is not None:
600             end_tag = ElementTree.SubElement(lane, END)
601             end_tag.text = str(self.end)
602         reads = ElementTree.SubElement(lane, READS)
603         reads.text = unicode(self.reads)
604         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
605         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
606
607         return lane
608
609     def set_elements(self, tree):
610         if tree.tag != SequenceLane.LANE:
611             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
612         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
613
614         for element in tree:
615             tag = element.tag.lower()
616             if tag == SAMPLE_NAME.lower():
617                 self.sample_name = element.text
618             elif tag == LANE_ID.lower():
619                 self.lane_id = int(element.text)
620             elif tag == END.lower():
621                 self.end = int(element.text)
622             elif tag == READS.lower():
623                 self._reads = int(element.text)
624             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
625                 self.sequence_type = lookup_sequence_type.get(element.text, None)
626             else:
627                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
628
629 class ELAND(collections.MutableMapping):
630     """
631     Summarize information from eland files
632     """
633     XML_VERSION = 3
634
635     ELAND = 'ElandCollection'
636     LANE = 'Lane'
637     LANE_ID = 'id'
638     SAMPLE = 'sample'
639     END = 'end'
640
641     def __init__(self, xml=None):
642         # we need information from the gerald config.xml
643         self.results = {}
644
645         if xml is not None:
646             self.set_elements(xml)
647
648     def __getitem__(self, key):
649         if not isinstance(key, SampleKey):
650             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
651         return self.results[key]
652
653     def __setitem__(self, key, value):
654         if not isinstance(key, SampleKey):
655             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
656         self.results[key] = value
657
658     def __delitem__(self, key):
659         del self.result[key]
660
661     def __iter__(self):
662         keys = self.results.iterkeys()
663         for k in sorted(keys):
664             yield k
665
666     def __len__(self):
667         return len(self.results)
668
669     def find_keys(self, search):
670         """Return results that match key"""
671         if not isinstance(search, SampleKey):
672             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
673         if not search.iswild:
674             yield self[search]
675         for key in self.keys():
676             if key.matches(search): yield key
677
678     def get_elements(self):
679         root = ElementTree.Element(ELAND.ELAND,
680                                    {'version': unicode(ELAND.XML_VERSION)})
681
682         for key in self:
683             eland_lane = self[key].get_elements()
684             eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
685             eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
686             eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
687             root.append(eland_lane)
688         return root
689         return root
690
691     def set_elements(self, tree):
692         if tree.tag.lower() != ELAND.ELAND.lower():
693             raise ValueError('Expecting %s', ELAND.ELAND)
694         for element in list(tree):
695             lane_id = int(element.attrib[ELAND.LANE_ID])
696             end = int(element.attrib.get(ELAND.END, 0))
697             sample = element.attrib.get(ELAND.SAMPLE, 's')
698             if element.tag.lower() == ElandLane.LANE.lower():
699                 lane = ElandLane(xml=element)
700             elif element.tag.lower() == SequenceLane.LANE.lower():
701                 lane = SequenceLane(xml=element)
702
703             key = SampleKey(lane=lane_id, read=end+1, sample=sample)
704             self.results[key] = lane
705
706
707     def update_result_with_eland(self, gerald, key, pathnames,
708                                  genome_maps):
709         # yes the lane_id is also being computed in ElandLane._update
710         # I didn't want to clutter up my constructor
711         # but I needed to persist the sample_name/lane_id for
712         # runfolder summary_report
713         names = [ os.path.split(p)[1] for p in pathnames]
714         LOGGER.info("Adding eland files %s" %(",".join(names),))
715         basedir = os.path.split(pathnames[0])[0]
716         gs_template = "{0}_*_L{1:03}_genomesize.xml"
717         genomesize = glob(
718             os.path.join(basedir,
719                          gs_template.format(key.sample, key.lane)))
720
721
722         genome_map = GenomeMap()
723         if genome_maps is not None:
724             genome_map = GenomeMap(genome_maps[key.lane])
725         elif len(genomesize) > 0:
726             LOGGER.info("Found {0}".format(genomesize))
727             genome_map.parse_genomesize(genomesize[0])
728         elif gerald is not None:
729             genome_dir = gerald.lanes[key].eland_genome
730             if genome_dir is not None:
731                 genome_map.scan_genome_dir(genome_dir)
732
733         lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
734
735         self.results[key] = lane
736
737     def update_result_with_sequence(self, gerald, key, pathnames,
738                                     genome_maps=None):
739         self.results[key] = SequenceLane(pathnames,
740                                          key.sample, key.lane, key.read)
741
742
743 def eland(gerald_dir, gerald=None, genome_maps=None):
744     e = ELAND()
745     eland_files = ElandMatches(e)
746     # collect
747     for path, dirnames, filenames in os.walk(gerald_dir):
748         for filename in filenames:
749             pathname = os.path.abspath(os.path.join(path, filename))
750             eland_files.add(pathname)
751     for key in eland_files:
752         eland_files.count(key, gerald, genome_maps)
753     return e
754
755
756 class ElandMatches(collections.MutableMapping):
757     def __init__(self, eland_container):
758         # the order in patterns determines the preference for what
759         # will be found.
760         self.eland_container = eland_container
761         MAPPED = eland_container.update_result_with_eland
762         SEQUENCE = eland_container.update_result_with_sequence
763
764         sample = '(?P<sample>[^_]+)'
765         hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
766         hiLane = '_L(?P<lane>[\d]+)'
767         gaLane = '_(?P<lane>[\d]+)'
768         hiRead = '_R(?P<read>[\d]+)'
769         gaRead = '(_(?P<read>[\d])+)?'
770         part = '_(?P<part>[\d]+)'
771         ext = '(?P<extention>(\.bz2|\.gz)?)'
772
773         hiPrefix = sample + hiIndex + hiLane + hiRead + part
774         gaPrefix = sample + gaLane + gaRead
775         P = collections.namedtuple('Patterns', 'pattern counter priority')
776         self.patterns = [
777             P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
778             P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
779             P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
780             P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
781             P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
782             P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
783             ]
784         self.file_sets = {}
785         self.file_priority = {}
786         self.file_counter = {}
787
788     def add(self, pathname):
789         """Add pathname to our set of files
790         """
791         path, filename = os.path.split(pathname)
792
793         for pattern, counter, priority in self.patterns:
794             rematch = re.match(pattern, filename)
795             if rematch is not None:
796                 m = ElandMatch(pathname, counter, **rematch.groupdict())
797                 key = m.make_samplekey()
798                 old_priority = self.file_priority.get(key, 0)
799                 if priority > old_priority:
800                     self.file_sets[key] = set((m,))
801                     self.file_counter[key] = counter
802                     self.file_priority[key] = priority
803                 elif priority == old_priority:
804                     self.file_sets[key].add(m)
805
806     def count(self, key, gerald=None, genome_maps=None):
807         #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
808         counter = self.file_counter[key]
809         file_set = self.file_sets[key]
810         filenames = [ f.filename for f in file_set ]
811         return counter(gerald, key,
812                        filenames, genome_maps)
813
814     def __iter__(self):
815         return iter(self.file_sets)
816
817     def __len__(self):
818         return len(self.file_sets)
819
820     def __getitem__(self, key):
821         return self.file_sets[key]
822
823     def __setitem__(self, key, value):
824         if not isintance(value, set):
825             raise ValueError("Expected set for value")
826         self.file_sets[key] = value
827
828     def __delitem__(self, key):
829         del self.file_sets[key]
830
831 class ElandMatch(object):
832     def __init__(self, pathname, counter,
833                  lane=None, read=None, extension=None,
834                  sample=None, index=None, part=None, **kwargs):
835         self.filename = pathname
836         self.counter = counter
837         self._lane = lane
838         self._read = read
839         self.extension = extension
840         self.sample = sample
841         self.index = index
842         self._part = part
843         LOGGER.info("Found %s: L%s R%s Samp%s" % (
844             self.filename, self._lane, self._read, self.sample))
845
846     def make_samplekey(self):
847         read = self._read if self._read is not None else 1
848         return SampleKey(lane=self.lane, read=read, sample=self.sample)
849
850     def _get_lane(self):
851         if self._lane is not None:
852             return int(self._lane)
853         return self._lane
854     lane = property(_get_lane)
855
856     def _get_read(self):
857         if self._read is not None:
858             return int(self._read)
859         return self._read
860     read = property(_get_read)
861
862     def _get_part(self):
863         if self._part is not None:
864             return int(self._part)
865         return self._part
866     part = property(_get_part)
867
868     def __repr__(self):
869         name = []
870         if self.sample is not None: name.append(self.sample)
871         if self._lane is not None: name.append('L%s' % (self.lane,))
872         if self._read is not None: name.append('R%s' % (self.read,))
873         if self._part is not None: name.append('P%s' % (self.part,))
874         return '<ElandMatch(' + "_".join(name) + ')>'
875
876
877 def extract_eland_sequence(instream, outstream, start, end):
878     """
879     Extract a chunk of sequence out of an eland file
880     """
881     for line in instream:
882         record = line.split()
883         if len(record) > 1:
884             result = [record[0], record[1][start:end]]
885         else:
886             result = [record[0][start:end]]
887         outstream.write("\t".join(result))
888         outstream.write(os.linesep)
889
890
891 def main(cmdline=None):
892     """Run eland extraction against the specified gerald directory"""
893     from optparse import OptionParser
894     parser = OptionParser("%prog: <gerald dir>+")
895     opts, args = parser.parse_args(cmdline)
896     logging.basicConfig(level=logging.DEBUG)
897     for a in args:
898         LOGGER.info("Starting scan of %s" % (a,))
899         e = eland(a)
900         print(ElementTree.tostring(e.get_elements()))
901     return
902
903
904 if __name__ == "__main__":
905     main(sys.argv[1:])