Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 import collections
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11 import types
12
13 from htsworkflow.pipelines import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.pipelines.genomemap import GenomeMap
16 from htsworkflow.util.ethelp import indent, flatten
17 from htsworkflow.util.opener import autoopen
18
19 LOGGER = logging.getLogger(__name__)
20
21 SAMPLE_NAME = 'SampleName'
22 LANE_ID = 'LaneID'
23 END = 'End'
24 READS = 'Reads'
25
26 GENOME_MAP = 'GenomeMap'
27 GENOME_ITEM = 'GenomeItem'
28 MAPPED_READS = 'MappedReads'
29 MAPPED_ITEM = 'MappedItem'
30 MATCH_CODES = 'MatchCodes'
31 MATCH_ITEM = 'Code'
32 READS = 'Reads'
33
34 ELAND_SINGLE = 0
35 ELAND_MULTI = 1
36 ELAND_EXTENDED = 2
37 ELAND_EXPORT = 3
38
39 class ResultLane(object):
40     """
41     Base class for result lanes
42     """
43     XML_VERSION = 2
44     LANE = 'ResultLane'
45
46     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
47                  xml=None):
48         self.pathnames = pathnames
49         self.sample_name = sample
50         self.lane_id = lane_id
51         self.end = end
52         self._reads = None
53
54         if xml is not None:
55             self.set_elements(xml)
56
57     def _update(self):
58         """
59         Actually read the file and actually count the reads
60         """
61         pass
62
63     def _get_reads(self):
64         if self._reads is None:
65             self._update()
66         return self._reads
67     reads = property(_get_reads)
68
69     def get_elements(self):
70         return None
71
72     def __repr__(self):
73         name = []
74
75         name.append('L%s' % (self.lane_id,))
76         name.append('R%s' % (self.end,))
77         name.append('S%s' % (self.sample_name,))
78
79         return '<ResultLane(' + ",".join(name) + ')>'
80
81 class ElandLane(ResultLane):
82     """
83     Process an eland result file
84     """
85     XML_VERSION = 2
86     LANE = "ElandLane"
87     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
88     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
89     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
90     SCORE_UNRECOGNIZED = 0
91     SCORE_QC = 1
92     SCORE_READ = 2
93
94     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
95                  genome_map=None, eland_type=None, xml=None):
96         super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
97
98         self._mapped_reads = None
99         self._match_codes = None
100         self._reads = None
101         self.genome_map = GenomeMap(genome_map)
102         self.eland_type = None
103
104         if xml is not None:
105             self.set_elements(xml)
106
107     def __repr__(self):
108         name = []
109
110         name.append('L%s' % (self.lane_id,))
111         name.append('R%s' % (self.end,))
112         name.append('S%s' % (self.sample_name,))
113
114         reads = str(self._reads) if self._reads is not None else 'Uncounted'
115         return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
116
117     def _guess_eland_type(self, pathname):
118         if self.eland_type is None:
119           # attempt autodetect eland file type
120           pathn, name = os.path.split(pathname)
121           if re.search('result', name):
122             self.eland_type = ELAND_SINGLE
123           elif re.search('multi', name):
124             self.eland_type = ELAND_MULTI
125           elif re.search('extended', name):
126             self.eland_type = ELAND_EXTENDED
127           elif re.search('export', name):
128             self.eland_type = ELAND_EXPORT
129           else:
130             self.eland_type = ELAND_SINGLE
131
132     def _update(self):
133         """
134         Actually read the file and actually count the reads
135         """
136         # can't do anything if we don't have a file to process
137         if self.pathnames is None or len(self.pathnames) == 0:
138             return
139         pathname = self.pathnames[-1]
140         self._guess_eland_type(pathname)
141
142         if os.stat(pathname)[stat.ST_SIZE] == 0:
143             raise RuntimeError("Eland isn't done, try again later.")
144
145         LOGGER.debug("summarizing results for %s" % (pathname))
146         self._match_codes = MatchCodes()
147         self._mapped_reads = MappedReads()
148         self._reads = 0
149
150         for pathname in self.pathnames:
151             stream = autoopen(pathname, 'r')
152             if self.eland_type == ELAND_SINGLE:
153                 result = self._update_eland_result(stream)
154             elif self.eland_type == ELAND_MULTI or \
155                  self.eland_type == ELAND_EXTENDED:
156                 result = self._update_eland_multi(stream)
157             elif self.eland_type == ELAND_EXPORT:
158                 result = self._update_eland_export(stream)
159             else:
160                 errmsg = "Only support single/multi/extended eland files"
161                 raise NotImplementedError(errmsg)
162             stream.close()
163
164             match, mapped, reads = result
165             self._match_codes += match
166             self._mapped_reads += mapped
167             self._reads += reads
168
169     def _update_eland_result(self, instream):
170         reads = 0
171         mapped_reads = MappedReads()
172         match_codes = MatchCodes()
173
174         for line in instream:
175             reads += 1
176             fields = line.split()
177             # code = fields[2]
178             # match_codes[code] = match_codes.setdefault(code, 0) + 1
179             # the QC/NM etc codes are in the 3rd field and always present
180             match_codes[fields[2]] += 1
181             # ignore lines that don't have a fasta filename
182             if len(fields) < 7:
183                 continue
184             fasta = self.genome_map.get(fields[6], fields[6])
185             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
186         return match_codes, mapped_reads, reads
187
188     def _update_eland_multi(self, instream):
189         """Summarize an eland_extend."""
190         MATCH_INDEX = 2
191         LOCATION_INDEX = 3
192         reads = 0
193         mapped_reads = MappedReads()
194         match_codes = MatchCodes()
195
196         for line in instream:
197             reads += 1
198             fields = line.split()
199             # fields[2] = QC/NM/or number of matches
200             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
201                                                        match_codes)
202             if score_type == ElandLane.SCORE_READ:
203                 # when there are too many hits, eland  writes a - where
204                 # it would have put the list of hits.
205                 # or in a different version of eland, it just leaves
206                 # that column blank, and only outputs 3 fields.
207                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
208                   continue
209
210                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
211
212         return match_codes, mapped_reads, reads
213
214     def _update_eland_export(self, instream):
215         """Summarize a gerald export file."""
216         MATCH_INDEX = 10
217         LOCATION_INDEX = 10
218         DESCRIPTOR_INDEX= 13
219         reads = 0
220         mapped_reads = MappedReads()
221         match_codes = MatchCodes()
222
223         for line in instream:
224             reads += 1
225             fields = line.split()
226             # fields[2] = QC/NM/or number of matches
227             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
228                                                        match_codes)
229             if score_type == ElandLane.SCORE_UNRECOGNIZED:
230                 # export files have three states for the match field
231                 # QC code, count of multi-reads, or a single
232                 # read location. The score_mapped_mismatches function
233                 # only understands the first two types.
234                 # if we get unrecognized, that implies the field is probably
235                 # a location.
236                 code = self._count_mapped_export(mapped_reads,
237                                                  fields[LOCATION_INDEX],
238                                                  fields[DESCRIPTOR_INDEX])
239                 match_codes[code] += 1
240
241         return match_codes, mapped_reads, reads
242
243
244     def _score_mapped_mismatches(self, match, match_codes):
245         """Update match_codes with eland map counts, or failure code.
246
247         Returns True if the read mapped, false if it was an error code.
248         """
249         groups = ElandLane.MATCH_COUNTS_RE.match(match)
250         if groups is None:
251             # match is not of the form [\d]+:[\d]+:[\d]+
252             if match in match_codes:
253                 # match is one quality control codes QC/NM etc
254                 match_codes[match] += 1
255                 return ElandLane.SCORE_QC
256             else:
257                 return ElandLane.SCORE_UNRECOGNIZED
258         else:
259             # match is of the form [\d]+:[\d]+:[\d]+
260             # AKA Multiread
261             zero_mismatches = int(groups.group(1))
262             one_mismatches = int(groups.group(2))
263             two_mismatches = int(groups.group(3))
264
265             if zero_mismatches == 1:
266                 match_codes['U0'] += 1
267             elif zero_mismatches < 255:
268                 match_codes['R0'] += zero_mismatches
269
270             if one_mismatches == 1:
271                 match_codes['U1'] += 1
272             elif one_mismatches < 255:
273                 match_codes['R1'] += one_mismatches
274
275             if two_mismatches == 1:
276                 match_codes['U2'] += 1
277             elif two_mismatches < 255:
278                 match_codes['R2'] += two_mismatches
279
280             return ElandLane.SCORE_READ
281
282
283     def _count_mapped_multireads(self, mapped_reads, match_string):
284         chromo = None
285         for match in match_string.split(','):
286             match_fragment = match.split(':')
287             if len(match_fragment) == 2:
288                 chromo = match_fragment[0]
289                 pos = match_fragment[1]
290
291             fasta = self.genome_map.get(chromo, chromo)
292             assert fasta is not None
293             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
294
295
296     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
297         """Count a read as defined in an export file
298
299         match_string contains the chromosome
300         descriptor contains the an ecoding of bases that match, mismatch,
301                    and have indels.
302         returns the "best" match code
303
304         Currently "best" match code is ignoring the possibility of in-dels
305         """
306         chromo = match_string
307         fasta = self.genome_map.get(chromo, chromo)
308         assert fasta is not None
309         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
310
311         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
312         if len(mismatch_bases) == 0:
313             return 'U0'
314         elif len(mismatch_bases) == 1:
315             return 'U1'
316         else:
317             return 'U2'
318
319
320     def _get_mapped_reads(self):
321         if self._mapped_reads is None:
322             self._update()
323         return self._mapped_reads
324     mapped_reads = property(_get_mapped_reads)
325
326     def _get_match_codes(self):
327         if self._match_codes is None:
328             self._update()
329         return self._match_codes
330     match_codes = property(_get_match_codes)
331
332     def _get_no_match(self):
333         if self._mapped_reads is None:
334             self._update()
335         return self._match_codes['NM']
336     no_match = property(_get_no_match,
337                         doc="total reads that didn't match the target genome.")
338
339     def _get_no_match_percent(self):
340         return float(self.no_match)/self.reads * 100
341     no_match_percent = property(_get_no_match_percent,
342                                 doc="no match reads as percent of total")
343
344     def _get_qc_failed(self):
345         if self._mapped_reads is None:
346             self._update()
347         return self._match_codes['QC']
348     qc_failed = property(_get_qc_failed,
349                         doc="total reads that didn't match the target genome.")
350
351     def _get_qc_failed_percent(self):
352         return float(self.qc_failed)/self.reads * 100
353     qc_failed_percent = property(_get_qc_failed_percent,
354                                  doc="QC failed reads as percent of total")
355
356     def _get_unique_reads(self):
357         if self._mapped_reads is None:
358            self._update()
359         sum = 0
360         for code in ['U0','U1','U2']:
361             sum += self._match_codes[code]
362         return sum
363     unique_reads = property(_get_unique_reads,
364                             doc="total unique reads")
365
366     def _get_repeat_reads(self):
367         if self._mapped_reads is None:
368            self._update()
369         sum = 0
370         for code in ['R0','R1','R2']:
371             sum += self._match_codes[code]
372         return sum
373     repeat_reads = property(_get_repeat_reads,
374                             doc="total repeat reads")
375
376     def get_elements(self):
377         lane = ElementTree.Element(ElandLane.LANE,
378                                    {'version':
379                                     str(ElandLane.XML_VERSION)})
380         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
381         sample_tag.text = self.sample_name
382         lane_tag = ElementTree.SubElement(lane, LANE_ID)
383         lane_tag.text = str(self.lane_id)
384         if self.end is not None:
385             end_tag = ElementTree.SubElement(lane, END)
386             end_tag.text = str(self.end)
387         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
388         for k, v in list(self.genome_map.items()):
389             item = ElementTree.SubElement(
390                 genome_map, GENOME_ITEM,
391                 {'name':k, 'value':str(v)})
392         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
393         for k, v in list(self.mapped_reads.items()):
394             item = ElementTree.SubElement(
395                 mapped_reads, MAPPED_ITEM,
396                 {'name':k, 'value':str(v)})
397         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
398         for k, v in list(self.match_codes.items()):
399             item = ElementTree.SubElement(
400                 match_codes, MATCH_ITEM,
401                 {'name':k, 'value':str(v)})
402         reads = ElementTree.SubElement(lane, READS)
403         reads.text = str(self.reads)
404
405         return lane
406
407     def set_elements(self, tree):
408         if tree.tag != ElandLane.LANE:
409             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
410
411         # reset dictionaries
412         self._mapped_reads = {}
413         self._match_codes = {}
414
415         for element in tree:
416             tag = element.tag.lower()
417             if tag == SAMPLE_NAME.lower():
418                 self.sample_name = element.text
419             elif tag == LANE_ID.lower():
420                 self.lane_id = int(element.text)
421             elif tag == END.lower():
422                 self.end = int(element.text)
423             elif tag == GENOME_MAP.lower():
424                 for child in element:
425                     name = child.attrib['name']
426                     value = child.attrib['value']
427                     self.genome_map[name] = value
428             elif tag == MAPPED_READS.lower():
429                 for child in element:
430                     name = child.attrib['name']
431                     value = child.attrib['value']
432                     self._mapped_reads[name] = int(value)
433             elif tag == MATCH_CODES.lower():
434                 for child in element:
435                     name = child.attrib['name']
436                     value = int(child.attrib['value'])
437                     self._match_codes[name] = value
438             elif tag == READS.lower():
439                 self._reads = int(element.text)
440             else:
441                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
442
443
444 class MatchCodes(collections.MutableMapping):
445     """Mapping to hold match counts -
446     supports combining two match count sets together
447     """
448     def __init__(self, initializer=None):
449         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
450                             'U0':0, 'U1':0, 'U2':0,
451                             'R0':0, 'R1':0, 'R2':0,
452                             }
453
454         if initializer is not None:
455             if not isinstance(initializer, collections.Mapping):
456                 raise ValueError("Expected dictionary like class")
457             for key in initializer:
458                 if key not in self.match_codes:
459                     errmsg = "Initializer can only contain: %s"
460                     raise ValueError(errmsg % (",".join(list(self.match_codes.keys()))))
461                 self.match_codes[key] += initializer[key]
462
463     def __iter__(self):
464         return iter(self.match_codes)
465
466     def __delitem__(self, key):
467         raise RuntimeError("delete not allowed")
468
469     def __getitem__(self, key):
470         return self.match_codes[key]
471
472     def __setitem__(self, key, value):
473         if key not in self.match_codes:
474             errmsg = "Unrecognized key, allowed values are: %s"
475             raise ValueError(errmsg % (",".join(list(self.match_codes.keys()))))
476         self.match_codes[key] = value
477
478     def __len__(self):
479         return len(self.match_codes)
480
481     def __add__(self, other):
482         if not isinstance(other, MatchCodes):
483             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
484
485         newobj = MatchCodes(self)
486         for key, value in list(other.items()):
487             newobj[key] = self.get(key, 0) + other[key]
488
489         return newobj
490
491
492 class MappedReads(collections.MutableMapping):
493     """Mapping to hold mapped reads -
494     supports combining two mapped read sets together
495     """
496     def __init__(self, initializer=None):
497         self.mapped_reads = {}
498
499         if initializer is not None:
500             if not isinstance(initializer, collections.Mapping):
501                 raise ValueError("Expected dictionary like class")
502             for key in initializer:
503                 self[key] = self.setdefault(key, 0) + initializer[key]
504
505     def __iter__(self):
506         return iter(self.mapped_reads)
507
508     def __delitem__(self, key):
509         del self.mapped_reads[key]
510
511     def __getitem__(self, key):
512         return self.mapped_reads[key]
513
514     def __setitem__(self, key, value):
515         self.mapped_reads[key] = value
516
517     def __len__(self):
518         return len(self.mapped_reads)
519
520     def __add__(self, other):
521         if not isinstance(other, MappedReads):
522             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
523
524         newobj = MappedReads(self)
525         for key in other:
526             newobj[key] = self.get(key, 0) + other[key]
527
528         return newobj
529
530 class SequenceLane(ResultLane):
531     XML_VERSION=1
532     LANE = 'SequenceLane'
533     SEQUENCE_TYPE = 'SequenceType'
534
535     NONE_TYPE = None
536     SCARF_TYPE = 1
537     FASTQ_TYPE = 2
538     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
539
540     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
541                  xml=None):
542         self.sequence_type = None
543         super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
544
545     def _guess_sequence_type(self, pathname):
546         """
547         Determine if we have a scarf or fastq sequence file
548         """
549         f = open(pathname,'r')
550         l = f.readline()
551         f.close()
552
553         if l[0] == '@':
554             # fastq starts with a @
555             self.sequence_type = SequenceLane.FASTQ_TYPE
556         else:
557             self.sequence_type = SequenceLane.SCARF_TYPE
558         return self.sequence_type
559
560     def _update(self):
561         """
562         Actually read the file and actually count the reads
563         """
564         # can't do anything if we don't have a file to process
565         if self.pathnames is None:
566             return
567
568         pathname = self.pathnames[-1]
569         if os.stat(pathname)[stat.ST_SIZE] == 0:
570             raise RuntimeError("Sequencing isn't done, try again later.")
571
572         self._guess_sequence_type(pathname)
573
574         LOGGER.info("summarizing results for %s" % (pathname))
575         lines = 0
576         f = open(pathname)
577         for l in f:
578             lines += 1
579         f.close()
580
581         if self.sequence_type == SequenceLane.SCARF_TYPE:
582             self._reads = lines
583         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
584             self._reads = lines / 4
585         else:
586             errmsg = "This only supports scarf or fastq squence files"
587             raise NotImplementedError(errmsg)
588
589     def get_elements(self):
590         lane = ElementTree.Element(SequenceLane.LANE,
591                                    {'version':
592                                     str(SequenceLane.XML_VERSION)})
593         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
594         sample_tag.text = self.sample_name
595         lane_tag = ElementTree.SubElement(lane, LANE_ID)
596         lane_tag.text = str(self.lane_id)
597         if self.end is not None:
598             end_tag = ElementTree.SubElement(lane, END)
599             end_tag.text = str(self.end)
600         reads = ElementTree.SubElement(lane, READS)
601         reads.text = str(self.reads)
602         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
603         sequence_type.text = str(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
604
605         return lane
606
607     def set_elements(self, tree):
608         if tree.tag != SequenceLane.LANE:
609             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
610         lookup_sequence_type = dict([ (v,k) for k,v in list(SequenceLane.SEQUENCE_DESCRIPTION.items())])
611
612         for element in tree:
613             tag = element.tag.lower()
614             if tag == SAMPLE_NAME.lower():
615                 self.sample_name = element.text
616             elif tag == LANE_ID.lower():
617                 self.lane_id = int(element.text)
618             elif tag == END.lower():
619                 self.end = int(element.text)
620             elif tag == READS.lower():
621                 self._reads = int(element.text)
622             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
623                 self.sequence_type = lookup_sequence_type.get(element.text, None)
624             else:
625                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
626
627 class ELAND(collections.MutableMapping):
628     """
629     Summarize information from eland files
630     """
631     XML_VERSION = 3
632
633     ELAND = 'ElandCollection'
634     LANE = 'Lane'
635     LANE_ID = 'id'
636     SAMPLE = 'sample'
637     END = 'end'
638
639     def __init__(self, xml=None):
640         # we need information from the gerald config.xml
641         self.results = {}
642
643         if xml is not None:
644             self.set_elements(xml)
645
646     def __getitem__(self, key):
647         if not isinstance(key, SampleKey):
648             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
649         return self.results[key]
650
651     def __setitem__(self, key, value):
652         if not isinstance(key, SampleKey):
653             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
654         self.results[key] = value
655
656     def __delitem__(self, key):
657         del self.result[key]
658
659     def __iter__(self):
660         keys = iter(self.results.keys())
661         for k in sorted(keys):
662             yield k
663
664     def __len__(self):
665         return len(self.results)
666
667     def find_keys(self, search):
668         """Return results that match key"""
669         if not isinstance(search, SampleKey):
670             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
671         if not search.iswild:
672             yield self[search]
673         for key in list(self.keys()):
674             if key.matches(search): yield key
675
676     def get_elements(self):
677         root = ElementTree.Element(ELAND.ELAND,
678                                    {'version': str(ELAND.XML_VERSION)})
679
680         for key in self:
681             eland_lane = self[key].get_elements()
682             eland_lane.attrib[ELAND.END] = str(self[key].end-1)
683             eland_lane.attrib[ELAND.LANE_ID] = str(self[key].lane_id)
684             eland_lane.attrib[ELAND.SAMPLE] = str(self[key].sample_name)
685             root.append(eland_lane)
686         return root
687         return root
688
689     def set_elements(self, tree):
690         if tree.tag.lower() != ELAND.ELAND.lower():
691             raise ValueError('Expecting %s', ELAND.ELAND)
692         for element in list(tree):
693             lane_id = int(element.attrib[ELAND.LANE_ID])
694             end = int(element.attrib.get(ELAND.END, 0))
695             sample = element.attrib.get(ELAND.SAMPLE, 's')
696             if element.tag.lower() == ElandLane.LANE.lower():
697                 lane = ElandLane(xml=element)
698             elif element.tag.lower() == SequenceLane.LANE.lower():
699                 lane = SequenceLane(xml=element)
700
701             key = SampleKey(lane=lane_id, read=end+1, sample=sample)
702             self.results[key] = lane
703
704
705     def update_result_with_eland(self, gerald, key, pathnames,
706                                  genome_maps):
707         # yes the lane_id is also being computed in ElandLane._update
708         # I didn't want to clutter up my constructor
709         # but I needed to persist the sample_name/lane_id for
710         # runfolder summary_report
711         names = [ os.path.split(p)[1] for p in pathnames]
712         LOGGER.info("Adding eland files %s" %(",".join(names),))
713         basedir = os.path.split(pathnames[0])[0]
714         gs_template = "{0}_*_L{1:03}_genomesize.xml"
715         genomesize = glob(
716             os.path.join(basedir,
717                          gs_template.format(key.sample, key.lane)))
718
719
720         genome_map = GenomeMap()
721         if genome_maps is not None:
722             genome_map = GenomeMap(genome_maps[key.lane])
723         elif len(genomesize) > 0:
724             LOGGER.info("Found {0}".format(genomesize))
725             genome_map.parse_genomesize(genomesize[0])
726         elif gerald is not None:
727             genome_dir = gerald.lanes[key].eland_genome
728             if genome_dir is not None:
729                 genome_map.scan_genome_dir(genome_dir)
730
731         lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
732
733         self.results[key] = lane
734
735     def update_result_with_sequence(self, gerald, key, pathnames,
736                                     genome_maps=None):
737         self.results[key] = SequenceLane(pathnames,
738                                          key.sample, key.lane, key.read)
739
740
741 def eland(gerald_dir, gerald=None, genome_maps=None):
742     e = ELAND()
743     eland_files = ElandMatches(e)
744     # collect
745     for path, dirnames, filenames in os.walk(gerald_dir):
746         for filename in filenames:
747             pathname = os.path.abspath(os.path.join(path, filename))
748             eland_files.add(pathname)
749     for key in eland_files:
750         eland_files.count(key, gerald, genome_maps)
751     return e
752
753
754 class ElandMatches(collections.MutableMapping):
755     def __init__(self, eland_container):
756         # the order in patterns determines the preference for what
757         # will be found.
758         self.eland_container = eland_container
759         MAPPED = eland_container.update_result_with_eland
760         SEQUENCE = eland_container.update_result_with_sequence
761
762         sample = '(?P<sample>[^_]+)'
763         hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
764         hiLane = '_L(?P<lane>[\d]+)'
765         gaLane = '_(?P<lane>[\d]+)'
766         hiRead = '_R(?P<read>[\d]+)'
767         gaRead = '(_(?P<read>[\d])+)?'
768         part = '_(?P<part>[\d]+)'
769         ext = '(?P<extention>(\.bz2|\.gz)?)'
770
771         hiPrefix = sample + hiIndex + hiLane + hiRead + part
772         gaPrefix = sample + gaLane + gaRead
773         P = collections.namedtuple('Patterns', 'pattern counter priority')
774         self.patterns = [
775             P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
776             P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
777             P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
778             P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
779             P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
780             P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
781             ]
782         self.file_sets = {}
783         self.file_priority = {}
784         self.file_counter = {}
785
786     def add(self, pathname):
787         """Add pathname to our set of files
788         """
789         path, filename = os.path.split(pathname)
790
791         for pattern, counter, priority in self.patterns:
792             rematch = re.match(pattern, filename)
793             if rematch is not None:
794                 m = ElandMatch(pathname, counter, **rematch.groupdict())
795                 key = m.make_samplekey()
796                 old_priority = self.file_priority.get(key, 0)
797                 if priority > old_priority:
798                     self.file_sets[key] = set((m,))
799                     self.file_counter[key] = counter
800                     self.file_priority[key] = priority
801                 elif priority == old_priority:
802                     self.file_sets[key].add(m)
803
804     def count(self, key, gerald=None, genome_maps=None):
805         #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
806         counter = self.file_counter[key]
807         file_set = self.file_sets[key]
808         filenames = [ f.filename for f in file_set ]
809         return counter(gerald, key,
810                        filenames, genome_maps)
811
812     def __iter__(self):
813         return iter(self.file_sets)
814
815     def __len__(self):
816         return len(self.file_sets)
817
818     def __getitem__(self, key):
819         return self.file_sets[key]
820
821     def __setitem__(self, key, value):
822         if not isintance(value, set):
823             raise ValueError("Expected set for value")
824         self.file_sets[key] = value
825
826     def __delitem__(self, key):
827         del self.file_sets[key]
828
829 class ElandMatch(object):
830     def __init__(self, pathname, counter,
831                  lane=None, read=None, extension=None,
832                  sample=None, index=None, part=None, **kwargs):
833         self.filename = pathname
834         self.counter = counter
835         self._lane = lane
836         self._read = read
837         self.extension = extension
838         self.sample = sample
839         self.index = index
840         self._part = part
841         LOGGER.info("Found %s: L%s R%s Samp%s" % (
842             self.filename, self._lane, self._read, self.sample))
843
844     def make_samplekey(self):
845         read = self._read if self._read is not None else 1
846         return SampleKey(lane=self.lane, read=read, sample=self.sample)
847
848     def _get_lane(self):
849         if self._lane is not None:
850             return int(self._lane)
851         return self._lane
852     lane = property(_get_lane)
853
854     def _get_read(self):
855         if self._read is not None:
856             return int(self._read)
857         return self._read
858     read = property(_get_read)
859
860     def _get_part(self):
861         if self._part is not None:
862             return int(self._part)
863         return self._part
864     part = property(_get_part)
865
866     def __repr__(self):
867         name = []
868         if self.sample is not None: name.append(self.sample)
869         if self._lane is not None: name.append('L%s' % (self.lane,))
870         if self._read is not None: name.append('R%s' % (self.read,))
871         if self._part is not None: name.append('P%s' % (self.part,))
872         return '<ElandMatch(' + "_".join(name) + ')>'
873
874
875 def extract_eland_sequence(instream, outstream, start, end):
876     """
877     Extract a chunk of sequence out of an eland file
878     """
879     for line in instream:
880         record = line.split()
881         if len(record) > 1:
882             result = [record[0], record[1][start:end]]
883         else:
884             result = [record[0][start:end]]
885         outstream.write("\t".join(result))
886         outstream.write(os.linesep)
887
888
889 def main(cmdline=None):
890     """Run eland extraction against the specified gerald directory"""
891     from optparse import OptionParser
892     parser = OptionParser("%prog: <gerald dir>+")
893     opts, args = parser.parse_args(cmdline)
894     logging.basicConfig(level=logging.DEBUG)
895     for a in args:
896         LOGGER.info("Starting scan of %s" % (a,))
897         e = eland(a)
898         print(ElementTree.tostring(e.get_elements()))
899     return
900
901
902 if __name__ == "__main__":
903     main(sys.argv[1:])