Specify text vs binary mode for opening files.
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 from __future__ import print_function
5
6 import collections
7 from glob import glob
8 import logging
9 import os
10 import re
11 import stat
12 import sys
13 import types
14
15 from htsworkflow.pipelines import ElementTree, LANE_LIST
16 from htsworkflow.pipelines.samplekey import SampleKey
17 from htsworkflow.pipelines.genomemap import GenomeMap
18 from htsworkflow.util.ethelp import indent, flatten
19 from htsworkflow.util.opener import autoopen
20
21 LOGGER = logging.getLogger(__name__)
22
23 SAMPLE_NAME = 'SampleName'
24 LANE_ID = 'LaneID'
25 END = 'End'
26 READS = 'Reads'
27
28 GENOME_MAP = 'GenomeMap'
29 GENOME_ITEM = 'GenomeItem'
30 MAPPED_READS = 'MappedReads'
31 MAPPED_ITEM = 'MappedItem'
32 MATCH_CODES = 'MatchCodes'
33 MATCH_ITEM = 'Code'
34 READS = 'Reads'
35
36 ELAND_SINGLE = 0
37 ELAND_MULTI = 1
38 ELAND_EXTENDED = 2
39 ELAND_EXPORT = 3
40
41 class ResultLane(object):
42     """
43     Base class for result lanes
44     """
45     XML_VERSION = 2
46     LANE = 'ResultLane'
47
48     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
49                  xml=None):
50         self.pathnames = pathnames
51         self.sample_name = sample
52         self.lane_id = lane_id
53         self.end = end
54         self._reads = None
55
56         if xml is not None:
57             self.set_elements(xml)
58
59     def _update(self):
60         """
61         Actually read the file and actually count the reads
62         """
63         pass
64
65     def _get_reads(self):
66         if self._reads is None:
67             self._update()
68         return self._reads
69     reads = property(_get_reads)
70
71     def get_elements(self):
72         return None
73
74     def __repr__(self):
75         name = []
76
77         name.append('L%s' % (self.lane_id,))
78         name.append('R%s' % (self.end,))
79         name.append('S%s' % (self.sample_name,))
80
81         return '<ResultLane(' + ",".join(name) + ')>'
82
83 class ElandLane(ResultLane):
84     """
85     Process an eland result file
86     """
87     XML_VERSION = 2
88     LANE = "ElandLane"
89     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92     SCORE_UNRECOGNIZED = 0
93     SCORE_QC = 1
94     SCORE_READ = 2
95
96     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
97                  genome_map=None, eland_type=None, xml=None):
98         super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
99
100         self._mapped_reads = None
101         self._match_codes = None
102         self._reads = None
103         self.genome_map = GenomeMap(genome_map)
104         self.eland_type = None
105
106         if xml is not None:
107             self.set_elements(xml)
108
109     def __repr__(self):
110         name = []
111
112         name.append('L%s' % (self.lane_id,))
113         name.append('R%s' % (self.end,))
114         name.append('S%s' % (self.sample_name,))
115
116         reads = str(self._reads) if self._reads is not None else 'Uncounted'
117         return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
118
119     def _guess_eland_type(self, pathname):
120         if self.eland_type is None:
121           # attempt autodetect eland file type
122           pathn, name = os.path.split(pathname)
123           if re.search('result', name):
124             self.eland_type = ELAND_SINGLE
125           elif re.search('multi', name):
126             self.eland_type = ELAND_MULTI
127           elif re.search('extended', name):
128             self.eland_type = ELAND_EXTENDED
129           elif re.search('export', name):
130             self.eland_type = ELAND_EXPORT
131           else:
132             self.eland_type = ELAND_SINGLE
133
134     def _update(self):
135         """
136         Actually read the file and actually count the reads
137         """
138         # can't do anything if we don't have a file to process
139         if self.pathnames is None or len(self.pathnames) == 0:
140             return
141         pathname = self.pathnames[-1]
142         self._guess_eland_type(pathname)
143
144         if os.stat(pathname)[stat.ST_SIZE] == 0:
145             raise RuntimeError("Eland isn't done, try again later.")
146
147         LOGGER.debug("summarizing results for %s" % (pathname))
148         self._match_codes = MatchCodes()
149         self._mapped_reads = MappedReads()
150         self._reads = 0
151
152         for pathname in self.pathnames:
153             stream = autoopen(pathname, 'rt')
154             if self.eland_type == ELAND_SINGLE:
155                 result = self._update_eland_result(stream)
156             elif self.eland_type == ELAND_MULTI or \
157                  self.eland_type == ELAND_EXTENDED:
158                 result = self._update_eland_multi(stream)
159             elif self.eland_type == ELAND_EXPORT:
160                 result = self._update_eland_export(stream)
161             else:
162                 errmsg = "Only support single/multi/extended eland files"
163                 raise NotImplementedError(errmsg)
164             stream.close()
165
166             match, mapped, reads = result
167             self._match_codes += match
168             self._mapped_reads += mapped
169             self._reads += reads
170
171     def _update_eland_result(self, instream):
172         reads = 0
173         mapped_reads = MappedReads()
174         match_codes = MatchCodes()
175
176         for line in instream:
177             reads += 1
178             fields = line.split()
179             # code = fields[2]
180             # match_codes[code] = match_codes.setdefault(code, 0) + 1
181             # the QC/NM etc codes are in the 3rd field and always present
182             match_codes[fields[2]] += 1
183             # ignore lines that don't have a fasta filename
184             if len(fields) < 7:
185                 continue
186             fasta = self.genome_map.get(fields[6], fields[6])
187             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
188         return match_codes, mapped_reads, reads
189
190     def _update_eland_multi(self, instream):
191         """Summarize an eland_extend."""
192         MATCH_INDEX = 2
193         LOCATION_INDEX = 3
194         reads = 0
195         mapped_reads = MappedReads()
196         match_codes = MatchCodes()
197
198         for line in instream:
199             reads += 1
200             fields = line.split()
201             # fields[2] = QC/NM/or number of matches
202             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
203                                                        match_codes)
204             if score_type == ElandLane.SCORE_READ:
205                 # when there are too many hits, eland  writes a - where
206                 # it would have put the list of hits.
207                 # or in a different version of eland, it just leaves
208                 # that column blank, and only outputs 3 fields.
209                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
210                   continue
211
212                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
213
214         return match_codes, mapped_reads, reads
215
216     def _update_eland_export(self, instream):
217         """Summarize a gerald export file."""
218         MATCH_INDEX = 10
219         LOCATION_INDEX = 10
220         DESCRIPTOR_INDEX= 13
221         reads = 0
222         mapped_reads = MappedReads()
223         match_codes = MatchCodes()
224
225         for line in instream:
226             reads += 1
227             fields = line.split()
228             # fields[2] = QC/NM/or number of matches
229             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
230                                                        match_codes)
231             if score_type == ElandLane.SCORE_UNRECOGNIZED:
232                 # export files have three states for the match field
233                 # QC code, count of multi-reads, or a single
234                 # read location. The score_mapped_mismatches function
235                 # only understands the first two types.
236                 # if we get unrecognized, that implies the field is probably
237                 # a location.
238                 code = self._count_mapped_export(mapped_reads,
239                                                  fields[LOCATION_INDEX],
240                                                  fields[DESCRIPTOR_INDEX])
241                 match_codes[code] += 1
242
243         return match_codes, mapped_reads, reads
244
245
246     def _score_mapped_mismatches(self, match, match_codes):
247         """Update match_codes with eland map counts, or failure code.
248
249         Returns True if the read mapped, false if it was an error code.
250         """
251         groups = ElandLane.MATCH_COUNTS_RE.match(match)
252         if groups is None:
253             # match is not of the form [\d]+:[\d]+:[\d]+
254             if match in match_codes:
255                 # match is one quality control codes QC/NM etc
256                 match_codes[match] += 1
257                 return ElandLane.SCORE_QC
258             else:
259                 return ElandLane.SCORE_UNRECOGNIZED
260         else:
261             # match is of the form [\d]+:[\d]+:[\d]+
262             # AKA Multiread
263             zero_mismatches = int(groups.group(1))
264             one_mismatches = int(groups.group(2))
265             two_mismatches = int(groups.group(3))
266
267             if zero_mismatches == 1:
268                 match_codes['U0'] += 1
269             elif zero_mismatches < 255:
270                 match_codes['R0'] += zero_mismatches
271
272             if one_mismatches == 1:
273                 match_codes['U1'] += 1
274             elif one_mismatches < 255:
275                 match_codes['R1'] += one_mismatches
276
277             if two_mismatches == 1:
278                 match_codes['U2'] += 1
279             elif two_mismatches < 255:
280                 match_codes['R2'] += two_mismatches
281
282             return ElandLane.SCORE_READ
283
284
285     def _count_mapped_multireads(self, mapped_reads, match_string):
286         chromo = None
287         for match in match_string.split(','):
288             match_fragment = match.split(':')
289             if len(match_fragment) == 2:
290                 chromo = match_fragment[0]
291                 pos = match_fragment[1]
292
293             fasta = self.genome_map.get(chromo, chromo)
294             assert fasta is not None
295             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
296
297
298     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
299         """Count a read as defined in an export file
300
301         match_string contains the chromosome
302         descriptor contains the an ecoding of bases that match, mismatch,
303                    and have indels.
304         returns the "best" match code
305
306         Currently "best" match code is ignoring the possibility of in-dels
307         """
308         chromo = match_string
309         fasta = self.genome_map.get(chromo, chromo)
310         assert fasta is not None
311         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
312
313         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
314         if len(mismatch_bases) == 0:
315             return 'U0'
316         elif len(mismatch_bases) == 1:
317             return 'U1'
318         else:
319             return 'U2'
320
321
322     def _get_mapped_reads(self):
323         if self._mapped_reads is None:
324             self._update()
325         return self._mapped_reads
326     mapped_reads = property(_get_mapped_reads)
327
328     def _get_match_codes(self):
329         if self._match_codes is None:
330             self._update()
331         return self._match_codes
332     match_codes = property(_get_match_codes)
333
334     def _get_no_match(self):
335         if self._mapped_reads is None:
336             self._update()
337         return self._match_codes['NM']
338     no_match = property(_get_no_match,
339                         doc="total reads that didn't match the target genome.")
340
341     def _get_no_match_percent(self):
342         return float(self.no_match)/self.reads * 100
343     no_match_percent = property(_get_no_match_percent,
344                                 doc="no match reads as percent of total")
345
346     def _get_qc_failed(self):
347         if self._mapped_reads is None:
348             self._update()
349         return self._match_codes['QC']
350     qc_failed = property(_get_qc_failed,
351                         doc="total reads that didn't match the target genome.")
352
353     def _get_qc_failed_percent(self):
354         return float(self.qc_failed)/self.reads * 100
355     qc_failed_percent = property(_get_qc_failed_percent,
356                                  doc="QC failed reads as percent of total")
357
358     def _get_unique_reads(self):
359         if self._mapped_reads is None:
360            self._update()
361         sum = 0
362         for code in ['U0','U1','U2']:
363             sum += self._match_codes[code]
364         return sum
365     unique_reads = property(_get_unique_reads,
366                             doc="total unique reads")
367
368     def _get_repeat_reads(self):
369         if self._mapped_reads is None:
370            self._update()
371         sum = 0
372         for code in ['R0','R1','R2']:
373             sum += self._match_codes[code]
374         return sum
375     repeat_reads = property(_get_repeat_reads,
376                             doc="total repeat reads")
377
378     def get_elements(self):
379         lane = ElementTree.Element(ElandLane.LANE,
380                                    {'version':
381                                     str(ElandLane.XML_VERSION)})
382         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
383         sample_tag.text = self.sample_name
384         lane_tag = ElementTree.SubElement(lane, LANE_ID)
385         lane_tag.text = str(self.lane_id)
386         if self.end is not None:
387             end_tag = ElementTree.SubElement(lane, END)
388             end_tag.text = str(self.end)
389         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
390         for k, v in self.genome_map.items():
391             item = ElementTree.SubElement(
392                 genome_map, GENOME_ITEM,
393                 {'name':k, 'value':str(v)})
394         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
395         for k, v in self.mapped_reads.items():
396             item = ElementTree.SubElement(
397                 mapped_reads, MAPPED_ITEM,
398                 {'name':k, 'value':str(v)})
399         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
400         for k, v in self.match_codes.items():
401             item = ElementTree.SubElement(
402                 match_codes, MATCH_ITEM,
403                 {'name':k, 'value':str(v)})
404         reads = ElementTree.SubElement(lane, READS)
405         reads.text = str(self.reads)
406
407         return lane
408
409     def set_elements(self, tree):
410         if tree.tag != ElandLane.LANE:
411             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
412
413         # reset dictionaries
414         self._mapped_reads = {}
415         self._match_codes = {}
416
417         for element in tree:
418             tag = element.tag.lower()
419             if tag == SAMPLE_NAME.lower():
420                 self.sample_name = element.text
421             elif tag == LANE_ID.lower():
422                 self.lane_id = int(element.text)
423             elif tag == END.lower():
424                 self.end = int(element.text)
425             elif tag == GENOME_MAP.lower():
426                 for child in element:
427                     name = child.attrib['name']
428                     value = child.attrib['value']
429                     self.genome_map[name] = value
430             elif tag == MAPPED_READS.lower():
431                 for child in element:
432                     name = child.attrib['name']
433                     value = child.attrib['value']
434                     self._mapped_reads[name] = int(value)
435             elif tag == MATCH_CODES.lower():
436                 for child in element:
437                     name = child.attrib['name']
438                     value = int(child.attrib['value'])
439                     self._match_codes[name] = value
440             elif tag == READS.lower():
441                 self._reads = int(element.text)
442             else:
443                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
444
445
446 class MatchCodes(collections.MutableMapping):
447     """Mapping to hold match counts -
448     supports combining two match count sets together
449     """
450     def __init__(self, initializer=None):
451         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
452                             'U0':0, 'U1':0, 'U2':0,
453                             'R0':0, 'R1':0, 'R2':0,
454                             }
455
456         if initializer is not None:
457             if not isinstance(initializer, collections.Mapping):
458                 raise ValueError("Expected dictionary like class")
459             for key in initializer:
460                 if key not in self.match_codes:
461                     errmsg = "Initializer can only contain: %s"
462                     raise ValueError(errmsg % (",".join(self.match_codes.keys())))
463                 self.match_codes[key] += initializer[key]
464
465     def __iter__(self):
466         return iter(self.match_codes)
467
468     def __delitem__(self, key):
469         raise RuntimeError("delete not allowed")
470
471     def __getitem__(self, key):
472         return self.match_codes[key]
473
474     def __setitem__(self, key, value):
475         if key not in self.match_codes:
476             errmsg = "Unrecognized key, allowed values are: %s"
477             raise ValueError(errmsg % (",".join(self.match_codes.keys())))
478         self.match_codes[key] = value
479
480     def __len__(self):
481         return len(self.match_codes)
482
483     def __add__(self, other):
484         if not isinstance(other, MatchCodes):
485             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
486
487         newobj = MatchCodes(self)
488         for key, value in other.items():
489             newobj[key] = self.get(key, 0) + other[key]
490
491         return newobj
492
493
494 class MappedReads(collections.MutableMapping):
495     """Mapping to hold mapped reads -
496     supports combining two mapped read sets together
497     """
498     def __init__(self, initializer=None):
499         self.mapped_reads = {}
500
501         if initializer is not None:
502             if not isinstance(initializer, collections.Mapping):
503                 raise ValueError("Expected dictionary like class")
504             for key in initializer:
505                 self[key] = self.setdefault(key, 0) + initializer[key]
506
507     def __iter__(self):
508         return iter(self.mapped_reads)
509
510     def __delitem__(self, key):
511         del self.mapped_reads[key]
512
513     def __getitem__(self, key):
514         return self.mapped_reads[key]
515
516     def __setitem__(self, key, value):
517         self.mapped_reads[key] = value
518
519     def __len__(self):
520         return len(self.mapped_reads)
521
522     def __add__(self, other):
523         if not isinstance(other, MappedReads):
524             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
525
526         newobj = MappedReads(self)
527         for key in other:
528             newobj[key] = self.get(key, 0) + other[key]
529
530         return newobj
531
532 class SequenceLane(ResultLane):
533     XML_VERSION=1
534     LANE = 'SequenceLane'
535     SEQUENCE_TYPE = 'SequenceType'
536
537     NONE_TYPE = None
538     SCARF_TYPE = 1
539     FASTQ_TYPE = 2
540     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
541
542     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
543                  xml=None):
544         self.sequence_type = None
545         super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
546
547     def _guess_sequence_type(self, pathname):
548         """
549         Determine if we have a scarf or fastq sequence file
550         """
551         f = open(pathname,'rt')
552         l = f.readline()
553         f.close()
554
555         if l[0] == '@':
556             # fastq starts with a @
557             self.sequence_type = SequenceLane.FASTQ_TYPE
558         else:
559             self.sequence_type = SequenceLane.SCARF_TYPE
560         return self.sequence_type
561
562     def _update(self):
563         """
564         Actually read the file and actually count the reads
565         """
566         # can't do anything if we don't have a file to process
567         if self.pathnames is None:
568             return
569
570         pathname = self.pathnames[-1]
571         if os.stat(pathname)[stat.ST_SIZE] == 0:
572             raise RuntimeError("Sequencing isn't done, try again later.")
573
574         self._guess_sequence_type(pathname)
575
576         LOGGER.info("summarizing results for %s" % (pathname))
577         lines = 0
578         f = open(pathname, 'rt')
579         for l in f:
580             lines += 1
581         f.close()
582
583         if self.sequence_type == SequenceLane.SCARF_TYPE:
584             self._reads = lines
585         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
586             self._reads = lines / 4
587         else:
588             errmsg = "This only supports scarf or fastq squence files"
589             raise NotImplementedError(errmsg)
590
591     def get_elements(self):
592         lane = ElementTree.Element(SequenceLane.LANE,
593                                    {'version':
594                                     str(SequenceLane.XML_VERSION)})
595         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
596         sample_tag.text = self.sample_name
597         lane_tag = ElementTree.SubElement(lane, LANE_ID)
598         lane_tag.text = str(self.lane_id)
599         if self.end is not None:
600             end_tag = ElementTree.SubElement(lane, END)
601             end_tag.text = str(self.end)
602         reads = ElementTree.SubElement(lane, READS)
603         reads.text = str(self.reads)
604         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
605         sequence_type.text = str(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
606
607         return lane
608
609     def set_elements(self, tree):
610         if tree.tag != SequenceLane.LANE:
611             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
612         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
613
614         for element in tree:
615             tag = element.tag.lower()
616             if tag == SAMPLE_NAME.lower():
617                 self.sample_name = element.text
618             elif tag == LANE_ID.lower():
619                 self.lane_id = int(element.text)
620             elif tag == END.lower():
621                 self.end = int(element.text)
622             elif tag == READS.lower():
623                 self._reads = int(element.text)
624             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
625                 self.sequence_type = lookup_sequence_type.get(element.text, None)
626             else:
627                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
628
629 class ELAND(collections.MutableMapping):
630     """
631     Summarize information from eland files
632     """
633     XML_VERSION = 3
634
635     ELAND = 'ElandCollection'
636     LANE = 'Lane'
637     LANE_ID = 'id'
638     SAMPLE = 'sample'
639     END = 'end'
640
641     def __init__(self, xml=None):
642         # we need information from the gerald config.xml
643         self.results = {}
644
645         if xml is not None:
646             self.set_elements(xml)
647
648     def __getitem__(self, key):
649         if not isinstance(key, SampleKey):
650             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
651         return self.results[key]
652
653     def __setitem__(self, key, value):
654         if not isinstance(key, SampleKey):
655             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
656         self.results[key] = value
657
658     def __delitem__(self, key):
659         del self.result[key]
660
661     def __iter__(self):
662         for k in sorted(self.results):
663             yield k
664
665     def __len__(self):
666         return len(self.results)
667
668     def find_keys(self, search):
669         """Return results that match key"""
670         if not isinstance(search, SampleKey):
671             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
672         if not search.iswild:
673             yield self[search]
674         for key in self.keys():
675             if key.matches(search): yield key
676
677     def get_elements(self):
678         root = ElementTree.Element(ELAND.ELAND,
679                                    {'version': str(ELAND.XML_VERSION)})
680
681         for key in self:
682             eland_lane = self[key].get_elements()
683             eland_lane.attrib[ELAND.END] = str(self[key].end-1)
684             eland_lane.attrib[ELAND.LANE_ID] = str(self[key].lane_id)
685             eland_lane.attrib[ELAND.SAMPLE] = str(self[key].sample_name)
686             root.append(eland_lane)
687         return root
688         return root
689
690     def set_elements(self, tree):
691         if tree.tag.lower() != ELAND.ELAND.lower():
692             raise ValueError('Expecting %s', ELAND.ELAND)
693         for element in list(tree):
694             lane_id = int(element.attrib[ELAND.LANE_ID])
695             end = int(element.attrib.get(ELAND.END, 0))
696             sample = element.attrib.get(ELAND.SAMPLE, 's')
697             if element.tag.lower() == ElandLane.LANE.lower():
698                 lane = ElandLane(xml=element)
699             elif element.tag.lower() == SequenceLane.LANE.lower():
700                 lane = SequenceLane(xml=element)
701
702             key = SampleKey(lane=lane_id, read=end+1, sample=sample)
703             self.results[key] = lane
704
705
706     def update_result_with_eland(self, gerald, key, pathnames,
707                                  genome_maps):
708         # yes the lane_id is also being computed in ElandLane._update
709         # I didn't want to clutter up my constructor
710         # but I needed to persist the sample_name/lane_id for
711         # runfolder summary_report
712         names = [ os.path.split(p)[1] for p in pathnames]
713         LOGGER.info("Adding eland files %s" %(",".join(names),))
714         basedir = os.path.split(pathnames[0])[0]
715         gs_template = "{0}_*_L{1:03}_genomesize.xml"
716         genomesize = glob(
717             os.path.join(basedir,
718                          gs_template.format(key.sample, key.lane)))
719
720
721         genome_map = GenomeMap()
722         if genome_maps is not None:
723             genome_map = GenomeMap(genome_maps[key.lane])
724         elif len(genomesize) > 0:
725             LOGGER.info("Found {0}".format(genomesize))
726             genome_map.parse_genomesize(genomesize[0])
727         elif gerald is not None:
728             genome_dir = gerald.lanes[key].eland_genome
729             if genome_dir is not None:
730                 genome_map.scan_genome_dir(genome_dir)
731
732         lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
733
734         self.results[key] = lane
735
736     def update_result_with_sequence(self, gerald, key, pathnames,
737                                     genome_maps=None):
738         self.results[key] = SequenceLane(pathnames,
739                                          key.sample, key.lane, key.read)
740
741
742 def eland(gerald_dir, gerald=None, genome_maps=None):
743     e = ELAND()
744     eland_files = ElandMatches(e)
745     # collect
746     for path, dirnames, filenames in os.walk(gerald_dir):
747         for filename in filenames:
748             pathname = os.path.abspath(os.path.join(path, filename))
749             eland_files.add(pathname)
750     for key in eland_files:
751         eland_files.count(key, gerald, genome_maps)
752     return e
753
754
755 class ElandMatches(collections.MutableMapping):
756     def __init__(self, eland_container):
757         # the order in patterns determines the preference for what
758         # will be found.
759         self.eland_container = eland_container
760         MAPPED = eland_container.update_result_with_eland
761         SEQUENCE = eland_container.update_result_with_sequence
762
763         sample = '(?P<sample>[^_]+)'
764         hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
765         hiLane = '_L(?P<lane>[\d]+)'
766         gaLane = '_(?P<lane>[\d]+)'
767         hiRead = '_R(?P<read>[\d]+)'
768         gaRead = '(_(?P<read>[\d])+)?'
769         part = '_(?P<part>[\d]+)'
770         ext = '(?P<extention>(\.bz2|\.gz)?)'
771
772         hiPrefix = sample + hiIndex + hiLane + hiRead + part
773         gaPrefix = sample + gaLane + gaRead
774         P = collections.namedtuple('Patterns', 'pattern counter priority')
775         self.patterns = [
776             P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
777             P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
778             P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
779             P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
780             P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
781             P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
782             ]
783         self.file_sets = {}
784         self.file_priority = {}
785         self.file_counter = {}
786
787     def add(self, pathname):
788         """Add pathname to our set of files
789         """
790         path, filename = os.path.split(pathname)
791
792         for pattern, counter, priority in self.patterns:
793             rematch = re.match(pattern, filename)
794             if rematch is not None:
795                 m = ElandMatch(pathname, counter, **rematch.groupdict())
796                 key = m.make_samplekey()
797                 old_priority = self.file_priority.get(key, 0)
798                 if priority > old_priority:
799                     self.file_sets[key] = set((m,))
800                     self.file_counter[key] = counter
801                     self.file_priority[key] = priority
802                 elif priority == old_priority:
803                     self.file_sets[key].add(m)
804
805     def count(self, key, gerald=None, genome_maps=None):
806         #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
807         counter = self.file_counter[key]
808         file_set = self.file_sets[key]
809         filenames = [ f.filename for f in file_set ]
810         return counter(gerald, key,
811                        filenames, genome_maps)
812
813     def __iter__(self):
814         return iter(self.file_sets)
815
816     def __len__(self):
817         return len(self.file_sets)
818
819     def __getitem__(self, key):
820         return self.file_sets[key]
821
822     def __setitem__(self, key, value):
823         if not isintance(value, set):
824             raise ValueError("Expected set for value")
825         self.file_sets[key] = value
826
827     def __delitem__(self, key):
828         del self.file_sets[key]
829
830 class ElandMatch(object):
831     def __init__(self, pathname, counter,
832                  lane=None, read=None, extension=None,
833                  sample=None, index=None, part=None, **kwargs):
834         self.filename = pathname
835         self.counter = counter
836         self._lane = lane
837         self._read = read
838         self.extension = extension
839         self.sample = sample
840         self.index = index
841         self._part = part
842         LOGGER.info("Found %s: L%s R%s Samp%s" % (
843             self.filename, self._lane, self._read, self.sample))
844
845     def make_samplekey(self):
846         read = self._read if self._read is not None else 1
847         return SampleKey(lane=self.lane, read=read, sample=self.sample)
848
849     def _get_lane(self):
850         if self._lane is not None:
851             return int(self._lane)
852         return self._lane
853     lane = property(_get_lane)
854
855     def _get_read(self):
856         if self._read is not None:
857             return int(self._read)
858         return self._read
859     read = property(_get_read)
860
861     def _get_part(self):
862         if self._part is not None:
863             return int(self._part)
864         return self._part
865     part = property(_get_part)
866
867     def __repr__(self):
868         name = []
869         if self.sample is not None: name.append(self.sample)
870         if self._lane is not None: name.append('L%s' % (self.lane,))
871         if self._read is not None: name.append('R%s' % (self.read,))
872         if self._part is not None: name.append('P%s' % (self.part,))
873         return '<ElandMatch(' + "_".join(name) + ')>'
874
875
876 def extract_eland_sequence(instream, outstream, start, end):
877     """
878     Extract a chunk of sequence out of an eland file
879     """
880     for line in instream:
881         record = line.split()
882         if len(record) > 1:
883             result = [record[0], record[1][start:end]]
884         else:
885             result = [record[0][start:end]]
886         outstream.write("\t".join(result))
887         outstream.write(os.linesep)
888
889
890 def main(cmdline=None):
891     """Run eland extraction against the specified gerald directory"""
892     from optparse import OptionParser
893     parser = OptionParser("%prog: <gerald dir>+")
894     opts, args = parser.parse_args(cmdline)
895     logging.basicConfig(level=logging.DEBUG)
896     for a in args:
897         LOGGER.info("Starting scan of %s" % (a,))
898         e = eland(a)
899         print(ElementTree.tostring(e.get_elements()))
900     return
901
902
903 if __name__ == "__main__":
904     main(sys.argv[1:])