Use sample keys when looking up lane parameters.
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 import collections
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11 import types
12
13 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.util.ethelp import indent, flatten
16 from htsworkflow.util.opener import autoopen
17
18 LOGGER = logging.getLogger(__name__)
19
20 SAMPLE_NAME = 'SampleName'
21 LANE_ID = 'LaneID'
22 END = 'End'
23 READS = 'Reads'
24
25 GENOME_MAP = 'GenomeMap'
26 GENOME_ITEM = 'GenomeItem'
27 MAPPED_READS = 'MappedReads'
28 MAPPED_ITEM = 'MappedItem'
29 MATCH_CODES = 'MatchCodes'
30 MATCH_ITEM = 'Code'
31 READS = 'Reads'
32
33 ELAND_SINGLE = 0
34 ELAND_MULTI = 1
35 ELAND_EXTENDED = 2
36 ELAND_EXPORT = 3
37
38 class ResultLane(object):
39     """
40     Base class for result lanes
41     """
42     XML_VERSION = 2
43     LANE = 'ResultLane'
44
45     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
46                  xml=None):
47         self.pathnames = pathnames
48         self.sample_name = sample
49         self.lane_id = lane_id
50         self.end = end
51         self._reads = None
52
53         if xml is not None:
54             self.set_elements(xml)
55
56     def _update(self):
57         """
58         Actually read the file and actually count the reads
59         """
60         pass
61
62     def _get_reads(self):
63         if self._reads is None:
64             self._update()
65         return self._reads
66     reads = property(_get_reads)
67
68     def get_elements(self):
69         return None
70
71     def __repr__(self):
72         name = []
73
74         name.append('L%s' % (self.lane_id,))
75         name.append('R%s' % (self.end,))
76         name.append('S%s' % (self.sample_name,))
77
78         return '<ResultLane(' + ",".join(name) + ')>'
79
80 class ElandLane(ResultLane):
81     """
82     Process an eland result file
83     """
84     XML_VERSION = 2
85     LANE = "ElandLane"
86     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
87     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
88     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
89     SCORE_UNRECOGNIZED = 0
90     SCORE_QC = 1
91     SCORE_READ = 2
92
93     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
94                  genome_map=None, eland_type=None, xml=None):
95         super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
96
97         self._mapped_reads = None
98         self._match_codes = None
99         self._reads = None
100         if genome_map is None:
101             genome_map = {}
102         self.genome_map = genome_map
103         self.eland_type = None
104
105         if xml is not None:
106             self.set_elements(xml)
107
108     def __repr__(self):
109         name = []
110
111         name.append('L%s' % (self.lane_id,))
112         name.append('R%s' % (self.end,))
113         name.append('S%s' % (self.sample_name,))
114
115         reads = str(self._reads) if self._reads is not None else 'Uncounted'
116         return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
117
118     def _guess_eland_type(self, pathname):
119         if self.eland_type is None:
120           # attempt autodetect eland file type
121           pathn, name = os.path.split(pathname)
122           if re.search('result', name):
123             self.eland_type = ELAND_SINGLE
124           elif re.search('multi', name):
125             self.eland_type = ELAND_MULTI
126           elif re.search('extended', name):
127             self.eland_type = ELAND_EXTENDED
128           elif re.search('export', name):
129             self.eland_type = ELAND_EXPORT
130           else:
131             self.eland_type = ELAND_SINGLE
132
133     def _update(self):
134         """
135         Actually read the file and actually count the reads
136         """
137         # can't do anything if we don't have a file to process
138         if self.pathnames is None or len(self.pathnames) == 0:
139             return
140         pathname = self.pathnames[-1]
141         self._guess_eland_type(pathname)
142
143         if os.stat(pathname)[stat.ST_SIZE] == 0:
144             raise RuntimeError("Eland isn't done, try again later.")
145
146         LOGGER.debug("summarizing results for %s" % (pathname))
147         self._match_codes = MatchCodes()
148         self._mapped_reads = MappedReads()
149         self._reads = 0
150
151         for pathname in self.pathnames:
152             stream = autoopen(pathname, 'r')
153             if self.eland_type == ELAND_SINGLE:
154                 result = self._update_eland_result(stream)
155             elif self.eland_type == ELAND_MULTI or \
156                  self.eland_type == ELAND_EXTENDED:
157                 result = self._update_eland_multi(stream)
158             elif self.eland_type == ELAND_EXPORT:
159                 result = self._update_eland_export(stream)
160             else:
161                 errmsg = "Only support single/multi/extended eland files"
162                 raise NotImplementedError(errmsg)
163             stream.close()
164
165             match, mapped, reads = result
166             self._match_codes += match
167             self._mapped_reads += mapped
168             self._reads += reads
169
170     def _update_eland_result(self, instream):
171         reads = 0
172         mapped_reads = MappedReads()
173         match_codes = MatchCodes()
174
175         for line in instream:
176             reads += 1
177             fields = line.split()
178             # code = fields[2]
179             # match_codes[code] = match_codes.setdefault(code, 0) + 1
180             # the QC/NM etc codes are in the 3rd field and always present
181             match_codes[fields[2]] += 1
182             # ignore lines that don't have a fasta filename
183             if len(fields) < 7:
184                 continue
185             fasta = self.genome_map.get(fields[6], fields[6])
186             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
187         return match_codes, mapped_reads, reads
188
189     def _update_eland_multi(self, instream):
190         """Summarize an eland_extend."""
191         MATCH_INDEX = 2
192         LOCATION_INDEX = 3
193         reads = 0
194         mapped_reads = MappedReads()
195         match_codes = MatchCodes()
196
197         for line in instream:
198             reads += 1
199             fields = line.split()
200             # fields[2] = QC/NM/or number of matches
201             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
202                                                        match_codes)
203             if score_type == ElandLane.SCORE_READ:
204                 # when there are too many hits, eland  writes a - where
205                 # it would have put the list of hits.
206                 # or in a different version of eland, it just leaves
207                 # that column blank, and only outputs 3 fields.
208                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
209                   continue
210
211                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
212
213         return match_codes, mapped_reads, reads
214
215     def _update_eland_export(self, instream):
216         """Summarize a gerald export file."""
217         MATCH_INDEX = 10
218         LOCATION_INDEX = 10
219         DESCRIPTOR_INDEX= 13
220         reads = 0
221         mapped_reads = MappedReads()
222         match_codes = MatchCodes()
223
224         for line in instream:
225             reads += 1
226             fields = line.split()
227             # fields[2] = QC/NM/or number of matches
228             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
229                                                        match_codes)
230             if score_type == ElandLane.SCORE_UNRECOGNIZED:
231                 # export files have three states for the match field
232                 # QC code, count of multi-reads, or a single
233                 # read location. The score_mapped_mismatches function
234                 # only understands the first two types.
235                 # if we get unrecognized, that implies the field is probably
236                 # a location.
237                 code = self._count_mapped_export(mapped_reads,
238                                                  fields[LOCATION_INDEX],
239                                                  fields[DESCRIPTOR_INDEX])
240                 match_codes[code] += 1
241
242         return match_codes, mapped_reads, reads
243
244
245     def _score_mapped_mismatches(self, match, match_codes):
246         """Update match_codes with eland map counts, or failure code.
247
248         Returns True if the read mapped, false if it was an error code.
249         """
250         groups = ElandLane.MATCH_COUNTS_RE.match(match)
251         if groups is None:
252             # match is not of the form [\d]+:[\d]+:[\d]+
253             if match in match_codes:
254                 # match is one quality control codes QC/NM etc
255                 match_codes[match] += 1
256                 return ElandLane.SCORE_QC
257             else:
258                 return ElandLane.SCORE_UNRECOGNIZED
259         else:
260             # match is of the form [\d]+:[\d]+:[\d]+
261             # AKA Multiread
262             zero_mismatches = int(groups.group(1))
263             one_mismatches = int(groups.group(2))
264             two_mismatches = int(groups.group(3))
265
266             if zero_mismatches == 1:
267                 match_codes['U0'] += 1
268             elif zero_mismatches < 255:
269                 match_codes['R0'] += zero_mismatches
270
271             if one_mismatches == 1:
272                 match_codes['U1'] += 1
273             elif one_mismatches < 255:
274                 match_codes['R1'] += one_mismatches
275
276             if two_mismatches == 1:
277                 match_codes['U2'] += 1
278             elif two_mismatches < 255:
279                 match_codes['R2'] += two_mismatches
280
281             return ElandLane.SCORE_READ
282
283
284     def _count_mapped_multireads(self, mapped_reads, match_string):
285         chromo = None
286         for match in match_string.split(','):
287             match_fragment = match.split(':')
288             if len(match_fragment) == 2:
289                 chromo = match_fragment[0]
290                 pos = match_fragment[1]
291
292             fasta = self.genome_map.get(chromo, chromo)
293             assert fasta is not None
294             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
295
296
297     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
298         """Count a read as defined in an export file
299
300         match_string contains the chromosome
301         descriptor contains the an ecoding of bases that match, mismatch,
302                    and have indels.
303         returns the "best" match code
304
305         Currently "best" match code is ignoring the possibility of in-dels
306         """
307         chromo = match_string
308         fasta = self.genome_map.get(chromo, chromo)
309         assert fasta is not None
310         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
311
312         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
313         if len(mismatch_bases) == 0:
314             return 'U0'
315         elif len(mismatch_bases) == 1:
316             return 'U1'
317         else:
318             return 'U2'
319
320
321     def _get_mapped_reads(self):
322         if self._mapped_reads is None:
323             self._update()
324         return self._mapped_reads
325     mapped_reads = property(_get_mapped_reads)
326
327     def _get_match_codes(self):
328         if self._match_codes is None:
329             self._update()
330         return self._match_codes
331     match_codes = property(_get_match_codes)
332
333     def _get_no_match(self):
334         if self._mapped_reads is None:
335             self._update()
336         return self._match_codes['NM']
337     no_match = property(_get_no_match,
338                         doc="total reads that didn't match the target genome.")
339
340     def _get_no_match_percent(self):
341         return float(self.no_match)/self.reads * 100
342     no_match_percent = property(_get_no_match_percent,
343                                 doc="no match reads as percent of total")
344
345     def _get_qc_failed(self):
346         if self._mapped_reads is None:
347             self._update()
348         return self._match_codes['QC']
349     qc_failed = property(_get_qc_failed,
350                         doc="total reads that didn't match the target genome.")
351
352     def _get_qc_failed_percent(self):
353         return float(self.qc_failed)/self.reads * 100
354     qc_failed_percent = property(_get_qc_failed_percent,
355                                  doc="QC failed reads as percent of total")
356
357     def _get_unique_reads(self):
358         if self._mapped_reads is None:
359            self._update()
360         sum = 0
361         for code in ['U0','U1','U2']:
362             sum += self._match_codes[code]
363         return sum
364     unique_reads = property(_get_unique_reads,
365                             doc="total unique reads")
366
367     def _get_repeat_reads(self):
368         if self._mapped_reads is None:
369            self._update()
370         sum = 0
371         for code in ['R0','R1','R2']:
372             sum += self._match_codes[code]
373         return sum
374     repeat_reads = property(_get_repeat_reads,
375                             doc="total repeat reads")
376
377     def get_elements(self):
378         lane = ElementTree.Element(ElandLane.LANE,
379                                    {'version':
380                                     unicode(ElandLane.XML_VERSION)})
381         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
382         sample_tag.text = self.sample_name
383         lane_tag = ElementTree.SubElement(lane, LANE_ID)
384         lane_tag.text = str(self.lane_id)
385         if self.end is not None:
386             end_tag = ElementTree.SubElement(lane, END)
387             end_tag.text = str(self.end)
388         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
389         for k, v in self.genome_map.items():
390             item = ElementTree.SubElement(
391                 genome_map, GENOME_ITEM,
392                 {'name':k, 'value':unicode(v)})
393         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
394         for k, v in self.mapped_reads.items():
395             item = ElementTree.SubElement(
396                 mapped_reads, MAPPED_ITEM,
397                 {'name':k, 'value':unicode(v)})
398         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
399         for k, v in self.match_codes.items():
400             item = ElementTree.SubElement(
401                 match_codes, MATCH_ITEM,
402                 {'name':k, 'value':unicode(v)})
403         reads = ElementTree.SubElement(lane, READS)
404         reads.text = unicode(self.reads)
405
406         return lane
407
408     def set_elements(self, tree):
409         if tree.tag != ElandLane.LANE:
410             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
411
412         # reset dictionaries
413         self._mapped_reads = {}
414         self._match_codes = {}
415
416         for element in tree:
417             tag = element.tag.lower()
418             if tag == SAMPLE_NAME.lower():
419                 self.sample_name = element.text
420             elif tag == LANE_ID.lower():
421                 self.lane_id = int(element.text)
422             elif tag == END.lower():
423                 self.end = int(element.text)
424             elif tag == GENOME_MAP.lower():
425                 for child in element:
426                     name = child.attrib['name']
427                     value = child.attrib['value']
428                     self.genome_map[name] = value
429             elif tag == MAPPED_READS.lower():
430                 for child in element:
431                     name = child.attrib['name']
432                     value = child.attrib['value']
433                     self._mapped_reads[name] = int(value)
434             elif tag == MATCH_CODES.lower():
435                 for child in element:
436                     name = child.attrib['name']
437                     value = int(child.attrib['value'])
438                     self._match_codes[name] = value
439             elif tag == READS.lower():
440                 self._reads = int(element.text)
441             else:
442                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
443
444
445 class MatchCodes(collections.MutableMapping):
446     """Mapping to hold match counts -
447     supports combining two match count sets together
448     """
449     def __init__(self, initializer=None):
450         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
451                             'U0':0, 'U1':0, 'U2':0,
452                             'R0':0, 'R1':0, 'R2':0,
453                             }
454
455         if initializer is not None:
456             if not isinstance(initializer, collections.Mapping):
457                 raise ValueError("Expected dictionary like class")
458             for key in initializer:
459                 if key not in self.match_codes:
460                     errmsg = "Initializer can only contain: %s"
461                     raise ValueError(errmsg % (",".join(self.match_codes.keys())))
462                 self.match_codes[key] += initializer[key]
463
464     def __iter__(self):
465         return iter(self.match_codes)
466
467     def __delitem__(self, key):
468         raise RuntimeError("delete not allowed")
469
470     def __getitem__(self, key):
471         return self.match_codes[key]
472
473     def __setitem__(self, key, value):
474         if key not in self.match_codes:
475             errmsg = "Unrecognized key, allowed values are: %s"
476             raise ValueError(errmsg % (",".join(self.match_codes.keys())))
477         self.match_codes[key] = value
478
479     def __len__(self):
480         return len(self.match_codes)
481
482     def __add__(self, other):
483         if not isinstance(other, MatchCodes):
484             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
485
486         newobj = MatchCodes(self)
487         for key, value in other.items():
488             newobj[key] = self.get(key, 0) + other[key]
489
490         return newobj
491
492
493 class MappedReads(collections.MutableMapping):
494     """Mapping to hold mapped reads -
495     supports combining two mapped read sets together
496     """
497     def __init__(self, initializer=None):
498         self.mapped_reads = {}
499
500         if initializer is not None:
501             if not isinstance(initializer, collections.Mapping):
502                 raise ValueError("Expected dictionary like class")
503             for key in initializer:
504                 self[key] = self.setdefault(key, 0) + initializer[key]
505
506     def __iter__(self):
507         return iter(self.mapped_reads)
508
509     def __delitem__(self, key):
510         del self.mapped_reads[key]
511
512     def __getitem__(self, key):
513         return self.mapped_reads[key]
514
515     def __setitem__(self, key, value):
516         self.mapped_reads[key] = value
517
518     def __len__(self):
519         return len(self.mapped_reads)
520
521     def __add__(self, other):
522         if not isinstance(other, MappedReads):
523             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
524
525         newobj = MappedReads(self)
526         for key in other:
527             newobj[key] = self.get(key, 0) + other[key]
528
529         return newobj
530
531 class SequenceLane(ResultLane):
532     XML_VERSION=1
533     LANE = 'SequenceLane'
534     SEQUENCE_TYPE = 'SequenceType'
535
536     NONE_TYPE = None
537     SCARF_TYPE = 1
538     FASTQ_TYPE = 2
539     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
540
541     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
542                  xml=None):
543         self.sequence_type = None
544         super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
545
546     def _guess_sequence_type(self, pathname):
547         """
548         Determine if we have a scarf or fastq sequence file
549         """
550         f = open(pathname,'r')
551         l = f.readline()
552         f.close()
553
554         if l[0] == '@':
555             # fastq starts with a @
556             self.sequence_type = SequenceLane.FASTQ_TYPE
557         else:
558             self.sequence_type = SequenceLane.SCARF_TYPE
559         return self.sequence_type
560
561     def _update(self):
562         """
563         Actually read the file and actually count the reads
564         """
565         # can't do anything if we don't have a file to process
566         if self.pathnames is None:
567             return
568
569         pathname = self.pathnames[-1]
570         if os.stat(pathname)[stat.ST_SIZE] == 0:
571             raise RuntimeError("Sequencing isn't done, try again later.")
572
573         self._guess_sequence_type(pathname)
574
575         LOGGER.info("summarizing results for %s" % (pathname))
576         lines = 0
577         f = open(pathname)
578         for l in f.xreadlines():
579             lines += 1
580         f.close()
581
582         if self.sequence_type == SequenceLane.SCARF_TYPE:
583             self._reads = lines
584         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
585             self._reads = lines / 4
586         else:
587             errmsg = "This only supports scarf or fastq squence files"
588             raise NotImplementedError(errmsg)
589
590     def get_elements(self):
591         lane = ElementTree.Element(SequenceLane.LANE,
592                                    {'version':
593                                     unicode(SequenceLane.XML_VERSION)})
594         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
595         sample_tag.text = self.sample_name
596         lane_tag = ElementTree.SubElement(lane, LANE_ID)
597         lane_tag.text = str(self.lane_id)
598         if self.end is not None:
599             end_tag = ElementTree.SubElement(lane, END)
600             end_tag.text = str(self.end)
601         reads = ElementTree.SubElement(lane, READS)
602         reads.text = unicode(self.reads)
603         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
604         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
605
606         return lane
607
608     def set_elements(self, tree):
609         if tree.tag != SequenceLane.LANE:
610             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
611         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
612
613         for element in tree:
614             tag = element.tag.lower()
615             if tag == SAMPLE_NAME.lower():
616                 self.sample_name = element.text
617             elif tag == LANE_ID.lower():
618                 self.lane_id = int(element.text)
619             elif tag == END.lower():
620                 self.end = int(element.text)
621             elif tag == READS.lower():
622                 self._reads = int(element.text)
623             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
624                 self.sequence_type = lookup_sequence_type.get(element.text, None)
625             else:
626                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
627
628 class ELAND(collections.MutableMapping):
629     """
630     Summarize information from eland files
631     """
632     XML_VERSION = 3
633
634     ELAND = 'ElandCollection'
635     LANE = 'Lane'
636     LANE_ID = 'id'
637     SAMPLE = 'sample'
638     END = 'end'
639
640     def __init__(self, xml=None):
641         # we need information from the gerald config.xml
642         self.results = {}
643
644         if xml is not None:
645             self.set_elements(xml)
646
647     def __getitem__(self, key):
648         if not isinstance(key, SampleKey):
649             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
650         return self.results[key]
651
652     def __setitem__(self, key, value):
653         if not isinstance(key, SampleKey):
654             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
655         self.results[key] = value
656
657     def __delitem__(self, key):
658         del self.result[key]
659
660     def __iter__(self):
661         keys = self.results.iterkeys()
662         for k in sorted(keys):
663             yield k
664
665     def __len__(self):
666         return len(self.results)
667
668     def find_keys(self, search):
669         """Return results that match key"""
670         if not isinstance(search, SampleKey):
671             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
672         if not search.iswild:
673             yield self[search]
674         for key in self.keys():
675             if key.matches(search): yield key
676
677     def get_elements(self):
678         root = ElementTree.Element(ELAND.ELAND,
679                                    {'version': unicode(ELAND.XML_VERSION)})
680
681         for key in self:
682             eland_lane = self[key].get_elements()
683             eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
684             eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
685             eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
686             root.append(eland_lane)
687         return root
688         return root
689
690     def set_elements(self, tree):
691         if tree.tag.lower() != ELAND.ELAND.lower():
692             raise ValueError('Expecting %s', ELAND.ELAND)
693         for element in list(tree):
694             lane_id = int(element.attrib[ELAND.LANE_ID])
695             end = int(element.attrib.get(ELAND.END, 0))
696             sample = element.attrib.get(ELAND.SAMPLE, 's')
697             if element.tag.lower() == ElandLane.LANE.lower():
698                 lane = ElandLane(xml=element)
699             elif element.tag.lower() == SequenceLane.LANE.lower():
700                 lane = SequenceLane(xml=element)
701
702             key = SampleKey(lane=lane_id, read=end+1, sample=sample)
703             self.results[key] = lane
704
705
706     def update_result_with_eland(self, gerald, key, pathnames,
707                                  genome_maps):
708         # yes the lane_id is also being computed in ElandLane._update
709         # I didn't want to clutter up my constructor
710         # but I needed to persist the sample_name/lane_id for
711         # runfolder summary_report
712         names = [ os.path.split(p)[1] for p in pathnames]
713         LOGGER.info("Adding eland files %s" %(",".join(names),))
714
715         genome_map = {}
716         if genome_maps is not None:
717             genome_map = genome_maps[key.lane]
718         elif gerald is not None:
719             genome_dir = gerald.lanes[key].eland_genome
720             if genome_dir is not None:
721                 genome_map = build_genome_fasta_map(genome_dir)
722
723         lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
724
725         self.results[key] = lane
726
727     def update_result_with_sequence(self, gerald, key, pathnames,
728                                     genome_maps=None):
729         self.results[key] = SequenceLane(pathnames,
730                                          key.sample, key.lane, key.read)
731
732
733 def eland(gerald_dir, gerald=None, genome_maps=None):
734     e = ELAND()
735     eland_files = ElandMatches(e)
736     # collect
737     for path, dirnames, filenames in os.walk(gerald_dir):
738         for filename in filenames:
739             pathname = os.path.abspath(os.path.join(path, filename))
740             eland_files.add(pathname)
741     for key in eland_files:
742         eland_files.count(key, gerald, genome_maps)
743     return e
744
745
746 class ElandMatches(collections.MutableMapping):
747     def __init__(self, eland_container):
748         # the order in patterns determines the preference for what
749         # will be found.
750         self.eland_container = eland_container
751         MAPPED = eland_container.update_result_with_eland
752         SEQUENCE = eland_container.update_result_with_sequence
753
754         sample = '(?P<sample>[^_]+)'
755         hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
756         hiLane = '_L(?P<lane>[\d]+)'
757         gaLane = '_(?P<lane>[\d]+)'
758         hiRead = '_R(?P<read>[\d]+)'
759         gaRead = '(_(?P<read>[\d])+)?'
760         part = '_(?P<part>[\d]+)'
761         ext = '(?P<extention>(\.bz2|\.gz)?)'
762
763         hiPrefix = sample + hiIndex + hiLane + hiRead + part
764         gaPrefix = sample + gaLane + gaRead
765         P = collections.namedtuple('Patterns', 'pattern counter priority')
766         self.patterns = [
767             P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
768             P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
769             P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
770             P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
771             P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
772             P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
773             ]
774         self.file_sets = {}
775         self.file_priority = {}
776         self.file_counter = {}
777
778     def add(self, pathname):
779         """Add pathname to our set of files
780         """
781         path, filename = os.path.split(pathname)
782
783         for pattern, counter, priority in self.patterns:
784             rematch = re.match(pattern, filename)
785             if rematch is not None:
786                 m = ElandMatch(pathname, counter, **rematch.groupdict())
787                 key = m.make_samplekey()
788                 old_priority = self.file_priority.get(key, 0)
789                 if priority > old_priority:
790                     self.file_sets[key] = set((m,))
791                     self.file_counter[key] = counter
792                     self.file_priority[key] = priority
793                 elif priority == old_priority:
794                     self.file_sets[key].add(m)
795
796     def count(self, key, gerald=None, genome_maps=None):
797         #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
798         counter = self.file_counter[key]
799         file_set = self.file_sets[key]
800         filenames = [ f.filename for f in file_set ]
801         return counter(gerald, key,
802                        filenames, genome_maps)
803
804     def __iter__(self):
805         return iter(self.file_sets)
806
807     def __len__(self):
808         return len(self.file_sets)
809
810     def __getitem__(self, key):
811         return self.file_sets[key]
812
813     def __setitem__(self, key, value):
814         if not isintance(value, set):
815             raise ValueError("Expected set for value")
816         self.file_sets[key] = value
817
818     def __delitem__(self, key):
819         del self.file_sets[key]
820
821 class ElandMatch(object):
822     def __init__(self, pathname, counter,
823                  lane=None, read=None, extension=None,
824                  sample=None, index=None, part=None, **kwargs):
825         self.filename = pathname
826         self.counter = counter
827         self._lane = lane
828         self._read = read
829         self.extension = extension
830         self.sample = sample
831         self.index = index
832         self._part = part
833         LOGGER.info("Found %s: L%s R%s Samp%s" % (
834             self.filename, self._lane, self._read, self.sample))
835
836     def make_samplekey(self):
837         read = self._read if self._read is not None else 1
838         return SampleKey(lane=self.lane, read=read, sample=self.sample)
839
840     def _get_lane(self):
841         if self._lane is not None:
842             return int(self._lane)
843         return self._lane
844     lane = property(_get_lane)
845
846     def _get_read(self):
847         if self._read is not None:
848             return int(self._read)
849         return self._read
850     read = property(_get_read)
851
852     def _get_part(self):
853         if self._part is not None:
854             return int(self._part)
855         return self._part
856     part = property(_get_part)
857
858     def __repr__(self):
859         name = []
860         if self.sample is not None: name.append(self.sample)
861         if self._lane is not None: name.append('L%s' % (self.lane,))
862         if self._read is not None: name.append('R%s' % (self.read,))
863         if self._part is not None: name.append('P%s' % (self.part,))
864         return '<ElandMatch(' + "_".join(name) + ')>'
865
866 def build_genome_fasta_map(genome_dir):
867     # build fasta to fasta file map
868     LOGGER.info("Building genome map")
869     genome = genome_dir.split(os.path.sep)[-1]
870     fasta_map = {}
871     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
872         is_link = False
873         if os.path.islink(vld_file):
874             is_link = True
875         vld_file = os.path.realpath(vld_file)
876         path, vld_name = os.path.split(vld_file)
877         name, ext = os.path.splitext(vld_name)
878         if is_link:
879             fasta_map[name] = name
880         else:
881             fasta_map[name] = os.path.join(genome, name)
882     return fasta_map
883
884
885 def extract_eland_sequence(instream, outstream, start, end):
886     """
887     Extract a chunk of sequence out of an eland file
888     """
889     for line in instream:
890         record = line.split()
891         if len(record) > 1:
892             result = [record[0], record[1][start:end]]
893         else:
894             result = [record[0][start:end]]
895         outstream.write("\t".join(result))
896         outstream.write(os.linesep)
897
898
899 def main(cmdline=None):
900     """Run eland extraction against the specified gerald directory"""
901     from optparse import OptionParser
902     parser = OptionParser("%prog: <gerald dir>+")
903     opts, args = parser.parse_args(cmdline)
904     logging.basicConfig(level=logging.DEBUG)
905     for a in args:
906         LOGGER.info("Starting scan of %s" % (a,))
907         e = eland(a)
908         print ElementTree.tostring(e.get_elements())
909     return
910
911
912 if __name__ == "__main__":
913     main(sys.argv[1:])