26e6f56b7d74e1d8ace710df473c896ed9e17c51
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 import collections
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11 import types
12
13 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.util.ethelp import indent, flatten
16 from htsworkflow.util.opener import autoopen
17
18 LOGGER = logging.getLogger(__name__)
19
20 SAMPLE_NAME = 'SampleName'
21 LANE_ID = 'LaneID'
22 END = 'End'
23 READS = 'Reads'
24
25 GENOME_MAP = 'GenomeMap'
26 GENOME_ITEM = 'GenomeItem'
27 MAPPED_READS = 'MappedReads'
28 MAPPED_ITEM = 'MappedItem'
29 MATCH_CODES = 'MatchCodes'
30 MATCH_ITEM = 'Code'
31 READS = 'Reads'
32
33 ELAND_SINGLE = 0
34 ELAND_MULTI = 1
35 ELAND_EXTENDED = 2
36 ELAND_EXPORT = 3
37
38 class ResultLane(object):
39     """
40     Base class for result lanes
41     """
42     XML_VERSION = 2
43     LANE = 'ResultLane'
44
45     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
46                  xml=None):
47         self.pathnames = pathnames
48         self.sample_name = sample
49         self.lane_id = lane_id
50         self.end = end
51         self._reads = None
52
53         if xml is not None:
54             self.set_elements(xml)
55
56     def _update(self):
57         """
58         Actually read the file and actually count the reads
59         """
60         pass
61
62     def _get_reads(self):
63         if self._reads is None:
64             self._update()
65         return self._reads
66     reads = property(_get_reads)
67
68     def get_elements(self):
69         return None
70
71     def __repr__(self):
72         name = []
73
74         name.append('L%s' % (self.lane_id,))
75         name.append('R%s' % (self.end,))
76         name.append('S%s' % (self.sample_name,))
77
78         return '<ResultLane(' + ",".join(name) + ')>'
79
80 class ElandLane(ResultLane):
81     """
82     Process an eland result file
83     """
84     XML_VERSION = 2
85     LANE = "ElandLane"
86     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
87     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
88     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
89     SCORE_UNRECOGNIZED = 0
90     SCORE_QC = 1
91     SCORE_READ = 2
92
93     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
94                  genome_map=None, eland_type=None, xml=None):
95         super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
96
97         self._mapped_reads = None
98         self._match_codes = None
99         self._reads = None
100         if genome_map is None:
101             genome_map = {}
102         self.genome_map = genome_map
103         self.eland_type = None
104
105         if xml is not None:
106             self.set_elements(xml)
107
108     def __repr__(self):
109         name = []
110
111         name.append('L%s' % (self.lane_id,))
112         name.append('R%s' % (self.end,))
113         name.append('S%s' % (self.sample_name,))
114
115         reads = str(self._reads) if self._reads is not None else 'Uncounted'
116         return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
117
118     def _guess_eland_type(self, pathname):
119         if self.eland_type is None:
120           # attempt autodetect eland file type
121           pathn, name = os.path.split(pathname)
122           if re.search('result', name):
123             self.eland_type = ELAND_SINGLE
124           elif re.search('multi', name):
125             self.eland_type = ELAND_MULTI
126           elif re.search('extended', name):
127             self.eland_type = ELAND_EXTENDED
128           elif re.search('export', name):
129             self.eland_type = ELAND_EXPORT
130           else:
131             self.eland_type = ELAND_SINGLE
132
133     def _update(self):
134         """
135         Actually read the file and actually count the reads
136         """
137         # can't do anything if we don't have a file to process
138         if self.pathnames is None or len(self.pathnames) == 0:
139             return
140         pathname = self.pathnames[-1]
141         self._guess_eland_type(pathname)
142
143         if os.stat(pathname)[stat.ST_SIZE] == 0:
144             raise RuntimeError("Eland isn't done, try again later.")
145
146         LOGGER.debug("summarizing results for %s" % (pathname))
147         self._match_codes = MatchCodes()
148         self._mapped_reads = MappedReads()
149         self._reads = 0
150
151         for pathname in self.pathnames:
152             stream = autoopen(pathname, 'r')
153             if self.eland_type == ELAND_SINGLE:
154                 result = self._update_eland_result(stream)
155             elif self.eland_type == ELAND_MULTI or \
156                  self.eland_type == ELAND_EXTENDED:
157                 result = self._update_eland_multi(stream)
158             elif self.eland_type == ELAND_EXPORT:
159                 result = self._update_eland_export(stream)
160             else:
161                 errmsg = "Only support single/multi/extended eland files"
162                 raise NotImplementedError(errmsg)
163             stream.close()
164
165             match, mapped, reads = result
166             self._match_codes += match
167             self._mapped_reads += mapped
168             self._reads += reads
169
170     def _update_eland_result(self, instream):
171         reads = 0
172         mapped_reads = MappedReads()
173         match_codes = MatchCodes()
174
175         for line in instream:
176             reads += 1
177             fields = line.split()
178             # code = fields[2]
179             # match_codes[code] = match_codes.setdefault(code, 0) + 1
180             # the QC/NM etc codes are in the 3rd field and always present
181             match_codes[fields[2]] += 1
182             # ignore lines that don't have a fasta filename
183             if len(fields) < 7:
184                 continue
185             fasta = self.genome_map.get(fields[6], fields[6])
186             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
187         return match_codes, mapped_reads, reads
188
189     def _update_eland_multi(self, instream):
190         """Summarize an eland_extend."""
191         MATCH_INDEX = 2
192         LOCATION_INDEX = 3
193         reads = 0
194         mapped_reads = MappedReads()
195         match_codes = MatchCodes()
196
197         for line in instream:
198             reads += 1
199             fields = line.split()
200             # fields[2] = QC/NM/or number of matches
201             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
202                                                        match_codes)
203             if score_type == ElandLane.SCORE_READ:
204                 # when there are too many hits, eland  writes a - where
205                 # it would have put the list of hits.
206                 # or in a different version of eland, it just leaves
207                 # that column blank, and only outputs 3 fields.
208                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
209                   continue
210
211                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
212
213         return match_codes, mapped_reads, reads
214
215     def _update_eland_export(self, instream):
216         """Summarize a gerald export file."""
217         MATCH_INDEX = 10
218         LOCATION_INDEX = 10
219         DESCRIPTOR_INDEX= 13
220         reads = 0
221         mapped_reads = MappedReads()
222         match_codes = MatchCodes()
223
224         for line in instream:
225             reads += 1
226             fields = line.split()
227             # fields[2] = QC/NM/or number of matches
228             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
229                                                        match_codes)
230             if score_type == ElandLane.SCORE_UNRECOGNIZED:
231                 # export files have three states for the match field
232                 # QC code, count of multi-reads, or a single
233                 # read location. The score_mapped_mismatches function
234                 # only understands the first two types.
235                 # if we get unrecognized, that implies the field is probably
236                 # a location.
237                 code = self._count_mapped_export(mapped_reads,
238                                                  fields[LOCATION_INDEX],
239                                                  fields[DESCRIPTOR_INDEX])
240                 match_codes[code] += 1
241
242         return match_codes, mapped_reads, reads
243
244
245     def _score_mapped_mismatches(self, match, match_codes):
246         """Update match_codes with eland map counts, or failure code.
247
248         Returns True if the read mapped, false if it was an error code.
249         """
250         groups = ElandLane.MATCH_COUNTS_RE.match(match)
251         if groups is None:
252             # match is not of the form [\d]+:[\d]+:[\d]+
253             if match in match_codes:
254                 # match is one quality control codes QC/NM etc
255                 match_codes[match] += 1
256                 return ElandLane.SCORE_QC
257             else:
258                 return ElandLane.SCORE_UNRECOGNIZED
259         else:
260             # match is of the form [\d]+:[\d]+:[\d]+
261             # AKA Multiread
262             zero_mismatches = int(groups.group(1))
263             one_mismatches = int(groups.group(2))
264             two_mismatches = int(groups.group(3))
265
266             if zero_mismatches == 1:
267                 match_codes['U0'] += 1
268             elif zero_mismatches < 255:
269                 match_codes['R0'] += zero_mismatches
270
271             if one_mismatches == 1:
272                 match_codes['U1'] += 1
273             elif one_mismatches < 255:
274                 match_codes['R1'] += one_mismatches
275
276             if two_mismatches == 1:
277                 match_codes['U2'] += 1
278             elif two_mismatches < 255:
279                 match_codes['R2'] += two_mismatches
280
281             return ElandLane.SCORE_READ
282
283
284     def _count_mapped_multireads(self, mapped_reads, match_string):
285         chromo = None
286         for match in match_string.split(','):
287             match_fragment = match.split(':')
288             if len(match_fragment) == 2:
289                 chromo = match_fragment[0]
290                 pos = match_fragment[1]
291
292             fasta = self.genome_map.get(chromo, chromo)
293             assert fasta is not None
294             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
295
296
297     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
298         """Count a read as defined in an export file
299
300         match_string contains the chromosome
301         descriptor contains the an ecoding of bases that match, mismatch,
302                    and have indels.
303         returns the "best" match code
304
305         Currently "best" match code is ignoring the possibility of in-dels
306         """
307         chromo = match_string
308         fasta = self.genome_map.get(chromo, chromo)
309         assert fasta is not None
310         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
311
312         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
313         if len(mismatch_bases) == 0:
314             return 'U0'
315         elif len(mismatch_bases) == 1:
316             return 'U1'
317         else:
318             return 'U2'
319
320
321     def _get_mapped_reads(self):
322         if self._mapped_reads is None:
323             self._update()
324         return self._mapped_reads
325     mapped_reads = property(_get_mapped_reads)
326
327     def _get_match_codes(self):
328         if self._match_codes is None:
329             self._update()
330         return self._match_codes
331     match_codes = property(_get_match_codes)
332
333     def _get_no_match(self):
334         if self._mapped_reads is None:
335             self._update()
336         return self._match_codes['NM']
337     no_match = property(_get_no_match,
338                         doc="total reads that didn't match the target genome.")
339
340     def _get_no_match_percent(self):
341         return float(self.no_match)/self.reads * 100
342     no_match_percent = property(_get_no_match_percent,
343                                 doc="no match reads as percent of total")
344
345     def _get_qc_failed(self):
346         if self._mapped_reads is None:
347             self._update()
348         return self._match_codes['QC']
349     qc_failed = property(_get_qc_failed,
350                         doc="total reads that didn't match the target genome.")
351
352     def _get_qc_failed_percent(self):
353         return float(self.qc_failed)/self.reads * 100
354     qc_failed_percent = property(_get_qc_failed_percent,
355                                  doc="QC failed reads as percent of total")
356
357     def _get_unique_reads(self):
358         if self._mapped_reads is None:
359            self._update()
360         sum = 0
361         for code in ['U0','U1','U2']:
362             sum += self._match_codes[code]
363         return sum
364     unique_reads = property(_get_unique_reads,
365                             doc="total unique reads")
366
367     def _get_repeat_reads(self):
368         if self._mapped_reads is None:
369            self._update()
370         sum = 0
371         for code in ['R0','R1','R2']:
372             sum += self._match_codes[code]
373         return sum
374     repeat_reads = property(_get_repeat_reads,
375                             doc="total repeat reads")
376
377     def get_elements(self):
378         lane = ElementTree.Element(ElandLane.LANE,
379                                    {'version':
380                                     unicode(ElandLane.XML_VERSION)})
381         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
382         sample_tag.text = self.sample_name
383         lane_tag = ElementTree.SubElement(lane, LANE_ID)
384         lane_tag.text = str(self.lane_id)
385         if self.end is not None:
386             end_tag = ElementTree.SubElement(lane, END)
387             end_tag.text = str(self.end)
388         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
389         for k, v in self.genome_map.items():
390             item = ElementTree.SubElement(
391                 genome_map, GENOME_ITEM,
392                 {'name':k, 'value':unicode(v)})
393         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
394         for k, v in self.mapped_reads.items():
395             item = ElementTree.SubElement(
396                 mapped_reads, MAPPED_ITEM,
397                 {'name':k, 'value':unicode(v)})
398         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
399         for k, v in self.match_codes.items():
400             item = ElementTree.SubElement(
401                 match_codes, MATCH_ITEM,
402                 {'name':k, 'value':unicode(v)})
403         reads = ElementTree.SubElement(lane, READS)
404         reads.text = unicode(self.reads)
405
406         return lane
407
408     def set_elements(self, tree):
409         if tree.tag != ElandLane.LANE:
410             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
411
412         # reset dictionaries
413         self._mapped_reads = {}
414         self._match_codes = {}
415
416         for element in tree:
417             tag = element.tag.lower()
418             if tag == SAMPLE_NAME.lower():
419                 self.sample_name = element.text
420             elif tag == LANE_ID.lower():
421                 self.lane_id = int(element.text)
422             elif tag == END.lower():
423                 self.end = int(element.text)
424             elif tag == GENOME_MAP.lower():
425                 for child in element:
426                     name = child.attrib['name']
427                     value = child.attrib['value']
428                     self.genome_map[name] = value
429             elif tag == MAPPED_READS.lower():
430                 for child in element:
431                     name = child.attrib['name']
432                     value = child.attrib['value']
433                     self._mapped_reads[name] = int(value)
434             elif tag == MATCH_CODES.lower():
435                 for child in element:
436                     name = child.attrib['name']
437                     value = int(child.attrib['value'])
438                     self._match_codes[name] = value
439             elif tag == READS.lower():
440                 self._reads = int(element.text)
441             else:
442                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
443
444
445 class MatchCodes(collections.MutableMapping):
446     """Mapping to hold match counts -
447     supports combining two match count sets together
448     """
449     def __init__(self, initializer=None):
450         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
451                             'U0':0, 'U1':0, 'U2':0,
452                             'R0':0, 'R1':0, 'R2':0,
453                             }
454
455         if initializer is not None:
456             if not isinstance(initializer, collections.Mapping):
457                 raise ValueError("Expected dictionary like class")
458             for key in initializer:
459                 if key not in self.match_codes:
460                     errmsg = "Initializer can only contain: %s"
461                     raise ValueError(errmsg % (",".join(self.match_codes.keys())))
462                 self.match_codes[key] += initializer[key]
463
464     def __iter__(self):
465         return iter(self.match_codes)
466
467     def __delitem__(self, key):
468         raise RuntimeError("delete not allowed")
469
470     def __getitem__(self, key):
471         return self.match_codes[key]
472
473     def __setitem__(self, key, value):
474         if key not in self.match_codes:
475             errmsg = "Unrecognized key, allowed values are: %s"
476             raise ValueError(errmsg % (",".join(self.match_codes.keys())))
477         self.match_codes[key] = value
478
479     def __len__(self):
480         return len(self.match_codes)
481
482     def __add__(self, other):
483         if not isinstance(other, MatchCodes):
484             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
485
486         newobj = MatchCodes(self)
487         for key, value in other.items():
488             newobj[key] = self.get(key, 0) + other[key]
489
490         return newobj
491
492
493 class MappedReads(collections.MutableMapping):
494     """Mapping to hold mapped reads -
495     supports combining two mapped read sets together
496     """
497     def __init__(self, initializer=None):
498         self.mapped_reads = {}
499
500         if initializer is not None:
501             if not isinstance(initializer, collections.Mapping):
502                 raise ValueError("Expected dictionary like class")
503             for key in initializer:
504                 self[key] = self.setdefault(key, 0) + initializer[key]
505
506     def __iter__(self):
507         return iter(self.mapped_reads)
508
509     def __delitem__(self, key):
510         del self.mapped_reads[key]
511
512     def __getitem__(self, key):
513         return self.mapped_reads[key]
514
515     def __setitem__(self, key, value):
516         self.mapped_reads[key] = value
517
518     def __len__(self):
519         return len(self.mapped_reads)
520
521     def __add__(self, other):
522         if not isinstance(other, MappedReads):
523             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
524
525         newobj = MappedReads(self)
526         for key in other:
527             newobj[key] = self.get(key, 0) + other[key]
528
529         return newobj
530
531 class SequenceLane(ResultLane):
532     XML_VERSION=1
533     LANE = 'SequenceLane'
534     SEQUENCE_TYPE = 'SequenceType'
535
536     NONE_TYPE = None
537     SCARF_TYPE = 1
538     FASTQ_TYPE = 2
539     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
540
541     def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
542                  xml=None):
543         self.sequence_type = None
544         super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
545
546     def _guess_sequence_type(self, pathname):
547         """
548         Determine if we have a scarf or fastq sequence file
549         """
550         f = open(pathname,'r')
551         l = f.readline()
552         f.close()
553
554         if l[0] == '@':
555             # fastq starts with a @
556             self.sequence_type = SequenceLane.FASTQ_TYPE
557         else:
558             self.sequence_type = SequenceLane.SCARF_TYPE
559         return self.sequence_type
560
561     def _update(self):
562         """
563         Actually read the file and actually count the reads
564         """
565         # can't do anything if we don't have a file to process
566         if self.pathnames is None:
567             return
568
569         pathname = self.pathnames[-1]
570         if os.stat(pathname)[stat.ST_SIZE] == 0:
571             raise RuntimeError("Sequencing isn't done, try again later.")
572
573         self._guess_sequence_type(pathname)
574
575         LOGGER.info("summarizing results for %s" % (pathname))
576         lines = 0
577         f = open(pathname)
578         for l in f.xreadlines():
579             lines += 1
580         f.close()
581
582         if self.sequence_type == SequenceLane.SCARF_TYPE:
583             self._reads = lines
584         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
585             self._reads = lines / 4
586         else:
587             errmsg = "This only supports scarf or fastq squence files"
588             raise NotImplementedError(errmsg)
589
590     def get_elements(self):
591         lane = ElementTree.Element(SequenceLane.LANE,
592                                    {'version':
593                                     unicode(SequenceLane.XML_VERSION)})
594         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
595         sample_tag.text = self.sample_name
596         lane_tag = ElementTree.SubElement(lane, LANE_ID)
597         lane_tag.text = str(self.lane_id)
598         if self.end is not None:
599             end_tag = ElementTree.SubElement(lane, END)
600             end_tag.text = str(self.end)
601         reads = ElementTree.SubElement(lane, READS)
602         reads.text = unicode(self.reads)
603         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
604         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
605
606         return lane
607
608     def set_elements(self, tree):
609         if tree.tag != SequenceLane.LANE:
610             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
611         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
612
613         for element in tree:
614             tag = element.tag.lower()
615             if tag == SAMPLE_NAME.lower():
616                 self.sample_name = element.text
617             elif tag == LANE_ID.lower():
618                 self.lane_id = int(element.text)
619             elif tag == END.lower():
620                 self.end = int(element.text)
621             elif tag == READS.lower():
622                 self._reads = int(element.text)
623             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
624                 self.sequence_type = lookup_sequence_type.get(element.text, None)
625             else:
626                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
627
628 class ELAND(collections.MutableMapping):
629     """
630     Summarize information from eland files
631     """
632     XML_VERSION = 3
633
634     ELAND = 'ElandCollection'
635     LANE = 'Lane'
636     LANE_ID = 'id'
637     SAMPLE = 'sample'
638     END = 'end'
639
640     def __init__(self, xml=None):
641         # we need information from the gerald config.xml
642         self.results = {}
643
644         if xml is not None:
645             self.set_elements(xml)
646
647     def __getitem__(self, key):
648         if not isinstance(key, SampleKey):
649             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
650         return self.results[key]
651
652     def __setitem__(self, key, value):
653         if not isinstance(key, SampleKey):
654             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
655         self.results[key] = value
656
657     def __delitem__(self, key):
658         del self.result[key]
659
660     def __iter__(self):
661         keys = self.results.iterkeys()
662         for k in sorted(keys):
663             yield k
664
665     def __len__(self):
666         return len(self.results)
667
668     def find_keys(self, search):
669         """Return results that match key"""
670         if not isinstance(search, SampleKey):
671             raise ValueError("Key must be a %s" % (str(type(SampleKey))))
672         if not search.iswild:
673             yield self[search]
674         for key in self.keys():
675             if key.matches(search): yield key
676
677     def get_elements(self):
678         root = ElementTree.Element(ELAND.ELAND,
679                                    {'version': unicode(ELAND.XML_VERSION)})
680
681         for key in self:
682             eland_lane = self[key].get_elements()
683             eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
684             eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
685             eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
686             root.append(eland_lane)
687         return root
688         return root
689
690     def set_elements(self, tree):
691         if tree.tag.lower() != ELAND.ELAND.lower():
692             raise ValueError('Expecting %s', ELAND.ELAND)
693         for element in list(tree):
694             lane_id = int(element.attrib[ELAND.LANE_ID])
695             end = int(element.attrib.get(ELAND.END, 0))
696             sample = element.attrib.get(ELAND.SAMPLE, 's')
697             if element.tag.lower() == ElandLane.LANE.lower():
698                 lane = ElandLane(xml=element)
699             elif element.tag.lower() == SequenceLane.LANE.lower():
700                 lane = SequenceLane(xml=element)
701
702             key = SampleKey(lane=lane_id, read=end+1, sample=sample)
703             self.results[key] = lane
704
705
706     def update_result_with_eland(self, gerald, key, pathnames,
707                                  genome_maps):
708         # yes the lane_id is also being computed in ElandLane._update
709         # I didn't want to clutter up my constructor
710         # but I needed to persist the sample_name/lane_id for
711         # runfolder summary_report
712         names = [ os.path.split(p)[1] for p in pathnames]
713         LOGGER.info("Adding eland files %s" %(",".join(names),))
714         basedir = os.path.split(pathnames[0])[0]
715         gs_template = "{0}_*_L{1:03}_genomesize.xml"
716         genomesize = glob(
717             os.path.join(basedir,
718                          gs_template.format(key.sample, key.lane)))
719
720
721         genome_map = {}
722         if genome_maps is not None:
723             genome_map = genome_maps[key.lane]
724         elif len(genomesize) > 0:
725             print "Found {0}".format(genomesize)
726             genome_map = build_genome_size_map(genomesize[0])
727         elif gerald is not None:
728             genome_dir = gerald.lanes[key].eland_genome
729             if genome_dir is not None:
730                 genome_map = build_genome_fasta_map(genome_dir)
731
732         lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
733
734         self.results[key] = lane
735
736     def update_result_with_sequence(self, gerald, key, pathnames,
737                                     genome_maps=None):
738         self.results[key] = SequenceLane(pathnames,
739                                          key.sample, key.lane, key.read)
740
741
742 def eland(gerald_dir, gerald=None, genome_maps=None):
743     e = ELAND()
744     eland_files = ElandMatches(e)
745     # collect
746     for path, dirnames, filenames in os.walk(gerald_dir):
747         for filename in filenames:
748             pathname = os.path.abspath(os.path.join(path, filename))
749             eland_files.add(pathname)
750     for key in eland_files:
751         eland_files.count(key, gerald, genome_maps)
752     return e
753
754
755 class ElandMatches(collections.MutableMapping):
756     def __init__(self, eland_container):
757         # the order in patterns determines the preference for what
758         # will be found.
759         self.eland_container = eland_container
760         MAPPED = eland_container.update_result_with_eland
761         SEQUENCE = eland_container.update_result_with_sequence
762
763         sample = '(?P<sample>[^_]+)'
764         hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
765         hiLane = '_L(?P<lane>[\d]+)'
766         gaLane = '_(?P<lane>[\d]+)'
767         hiRead = '_R(?P<read>[\d]+)'
768         gaRead = '(_(?P<read>[\d])+)?'
769         part = '_(?P<part>[\d]+)'
770         ext = '(?P<extention>(\.bz2|\.gz)?)'
771
772         hiPrefix = sample + hiIndex + hiLane + hiRead + part
773         gaPrefix = sample + gaLane + gaRead
774         P = collections.namedtuple('Patterns', 'pattern counter priority')
775         self.patterns = [
776             P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
777             P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
778             P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
779             P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
780             P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
781             P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
782             ]
783         self.file_sets = {}
784         self.file_priority = {}
785         self.file_counter = {}
786
787     def add(self, pathname):
788         """Add pathname to our set of files
789         """
790         path, filename = os.path.split(pathname)
791
792         for pattern, counter, priority in self.patterns:
793             rematch = re.match(pattern, filename)
794             if rematch is not None:
795                 m = ElandMatch(pathname, counter, **rematch.groupdict())
796                 key = m.make_samplekey()
797                 old_priority = self.file_priority.get(key, 0)
798                 if priority > old_priority:
799                     self.file_sets[key] = set((m,))
800                     self.file_counter[key] = counter
801                     self.file_priority[key] = priority
802                 elif priority == old_priority:
803                     self.file_sets[key].add(m)
804
805     def count(self, key, gerald=None, genome_maps=None):
806         #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
807         counter = self.file_counter[key]
808         file_set = self.file_sets[key]
809         filenames = [ f.filename for f in file_set ]
810         return counter(gerald, key,
811                        filenames, genome_maps)
812
813     def __iter__(self):
814         return iter(self.file_sets)
815
816     def __len__(self):
817         return len(self.file_sets)
818
819     def __getitem__(self, key):
820         return self.file_sets[key]
821
822     def __setitem__(self, key, value):
823         if not isintance(value, set):
824             raise ValueError("Expected set for value")
825         self.file_sets[key] = value
826
827     def __delitem__(self, key):
828         del self.file_sets[key]
829
830 class ElandMatch(object):
831     def __init__(self, pathname, counter,
832                  lane=None, read=None, extension=None,
833                  sample=None, index=None, part=None, **kwargs):
834         self.filename = pathname
835         self.counter = counter
836         self._lane = lane
837         self._read = read
838         self.extension = extension
839         self.sample = sample
840         self.index = index
841         self._part = part
842         LOGGER.info("Found %s: L%s R%s Samp%s" % (
843             self.filename, self._lane, self._read, self.sample))
844
845     def make_samplekey(self):
846         read = self._read if self._read is not None else 1
847         return SampleKey(lane=self.lane, read=read, sample=self.sample)
848
849     def _get_lane(self):
850         if self._lane is not None:
851             return int(self._lane)
852         return self._lane
853     lane = property(_get_lane)
854
855     def _get_read(self):
856         if self._read is not None:
857             return int(self._read)
858         return self._read
859     read = property(_get_read)
860
861     def _get_part(self):
862         if self._part is not None:
863             return int(self._part)
864         return self._part
865     part = property(_get_part)
866
867     def __repr__(self):
868         name = []
869         if self.sample is not None: name.append(self.sample)
870         if self._lane is not None: name.append('L%s' % (self.lane,))
871         if self._read is not None: name.append('R%s' % (self.read,))
872         if self._part is not None: name.append('P%s' % (self.part,))
873         return '<ElandMatch(' + "_".join(name) + ')>'
874
875 def build_genome_fasta_map(genome_dir):
876     # build fasta to fasta file map
877     LOGGER.info("Building genome map")
878     genome = genome_dir.split(os.path.sep)[-1]
879     fasta_map = {}
880     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
881         is_link = False
882         if os.path.islink(vld_file):
883             is_link = True
884         vld_file = os.path.realpath(vld_file)
885         path, vld_name = os.path.split(vld_file)
886         name, ext = os.path.splitext(vld_name)
887         if is_link:
888             fasta_map[name] = name
889         else:
890             fasta_map[name] = os.path.join(genome, name)
891     return fasta_map
892
893 def build_genome_size_map(pathname):
894     """Guess what genome we're using"""
895     sizes = {}
896     tree = ElementTree.parse(pathname)
897     for element in tree.getroot():
898         name = element.attrib['contigName']
899         bases = int(element.attrib['totalBases'])
900         sizes[name] = bases
901
902     # guess genome names
903     if sizes.get('chr1', 0) == 197195432:
904         genome = 'mm9'
905     elif sizes.get('chr1', 0) == 247249719:
906         genome = 'hg19'
907     elif sizes.get('chrI', 0) == 230218:
908         genome = 'sacCer3'
909     elif len(sizes) == 1:
910         genome = os.path.splitext(sizes.keys()[0])[0]
911     else:
912         raise RuntimeError("Unrecognized genome type, update detection")
913
914     fasta_map = {}
915     for k,v in sizes.items():
916         fasta_map[k] = genome + '/' + k
917
918     return fasta_map
919
920 def extract_eland_sequence(instream, outstream, start, end):
921     """
922     Extract a chunk of sequence out of an eland file
923     """
924     for line in instream:
925         record = line.split()
926         if len(record) > 1:
927             result = [record[0], record[1][start:end]]
928         else:
929             result = [record[0][start:end]]
930         outstream.write("\t".join(result))
931         outstream.write(os.linesep)
932
933
934 def main(cmdline=None):
935     """Run eland extraction against the specified gerald directory"""
936     from optparse import OptionParser
937     parser = OptionParser("%prog: <gerald dir>+")
938     opts, args = parser.parse_args(cmdline)
939     logging.basicConfig(level=logging.DEBUG)
940     for a in args:
941         LOGGER.info("Starting scan of %s" % (a,))
942         e = eland(a)
943         print ElementTree.tostring(e.get_elements())
944     return
945
946
947 if __name__ == "__main__":
948     main(sys.argv[1:])