Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4 import collections
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11
12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
15
16 LOGGER = logging.getLogger(__name__)
17
18 SAMPLE_NAME = 'SampleName'
19 LANE_ID = 'LaneID'
20 END = 'End'
21 READS = 'Reads'
22
23 GENOME_MAP = 'GenomeMap'
24 GENOME_ITEM = 'GenomeItem'
25 MAPPED_READS = 'MappedReads'
26 MAPPED_ITEM = 'MappedItem'
27 MATCH_CODES = 'MatchCodes'
28 MATCH_ITEM = 'Code'
29 READS = 'Reads'
30
31 ELAND_SINGLE = 0
32 ELAND_MULTI = 1
33 ELAND_EXTENDED = 2
34 ELAND_EXPORT = 3
35
36 class ResultLane(object):
37     """
38     Base class for result lanes
39     """
40     XML_VERSION = 2
41     LANE = 'ResultLane'
42
43     def __init__(self, pathnames=None, lane_id=None, end=None, xml=None):
44         self.pathnames = pathnames
45         self._sample_name = None
46         self.lane_id = lane_id
47         self.end = end
48         self._reads = None
49
50         if xml is not None:
51             self.set_elements(xml)
52
53     def _update(self):
54         """
55         Actually read the file and actually count the reads
56         """
57         pass
58
59     def _update_name(self):
60         # extract the sample name
61         if self.pathnames is None or len(self.pathnames) == 0:
62             return
63
64         sample_names = set()
65         for pathname in self.pathnames:
66             path, name = os.path.split(pathname)
67             split_name = name.split('_')
68             sample_names.add(split_name[0])
69         if len(sample_names) > 1:
70             errmsg = "Attempting to update from more than one sample %s"
71             raise RuntimeError(errmsg % (",".join(sample_names)))
72         self._sample_name = sample_names.pop()
73         return self._sample_name
74
75     def _get_sample_name(self):
76         if self._sample_name is None:
77             self._update_name()
78         return self._sample_name
79     sample_name = property(_get_sample_name)
80
81     def _get_reads(self):
82         if self._reads is None:
83             self._update()
84         return self._reads
85     reads = property(_get_reads)
86
87     def get_elements(self):
88         return None
89
90 class ElandLane(ResultLane):
91     """
92     Process an eland result file
93     """
94     XML_VERSION = 2
95     LANE = "ElandLane"
96     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
97     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
98     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
99     SCORE_UNRECOGNIZED = 0
100     SCORE_QC = 1
101     SCORE_READ = 2
102
103     def __init__(self, pathnames=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
104         super(ElandLane, self).__init__(pathnames, lane_id, end)
105
106         self._mapped_reads = None
107         self._match_codes = None
108         if genome_map is None:
109             genome_map = {}
110         self.genome_map = genome_map
111         self.eland_type = None
112
113         if xml is not None:
114             self.set_elements(xml)
115
116     def _guess_eland_type(self, pathname):
117         if self.eland_type is None:
118           # attempt autodetect eland file type
119           pathn, name = os.path.split(pathname)
120           if re.search('result', name):
121             self.eland_type = ELAND_SINGLE
122           elif re.search('multi', name):
123             self.eland_type = ELAND_MULTI
124           elif re.search('extended', name):
125             self.eland_type = ELAND_EXTENDED
126           elif re.search('export', name):
127             self.eland_type = ELAND_EXPORT
128           else:
129             self.eland_type = ELAND_SINGLE
130
131     def _update(self):
132         """
133         Actually read the file and actually count the reads
134         """
135         # can't do anything if we don't have a file to process
136         if self.pathnames is None or len(self.pathnames) == 0:
137             return
138         pathname = self.pathnames[-1]
139         self._guess_eland_type(pathname)
140
141         if os.stat(pathname)[stat.ST_SIZE] == 0:
142             raise RuntimeError("Eland isn't done, try again later.")
143
144         LOGGER.debug("summarizing results for %s" % (pathname))
145         self._match_codes = MatchCodes()
146         self._mapped_reads = MappedReads()
147         self._reads = 0
148
149         for pathname in self.pathnames:
150             stream = autoopen(pathname, 'r')
151             if self.eland_type == ELAND_SINGLE:
152                 result = self._update_eland_result(stream)
153             elif self.eland_type == ELAND_MULTI or \
154                  self.eland_type == ELAND_EXTENDED:
155                 result = self._update_eland_multi(stream)
156             elif self.eland_type == ELAND_EXPORT:
157                 result = self._update_eland_export(stream)
158             else:
159                 errmsg = "Only support single/multi/extended eland files"
160                 raise NotImplementedError(errmsg)
161             stream.close()
162
163             match, mapped, reads = result
164             self._match_codes += match
165             self._mapped_reads += mapped
166             self._reads += reads
167
168     def _update_eland_result(self, instream):
169         reads = 0
170         mapped_reads = MappedReads()
171         match_codes = MatchCodes()
172
173         for line in instream:
174             reads += 1
175             fields = line.split()
176             # code = fields[2]
177             # match_codes[code] = match_codes.setdefault(code, 0) + 1
178             # the QC/NM etc codes are in the 3rd field and always present
179             match_codes[fields[2]] += 1
180             # ignore lines that don't have a fasta filename
181             if len(fields) < 7:
182                 continue
183             fasta = self.genome_map.get(fields[6], fields[6])
184             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
185         return match_codes, mapped_reads, reads
186
187     def _update_eland_multi(self, instream):
188         """Summarize an eland_extend."""
189         MATCH_INDEX = 2
190         LOCATION_INDEX = 3
191         reads = 0
192         mapped_reads = MappedReads()
193         match_codes = MatchCodes()
194
195         for line in instream:
196             reads += 1
197             fields = line.split()
198             # fields[2] = QC/NM/or number of matches
199             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
200                                                        match_codes)
201             if score_type == ElandLane.SCORE_READ:
202                 # when there are too many hits, eland  writes a - where
203                 # it would have put the list of hits.
204                 # or in a different version of eland, it just leaves
205                 # that column blank, and only outputs 3 fields.
206                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
207                   continue
208
209                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
210
211         return match_codes, mapped_reads, reads
212
213     def _update_eland_export(self, instream):
214         """Summarize a gerald export file."""
215         MATCH_INDEX = 10
216         LOCATION_INDEX = 10
217         DESCRIPTOR_INDEX= 13
218         reads = 0
219         mapped_reads = MappedReads()
220         match_codes = MatchCodes()
221
222         for line in instream:
223             reads += 1
224             fields = line.split()
225             # fields[2] = QC/NM/or number of matches
226             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
227                                                        match_codes)
228             if score_type == ElandLane.SCORE_UNRECOGNIZED:
229                 # export files have three states for the match field
230                 # QC code, count of multi-reads, or a single
231                 # read location. The score_mapped_mismatches function
232                 # only understands the first two types.
233                 # if we get unrecognized, that implies the field is probably
234                 # a location.
235                 code = self._count_mapped_export(mapped_reads,
236                                                  fields[LOCATION_INDEX],
237                                                  fields[DESCRIPTOR_INDEX])
238                 match_codes[code] += 1
239
240         return match_codes, mapped_reads, reads
241
242
243     def _score_mapped_mismatches(self, match, match_codes):
244         """Update match_codes with eland map counts, or failure code.
245
246         Returns True if the read mapped, false if it was an error code.
247         """
248         groups = ElandLane.MATCH_COUNTS_RE.match(match)
249         if groups is None:
250             # match is not of the form [\d]+:[\d]+:[\d]+
251             if match in match_codes:
252                 # match is one quality control codes QC/NM etc
253                 match_codes[match] += 1
254                 return ElandLane.SCORE_QC
255             else:
256                 return ElandLane.SCORE_UNRECOGNIZED
257         else:
258             # match is of the form [\d]+:[\d]+:[\d]+
259             # AKA Multiread
260             zero_mismatches = int(groups.group(1))
261             one_mismatches = int(groups.group(2))
262             two_mismatches = int(groups.group(3))
263
264             if zero_mismatches == 1:
265                 match_codes['U0'] += 1
266             elif zero_mismatches < 255:
267                 match_codes['R0'] += zero_mismatches
268
269             if one_mismatches == 1:
270                 match_codes['U1'] += 1
271             elif one_mismatches < 255:
272                 match_codes['R1'] += one_mismatches
273
274             if two_mismatches == 1:
275                 match_codes['U2'] += 1
276             elif two_mismatches < 255:
277                 match_codes['R2'] += two_mismatches
278
279             return ElandLane.SCORE_READ
280
281
282     def _count_mapped_multireads(self, mapped_reads, match_string):
283         chromo = None
284         for match in match_string.split(','):
285             match_fragment = match.split(':')
286             if len(match_fragment) == 2:
287                 chromo = match_fragment[0]
288                 pos = match_fragment[1]
289
290             fasta = self.genome_map.get(chromo, chromo)
291             assert fasta is not None
292             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
293
294
295     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
296         """Count a read as defined in an export file
297
298         match_string contains the chromosome
299         descriptor contains the an ecoding of bases that match, mismatch,
300                    and have indels.
301         returns the "best" match code
302
303         Currently "best" match code is ignoring the possibility of in-dels
304         """
305         chromo = match_string
306         fasta = self.genome_map.get(chromo, chromo)
307         assert fasta is not None
308         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
309
310         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
311         if len(mismatch_bases) == 0:
312             return 'U0'
313         elif len(mismatch_bases) == 1:
314             return 'U1'
315         else:
316             return 'U2'
317
318
319     def _get_mapped_reads(self):
320         if self._mapped_reads is None:
321             self._update()
322         return self._mapped_reads
323     mapped_reads = property(_get_mapped_reads)
324
325     def _get_match_codes(self):
326         if self._match_codes is None:
327             self._update()
328         return self._match_codes
329     match_codes = property(_get_match_codes)
330
331     def _get_no_match(self):
332         if self._mapped_reads is None:
333             self._update()
334         return self._match_codes['NM']
335     no_match = property(_get_no_match,
336                         doc="total reads that didn't match the target genome.")
337
338     def _get_no_match_percent(self):
339         return float(self.no_match)/self.reads * 100
340     no_match_percent = property(_get_no_match_percent,
341                                 doc="no match reads as percent of total")
342
343     def _get_qc_failed(self):
344         if self._mapped_reads is None:
345             self._update()
346         return self._match_codes['QC']
347     qc_failed = property(_get_qc_failed,
348                         doc="total reads that didn't match the target genome.")
349
350     def _get_qc_failed_percent(self):
351         return float(self.qc_failed)/self.reads * 100
352     qc_failed_percent = property(_get_qc_failed_percent,
353                                  doc="QC failed reads as percent of total")
354
355     def _get_unique_reads(self):
356         if self._mapped_reads is None:
357            self._update()
358         sum = 0
359         for code in ['U0','U1','U2']:
360             sum += self._match_codes[code]
361         return sum
362     unique_reads = property(_get_unique_reads,
363                             doc="total unique reads")
364
365     def _get_repeat_reads(self):
366         if self._mapped_reads is None:
367            self._update()
368         sum = 0
369         for code in ['R0','R1','R2']:
370             sum += self._match_codes[code]
371         return sum
372     repeat_reads = property(_get_repeat_reads,
373                             doc="total repeat reads")
374
375     def get_elements(self):
376         lane = ElementTree.Element(ElandLane.LANE,
377                                    {'version':
378                                     unicode(ElandLane.XML_VERSION)})
379         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
380         sample_tag.text = self.sample_name
381         lane_tag = ElementTree.SubElement(lane, LANE_ID)
382         lane_tag.text = str(self.lane_id)
383         if self.end is not None:
384             end_tag = ElementTree.SubElement(lane, END)
385             end_tag.text = str(self.end)
386         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
387         for k, v in self.genome_map.items():
388             item = ElementTree.SubElement(
389                 genome_map, GENOME_ITEM,
390                 {'name':k, 'value':unicode(v)})
391         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
392         for k, v in self.mapped_reads.items():
393             item = ElementTree.SubElement(
394                 mapped_reads, MAPPED_ITEM,
395                 {'name':k, 'value':unicode(v)})
396         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
397         for k, v in self.match_codes.items():
398             item = ElementTree.SubElement(
399                 match_codes, MATCH_ITEM,
400                 {'name':k, 'value':unicode(v)})
401         reads = ElementTree.SubElement(lane, READS)
402         reads.text = unicode(self.reads)
403
404         return lane
405
406     def set_elements(self, tree):
407         if tree.tag != ElandLane.LANE:
408             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
409
410         # reset dictionaries
411         self._mapped_reads = {}
412         self._match_codes = {}
413
414         for element in tree:
415             tag = element.tag.lower()
416             if tag == SAMPLE_NAME.lower():
417                 self._sample_name = element.text
418             elif tag == LANE_ID.lower():
419                 self.lane_id = int(element.text)
420             elif tag == END.lower():
421                 self.end = int(element.text)
422             elif tag == GENOME_MAP.lower():
423                 for child in element:
424                     name = child.attrib['name']
425                     value = child.attrib['value']
426                     self.genome_map[name] = value
427             elif tag == MAPPED_READS.lower():
428                 for child in element:
429                     name = child.attrib['name']
430                     value = child.attrib['value']
431                     self._mapped_reads[name] = int(value)
432             elif tag == MATCH_CODES.lower():
433                 for child in element:
434                     name = child.attrib['name']
435                     value = int(child.attrib['value'])
436                     self._match_codes[name] = value
437             elif tag == READS.lower():
438                 self._reads = int(element.text)
439             else:
440                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
441
442
443 class MatchCodes(collections.MutableMapping):
444     """Mapping to hold match counts -
445     supports combining two match count sets together
446     """
447     def __init__(self, initializer=None):
448         self.match_codes = {'NM':0, 'QC':0, 'RM':0,
449                             'U0':0, 'U1':0, 'U2':0,
450                             'R0':0, 'R1':0, 'R2':0,
451                             }
452
453         if initializer is not None:
454             if not isinstance(initializer, collections.Mapping):
455                 raise ValueError("Expected dictionary like class")
456             for key in initializer:
457                 if key not in self.match_codes:
458                     errmsg = "Initializer can only contain: %s"
459                     raise ValueError(errmsg % (",".join(self.match_codes.keys())))
460                 self.match_codes[key] += initializer[key]
461
462     def __iter__(self):
463         return iter(self.match_codes)
464
465     def __delitem__(self, key):
466         raise RuntimeError("delete not allowed")
467
468     def __getitem__(self, key):
469         return self.match_codes[key]
470
471     def __setitem__(self, key, value):
472         if key not in self.match_codes:
473             errmsg = "Unrecognized key, allowed values are: %s"
474             raise ValueError(errmsg % (",".join(self.match_codes.keys())))
475         self.match_codes[key] = value
476
477     def __len__(self):
478         return len(self.match_codes)
479
480     def __add__(self, other):
481         if not isinstance(other, MatchCodes):
482             raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
483
484         newobj = MatchCodes(self)
485         for key, value in other.items():
486             newobj[key] = self.get(key, 0) + other[key]
487
488         return newobj
489
490
491 class MappedReads(collections.MutableMapping):
492     """Mapping to hold mapped reads -
493     supports combining two mapped read sets together
494     """
495     def __init__(self, initializer=None):
496         self.mapped_reads = {}
497
498         if initializer is not None:
499             if not isinstance(initializer, collections.Mapping):
500                 raise ValueError("Expected dictionary like class")
501             for key in initializer:
502                 self[key] = self.setdefault(key, 0) + initializer[key]
503
504     def __iter__(self):
505         return iter(self.mapped_reads)
506
507     def __delitem__(self, key):
508         del self.mapped_reads[key]
509
510     def __getitem__(self, key):
511         return self.mapped_reads[key]
512
513     def __setitem__(self, key, value):
514         self.mapped_reads[key] = value
515
516     def __len__(self):
517         return len(self.mapped_reads)
518
519     def __add__(self, other):
520         if not isinstance(other, MappedReads):
521             raise ValueError("Expected a MappedReads, got %s", str(type(other)))
522
523         newobj = MappedReads(self)
524         for key in other:
525             newobj[key] = self.get(key, 0) + other[key]
526
527         return newobj
528
529 class SequenceLane(ResultLane):
530     XML_VERSION=1
531     LANE = 'SequenceLane'
532     SEQUENCE_TYPE = 'SequenceType'
533
534     NONE_TYPE = None
535     SCARF_TYPE = 1
536     FASTQ_TYPE = 2
537     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
538
539     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
540         self.sequence_type = None
541         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
542
543     def _guess_sequence_type(self, pathname):
544         """
545         Determine if we have a scarf or fastq sequence file
546         """
547         f = open(pathname,'r')
548         l = f.readline()
549         f.close()
550
551         if l[0] == '@':
552             # fastq starts with a @
553             self.sequence_type = SequenceLane.FASTQ_TYPE
554         else:
555             self.sequence_type = SequenceLane.SCARF_TYPE
556         return self.sequence_type
557
558     def _update(self):
559         """
560         Actually read the file and actually count the reads
561         """
562         # can't do anything if we don't have a file to process
563         if self.pathnames is None:
564             return
565
566         pathname = self.pathnames[-1]
567         if os.stat(pathname)[stat.ST_SIZE] == 0:
568             raise RuntimeError("Sequencing isn't done, try again later.")
569
570         self._guess_sequence_type(pathname)
571
572         LOGGER.info("summarizing results for %s" % (pathname))
573         lines = 0
574         f = open(pathname)
575         for l in f.xreadlines():
576             lines += 1
577         f.close()
578
579         if self.sequence_type == SequenceLane.SCARF_TYPE:
580             self._reads = lines
581         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
582             self._reads = lines / 4
583         else:
584             errmsg = "This only supports scarf or fastq squence files"
585             raise NotImplementedError(errmsg)
586
587     def get_elements(self):
588         lane = ElementTree.Element(SequenceLane.LANE,
589                                    {'version':
590                                     unicode(SequenceLane.XML_VERSION)})
591         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
592         sample_tag.text = self.sample_name
593         lane_tag = ElementTree.SubElement(lane, LANE_ID)
594         lane_tag.text = str(self.lane_id)
595         if self.end is not None:
596             end_tag = ElementTree.SubElement(lane, END)
597             end_tag.text = str(self.end)
598         reads = ElementTree.SubElement(lane, READS)
599         reads.text = unicode(self.reads)
600         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
601         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
602
603         return lane
604
605     def set_elements(self, tree):
606         if tree.tag != SequenceLane.LANE:
607             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
608         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
609
610         for element in tree:
611             tag = element.tag.lower()
612             if tag == SAMPLE_NAME.lower():
613                 self._sample_name = element.text
614             elif tag == LANE_ID.lower():
615                 self.lane_id = int(element.text)
616             elif tag == END.lower():
617                 self.end = int(element.text)
618             elif tag == READS.lower():
619                 self._reads = int(element.text)
620             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
621                 self.sequence_type = lookup_sequence_type.get(element.text, None)
622             else:
623                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
624
625 class ELAND(object):
626     """
627     Summarize information from eland files
628     """
629     XML_VERSION = 3
630
631     ELAND = 'ElandCollection'
632     LANE = 'Lane'
633     LANE_ID = 'id'
634     END = 'end'
635
636     def __init__(self, xml=None):
637         # we need information from the gerald config.xml
638         self.results = [{},{}]
639
640         if xml is not None:
641             self.set_elements(xml)
642
643         if len(self.results[0]) == 0:
644             # Initialize our eland object with meaningless junk
645             for l in  LANE_LIST:
646                 self.results[0][l] = ResultLane(lane_id=l, end=0)
647
648
649     def get_elements(self):
650         root = ElementTree.Element(ELAND.ELAND,
651                                    {'version': unicode(ELAND.XML_VERSION)})
652         for end in range(len(self.results)):
653            end_results = self.results[end]
654            for lane_id, lane in end_results.items():
655                 eland_lane = lane.get_elements()
656                 if eland_lane is not None:
657                     eland_lane.attrib[ELAND.END] = unicode (end)
658                     eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
659                     root.append(eland_lane)
660         return root
661
662     def set_elements(self, tree):
663         if tree.tag.lower() != ELAND.ELAND.lower():
664             raise ValueError('Expecting %s', ELAND.ELAND)
665         for element in list(tree):
666             lane_id = int(element.attrib[ELAND.LANE_ID])
667             end = int(element.attrib.get(ELAND.END, 0))
668             if element.tag.lower() == ElandLane.LANE.lower():
669                 lane = ElandLane(xml=element)
670             elif element.tag.lower() == SequenceLane.LANE.lower():
671                 lane = SequenceLane(xml=element)
672
673             self.results[end][lane_id] = lane
674
675 def check_for_eland_file(basedir, pattern, lane_id, end):
676    #if end is None:
677    #   full_lane_id = lane_id
678    #else:
679    #   full_lane_id = "%d_%d" % ( lane_id, end )
680    eland_files = []
681    eland_pattern = pattern % (lane_id, end)
682    eland_re = re.compile(eland_pattern)
683    LOGGER.debug("Eland pattern: %s" %(eland_pattern,))
684    for filename in os.listdir(basedir):
685        if eland_re.match(filename):
686            LOGGER.info('found eland file %s' % (filename,))
687            eland_files.append(os.path.join(basedir, filename))
688
689    return eland_files
690
691 def update_result_with_eland(gerald, results, lane_id, end, pathnames, genome_maps):
692     # yes the lane_id is also being computed in ElandLane._update
693     # I didn't want to clutter up my constructor
694     # but I needed to persist the sample_name/lane_id for
695     # runfolder summary_report
696     names = [ os.path.split(p)[1] for p in pathnames]
697     LOGGER.info("Adding eland files %s" %(",".join(names),))
698
699     genome_map = {}
700     if genome_maps is not None:
701         genome_map = genome_maps[lane_id]
702     elif gerald is not None:
703         genome_dir = gerald.lanes[lane_id].eland_genome
704         if genome_dir is not None:
705             genome_map = build_genome_fasta_map(genome_dir)
706
707     lane = ElandLane(pathnames, lane_id, end, genome_map)
708
709     if end is None:
710         effective_end =  0
711     else:
712         effective_end = end - 1
713
714     results[effective_end][lane_id] = lane
715
716 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
717     result = SequenceLane(pathname, lane_id, end)
718
719     if end is None:
720         effective_end =  0
721     else:
722         effective_end = end - 1
723
724     results[effective_end][lane_id] = result
725
726
727 def eland(gerald_dir, gerald=None, genome_maps=None):
728     e = ELAND()
729
730     lane_ids = range(1,9)
731     ends = [None, 1, 2]
732
733     basedirs = [gerald_dir]
734
735     # if there is a basedir/Temp change basedir to point to the temp
736     # directory, as 1.1rc1 moves most of the files we've historically
737     # cared about to that subdirectory.
738     # we should look into what the official 'result' files are.
739     # and 1.3 moves them back
740     basedir_temp = os.path.join(gerald_dir, 'Temp')
741     if os.path.isdir(basedir_temp):
742         basedirs.append(basedir_temp)
743
744     # So how about scanning for Project*/Sample* directories as well
745     sample_pattern = os.path.join(gerald_dir, 'Project_*', 'Sample_*')
746     basedirs.extend(glob(sample_pattern))
747
748     # the order in patterns determines the preference for what
749     # will be found.
750     MAPPED_ELAND = 0
751     SEQUENCE = 1
752     patterns = [
753         ('(?P<sampleId>[^_]+)_(?P<index>(NoIndex|[AGCT])+)_L00%s(_R%s)_(?P<part>[\d]+)_export.txt(?P<ext>(\.bz2|\.gz)?)', MAPPED_ELAND),
754         ('s_(?P<lane>%s)(_(?P<end>%s))?_eland_result.txt(?P<ext>(\.bz2|\.gz)?)',
755          MAPPED_ELAND),
756         ('s_(?P<lane>%s)(_(?P<end>%s))?_eland_extended.txt(?P<ext>(\.bz2|\.gz)?)',
757          MAPPED_ELAND),
758         ('s_(?P<lane>%s)(_(?P<end>%s))?_eland_multi.txt(?P<ext>(\.bz2|\.gz)?)',
759          MAPPED_ELAND),
760         ('s_(?P<lane>%s)(_(?P<end>%s))?_export.txt(?P<ext>(\.bz2|\.gz)?)',
761          MAPPED_ELAND),
762         ('s_(?P<lane>%s)(_(?P<end>%s))?_sequence.txt(?P<ext>(\.bz2|\.gz)?)',
763          SEQUENCE),
764
765         #('s_%s_eland_result.txt', MAPPED_ELAND),
766         #('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
767         #('s_%s_eland_result.txt.gz', MAPPED_ELAND),
768         #('s_%s_eland_extended.txt', MAPPED_ELAND),
769         #('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
770         #('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
771         #('s_%s_eland_multi.txt', MAPPED_ELAND),
772         #('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
773         #('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
774         #('s_%s_export.txt', MAPPED_ELAND),
775         #('s_%s_export.txt.bz2', MAPPED_ELAND),
776         #('s_%s_export.txt.gz', MAPPED_ELAND),
777         #('s_%s_sequence.txt', SEQUENCE),
778         ]
779
780     for basedir in basedirs:
781         for end in ends:
782             for lane_id in lane_ids:
783                 for p in patterns:
784                     pathnames = check_for_eland_file(basedir, p[0], lane_id, end)
785                     if len(pathnames) > 0:
786                         if p[1] == MAPPED_ELAND:
787                             update_result_with_eland(gerald, e.results, lane_id, end, pathnames, genome_maps)
788                         elif p[1] == SEQUENCE:
789                             update_result_with_sequence(gerald, e.results, lane_id, end, pathnames)
790                         break
791                 else:
792                     LOGGER.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
793                     continue
794
795     return e
796
797 def build_genome_fasta_map(genome_dir):
798     # build fasta to fasta file map
799     LOGGER.info("Building genome map")
800     genome = genome_dir.split(os.path.sep)[-1]
801     fasta_map = {}
802     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
803         is_link = False
804         if os.path.islink(vld_file):
805             is_link = True
806         vld_file = os.path.realpath(vld_file)
807         path, vld_name = os.path.split(vld_file)
808         name, ext = os.path.splitext(vld_name)
809         if is_link:
810             fasta_map[name] = name
811         else:
812             fasta_map[name] = os.path.join(genome, name)
813     return fasta_map
814
815
816 def extract_eland_sequence(instream, outstream, start, end):
817     """
818     Extract a chunk of sequence out of an eland file
819     """
820     for line in instream:
821         record = line.split()
822         if len(record) > 1:
823             result = [record[0], record[1][start:end]]
824         else:
825             result = [record[0][start:end]]
826         outstream.write("\t".join(result))
827         outstream.write(os.linesep)
828
829
830 def main(cmdline=None):
831     """Run eland extraction against the specified gerald directory"""
832     from optparse import OptionParser
833     parser = OptionParser("%prog: <gerald dir>+")
834     opts, args = parser.parse_args(cmdline)
835     logging.basicConfig(level=logging.DEBUG)
836     for a in args:
837         LOGGER.info("Starting scan of %s" % (a,))
838         e = eland(a)
839         print e.get_elements()
840
841     return
842
843
844 if __name__ == "__main__":
845     main(sys.argv[1:])