update summary.py to extract data from HiSeq runs.
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11
12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
15
16 LOGGER = logging.getLogger(__name__)
17
18 SAMPLE_NAME = 'SampleName'
19 LANE_ID = 'LaneID'
20 END = 'End'
21 READS = 'Reads'
22
23 GENOME_MAP = 'GenomeMap'
24 GENOME_ITEM = 'GenomeItem'
25 MAPPED_READS = 'MappedReads'
26 MAPPED_ITEM = 'MappedItem'
27 MATCH_CODES = 'MatchCodes'
28 MATCH_ITEM = 'Code'
29 READS = 'Reads'
30
31 ELAND_SINGLE = 0
32 ELAND_MULTI = 1
33 ELAND_EXTENDED = 2
34 ELAND_EXPORT = 3
35
36 class ResultLane(object):
37     """
38     Base class for result lanes
39     """
40     XML_VERSION = 2
41     LANE = 'ResultLane'
42
43     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
44         self.pathname = pathname
45         self._sample_name = None
46         self.lane_id = lane_id
47         self.end = end
48         self._reads = None
49
50         if xml is not None:
51             self.set_elements(xml)
52
53     def _update(self):
54         """
55         Actually read the file and actually count the reads
56         """
57         pass
58
59     def _update_name(self):
60         # extract the sample name
61         if self.pathname is None:
62             return
63
64         path, name = os.path.split(self.pathname)
65         split_name = name.split('_')
66         self._sample_name = split_name[0]
67
68     def _get_sample_name(self):
69         if self._sample_name is None:
70             self._update_name()
71         return self._sample_name
72     sample_name = property(_get_sample_name)
73
74     def _get_reads(self):
75         if self._reads is None:
76             self._update()
77         return self._reads
78     reads = property(_get_reads)
79
80     def get_elements(self):
81         return None
82
83 class ElandLane(ResultLane):
84     """
85     Process an eland result file
86     """
87     XML_VERSION = 2
88     LANE = "ElandLane"
89     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92     SCORE_UNRECOGNIZED = 0
93     SCORE_QC = 1
94     SCORE_READ = 2
95
96     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
97         super(ElandLane, self).__init__(pathname, lane_id, end)
98
99         self._mapped_reads = None
100         self._match_codes = None
101         if genome_map is None:
102             genome_map = {}
103         self.genome_map = genome_map
104         self.eland_type = None
105
106         if xml is not None:
107             self.set_elements(xml)
108
109     def _guess_eland_type(self, pathname):
110         if self.eland_type is None:
111           # attempt autodetect eland file type
112           pathn, name = os.path.split(pathname)
113           if re.search('result', name):
114             self.eland_type = ELAND_SINGLE
115           elif re.search('multi', name):
116             self.eland_type = ELAND_MULTI
117           elif re.search('extended', name):
118             self.eland_type = ELAND_EXTENDED
119           elif re.search('export', name):
120             self.eland_type = ELAND_EXPORT
121           else:
122             self.eland_type = ELAND_SINGLE
123
124     def _update(self):
125         """
126         Actually read the file and actually count the reads
127         """
128         # can't do anything if we don't have a file to process
129         if self.pathname is None:
130             return
131         self._guess_eland_type(self.pathname)
132
133         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
134             raise RuntimeError("Eland isn't done, try again later.")
135
136         LOGGER.debug("summarizing results for %s" % (self.pathname))
137
138         stream = autoopen(self.pathname, 'r')
139         if self.eland_type == ELAND_SINGLE:
140             result = self._update_eland_result(stream)
141         elif self.eland_type == ELAND_MULTI or \
142              self.eland_type == ELAND_EXTENDED:
143             result = self._update_eland_multi(stream)
144         elif self.eland_type == ELAND_EXPORT:
145             result = self._update_eland_export(stream)
146         else:
147           raise NotImplementedError("Only support single/multi/extended eland files")
148         self._match_codes, self._mapped_reads, self._reads = result
149
150     def _update_eland_result(self, instream):
151         reads = 0
152         mapped_reads = {}
153
154         match_codes = {'NM':0, 'QC':0, 'RM':0,
155                        'U0':0, 'U1':0, 'U2':0,
156                        'R0':0, 'R1':0, 'R2':0,
157                       }
158         for line in instream:
159             reads += 1
160             fields = line.split()
161             # code = fields[2]
162             # match_codes[code] = match_codes.setdefault(code, 0) + 1
163             # the QC/NM etc codes are in the 3rd field and always present
164             match_codes[fields[2]] += 1
165             # ignore lines that don't have a fasta filename
166             if len(fields) < 7:
167                 continue
168             fasta = self.genome_map.get(fields[6], fields[6])
169             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
170         return match_codes, mapped_reads, reads
171
172     def _update_eland_multi(self, instream):
173         """Summarize an eland_extend."""
174         MATCH_INDEX = 2
175         LOCATION_INDEX = 3
176         reads = 0
177         mapped_reads = {}
178
179         match_codes = {'NM':0, 'QC':0, 'RM':0,
180                        'U0':0, 'U1':0, 'U2':0,
181                        'R0':0, 'R1':0, 'R2':0,
182                       }
183         for line in instream:
184             reads += 1
185             fields = line.split()
186             # fields[2] = QC/NM/or number of matches
187             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
188                                                        match_codes)
189             if score_type == ElandLane.SCORE_READ:
190                 # when there are too many hits, eland  writes a - where
191                 # it would have put the list of hits.
192                 # or in a different version of eland, it just leaves
193                 # that column blank, and only outputs 3 fields.
194                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
195                   continue
196
197                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
198
199         return match_codes, mapped_reads, reads
200
201     def _update_eland_export(self, instream):
202         """Summarize a gerald export file."""
203         MATCH_INDEX = 10
204         LOCATION_INDEX = 10
205         DESCRIPTOR_INDEX= 13
206         reads = 0
207         mapped_reads = {}
208
209         match_codes = {'NM':0, 'QC':0, 'RM':0,
210                        'U0':0, 'U1':0, 'U2':0,
211                        'R0':0, 'R1':0, 'R2':0,
212                       }
213
214         for line in instream:
215             reads += 1
216             fields = line.split()
217             # fields[2] = QC/NM/or number of matches
218             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
219                                                        match_codes)
220             if score_type == ElandLane.SCORE_UNRECOGNIZED:
221                 # export files have three states for the match field
222                 # QC code, count of multi-reads, or a single
223                 # read location. The score_mapped_mismatches function
224                 # only understands the first two types.
225                 # if we get unrecognized, that implies the field is probably
226                 # a location.
227                 code = self._count_mapped_export(mapped_reads,
228                                                  fields[LOCATION_INDEX],
229                                                  fields[DESCRIPTOR_INDEX])
230                 match_codes[code] += 1
231
232         return match_codes, mapped_reads, reads
233
234
235     def _score_mapped_mismatches(self, match, match_codes):
236         """Update match_codes with eland map counts, or failure code.
237
238         Returns True if the read mapped, false if it was an error code.
239         """
240         groups = ElandLane.MATCH_COUNTS_RE.match(match)
241         if groups is None:
242             # match is not of the form [\d]+:[\d]+:[\d]+
243             if match_codes.has_key(match):
244                 # match is one quality control codes QC/NM etc
245                 match_codes[match] += 1
246                 return ElandLane.SCORE_QC
247             else:
248                 return ElandLane.SCORE_UNRECOGNIZED
249         else:
250             # match is of the form [\d]+:[\d]+:[\d]+
251             # AKA Multiread
252             zero_mismatches = int(groups.group(1))
253             one_mismatches = int(groups.group(2))
254             two_mismatches = int(groups.group(3))
255
256             if zero_mismatches == 1:
257                 match_codes['U0'] += 1
258             elif zero_mismatches < 255:
259                 match_codes['R0'] += zero_mismatches
260
261             if one_mismatches == 1:
262                 match_codes['U1'] += 1
263             elif one_mismatches < 255:
264                 match_codes['R1'] += one_mismatches
265
266             if two_mismatches == 1:
267                 match_codes['U2'] += 1
268             elif two_mismatches < 255:
269                 match_codes['R2'] += two_mismatches
270
271             return ElandLane.SCORE_READ
272
273
274     def _count_mapped_multireads(self, mapped_reads, match_string):
275         chromo = None
276         for match in match_string.split(','):
277             match_fragment = match.split(':')
278             if len(match_fragment) == 2:
279                 chromo = match_fragment[0]
280                 pos = match_fragment[1]
281
282             fasta = self.genome_map.get(chromo, chromo)
283             assert fasta is not None
284             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
285
286
287     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
288         """Count a read as defined in an export file
289
290         match_string contains the chromosome
291         descriptor contains the an ecoding of bases that match, mismatch,
292                    and have indels.
293         returns the "best" match code
294
295         Currently "best" match code is ignoring the possibility of in-dels
296         """
297         chromo = match_string
298         fasta = self.genome_map.get(chromo, chromo)
299         assert fasta is not None
300         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
301
302         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
303         if len(mismatch_bases) == 0:
304             return 'U0'
305         elif len(mismatch_bases) == 1:
306             return 'U1'
307         else:
308             return 'U2'
309
310
311     def _get_mapped_reads(self):
312         if self._mapped_reads is None:
313             self._update()
314         return self._mapped_reads
315     mapped_reads = property(_get_mapped_reads)
316
317     def _get_match_codes(self):
318         if self._match_codes is None:
319             self._update()
320         return self._match_codes
321     match_codes = property(_get_match_codes)
322
323     def _get_no_match(self):
324         if self._mapped_reads is None:
325             self._update()
326         return self._match_codes['NM']
327     no_match = property(_get_no_match,
328                         doc="total reads that didn't match the target genome.")
329
330     def _get_no_match_percent(self):
331         return float(self.no_match)/self.reads * 100
332     no_match_percent = property(_get_no_match_percent,
333                                 doc="no match reads as percent of total")
334
335     def _get_qc_failed(self):
336         if self._mapped_reads is None:
337             self._update()
338         return self._match_codes['QC']
339     qc_failed = property(_get_qc_failed,
340                         doc="total reads that didn't match the target genome.")
341
342     def _get_qc_failed_percent(self):
343         return float(self.qc_failed)/self.reads * 100
344     qc_failed_percent = property(_get_qc_failed_percent,
345                                  doc="QC failed reads as percent of total")
346
347     def _get_unique_reads(self):
348         if self._mapped_reads is None:
349            self._update()
350         sum = 0
351         for code in ['U0','U1','U2']:
352             sum += self._match_codes[code]
353         return sum
354     unique_reads = property(_get_unique_reads,
355                             doc="total unique reads")
356
357     def _get_repeat_reads(self):
358         if self._mapped_reads is None:
359            self._update()
360         sum = 0
361         for code in ['R0','R1','R2']:
362             sum += self._match_codes[code]
363         return sum
364     repeat_reads = property(_get_repeat_reads,
365                             doc="total repeat reads")
366
367     def get_elements(self):
368         lane = ElementTree.Element(ElandLane.LANE,
369                                    {'version':
370                                     unicode(ElandLane.XML_VERSION)})
371         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
372         sample_tag.text = self.sample_name
373         lane_tag = ElementTree.SubElement(lane, LANE_ID)
374         lane_tag.text = str(self.lane_id)
375         if self.end is not None:
376             end_tag = ElementTree.SubElement(lane, END)
377             end_tag.text = str(self.end)
378         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
379         for k, v in self.genome_map.items():
380             item = ElementTree.SubElement(
381                 genome_map, GENOME_ITEM,
382                 {'name':k, 'value':unicode(v)})
383         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
384         for k, v in self.mapped_reads.items():
385             item = ElementTree.SubElement(
386                 mapped_reads, MAPPED_ITEM,
387                 {'name':k, 'value':unicode(v)})
388         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
389         for k, v in self.match_codes.items():
390             item = ElementTree.SubElement(
391                 match_codes, MATCH_ITEM,
392                 {'name':k, 'value':unicode(v)})
393         reads = ElementTree.SubElement(lane, READS)
394         reads.text = unicode(self.reads)
395
396         return lane
397
398     def set_elements(self, tree):
399         if tree.tag != ElandLane.LANE:
400             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
401
402         # reset dictionaries
403         self._mapped_reads = {}
404         self._match_codes = {}
405
406         for element in tree:
407             tag = element.tag.lower()
408             if tag == SAMPLE_NAME.lower():
409                 self._sample_name = element.text
410             elif tag == LANE_ID.lower():
411                 self.lane_id = int(element.text)
412             elif tag == END.lower():
413                 self.end = int(element.text)
414             elif tag == GENOME_MAP.lower():
415                 for child in element:
416                     name = child.attrib['name']
417                     value = child.attrib['value']
418                     self.genome_map[name] = value
419             elif tag == MAPPED_READS.lower():
420                 for child in element:
421                     name = child.attrib['name']
422                     value = child.attrib['value']
423                     self._mapped_reads[name] = int(value)
424             elif tag == MATCH_CODES.lower():
425                 for child in element:
426                     name = child.attrib['name']
427                     value = int(child.attrib['value'])
428                     self._match_codes[name] = value
429             elif tag == READS.lower():
430                 self._reads = int(element.text)
431             else:
432                 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
433
434 class SequenceLane(ResultLane):
435     XML_VERSION=1
436     LANE = 'SequenceLane'
437     SEQUENCE_TYPE = 'SequenceType'
438
439     NONE_TYPE = None
440     SCARF_TYPE = 1
441     FASTQ_TYPE = 2
442     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
443
444     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
445         self.sequence_type = None
446         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
447
448     def _guess_sequence_type(self, pathname):
449         """
450         Determine if we have a scarf or fastq sequence file
451         """
452         f = open(pathname,'r')
453         l = f.readline()
454         f.close()
455
456         if l[0] == '@':
457             # fastq starts with a @
458             self.sequence_type = SequenceLane.FASTQ_TYPE
459         else:
460             self.sequence_type = SequenceLane.SCARF_TYPE
461         return self.sequence_type
462
463     def _update(self):
464         """
465         Actually read the file and actually count the reads
466         """
467         # can't do anything if we don't have a file to process
468         if self.pathname is None:
469             return
470
471         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
472             raise RuntimeError("Sequencing isn't done, try again later.")
473
474         self._guess_sequence_type(self.pathname)
475
476         LOGGER.info("summarizing results for %s" % (self.pathname))
477         lines = 0
478         f = open(self.pathname)
479         for l in f.xreadlines():
480             lines += 1
481         f.close()
482
483         if self.sequence_type == SequenceLane.SCARF_TYPE:
484             self._reads = lines
485         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
486             self._reads = lines / 4
487         else:
488           raise NotImplementedError("This only supports scarf or fastq squence files")
489
490     def get_elements(self):
491         lane = ElementTree.Element(SequenceLane.LANE,
492                                    {'version':
493                                     unicode(SequenceLane.XML_VERSION)})
494         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
495         sample_tag.text = self.sample_name
496         lane_tag = ElementTree.SubElement(lane, LANE_ID)
497         lane_tag.text = str(self.lane_id)
498         if self.end is not None:
499             end_tag = ElementTree.SubElement(lane, END)
500             end_tag.text = str(self.end)
501         reads = ElementTree.SubElement(lane, READS)
502         reads.text = unicode(self.reads)
503         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
504         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
505
506         return lane
507
508     def set_elements(self, tree):
509         if tree.tag != SequenceLane.LANE:
510             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
511         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
512
513         for element in tree:
514             tag = element.tag.lower()
515             if tag == SAMPLE_NAME.lower():
516                 self._sample_name = element.text
517             elif tag == LANE_ID.lower():
518                 self.lane_id = int(element.text)
519             elif tag == END.lower():
520                 self.end = int(element.text)
521             elif tag == READS.lower():
522                 self._reads = int(element.text)
523             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
524                 self.sequence_type = lookup_sequence_type.get(element.text, None)
525             else:
526                 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
527
528 class ELAND(object):
529     """
530     Summarize information from eland files
531     """
532     XML_VERSION = 3
533
534     ELAND = 'ElandCollection'
535     LANE = 'Lane'
536     LANE_ID = 'id'
537     END = 'end'
538
539     def __init__(self, xml=None):
540         # we need information from the gerald config.xml
541         self.results = [{},{}]
542
543         if xml is not None:
544             self.set_elements(xml)
545
546         if len(self.results[0]) == 0:
547             # Initialize our eland object with meaningless junk
548             for l in  LANE_LIST:
549                 self.results[0][l] = ResultLane(lane_id=l, end=0)
550
551
552     def get_elements(self):
553         root = ElementTree.Element(ELAND.ELAND,
554                                    {'version': unicode(ELAND.XML_VERSION)})
555         for end in range(len(self.results)):
556            end_results = self.results[end]
557            for lane_id, lane in end_results.items():
558                 eland_lane = lane.get_elements()
559                 if eland_lane is not None:
560                     eland_lane.attrib[ELAND.END] = unicode (end)
561                     eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
562                     root.append(eland_lane)
563         return root
564
565     def set_elements(self, tree):
566         if tree.tag.lower() != ELAND.ELAND.lower():
567             raise ValueError('Expecting %s', ELAND.ELAND)
568         for element in list(tree):
569             lane_id = int(element.attrib[ELAND.LANE_ID])
570             end = int(element.attrib.get(ELAND.END, 0))
571             if element.tag.lower() == ElandLane.LANE.lower():
572                 lane = ElandLane(xml=element)
573             elif element.tag.lower() == SequenceLane.LANE.lower():
574                 lane = SequenceLane(xml=element)
575
576             self.results[end][lane_id] = lane
577
578 def check_for_eland_file(basedir, pattern, lane_id, end):
579    if end is None:
580       full_lane_id = lane_id
581    else:
582       full_lane_id = "%d_%d" % ( lane_id, end )
583
584    basename = pattern % (full_lane_id,)
585    LOGGER.debug("Eland pattern: %s" %(basename,))
586    pathname = os.path.join(basedir, basename)
587    if os.path.exists(pathname):
588        LOGGER.info('found eland file in %s' % (pathname,))
589        return pathname
590    else:
591        return None
592
593 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
594     # yes the lane_id is also being computed in ElandLane._update
595     # I didn't want to clutter up my constructor
596     # but I needed to persist the sample_name/lane_id for
597     # runfolder summary_report
598     path, name = os.path.split(pathname)
599     LOGGER.info("Adding eland file %s" %(name,))
600     # split_name = name.split('_')
601     # lane_id = int(split_name[1])
602
603     genome_map = {}
604     if genome_maps is not None:
605         genome_map = genome_maps[lane_id]
606     elif gerald is not None:
607         genome_dir = gerald.lanes[lane_id].eland_genome
608         if genome_dir is not None:
609             genome_map = build_genome_fasta_map(genome_dir)
610
611     lane = ElandLane(pathname, lane_id, end, genome_map)
612
613     if end is None:
614         effective_end =  0
615     else:
616         effective_end = end - 1
617
618     results[effective_end][lane_id] = lane
619
620 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
621     result = SequenceLane(pathname, lane_id, end)
622
623     if end is None:
624         effective_end =  0
625     else:
626         effective_end = end - 1
627
628     results[effective_end][lane_id] = result
629
630
631 def eland(gerald_dir, gerald=None, genome_maps=None):
632     e = ELAND()
633
634     lane_ids = range(1,9)
635     ends = [None, 1, 2]
636
637     basedirs = [gerald_dir]
638
639     # if there is a basedir/Temp change basedir to point to the temp
640     # directory, as 1.1rc1 moves most of the files we've historically
641     # cared about to that subdirectory.
642     # we should look into what the official 'result' files are.
643     # and 1.3 moves them back
644     basedir_temp = os.path.join(gerald_dir, 'Temp')
645     if os.path.isdir(basedir_temp):
646         basedirs.append(basedir_temp)
647
648
649     # the order in patterns determines the preference for what
650     # will be found.
651     MAPPED_ELAND = 0
652     SEQUENCE = 1
653     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
654                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
655                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
656                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
657                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
658                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
659                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
660                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
661                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
662                 ('s_%s_export.txt', MAPPED_ELAND),
663                 ('s_%s_export.txt.bz2', MAPPED_ELAND),
664                 ('s_%s_export.txt.gz', MAPPED_ELAND),
665                 ('s_%s_sequence.txt', SEQUENCE),]
666
667     for basedir in basedirs:
668         for end in ends:
669             for lane_id in lane_ids:
670                 for p in patterns:
671                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
672                     if pathname is not None:
673                         if p[1] == MAPPED_ELAND:
674                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
675                         elif p[1] == SEQUENCE:
676                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
677                         break
678                 else:
679                     LOGGER.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
680                     continue
681
682     return e
683
684 def build_genome_fasta_map(genome_dir):
685     # build fasta to fasta file map
686     LOGGER.info("Building genome map")
687     genome = genome_dir.split(os.path.sep)[-1]
688     fasta_map = {}
689     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
690         is_link = False
691         if os.path.islink(vld_file):
692             is_link = True
693         vld_file = os.path.realpath(vld_file)
694         path, vld_name = os.path.split(vld_file)
695         name, ext = os.path.splitext(vld_name)
696         if is_link:
697             fasta_map[name] = name
698         else:
699             fasta_map[name] = os.path.join(genome, name)
700     return fasta_map
701
702
703 def extract_eland_sequence(instream, outstream, start, end):
704     """
705     Extract a chunk of sequence out of an eland file
706     """
707     for line in instream:
708         record = line.split()
709         if len(record) > 1:
710             result = [record[0], record[1][start:end]]
711         else:
712             result = [record[0][start:end]]
713         outstream.write("\t".join(result))
714         outstream.write(os.linesep)
715
716
717 def main(cmdline=None):
718     """Run eland extraction against the specified gerald directory"""
719     from optparse import OptionParser
720     parser = OptionParser("%prog: <gerald dir>+")
721     opts, args = parser.parse_args(cmdline)
722     LOGGER.basicConfig(level=logging.DEBUG)
723     for a in args:
724         LOGGER.info("Starting scan of %s" % (a,))
725         e = eland(a)
726         print e.get_elements()
727
728     return
729
730
731 if __name__ == "__main__":
732     main(sys.argv[1:])