Handle the case when a sequencing lane lacks any yield information.
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11
12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
15
16 SAMPLE_NAME = 'SampleName'
17 LANE_ID = 'LaneID'
18 END = 'End'
19 READS = 'Reads'
20
21 GENOME_MAP = 'GenomeMap'
22 GENOME_ITEM = 'GenomeItem'
23 MAPPED_READS = 'MappedReads'
24 MAPPED_ITEM = 'MappedItem'
25 MATCH_CODES = 'MatchCodes'
26 MATCH_ITEM = 'Code'
27 READS = 'Reads'
28
29 ELAND_SINGLE = 0
30 ELAND_MULTI = 1
31 ELAND_EXTENDED = 2
32 ELAND_EXPORT = 3
33
34 class ResultLane(object):
35     """
36     Base class for result lanes
37     """
38     XML_VERSION = 2
39     LANE = 'ResultLane'
40
41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42         self.pathname = pathname
43         self._sample_name = None
44         self.lane_id = lane_id
45         self.end = end
46         self._reads = None
47         
48         if xml is not None:
49             self.set_elements(xml)
50
51     def _update(self):
52         """
53         Actually read the file and actually count the reads
54         """
55         pass
56
57     def _update_name(self):
58         # extract the sample name
59         if self.pathname is None:
60             return
61
62         path, name = os.path.split(self.pathname)
63         split_name = name.split('_')
64         self._sample_name = split_name[0]
65
66     def _get_sample_name(self):
67         if self._sample_name is None:
68             self._update_name()
69         return self._sample_name
70     sample_name = property(_get_sample_name)
71
72     def _get_reads(self):
73         if self._reads is None:
74             self._update()
75         return self._reads
76     reads = property(_get_reads)
77
78     def get_elements(self):
79         return None
80
81 class ElandLane(ResultLane):
82     """
83     Process an eland result file
84     """
85     XML_VERSION = 2
86     LANE = "ElandLane"
87     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
88     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
89     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
90     SCORE_UNRECOGNIZED = 0
91     SCORE_QC = 1
92     SCORE_READ = 2
93
94     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
95         super(ElandLane, self).__init__(pathname, lane_id, end)
96
97         self._mapped_reads = None
98         self._match_codes = None
99         if genome_map is None:
100             genome_map = {}
101         self.genome_map = genome_map
102         self.eland_type = None
103
104         if xml is not None:
105             self.set_elements(xml)
106
107     def _guess_eland_type(self, pathname):
108         if self.eland_type is None:
109           # attempt autodetect eland file type
110           pathn, name = os.path.split(pathname)
111           if re.search('result', name):
112             self.eland_type = ELAND_SINGLE
113           elif re.search('multi', name):
114             self.eland_type = ELAND_MULTI
115           elif re.search('extended', name):
116             self.eland_type = ELAND_EXTENDED
117           elif re.search('export', name):
118             self.eland_type = ELAND_EXPORT
119           else:
120             self.eland_type = ELAND_SINGLE
121
122     def _update(self):
123         """
124         Actually read the file and actually count the reads
125         """
126         # can't do anything if we don't have a file to process
127         if self.pathname is None:
128             return
129         self._guess_eland_type(self.pathname)
130
131         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
132             raise RuntimeError("Eland isn't done, try again later.")
133
134         logging.info("summarizing results for %s" % (self.pathname))
135
136         stream = autoopen(self.pathname, 'r')
137         if self.eland_type == ELAND_SINGLE:
138             result = self._update_eland_result(stream)
139         elif self.eland_type == ELAND_MULTI or \
140              self.eland_type == ELAND_EXTENDED:
141             result = self._update_eland_multi(stream)
142         elif self.eland_type == ELAND_EXPORT:
143             result = self._update_eland_export(stream)
144         else:
145           raise NotImplementedError("Only support single/multi/extended eland files")
146         self._match_codes, self._mapped_reads, self._reads = result
147
148     def _update_eland_result(self, instream):
149         reads = 0
150         mapped_reads = {}
151
152         match_codes = {'NM':0, 'QC':0, 'RM':0,
153                        'U0':0, 'U1':0, 'U2':0,
154                        'R0':0, 'R1':0, 'R2':0,
155                       }
156         for line in instream:
157             reads += 1
158             fields = line.split()
159             # code = fields[2]
160             # match_codes[code] = match_codes.setdefault(code, 0) + 1
161             # the QC/NM etc codes are in the 3rd field and always present
162             match_codes[fields[2]] += 1
163             # ignore lines that don't have a fasta filename
164             if len(fields) < 7:
165                 continue
166             fasta = self.genome_map.get(fields[6], fields[6])
167             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
168         return match_codes, mapped_reads, reads
169
170     def _update_eland_multi(self, instream):
171         """Summarize an eland_extend."""
172         MATCH_INDEX = 2
173         LOCATION_INDEX = 3
174         reads = 0
175         mapped_reads = {}
176
177         match_codes = {'NM':0, 'QC':0, 'RM':0,
178                        'U0':0, 'U1':0, 'U2':0,
179                        'R0':0, 'R1':0, 'R2':0,
180                       }
181         for line in instream:
182             reads += 1
183             fields = line.split()
184             # fields[2] = QC/NM/or number of matches
185             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
186                                                        match_codes)
187             if score_type == ElandLane.SCORE_READ:
188                 # when there are too many hits, eland  writes a - where
189                 # it would have put the list of hits.
190                 # or in a different version of eland, it just leaves
191                 # that column blank, and only outputs 3 fields.     
192                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
193                   continue
194
195                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
196
197         return match_codes, mapped_reads, reads
198
199     def _update_eland_export(self, instream):
200         """Summarize a gerald export file."""
201         MATCH_INDEX = 10
202         LOCATION_INDEX = 10
203         DESCRIPTOR_INDEX= 13
204         reads = 0
205         mapped_reads = {}
206
207         match_codes = {'NM':0, 'QC':0, 'RM':0,
208                        'U0':0, 'U1':0, 'U2':0,
209                        'R0':0, 'R1':0, 'R2':0,
210                       }
211
212         for line in instream:
213             reads += 1
214             fields = line.split()
215             # fields[2] = QC/NM/or number of matches
216             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
217                                                        match_codes)
218             if score_type == ElandLane.SCORE_UNRECOGNIZED:
219                 # export files have three states for the match field
220                 # QC code, count of multi-reads, or a single 
221                 # read location. The score_mapped_mismatches function
222                 # only understands the first two types.
223                 # if we get unrecognized, that implies the field is probably
224                 # a location.
225                 code = self._count_mapped_export(mapped_reads, 
226                                                  fields[LOCATION_INDEX],
227                                                  fields[DESCRIPTOR_INDEX])
228                 match_codes[code] += 1
229
230         return match_codes, mapped_reads, reads
231
232
233     def _score_mapped_mismatches(self, match, match_codes):
234         """Update match_codes with eland map counts, or failure code.
235         
236         Returns True if the read mapped, false if it was an error code.
237         """
238         groups = ElandLane.MATCH_COUNTS_RE.match(match)
239         if groups is None:
240             # match is not of the form [\d]+:[\d]+:[\d]+
241             if match_codes.has_key(match):
242                 # match is one quality control codes QC/NM etc
243                 match_codes[match] += 1
244                 return ElandLane.SCORE_QC
245             else:
246                 return ElandLane.SCORE_UNRECOGNIZED
247         else:
248             # match is of the form [\d]+:[\d]+:[\d]+
249             # AKA Multiread
250             zero_mismatches = int(groups.group(1))
251             one_mismatches = int(groups.group(2))
252             two_mismatches = int(groups.group(3))
253
254             if zero_mismatches == 1:
255                 match_codes['U0'] += 1
256             elif zero_mismatches < 255:
257                 match_codes['R0'] += zero_mismatches
258
259             if one_mismatches == 1:
260                 match_codes['U1'] += 1
261             elif one_mismatches < 255:
262                 match_codes['R1'] += one_mismatches
263     
264             if two_mismatches == 1:
265                 match_codes['U2'] += 1
266             elif two_mismatches < 255:
267                 match_codes['R2'] += two_mismatches
268                 
269             return ElandLane.SCORE_READ
270
271
272     def _count_mapped_multireads(self, mapped_reads, match_string):
273         chromo = None
274         for match in match_string.split(','):
275             match_fragment = match.split(':')
276             if len(match_fragment) == 2:
277                 chromo = match_fragment[0]
278                 pos = match_fragment[1]
279
280             fasta = self.genome_map.get(chromo, chromo)
281             assert fasta is not None
282             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
283
284
285     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
286         """Count a read as defined in an export file
287         
288         match_string contains the chromosome
289         descriptor contains the an ecoding of bases that match, mismatch, 
290                    and have indels.
291         returns the "best" match code
292
293         Currently "best" match code is ignoring the possibility of in-dels
294         """
295         chromo = match_string
296         fasta = self.genome_map.get(chromo, chromo)
297         assert fasta is not None
298         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
299
300         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
301         if len(mismatch_bases) == 0:
302             return 'U0'
303         elif len(mismatch_bases) == 1:
304             return 'U1'
305         else:
306             return 'U2'
307
308
309     def _get_mapped_reads(self):
310         if self._mapped_reads is None:
311             self._update()
312         return self._mapped_reads
313     mapped_reads = property(_get_mapped_reads)
314
315     def _get_match_codes(self):
316         if self._match_codes is None:
317             self._update()
318         return self._match_codes
319     match_codes = property(_get_match_codes)
320
321     def _get_no_match(self):
322         if self._mapped_reads is None:
323             self._update()  
324         return self._match_codes['NM']
325     no_match = property(_get_no_match, 
326                         doc="total reads that didn't match the target genome.")
327
328     def _get_no_match_percent(self):
329         return float(self.no_match)/self.reads * 100 
330     no_match_percent = property(_get_no_match_percent,
331                                 doc="no match reads as percent of total")
332
333     def _get_qc_failed(self):
334         if self._mapped_reads is None:
335             self._update()  
336         return self._match_codes['QC']
337     qc_failed = property(_get_qc_failed,
338                         doc="total reads that didn't match the target genome.")
339
340     def _get_qc_failed_percent(self):
341         return float(self.qc_failed)/self.reads * 100 
342     qc_failed_percent = property(_get_qc_failed_percent,
343                                  doc="QC failed reads as percent of total")
344
345     def _get_unique_reads(self):
346         if self._mapped_reads is None:
347            self._update()
348         sum = 0
349         for code in ['U0','U1','U2']:
350             sum += self._match_codes[code]
351         return sum
352     unique_reads = property(_get_unique_reads,
353                             doc="total unique reads")
354
355     def _get_repeat_reads(self):
356         if self._mapped_reads is None:
357            self._update()
358         sum = 0
359         for code in ['R0','R1','R2']:
360             sum += self._match_codes[code]
361         return sum
362     repeat_reads = property(_get_repeat_reads,
363                             doc="total repeat reads")
364     
365     def get_elements(self):
366         lane = ElementTree.Element(ElandLane.LANE,
367                                    {'version':
368                                     unicode(ElandLane.XML_VERSION)})
369         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
370         sample_tag.text = self.sample_name
371         lane_tag = ElementTree.SubElement(lane, LANE_ID)
372         lane_tag.text = str(self.lane_id)
373         if self.end is not None:
374             end_tag = ElementTree.SubElement(lane, END)
375             end_tag.text = str(self.end)
376         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
377         for k, v in self.genome_map.items():
378             item = ElementTree.SubElement(
379                 genome_map, GENOME_ITEM,
380                 {'name':k, 'value':unicode(v)})
381         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
382         for k, v in self.mapped_reads.items():
383             item = ElementTree.SubElement(
384                 mapped_reads, MAPPED_ITEM,
385                 {'name':k, 'value':unicode(v)})
386         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
387         for k, v in self.match_codes.items():
388             item = ElementTree.SubElement(
389                 match_codes, MATCH_ITEM,
390                 {'name':k, 'value':unicode(v)})
391         reads = ElementTree.SubElement(lane, READS)
392         reads.text = unicode(self.reads)
393
394         return lane
395
396     def set_elements(self, tree):
397         if tree.tag != ElandLane.LANE:
398             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
399
400         # reset dictionaries
401         self._mapped_reads = {}
402         self._match_codes = {}
403
404         for element in tree:
405             tag = element.tag.lower()
406             if tag == SAMPLE_NAME.lower():
407                 self._sample_name = element.text
408             elif tag == LANE_ID.lower():
409                 self.lane_id = int(element.text)
410             elif tag == END.lower():
411                 self.end = int(element.text)
412             elif tag == GENOME_MAP.lower():
413                 for child in element:
414                     name = child.attrib['name']
415                     value = child.attrib['value']
416                     self.genome_map[name] = value
417             elif tag == MAPPED_READS.lower():
418                 for child in element:
419                     name = child.attrib['name']
420                     value = child.attrib['value']
421                     self._mapped_reads[name] = int(value)
422             elif tag == MATCH_CODES.lower():
423                 for child in element:
424                     name = child.attrib['name']
425                     value = int(child.attrib['value'])
426                     self._match_codes[name] = value
427             elif tag == READS.lower():
428                 self._reads = int(element.text)
429             else:
430                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
431
432 class SequenceLane(ResultLane):
433     XML_VERSION=1
434     LANE = 'SequenceLane'
435     SEQUENCE_TYPE = 'SequenceType'
436
437     NONE_TYPE = None
438     SCARF_TYPE = 1
439     FASTQ_TYPE = 2
440     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
441
442     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
443         self.sequence_type = None
444         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
445
446     def _guess_sequence_type(self, pathname):
447         """
448         Determine if we have a scarf or fastq sequence file
449         """
450         f = open(pathname,'r')
451         l = f.readline()
452         f.close()
453
454         if l[0] == '@':
455             # fastq starts with a @
456             self.sequence_type = SequenceLane.FASTQ_TYPE
457         else:
458             self.sequence_type = SequenceLane.SCARF_TYPE
459         return self.sequence_type
460
461     def _update(self):
462         """
463         Actually read the file and actually count the reads
464         """
465         # can't do anything if we don't have a file to process
466         if self.pathname is None:
467             return
468
469         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
470             raise RuntimeError("Sequencing isn't done, try again later.")
471
472         self._guess_sequence_type(self.pathname)
473
474         logging.info("summarizing results for %s" % (self.pathname))
475         lines = 0
476         f = open(self.pathname)
477         for l in f.xreadlines():
478             lines += 1
479         f.close()
480
481         if self.sequence_type == SequenceLane.SCARF_TYPE:
482             self._reads = lines
483         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
484             self._reads = lines / 4
485         else:
486           raise NotImplementedError("This only supports scarf or fastq squence files")
487
488     def get_elements(self):
489         lane = ElementTree.Element(SequenceLane.LANE,
490                                    {'version':
491                                     unicode(SequenceLane.XML_VERSION)})
492         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
493         sample_tag.text = self.sample_name
494         lane_tag = ElementTree.SubElement(lane, LANE_ID)
495         lane_tag.text = str(self.lane_id)
496         if self.end is not None:
497             end_tag = ElementTree.SubElement(lane, END)
498             end_tag.text = str(self.end)
499         reads = ElementTree.SubElement(lane, READS)
500         reads.text = unicode(self.reads)
501         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
502         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
503
504         return lane
505
506     def set_elements(self, tree):
507         if tree.tag != SequenceLane.LANE:
508             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
509         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
510
511         for element in tree:
512             tag = element.tag.lower()
513             if tag == SAMPLE_NAME.lower():
514                 self._sample_name = element.text
515             elif tag == LANE_ID.lower():
516                 self.lane_id = int(element.text)
517             elif tag == END.lower():
518                 self.end = int(element.text)
519             elif tag == READS.lower():
520                 self._reads = int(element.text)
521             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
522                 self.sequence_type = lookup_sequence_type.get(element.text, None)
523             else:
524                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
525
526 class ELAND(object):
527     """
528     Summarize information from eland files
529     """
530     XML_VERSION = 3
531
532     ELAND = 'ElandCollection'
533     LANE = 'Lane'
534     LANE_ID = 'id'
535     END = 'end'
536
537     def __init__(self, xml=None):
538         # we need information from the gerald config.xml
539         self.results = [{},{}]
540
541         if xml is not None:
542             self.set_elements(xml)
543
544         if len(self.results[0]) == 0:
545             # Initialize our eland object with meaningless junk
546             for l in  LANE_LIST:
547                 self.results[0][l] = ResultLane(lane_id=l, end=0)
548
549
550     def get_elements(self):
551         root = ElementTree.Element(ELAND.ELAND,
552                                    {'version': unicode(ELAND.XML_VERSION)})
553         for end in range(len(self.results)):
554            end_results = self.results[end]
555            for lane_id, lane in end_results.items():
556                 eland_lane = lane.get_elements()
557                 if eland_lane is not None:
558                     eland_lane.attrib[ELAND.END] = unicode (end)
559                     eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
560                     root.append(eland_lane)
561         return root
562
563     def set_elements(self, tree):
564         if tree.tag.lower() != ELAND.ELAND.lower():
565             raise ValueError('Expecting %s', ELAND.ELAND)
566         for element in list(tree):
567             lane_id = int(element.attrib[ELAND.LANE_ID])
568             end = int(element.attrib.get(ELAND.END, 0)) 
569             if element.tag.lower() == ElandLane.LANE.lower():
570                 lane = ElandLane(xml=element)
571             elif element.tag.lower() == SequenceLane.LANE.lower():
572                 lane = SequenceLane(xml=element)
573
574             self.results[end][lane_id] = lane
575
576 def check_for_eland_file(basedir, pattern, lane_id, end):
577    if end is None:
578       full_lane_id = lane_id
579    else:
580       full_lane_id = "%d_%d" % ( lane_id, end )
581
582    basename = pattern % (full_lane_id,)
583    logging.info("Eland pattern: %s" %(basename,))
584    pathname = os.path.join(basedir, basename)
585    if os.path.exists(pathname):
586        logging.info('found eland file in %s' % (pathname,))
587        return pathname
588    else:
589        return None
590
591 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
592     # yes the lane_id is also being computed in ElandLane._update
593     # I didn't want to clutter up my constructor
594     # but I needed to persist the sample_name/lane_id for
595     # runfolder summary_report
596     path, name = os.path.split(pathname)
597     logging.info("Adding eland file %s" %(name,))
598     # split_name = name.split('_')
599     # lane_id = int(split_name[1])
600
601     if genome_maps is not None:
602         genome_map = genome_maps[lane_id]
603     elif gerald is not None:
604         genome_dir = gerald.lanes[lane_id].eland_genome
605         genome_map = build_genome_fasta_map(genome_dir)
606     else:
607         genome_map = {}
608
609     lane = ElandLane(pathname, lane_id, end, genome_map)
610     
611     if end is None:
612         effective_end =  0
613     else:
614         effective_end = end - 1
615
616     results[effective_end][lane_id] = lane
617
618 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
619     result = SequenceLane(pathname, lane_id, end)
620
621     if end is None:
622         effective_end =  0
623     else:
624         effective_end = end - 1
625
626     results[effective_end][lane_id] = result
627
628
629 def eland(gerald_dir, gerald=None, genome_maps=None):
630     e = ELAND()
631
632     lane_ids = range(1,9)
633     ends = [None, 1, 2]
634
635     basedirs = [gerald_dir]
636
637     # if there is a basedir/Temp change basedir to point to the temp
638     # directory, as 1.1rc1 moves most of the files we've historically
639     # cared about to that subdirectory.
640     # we should look into what the official 'result' files are.
641     # and 1.3 moves them back
642     basedir_temp = os.path.join(gerald_dir, 'Temp')
643     if os.path.isdir(basedir_temp):
644         basedirs.append(basedir_temp)
645
646    
647     # the order in patterns determines the preference for what
648     # will be found.
649     MAPPED_ELAND = 0
650     SEQUENCE = 1
651     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
652                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
653                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
654                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
655                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
656                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
657                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
658                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
659                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
660                 ('s_%s_export.txt', MAPPED_ELAND),
661                 ('s_%s_export.txt.bz2', MAPPED_ELAND),
662                 ('s_%s_export.txt.gz', MAPPED_ELAND),
663                 ('s_%s_sequence.txt', SEQUENCE),]
664
665     for basedir in basedirs:
666         for end in ends:
667             for lane_id in lane_ids:
668                 for p in patterns:
669                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
670                     if pathname is not None:
671                         if p[1] == MAPPED_ELAND:
672                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
673                         elif p[1] == SEQUENCE:
674                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
675                         break
676                 else:
677                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
678                     continue
679
680     return e
681
682 def build_genome_fasta_map(genome_dir):
683     # build fasta to fasta file map
684     logging.info("Building genome map")
685     genome = genome_dir.split(os.path.sep)[-1]
686     fasta_map = {}
687     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
688         is_link = False
689         if os.path.islink(vld_file):
690             is_link = True
691         vld_file = os.path.realpath(vld_file)
692         path, vld_name = os.path.split(vld_file)
693         name, ext = os.path.splitext(vld_name)
694         if is_link:
695             fasta_map[name] = name
696         else:
697             fasta_map[name] = os.path.join(genome, name)
698     return fasta_map
699
700
701 def extract_eland_sequence(instream, outstream, start, end):
702     """
703     Extract a chunk of sequence out of an eland file
704     """
705     for line in instream:
706         record = line.split()
707         if len(record) > 1:
708             result = [record[0], record[1][start:end]]
709         else:
710             result = [record[0][start:end]]
711         outstream.write("\t".join(result))
712         outstream.write(os.linesep)
713
714
715 def main(cmdline=None):
716     """Run eland extraction against the specified gerald directory"""
717     from optparse import OptionParser
718     parser = OptionParser("%prog: <gerald dir>+")
719     opts, args = parser.parse_args(cmdline)
720     logging.basicConfig(level=logging.DEBUG)
721     for a in args:
722         logging.info("Starting scan of %s" % (a,))
723         e = eland(a)
724         print e.get_elements()
725
726     return 
727
728
729 if __name__ == "__main__":
730     main(sys.argv[1:])