Add support for CASAVA 1.7
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import re
9 import stat
10 import sys
11
12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
13 from htsworkflow.util.ethelp import indent, flatten
14 from htsworkflow.util.opener import autoopen
15
16 SAMPLE_NAME = 'SampleName'
17 LANE_ID = 'LaneID'
18 END = 'End'
19 READS = 'Reads'
20
21 GENOME_MAP = 'GenomeMap'
22 GENOME_ITEM = 'GenomeItem'
23 MAPPED_READS = 'MappedReads'
24 MAPPED_ITEM = 'MappedItem'
25 MATCH_CODES = 'MatchCodes'
26 MATCH_ITEM = 'Code'
27 READS = 'Reads'
28
29 ELAND_SINGLE = 0
30 ELAND_MULTI = 1
31 ELAND_EXTENDED = 2
32 ELAND_EXPORT = 3
33
34 class ResultLane(object):
35     """
36     Base class for result lanes
37     """
38     XML_VERSION = 2
39     LANE = 'ResultLane'
40
41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
42         self.pathname = pathname
43         self._sample_name = None
44         self.lane_id = lane_id
45         self.end = end
46         self._reads = None
47         
48         if xml is not None:
49             self.set_elements(xml)
50
51     def _update(self):
52         """
53         Actually read the file and actually count the reads
54         """
55         pass
56
57     def _update_name(self):
58         # extract the sample name
59         if self.pathname is None:
60             return
61
62         path, name = os.path.split(self.pathname)
63         split_name = name.split('_')
64         self._sample_name = split_name[0]
65
66     def _get_sample_name(self):
67         if self._sample_name is None:
68             self._update_name()
69         return self._sample_name
70     sample_name = property(_get_sample_name)
71
72     def _get_reads(self):
73         if self._reads is None:
74             self._update()
75         return self._reads
76     reads = property(_get_reads)
77
78
79 class ElandLane(ResultLane):
80     """
81     Process an eland result file
82     """
83     XML_VERSION = 2
84     LANE = "ElandLane"
85     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
86     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
87     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
88     SCORE_UNRECOGNIZED = 0
89     SCORE_QC = 1
90     SCORE_READ = 2
91
92     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
93         super(ElandLane, self).__init__(pathname, lane_id, end)
94
95         self._mapped_reads = None
96         self._match_codes = None
97         if genome_map is None:
98             genome_map = {}
99         self.genome_map = genome_map
100         self.eland_type = None
101
102         if xml is not None:
103             self.set_elements(xml)
104
105     def _guess_eland_type(self, pathname):
106         if self.eland_type is None:
107           # attempt autodetect eland file type
108           pathn, name = os.path.split(pathname)
109           if re.search('result', name):
110             self.eland_type = ELAND_SINGLE
111           elif re.search('multi', name):
112             self.eland_type = ELAND_MULTI
113           elif re.search('extended', name):
114             self.eland_type = ELAND_EXTENDED
115           elif re.search('export', name):
116             self.eland_type = ELAND_EXPORT
117           else:
118             self.eland_type = ELAND_SINGLE
119
120     def _update(self):
121         """
122         Actually read the file and actually count the reads
123         """
124         # can't do anything if we don't have a file to process
125         if self.pathname is None:
126             return
127         self._guess_eland_type(self.pathname)
128
129         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
130             raise RuntimeError("Eland isn't done, try again later.")
131
132         logging.info("summarizing results for %s" % (self.pathname))
133
134         stream = autoopen(self.pathname, 'r')
135         if self.eland_type == ELAND_SINGLE:
136             result = self._update_eland_result(stream)
137         elif self.eland_type == ELAND_MULTI or \
138              self.eland_type == ELAND_EXTENDED:
139             result = self._update_eland_multi(stream)
140         elif self.eland_type == ELAND_EXPORT:
141             result = self._update_eland_export(stream)
142         else:
143           raise NotImplementedError("Only support single/multi/extended eland files")
144         self._match_codes, self._mapped_reads, self._reads = result
145
146     def _update_eland_result(self, instream):
147         reads = 0
148         mapped_reads = {}
149
150         match_codes = {'NM':0, 'QC':0, 'RM':0,
151                        'U0':0, 'U1':0, 'U2':0,
152                        'R0':0, 'R1':0, 'R2':0,
153                       }
154         for line in instream:
155             reads += 1
156             fields = line.split()
157             # code = fields[2]
158             # match_codes[code] = match_codes.setdefault(code, 0) + 1
159             # the QC/NM etc codes are in the 3rd field and always present
160             match_codes[fields[2]] += 1
161             # ignore lines that don't have a fasta filename
162             if len(fields) < 7:
163                 continue
164             fasta = self.genome_map.get(fields[6], fields[6])
165             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
166         return match_codes, mapped_reads, reads
167
168     def _update_eland_multi(self, instream):
169         """Summarize an eland_extend."""
170         MATCH_INDEX = 2
171         LOCATION_INDEX = 3
172         reads = 0
173         mapped_reads = {}
174
175         match_codes = {'NM':0, 'QC':0, 'RM':0,
176                        'U0':0, 'U1':0, 'U2':0,
177                        'R0':0, 'R1':0, 'R2':0,
178                       }
179         for line in instream:
180             reads += 1
181             fields = line.split()
182             # fields[2] = QC/NM/or number of matches
183             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
184                                                        match_codes)
185             if score_type == ElandLane.SCORE_READ:
186                 # when there are too many hits, eland  writes a - where
187                 # it would have put the list of hits.
188                 # or in a different version of eland, it just leaves
189                 # that column blank, and only outputs 3 fields.     
190                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
191                   continue
192
193                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
194
195         return match_codes, mapped_reads, reads
196
197     def _update_eland_export(self, instream):
198         """Summarize a gerald export file."""
199         MATCH_INDEX = 10
200         LOCATION_INDEX = 10
201         DESCRIPTOR_INDEX= 13
202         reads = 0
203         mapped_reads = {}
204
205         match_codes = {'NM':0, 'QC':0, 'RM':0,
206                        'U0':0, 'U1':0, 'U2':0,
207                        'R0':0, 'R1':0, 'R2':0,
208                       }
209
210         for line in instream:
211             reads += 1
212             fields = line.split()
213             # fields[2] = QC/NM/or number of matches
214             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX], 
215                                                        match_codes)
216             if score_type == ElandLane.SCORE_UNRECOGNIZED:
217                 # export files have three states for the match field
218                 # QC code, count of multi-reads, or a single 
219                 # read location. The score_mapped_mismatches function
220                 # only understands the first two types.
221                 # if we get unrecognized, that implies the field is probably
222                 # a location.
223                 code = self._count_mapped_export(mapped_reads, 
224                                                  fields[LOCATION_INDEX],
225                                                  fields[DESCRIPTOR_INDEX])
226                 match_codes[code] += 1
227
228         return match_codes, mapped_reads, reads
229
230
231     def _score_mapped_mismatches(self, match, match_codes):
232         """Update match_codes with eland map counts, or failure code.
233         
234         Returns True if the read mapped, false if it was an error code.
235         """
236         groups = ElandLane.MATCH_COUNTS_RE.match(match)
237         if groups is None:
238             # match is not of the form [\d]+:[\d]+:[\d]+
239             if match_codes.has_key(match):
240                 # match is one quality control codes QC/NM etc
241                 match_codes[match] += 1
242                 return ElandLane.SCORE_QC
243             else:
244                 return ElandLane.SCORE_UNRECOGNIZED
245         else:
246             # match is of the form [\d]+:[\d]+:[\d]+
247             # AKA Multiread
248             zero_mismatches = int(groups.group(1))
249             one_mismatches = int(groups.group(2))
250             two_mismatches = int(groups.group(3))
251
252             if zero_mismatches == 1:
253                 match_codes['U0'] += 1
254             elif zero_mismatches < 255:
255                 match_codes['R0'] += zero_mismatches
256
257             if one_mismatches == 1:
258                 match_codes['U1'] += 1
259             elif one_mismatches < 255:
260                 match_codes['R1'] += one_mismatches
261     
262             if two_mismatches == 1:
263                 match_codes['U2'] += 1
264             elif two_mismatches < 255:
265                 match_codes['R2'] += two_mismatches
266                 
267             return ElandLane.SCORE_READ
268
269
270     def _count_mapped_multireads(self, mapped_reads, match_string):
271         chromo = None
272         for match in match_string.split(','):
273             match_fragment = match.split(':')
274             if len(match_fragment) == 2:
275                 chromo = match_fragment[0]
276                 pos = match_fragment[1]
277
278             fasta = self.genome_map.get(chromo, chromo)
279             assert fasta is not None
280             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
281
282
283     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
284         """Count a read as defined in an export file
285         
286         match_string contains the chromosome
287         descriptor contains the an ecoding of bases that match, mismatch, 
288                    and have indels.
289         returns the "best" match code
290
291         Currently "best" match code is ignoring the possibility of in-dels
292         """
293         chromo = match_string
294         fasta = self.genome_map.get(chromo, chromo)
295         assert fasta is not None
296         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
297
298         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
299         if len(mismatch_bases) == 0:
300             return 'U0'
301         elif len(mismatch_bases) == 1:
302             return 'U1'
303         else:
304             return 'U2'
305
306
307     def _get_mapped_reads(self):
308         if self._mapped_reads is None:
309             self._update()
310         return self._mapped_reads
311     mapped_reads = property(_get_mapped_reads)
312
313     def _get_match_codes(self):
314         if self._match_codes is None:
315             self._update()
316         return self._match_codes
317     match_codes = property(_get_match_codes)
318
319     def _get_no_match(self):
320         if self._mapped_reads is None:
321             self._update()  
322         return self._match_codes['NM']
323     no_match = property(_get_no_match, 
324                         doc="total reads that didn't match the target genome.")
325
326     def _get_no_match_percent(self):
327         return float(self.no_match)/self.reads * 100 
328     no_match_percent = property(_get_no_match_percent,
329                                 doc="no match reads as percent of total")
330
331     def _get_qc_failed(self):
332         if self._mapped_reads is None:
333             self._update()  
334         return self._match_codes['QC']
335     qc_failed = property(_get_qc_failed,
336                         doc="total reads that didn't match the target genome.")
337
338     def _get_qc_failed_percent(self):
339         return float(self.qc_failed)/self.reads * 100 
340     qc_failed_percent = property(_get_qc_failed_percent,
341                                  doc="QC failed reads as percent of total")
342
343     def _get_unique_reads(self):
344         if self._mapped_reads is None:
345            self._update()
346         sum = 0
347         for code in ['U0','U1','U2']:
348             sum += self._match_codes[code]
349         return sum
350     unique_reads = property(_get_unique_reads,
351                             doc="total unique reads")
352
353     def _get_repeat_reads(self):
354         if self._mapped_reads is None:
355            self._update()
356         sum = 0
357         for code in ['R0','R1','R2']:
358             sum += self._match_codes[code]
359         return sum
360     repeat_reads = property(_get_repeat_reads,
361                             doc="total repeat reads")
362     
363     def get_elements(self):
364         lane = ElementTree.Element(ElandLane.LANE,
365                                    {'version':
366                                     unicode(ElandLane.XML_VERSION)})
367         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
368         sample_tag.text = self.sample_name
369         lane_tag = ElementTree.SubElement(lane, LANE_ID)
370         lane_tag.text = str(self.lane_id)
371         if self.end is not None:
372             end_tag = ElementTree.SubElement(lane, END)
373             end_tag.text = str(self.end)
374         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
375         for k, v in self.genome_map.items():
376             item = ElementTree.SubElement(
377                 genome_map, GENOME_ITEM,
378                 {'name':k, 'value':unicode(v)})
379         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
380         for k, v in self.mapped_reads.items():
381             item = ElementTree.SubElement(
382                 mapped_reads, MAPPED_ITEM,
383                 {'name':k, 'value':unicode(v)})
384         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
385         for k, v in self.match_codes.items():
386             item = ElementTree.SubElement(
387                 match_codes, MATCH_ITEM,
388                 {'name':k, 'value':unicode(v)})
389         reads = ElementTree.SubElement(lane, READS)
390         reads.text = unicode(self.reads)
391
392         return lane
393
394     def set_elements(self, tree):
395         if tree.tag != ElandLane.LANE:
396             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
397
398         # reset dictionaries
399         self._mapped_reads = {}
400         self._match_codes = {}
401
402         for element in tree:
403             tag = element.tag.lower()
404             if tag == SAMPLE_NAME.lower():
405                 self._sample_name = element.text
406             elif tag == LANE_ID.lower():
407                 self.lane_id = int(element.text)
408             elif tag == END.lower():
409                 self.end = int(element.text)
410             elif tag == GENOME_MAP.lower():
411                 for child in element:
412                     name = child.attrib['name']
413                     value = child.attrib['value']
414                     self.genome_map[name] = value
415             elif tag == MAPPED_READS.lower():
416                 for child in element:
417                     name = child.attrib['name']
418                     value = child.attrib['value']
419                     self._mapped_reads[name] = int(value)
420             elif tag == MATCH_CODES.lower():
421                 for child in element:
422                     name = child.attrib['name']
423                     value = int(child.attrib['value'])
424                     self._match_codes[name] = value
425             elif tag == READS.lower():
426                 self._reads = int(element.text)
427             else:
428                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
429
430 class SequenceLane(ResultLane):
431     XML_VERSION=1
432     LANE = 'SequenceLane'
433     SEQUENCE_TYPE = 'SequenceType'
434
435     NONE_TYPE = None
436     SCARF_TYPE = 1
437     FASTQ_TYPE = 2
438     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
439
440     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
441         self.sequence_type = None
442         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
443
444     def _guess_sequence_type(self, pathname):
445         """
446         Determine if we have a scarf or fastq sequence file
447         """
448         f = open(pathname,'r')
449         l = f.readline()
450         f.close()
451
452         if l[0] == '@':
453             # fastq starts with a @
454             self.sequence_type = SequenceLane.FASTQ_TYPE
455         else:
456             self.sequence_type = SequenceLane.SCARF_TYPE
457         return self.sequence_type
458
459     def _update(self):
460         """
461         Actually read the file and actually count the reads
462         """
463         # can't do anything if we don't have a file to process
464         if self.pathname is None:
465             return
466
467         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
468             raise RuntimeError("Sequencing isn't done, try again later.")
469
470         self._guess_sequence_type(self.pathname)
471
472         logging.info("summarizing results for %s" % (self.pathname))
473         lines = 0
474         f = open(self.pathname)
475         for l in f.xreadlines():
476             lines += 1
477         f.close()
478
479         if self.sequence_type == SequenceLane.SCARF_TYPE:
480             self._reads = lines
481         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
482             self._reads = lines / 4
483         else:
484           raise NotImplementedError("This only supports scarf or fastq squence files")
485
486     def get_elements(self):
487         lane = ElementTree.Element(SequenceLane.LANE,
488                                    {'version':
489                                     unicode(SequenceLane.XML_VERSION)})
490         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
491         sample_tag.text = self.sample_name
492         lane_tag = ElementTree.SubElement(lane, LANE_ID)
493         lane_tag.text = str(self.lane_id)
494         if self.end is not None:
495             end_tag = ElementTree.SubElement(lane, END)
496             end_tag.text = str(self.end)
497         reads = ElementTree.SubElement(lane, READS)
498         reads.text = unicode(self.reads)
499         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
500         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
501
502         return lane
503
504     def set_elements(self, tree):
505         if tree.tag != SequenceLane.LANE:
506             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
507         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
508
509         for element in tree:
510             tag = element.tag.lower()
511             if tag == SAMPLE_NAME.lower():
512                 self._sample_name = element.text
513             elif tag == LANE_ID.lower():
514                 self.lane_id = int(element.text)
515             elif tag == END.lower():
516                 self.end = int(element.text)
517             elif tag == READS.lower():
518                 self._reads = int(element.text)
519             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
520                 self.sequence_type = lookup_sequence_type.get(element.text, None)
521             else:
522                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
523
524 class ELAND(object):
525     """
526     Summarize information from eland files
527     """
528     XML_VERSION = 3
529
530     ELAND = 'ElandCollection'
531     LANE = 'Lane'
532     LANE_ID = 'id'
533     END = 'end'
534
535     def __init__(self, xml=None):
536         # we need information from the gerald config.xml
537         self.results = [{},{}]
538
539         if xml is not None:
540             self.set_elements(xml)
541
542         if len(self.results[0]) == 0:
543             # Initialize our eland object with meaningless junk
544             for l in  LANE_LIST:
545                 self.results[0][l] = ResultLane(lane_id=l, end=0)
546
547
548     def get_elements(self):
549         root = ElementTree.Element(ELAND.ELAND,
550                                    {'version': unicode(ELAND.XML_VERSION)})
551         for end in range(len(self.results)):
552            end_results = self.results[end]
553            for lane_id, lane in end_results.items():
554                 eland_lane = lane.get_elements()
555                 eland_lane.attrib[ELAND.END] = unicode (end)
556                 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
557                 root.append(eland_lane)
558         return root
559
560     def set_elements(self, tree):
561         if tree.tag.lower() != ELAND.ELAND.lower():
562             raise ValueError('Expecting %s', ELAND.ELAND)
563         for element in list(tree):
564             lane_id = int(element.attrib[ELAND.LANE_ID])
565             end = int(element.attrib.get(ELAND.END, 0)) 
566             if element.tag.lower() == ElandLane.LANE.lower():
567                 lane = ElandLane(xml=element)
568             elif element.tag.lower() == SequenceLane.LANE.lower():
569                 lane = SequenceLane(xml=element)
570
571             self.results[end][lane_id] = lane
572
573 def check_for_eland_file(basedir, pattern, lane_id, end):
574    if end is None:
575       full_lane_id = lane_id
576    else:
577       full_lane_id = "%d_%d" % ( lane_id, end )
578
579    basename = pattern % (full_lane_id,)
580    logging.info("Eland pattern: %s" %(basename,))
581    pathname = os.path.join(basedir, basename)
582    if os.path.exists(pathname):
583        logging.info('found eland file in %s' % (pathname,))
584        return pathname
585    else:
586        return None
587
588 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
589     # yes the lane_id is also being computed in ElandLane._update
590     # I didn't want to clutter up my constructor
591     # but I needed to persist the sample_name/lane_id for
592     # runfolder summary_report
593     path, name = os.path.split(pathname)
594     logging.info("Adding eland file %s" %(name,))
595     # split_name = name.split('_')
596     # lane_id = int(split_name[1])
597
598     if genome_maps is not None:
599         genome_map = genome_maps[lane_id]
600     elif gerald is not None:
601         genome_dir = gerald.lanes[lane_id].eland_genome
602         genome_map = build_genome_fasta_map(genome_dir)
603     else:
604         genome_map = {}
605
606     lane = ElandLane(pathname, lane_id, end, genome_map)
607     
608     if end is None:
609         effective_end =  0
610     else:
611         effective_end = end - 1
612
613     results[effective_end][lane_id] = lane
614
615 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
616     result = SequenceLane(pathname, lane_id, end)
617
618     if end is None:
619         effective_end =  0
620     else:
621         effective_end = end - 1
622
623     results[effective_end][lane_id] = result
624
625
626 def eland(gerald_dir, gerald=None, genome_maps=None):
627     e = ELAND()
628
629     lane_ids = range(1,9)
630     ends = [None, 1, 2]
631
632     basedirs = [gerald_dir]
633
634     # if there is a basedir/Temp change basedir to point to the temp
635     # directory, as 1.1rc1 moves most of the files we've historically
636     # cared about to that subdirectory.
637     # we should look into what the official 'result' files are.
638     # and 1.3 moves them back
639     basedir_temp = os.path.join(gerald_dir, 'Temp')
640     if os.path.isdir(basedir_temp):
641         basedirs.append(basedir_temp)
642
643    
644     # the order in patterns determines the preference for what
645     # will be found.
646     MAPPED_ELAND = 0
647     SEQUENCE = 1
648     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
649                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
650                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
651                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
652                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
653                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
654                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
655                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
656                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
657                 ('s_%s_export.txt', MAPPED_ELAND),
658                 ('s_%s_export.txt.bz2', MAPPED_ELAND),
659                 ('s_%s_export.txt.gz', MAPPED_ELAND),
660                 ('s_%s_sequence.txt', SEQUENCE),]
661
662     for basedir in basedirs:
663         for end in ends:
664             for lane_id in lane_ids:
665                 for p in patterns:
666                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
667                     if pathname is not None:
668                         if p[1] == MAPPED_ELAND:
669                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
670                         elif p[1] == SEQUENCE:
671                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
672                         break
673                 else:
674                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
675                     continue
676
677     return e
678
679 def build_genome_fasta_map(genome_dir):
680     # build fasta to fasta file map
681     logging.info("Building genome map")
682     genome = genome_dir.split(os.path.sep)[-1]
683     fasta_map = {}
684     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
685         is_link = False
686         if os.path.islink(vld_file):
687             is_link = True
688         vld_file = os.path.realpath(vld_file)
689         path, vld_name = os.path.split(vld_file)
690         name, ext = os.path.splitext(vld_name)
691         if is_link:
692             fasta_map[name] = name
693         else:
694             fasta_map[name] = os.path.join(genome, name)
695     return fasta_map
696
697
698 def extract_eland_sequence(instream, outstream, start, end):
699     """
700     Extract a chunk of sequence out of an eland file
701     """
702     for line in instream:
703         record = line.split()
704         if len(record) > 1:
705             result = [record[0], record[1][start:end]]
706         else:
707             result = [record[0][start:end]]
708         outstream.write("\t".join(result))
709         outstream.write(os.linesep)
710
711
712 def main(cmdline=None):
713     """Run eland extraction against the specified gerald directory"""
714     from optparse import OptionParser
715     parser = OptionParser("%prog: <gerald dir>+")
716     opts, args = parser.parse_args(cmdline)
717     logging.basicConfig(level=logging.DEBUG)
718     for a in args:
719         logging.info("Starting scan of %s" % (a,))
720         e = eland(a)
721         print e.get_elements()
722
723     return 
724
725
726 if __name__ == "__main__":
727     main(sys.argv[1:])