htsworkflow/pipelines/eland.py

   1 """
   2 Analyze ELAND files
   3 """
   4
   5 from glob import glob
   6 import logging
   7 import os
   8 import re
   9 import stat
  10 import sys
  11
  12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
  13 from htsworkflow.util.ethelp import indent, flatten
  14 from htsworkflow.util.opener import autoopen
  15
  16 SAMPLE_NAME = 'SampleName'
  17 LANE_ID = 'LaneID'
  18 END = 'End'
  19 READS = 'Reads'
  20
  21 GENOME_MAP = 'GenomeMap'
  22 GENOME_ITEM = 'GenomeItem'
  23 MAPPED_READS = 'MappedReads'
  24 MAPPED_ITEM = 'MappedItem'
  25 MATCH_CODES = 'MatchCodes'
  26 MATCH_ITEM = 'Code'
  27 READS = 'Reads'
  28
  29 ELAND_SINGLE = 0
  30 ELAND_MULTI = 1
  31 ELAND_EXTENDED = 2
  32 ELAND_EXPORT = 3
  33
  34 class ResultLane(object):
  35     """
  36     Base class for result lanes
  37     """
  38     XML_VERSION = 2
  39     LANE = 'ResultLane'
  40
  41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
  42         self.pathname = pathname
  43         self._sample_name = None
  44         self.lane_id = lane_id
  45         self.end = end
  46         self._reads = None
  47
  48         if xml is not None:
  49             self.set_elements(xml)
  50
  51     def _update(self):
  52         """
  53         Actually read the file and actually count the reads
  54         """
  55         pass
  56
  57     def _update_name(self):
  58         # extract the sample name
  59         if self.pathname is None:
  60             return
  61
  62         path, name = os.path.split(self.pathname)
  63         split_name = name.split('_')
  64         self._sample_name = split_name[0]
  65
  66     def _get_sample_name(self):
  67         if self._sample_name is None:
  68             self._update_name()
  69         return self._sample_name
  70     sample_name = property(_get_sample_name)
  71
  72     def _get_reads(self):
  73         if self._reads is None:
  74             self._update()
  75         return self._reads
  76     reads = property(_get_reads)
  77
  78
  79 class ElandLane(ResultLane):
  80     """
  81     Process an eland result file
  82     """
  83     XML_VERSION = 2
  84     LANE = "ElandLane"
  85     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
  86     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
  87     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
  88     SCORE_UNRECOGNIZED = 0
  89     SCORE_QC = 1
  90     SCORE_READ = 2
  91
  92     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
  93         super(ElandLane, self).__init__(pathname, lane_id, end)
  94
  95         self._mapped_reads = None
  96         self._match_codes = None
  97         if genome_map is None:
  98             genome_map = {}
  99         self.genome_map = genome_map
 100         self.eland_type = None
 101
 102         if xml is not None:
 103             self.set_elements(xml)
 104
 105     def _guess_eland_type(self, pathname):
 106         if self.eland_type is None:
 107           # attempt autodetect eland file type
 108           pathn, name = os.path.split(pathname)
 109           if re.search('result', name):
 110             self.eland_type = ELAND_SINGLE
 111           elif re.search('multi', name):
 112             self.eland_type = ELAND_MULTI
 113           elif re.search('extended', name):
 114             self.eland_type = ELAND_EXTENDED
 115           elif re.search('export', name):
 116             self.eland_type = ELAND_EXPORT
 117           else:
 118             self.eland_type = ELAND_SINGLE
 119
 120     def _update(self):
 121         """
 122         Actually read the file and actually count the reads
 123         """
 124         # can't do anything if we don't have a file to process
 125         if self.pathname is None:
 126             return
 127         self._guess_eland_type(self.pathname)
 128
 129         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
 130             raise RuntimeError("Eland isn't done, try again later.")
 131
 132         logging.info("summarizing results for %s" % (self.pathname))
 133
 134         stream = autoopen(self.pathname, 'r')
 135         if self.eland_type == ELAND_SINGLE:
 136             result = self._update_eland_result(stream)
 137         elif self.eland_type == ELAND_MULTI or \
 138              self.eland_type == ELAND_EXTENDED:
 139             result = self._update_eland_multi(stream)
 140         elif self.eland_type == ELAND_EXPORT:
 141             result = self._update_eland_export(stream)
 142         else:
 143           raise NotImplementedError("Only support single/multi/extended eland files")
 144         self._match_codes, self._mapped_reads, self._reads = result
 145
 146     def _update_eland_result(self, instream):
 147         reads = 0
 148         mapped_reads = {}
 149
 150         match_codes = {'NM':0, 'QC':0, 'RM':0,
 151                        'U0':0, 'U1':0, 'U2':0,
 152                        'R0':0, 'R1':0, 'R2':0,
 153                       }
 154         for line in instream:
 155             reads += 1
 156             fields = line.split()
 157             # code = fields[2]
 158             # match_codes[code] = match_codes.setdefault(code, 0) + 1
 159             # the QC/NM etc codes are in the 3rd field and always present
 160             match_codes[fields[2]] += 1
 161             # ignore lines that don't have a fasta filename
 162             if len(fields) < 7:
 163                 continue
 164             fasta = self.genome_map.get(fields[6], fields[6])
 165             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 166         return match_codes, mapped_reads, reads
 167
 168     def _update_eland_multi(self, instream):
 169         """Summarize an eland_extend."""
 170         MATCH_INDEX = 2
 171         LOCATION_INDEX = 3
 172         reads = 0
 173         mapped_reads = {}
 174
 175         match_codes = {'NM':0, 'QC':0, 'RM':0,
 176                        'U0':0, 'U1':0, 'U2':0,
 177                        'R0':0, 'R1':0, 'R2':0,
 178                       }
 179         for line in instream:
 180             reads += 1
 181             fields = line.split()
 182             # fields[2] = QC/NM/or number of matches
 183             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
 184                                                        match_codes)
 185             if score_type == ElandLane.SCORE_READ:
 186                 # when there are too many hits, eland  writes a - where
 187                 # it would have put the list of hits.
 188                 # or in a different version of eland, it just leaves
 189                 # that column blank, and only outputs 3 fields.
 190                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
 191                   continue
 192
 193                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
 194
 195         return match_codes, mapped_reads, reads
 196
 197     def _update_eland_export(self, instream):
 198         """Summarize a gerald export file."""
 199         MATCH_INDEX = 10
 200         LOCATION_INDEX = 10
 201         DESCRIPTOR_INDEX= 13
 202         reads = 0
 203         mapped_reads = {}
 204
 205         match_codes = {'NM':0, 'QC':0, 'RM':0,
 206                        'U0':0, 'U1':0, 'U2':0,
 207                        'R0':0, 'R1':0, 'R2':0,
 208                       }
 209
 210         for line in instream:
 211             reads += 1
 212             fields = line.split()
 213             # fields[2] = QC/NM/or number of matches
 214             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
 215                                                        match_codes)
 216             if score_type == ElandLane.SCORE_UNRECOGNIZED:
 217                 # export files have three states for the match field
 218                 # QC code, count of multi-reads, or a single
 219                 # read location. The score_mapped_mismatches function
 220                 # only understands the first two types.
 221                 # if we get unrecognized, that implies the field is probably
 222                 # a location.
 223                 code = self._count_mapped_export(mapped_reads,
 224                                                  fields[LOCATION_INDEX],
 225                                                  fields[DESCRIPTOR_INDEX])
 226                 match_codes[code] += 1
 227
 228         return match_codes, mapped_reads, reads
 229
 230
 231     def _score_mapped_mismatches(self, match, match_codes):
 232         """Update match_codes with eland map counts, or failure code.
 233
 234         Returns True if the read mapped, false if it was an error code.
 235         """
 236         groups = ElandLane.MATCH_COUNTS_RE.match(match)
 237         if groups is None:
 238             # match is not of the form [\d]+:[\d]+:[\d]+
 239             if match_codes.has_key(match):
 240                 # match is one quality control codes QC/NM etc
 241                 match_codes[match] += 1
 242                 return ElandLane.SCORE_QC
 243             else:
 244                 return ElandLane.SCORE_UNRECOGNIZED
 245         else:
 246             # match is of the form [\d]+:[\d]+:[\d]+
 247             # AKA Multiread
 248             zero_mismatches = int(groups.group(1))
 249             one_mismatches = int(groups.group(2))
 250             two_mismatches = int(groups.group(3))
 251
 252             if zero_mismatches == 1:
 253                 match_codes['U0'] += 1
 254             elif zero_mismatches < 255:
 255                 match_codes['R0'] += zero_mismatches
 256
 257             if one_mismatches == 1:
 258                 match_codes['U1'] += 1
 259             elif one_mismatches < 255:
 260                 match_codes['R1'] += one_mismatches
 261
 262             if two_mismatches == 1:
 263                 match_codes['U2'] += 1
 264             elif two_mismatches < 255:
 265                 match_codes['R2'] += two_mismatches
 266
 267             return ElandLane.SCORE_READ
 268
 269
 270     def _count_mapped_multireads(self, mapped_reads, match_string):
 271         chromo = None
 272         for match in match_string.split(','):
 273             match_fragment = match.split(':')
 274             if len(match_fragment) == 2:
 275                 chromo = match_fragment[0]
 276                 pos = match_fragment[1]
 277
 278             fasta = self.genome_map.get(chromo, chromo)
 279             assert fasta is not None
 280             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 281
 282
 283     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
 284         """Count a read as defined in an export file
 285
 286         match_string contains the chromosome
 287         descriptor contains the an ecoding of bases that match, mismatch,
 288                    and have indels.
 289         returns the "best" match code
 290
 291         Currently "best" match code is ignoring the possibility of in-dels
 292         """
 293         chromo = match_string
 294         fasta = self.genome_map.get(chromo, chromo)
 295         assert fasta is not None
 296         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 297
 298         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
 299         if len(mismatch_bases) == 0:
 300             return 'U0'
 301         elif len(mismatch_bases) == 1:
 302             return 'U1'
 303         else:
 304             return 'U2'
 305
 306
 307     def _get_mapped_reads(self):
 308         if self._mapped_reads is None:
 309             self._update()
 310         return self._mapped_reads
 311     mapped_reads = property(_get_mapped_reads)
 312
 313     def _get_match_codes(self):
 314         if self._match_codes is None:
 315             self._update()
 316         return self._match_codes
 317     match_codes = property(_get_match_codes)
 318
 319     def _get_no_match(self):
 320         if self._mapped_reads is None:
 321             self._update()
 322         return self._match_codes['NM']
 323     no_match = property(_get_no_match,
 324                         doc="total reads that didn't match the target genome.")
 325
 326     def _get_no_match_percent(self):
 327         return float(self.no_match)/self.reads * 100
 328     no_match_percent = property(_get_no_match_percent,
 329                                 doc="no match reads as percent of total")
 330
 331     def _get_qc_failed(self):
 332         if self._mapped_reads is None:
 333             self._update()
 334         return self._match_codes['QC']
 335     qc_failed = property(_get_qc_failed,
 336                         doc="total reads that didn't match the target genome.")
 337
 338     def _get_qc_failed_percent(self):
 339         return float(self.qc_failed)/self.reads * 100
 340     qc_failed_percent = property(_get_qc_failed_percent,
 341                                  doc="QC failed reads as percent of total")
 342
 343     def _get_unique_reads(self):
 344         if self._mapped_reads is None:
 345            self._update()
 346         sum = 0
 347         for code in ['U0','U1','U2']:
 348             sum += self._match_codes[code]
 349         return sum
 350     unique_reads = property(_get_unique_reads,
 351                             doc="total unique reads")
 352
 353     def _get_repeat_reads(self):
 354         if self._mapped_reads is None:
 355            self._update()
 356         sum = 0
 357         for code in ['R0','R1','R2']:
 358             sum += self._match_codes[code]
 359         return sum
 360     repeat_reads = property(_get_repeat_reads,
 361                             doc="total repeat reads")
 362
 363     def get_elements(self):
 364         lane = ElementTree.Element(ElandLane.LANE,
 365                                    {'version':
 366                                     unicode(ElandLane.XML_VERSION)})
 367         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
 368         sample_tag.text = self.sample_name
 369         lane_tag = ElementTree.SubElement(lane, LANE_ID)
 370         lane_tag.text = str(self.lane_id)
 371         if self.end is not None:
 372             end_tag = ElementTree.SubElement(lane, END)
 373             end_tag.text = str(self.end)
 374         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
 375         for k, v in self.genome_map.items():
 376             item = ElementTree.SubElement(
 377                 genome_map, GENOME_ITEM,
 378                 {'name':k, 'value':unicode(v)})
 379         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
 380         for k, v in self.mapped_reads.items():
 381             item = ElementTree.SubElement(
 382                 mapped_reads, MAPPED_ITEM,
 383                 {'name':k, 'value':unicode(v)})
 384         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
 385         for k, v in self.match_codes.items():
 386             item = ElementTree.SubElement(
 387                 match_codes, MATCH_ITEM,
 388                 {'name':k, 'value':unicode(v)})
 389         reads = ElementTree.SubElement(lane, READS)
 390         reads.text = unicode(self.reads)
 391
 392         return lane
 393
 394     def set_elements(self, tree):
 395         if tree.tag != ElandLane.LANE:
 396             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
 397
 398         # reset dictionaries
 399         self._mapped_reads = {}
 400         self._match_codes = {}
 401
 402         for element in tree:
 403             tag = element.tag.lower()
 404             if tag == SAMPLE_NAME.lower():
 405                 self._sample_name = element.text
 406             elif tag == LANE_ID.lower():
 407                 self.lane_id = int(element.text)
 408             elif tag == END.lower():
 409                 self.end = int(element.text)
 410             elif tag == GENOME_MAP.lower():
 411                 for child in element:
 412                     name = child.attrib['name']
 413                     value = child.attrib['value']
 414                     self.genome_map[name] = value
 415             elif tag == MAPPED_READS.lower():
 416                 for child in element:
 417                     name = child.attrib['name']
 418                     value = child.attrib['value']
 419                     self._mapped_reads[name] = int(value)
 420             elif tag == MATCH_CODES.lower():
 421                 for child in element:
 422                     name = child.attrib['name']
 423                     value = int(child.attrib['value'])
 424                     self._match_codes[name] = value
 425             elif tag == READS.lower():
 426                 self._reads = int(element.text)
 427             else:
 428                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
 429
 430 class SequenceLane(ResultLane):
 431     XML_VERSION=1
 432     LANE = 'SequenceLane'
 433     SEQUENCE_TYPE = 'SequenceType'
 434
 435     NONE_TYPE = None
 436     SCARF_TYPE = 1
 437     FASTQ_TYPE = 2
 438     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
 439
 440     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
 441         self.sequence_type = None
 442         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
 443
 444     def _guess_sequence_type(self, pathname):
 445         """
 446         Determine if we have a scarf or fastq sequence file
 447         """
 448         f = open(pathname,'r')
 449         l = f.readline()
 450         f.close()
 451
 452         if l[0] == '@':
 453             # fastq starts with a @
 454             self.sequence_type = SequenceLane.FASTQ_TYPE
 455         else:
 456             self.sequence_type = SequenceLane.SCARF_TYPE
 457         return self.sequence_type
 458
 459     def _update(self):
 460         """
 461         Actually read the file and actually count the reads
 462         """
 463         # can't do anything if we don't have a file to process
 464         if self.pathname is None:
 465             return
 466
 467         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
 468             raise RuntimeError("Sequencing isn't done, try again later.")
 469
 470         self._guess_sequence_type(self.pathname)
 471
 472         logging.info("summarizing results for %s" % (self.pathname))
 473         lines = 0
 474         f = open(self.pathname)
 475         for l in f.xreadlines():
 476             lines += 1
 477         f.close()
 478
 479         if self.sequence_type == SequenceLane.SCARF_TYPE:
 480             self._reads = lines
 481         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
 482             self._reads = lines / 4
 483         else:
 484           raise NotImplementedError("This only supports scarf or fastq squence files")
 485
 486     def get_elements(self):
 487         lane = ElementTree.Element(SequenceLane.LANE,
 488                                    {'version':
 489                                     unicode(SequenceLane.XML_VERSION)})
 490         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
 491         sample_tag.text = self.sample_name
 492         lane_tag = ElementTree.SubElement(lane, LANE_ID)
 493         lane_tag.text = str(self.lane_id)
 494         if self.end is not None:
 495             end_tag = ElementTree.SubElement(lane, END)
 496             end_tag.text = str(self.end)
 497         reads = ElementTree.SubElement(lane, READS)
 498         reads.text = unicode(self.reads)
 499         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
 500         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
 501
 502         return lane
 503
 504     def set_elements(self, tree):
 505         if tree.tag != SequenceLane.LANE:
 506             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
 507         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
 508
 509         for element in tree:
 510             tag = element.tag.lower()
 511             if tag == SAMPLE_NAME.lower():
 512                 self._sample_name = element.text
 513             elif tag == LANE_ID.lower():
 514                 self.lane_id = int(element.text)
 515             elif tag == END.lower():
 516                 self.end = int(element.text)
 517             elif tag == READS.lower():
 518                 self._reads = int(element.text)
 519             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
 520                 self.sequence_type = lookup_sequence_type.get(element.text, None)
 521             else:
 522                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
 523
 524 class ELAND(object):
 525     """
 526     Summarize information from eland files
 527     """
 528     XML_VERSION = 3
 529
 530     ELAND = 'ElandCollection'
 531     LANE = 'Lane'
 532     LANE_ID = 'id'
 533     END = 'end'
 534
 535     def __init__(self, xml=None):
 536         # we need information from the gerald config.xml
 537         self.results = [{},{}]
 538
 539         if xml is not None:
 540             self.set_elements(xml)
 541
 542         if len(self.results[0]) == 0:
 543             # Initialize our eland object with meaningless junk
 544             for l in  LANE_LIST:
 545                 self.results[0][l] = ResultLane(lane_id=l, end=0)
 546
 547
 548     def get_elements(self):
 549         root = ElementTree.Element(ELAND.ELAND,
 550                                    {'version': unicode(ELAND.XML_VERSION)})
 551         for end in range(len(self.results)):
 552            end_results = self.results[end]
 553            for lane_id, lane in end_results.items():
 554                 eland_lane = lane.get_elements()
 555                 eland_lane.attrib[ELAND.END] = unicode (end)
 556                 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
 557                 root.append(eland_lane)
 558         return root
 559
 560     def set_elements(self, tree):
 561         if tree.tag.lower() != ELAND.ELAND.lower():
 562             raise ValueError('Expecting %s', ELAND.ELAND)
 563         for element in list(tree):
 564             lane_id = int(element.attrib[ELAND.LANE_ID])
 565             end = int(element.attrib.get(ELAND.END, 0))
 566             if element.tag.lower() == ElandLane.LANE.lower():
 567                 lane = ElandLane(xml=element)
 568             elif element.tag.lower() == SequenceLane.LANE.lower():
 569                 lane = SequenceLane(xml=element)
 570
 571             self.results[end][lane_id] = lane
 572
 573 def check_for_eland_file(basedir, pattern, lane_id, end):
 574    if end is None:
 575       full_lane_id = lane_id
 576    else:
 577       full_lane_id = "%d_%d" % ( lane_id, end )
 578
 579    basename = pattern % (full_lane_id,)
 580    logging.info("Eland pattern: %s" %(basename,))
 581    pathname = os.path.join(basedir, basename)
 582    if os.path.exists(pathname):
 583        logging.info('found eland file in %s' % (pathname,))
 584        return pathname
 585    else:
 586        return None
 587
 588 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
 589     # yes the lane_id is also being computed in ElandLane._update
 590     # I didn't want to clutter up my constructor
 591     # but I needed to persist the sample_name/lane_id for
 592     # runfolder summary_report
 593     path, name = os.path.split(pathname)
 594     logging.info("Adding eland file %s" %(name,))
 595     # split_name = name.split('_')
 596     # lane_id = int(split_name[1])
 597
 598     if genome_maps is not None:
 599         genome_map = genome_maps[lane_id]
 600     elif gerald is not None:
 601         genome_dir = gerald.lanes[lane_id].eland_genome
 602         genome_map = build_genome_fasta_map(genome_dir)
 603     else:
 604         genome_map = {}
 605
 606     lane = ElandLane(pathname, lane_id, end, genome_map)
 607
 608     if end is None:
 609         effective_end =  0
 610     else:
 611         effective_end = end - 1
 612
 613     results[effective_end][lane_id] = lane
 614
 615 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
 616     result = SequenceLane(pathname, lane_id, end)
 617
 618     if end is None:
 619         effective_end =  0
 620     else:
 621         effective_end = end - 1
 622
 623     results[effective_end][lane_id] = result
 624
 625
 626 def eland(gerald_dir, gerald=None, genome_maps=None):
 627     e = ELAND()
 628
 629     lane_ids = range(1,9)
 630     ends = [None, 1, 2]
 631
 632     basedirs = [gerald_dir]
 633
 634     # if there is a basedir/Temp change basedir to point to the temp
 635     # directory, as 1.1rc1 moves most of the files we've historically
 636     # cared about to that subdirectory.
 637     # we should look into what the official 'result' files are.
 638     # and 1.3 moves them back
 639     basedir_temp = os.path.join(gerald_dir, 'Temp')
 640     if os.path.isdir(basedir_temp):
 641         basedirs.append(basedir_temp)
 642
 643
 644     # the order in patterns determines the preference for what
 645     # will be found.
 646     MAPPED_ELAND = 0
 647     SEQUENCE = 1
 648     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
 649                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
 650                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
 651                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
 652                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
 653                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
 654                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
 655                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
 656                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
 657                 ('s_%s_export.txt', MAPPED_ELAND),
 658                 ('s_%s_export.txt.bz2', MAPPED_ELAND),
 659                 ('s_%s_export.txt.gz', MAPPED_ELAND),
 660                 ('s_%s_sequence.txt', SEQUENCE),]
 661
 662     for basedir in basedirs:
 663         for end in ends:
 664             for lane_id in lane_ids:
 665                 for p in patterns:
 666                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
 667                     if pathname is not None:
 668                         if p[1] == MAPPED_ELAND:
 669                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
 670                         elif p[1] == SEQUENCE:
 671                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
 672                         break
 673                 else:
 674                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
 675                     continue
 676
 677     return e
 678
 679 def build_genome_fasta_map(genome_dir):
 680     # build fasta to fasta file map
 681     logging.info("Building genome map")
 682     genome = genome_dir.split(os.path.sep)[-1]
 683     fasta_map = {}
 684     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
 685         is_link = False
 686         if os.path.islink(vld_file):
 687             is_link = True
 688         vld_file = os.path.realpath(vld_file)
 689         path, vld_name = os.path.split(vld_file)
 690         name, ext = os.path.splitext(vld_name)
 691         if is_link:
 692             fasta_map[name] = name
 693         else:
 694             fasta_map[name] = os.path.join(genome, name)
 695     return fasta_map
 696
 697
 698 def extract_eland_sequence(instream, outstream, start, end):
 699     """
 700     Extract a chunk of sequence out of an eland file
 701     """
 702     for line in instream:
 703         record = line.split()
 704         if len(record) > 1:
 705             result = [record[0], record[1][start:end]]
 706         else:
 707             result = [record[0][start:end]]
 708         outstream.write("\t".join(result))
 709         outstream.write(os.linesep)
 710
 711
 712 def main(cmdline=None):
 713     """Run eland extraction against the specified gerald directory"""
 714     from optparse import OptionParser
 715     parser = OptionParser("%prog: <gerald dir>+")
 716     opts, args = parser.parse_args(cmdline)
 717     logging.basicConfig(level=logging.DEBUG)
 718     for a in args:
 719         logging.info("Starting scan of %s" % (a,))
 720         e = eland(a)
 721         print e.get_elements()
 722
 723     return
 724
 725
 726 if __name__ == "__main__":
 727     main(sys.argv[1:])