htsworkflow/pipelines/eland.py

   1 """
   2 Analyze ELAND files
   3 """
   4
   5 from glob import glob
   6 import logging
   7 import os
   8 import re
   9 import stat
  10 import sys
  11
  12 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
  13 from htsworkflow.util.ethelp import indent, flatten
  14 from htsworkflow.util.opener import autoopen
  15
  16 SAMPLE_NAME = 'SampleName'
  17 LANE_ID = 'LaneID'
  18 END = 'End'
  19 READS = 'Reads'
  20
  21 GENOME_MAP = 'GenomeMap'
  22 GENOME_ITEM = 'GenomeItem'
  23 MAPPED_READS = 'MappedReads'
  24 MAPPED_ITEM = 'MappedItem'
  25 MATCH_CODES = 'MatchCodes'
  26 MATCH_ITEM = 'Code'
  27 READS = 'Reads'
  28
  29 ELAND_SINGLE = 0
  30 ELAND_MULTI = 1
  31 ELAND_EXTENDED = 2
  32 ELAND_EXPORT = 3
  33
  34 class ResultLane(object):
  35     """
  36     Base class for result lanes
  37     """
  38     XML_VERSION = 2
  39     LANE = 'ResultLane'
  40
  41     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
  42         self.pathname = pathname
  43         self._sample_name = None
  44         self.lane_id = lane_id
  45         self.end = end
  46         self._reads = None
  47
  48         if xml is not None:
  49             self.set_elements(xml)
  50
  51     def _update(self):
  52         """
  53         Actually read the file and actually count the reads
  54         """
  55         pass
  56
  57     def _update_name(self):
  58         # extract the sample name
  59         if self.pathname is None:
  60             return
  61
  62         path, name = os.path.split(self.pathname)
  63         split_name = name.split('_')
  64         self._sample_name = split_name[0]
  65
  66     def _get_sample_name(self):
  67         if self._sample_name is None:
  68             self._update_name()
  69         return self._sample_name
  70     sample_name = property(_get_sample_name)
  71
  72     def _get_reads(self):
  73         if self._reads is None:
  74             self._update()
  75         return self._reads
  76     reads = property(_get_reads)
  77
  78     def get_elements(self):
  79         return None
  80
  81 class ElandLane(ResultLane):
  82     """
  83     Process an eland result file
  84     """
  85     XML_VERSION = 2
  86     LANE = "ElandLane"
  87     MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
  88     DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
  89     DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
  90     SCORE_UNRECOGNIZED = 0
  91     SCORE_QC = 1
  92     SCORE_READ = 2
  93
  94     def __init__(self, pathname=None, lane_id=None, end=None, genome_map=None, eland_type=None, xml=None):
  95         super(ElandLane, self).__init__(pathname, lane_id, end)
  96
  97         self._mapped_reads = None
  98         self._match_codes = None
  99         if genome_map is None:
 100             genome_map = {}
 101         self.genome_map = genome_map
 102         self.eland_type = None
 103
 104         if xml is not None:
 105             self.set_elements(xml)
 106
 107     def _guess_eland_type(self, pathname):
 108         if self.eland_type is None:
 109           # attempt autodetect eland file type
 110           pathn, name = os.path.split(pathname)
 111           if re.search('result', name):
 112             self.eland_type = ELAND_SINGLE
 113           elif re.search('multi', name):
 114             self.eland_type = ELAND_MULTI
 115           elif re.search('extended', name):
 116             self.eland_type = ELAND_EXTENDED
 117           elif re.search('export', name):
 118             self.eland_type = ELAND_EXPORT
 119           else:
 120             self.eland_type = ELAND_SINGLE
 121
 122     def _update(self):
 123         """
 124         Actually read the file and actually count the reads
 125         """
 126         # can't do anything if we don't have a file to process
 127         if self.pathname is None:
 128             return
 129         self._guess_eland_type(self.pathname)
 130
 131         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
 132             raise RuntimeError("Eland isn't done, try again later.")
 133
 134         logging.info("summarizing results for %s" % (self.pathname))
 135
 136         stream = autoopen(self.pathname, 'r')
 137         if self.eland_type == ELAND_SINGLE:
 138             result = self._update_eland_result(stream)
 139         elif self.eland_type == ELAND_MULTI or \
 140              self.eland_type == ELAND_EXTENDED:
 141             result = self._update_eland_multi(stream)
 142         elif self.eland_type == ELAND_EXPORT:
 143             result = self._update_eland_export(stream)
 144         else:
 145           raise NotImplementedError("Only support single/multi/extended eland files")
 146         self._match_codes, self._mapped_reads, self._reads = result
 147
 148     def _update_eland_result(self, instream):
 149         reads = 0
 150         mapped_reads = {}
 151
 152         match_codes = {'NM':0, 'QC':0, 'RM':0,
 153                        'U0':0, 'U1':0, 'U2':0,
 154                        'R0':0, 'R1':0, 'R2':0,
 155                       }
 156         for line in instream:
 157             reads += 1
 158             fields = line.split()
 159             # code = fields[2]
 160             # match_codes[code] = match_codes.setdefault(code, 0) + 1
 161             # the QC/NM etc codes are in the 3rd field and always present
 162             match_codes[fields[2]] += 1
 163             # ignore lines that don't have a fasta filename
 164             if len(fields) < 7:
 165                 continue
 166             fasta = self.genome_map.get(fields[6], fields[6])
 167             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 168         return match_codes, mapped_reads, reads
 169
 170     def _update_eland_multi(self, instream):
 171         """Summarize an eland_extend."""
 172         MATCH_INDEX = 2
 173         LOCATION_INDEX = 3
 174         reads = 0
 175         mapped_reads = {}
 176
 177         match_codes = {'NM':0, 'QC':0, 'RM':0,
 178                        'U0':0, 'U1':0, 'U2':0,
 179                        'R0':0, 'R1':0, 'R2':0,
 180                       }
 181         for line in instream:
 182             reads += 1
 183             fields = line.split()
 184             # fields[2] = QC/NM/or number of matches
 185             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
 186                                                        match_codes)
 187             if score_type == ElandLane.SCORE_READ:
 188                 # when there are too many hits, eland  writes a - where
 189                 # it would have put the list of hits.
 190                 # or in a different version of eland, it just leaves
 191                 # that column blank, and only outputs 3 fields.
 192                 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
 193                   continue
 194
 195                 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
 196
 197         return match_codes, mapped_reads, reads
 198
 199     def _update_eland_export(self, instream):
 200         """Summarize a gerald export file."""
 201         MATCH_INDEX = 10
 202         LOCATION_INDEX = 10
 203         DESCRIPTOR_INDEX= 13
 204         reads = 0
 205         mapped_reads = {}
 206
 207         match_codes = {'NM':0, 'QC':0, 'RM':0,
 208                        'U0':0, 'U1':0, 'U2':0,
 209                        'R0':0, 'R1':0, 'R2':0,
 210                       }
 211
 212         for line in instream:
 213             reads += 1
 214             fields = line.split()
 215             # fields[2] = QC/NM/or number of matches
 216             score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
 217                                                        match_codes)
 218             if score_type == ElandLane.SCORE_UNRECOGNIZED:
 219                 # export files have three states for the match field
 220                 # QC code, count of multi-reads, or a single
 221                 # read location. The score_mapped_mismatches function
 222                 # only understands the first two types.
 223                 # if we get unrecognized, that implies the field is probably
 224                 # a location.
 225                 code = self._count_mapped_export(mapped_reads,
 226                                                  fields[LOCATION_INDEX],
 227                                                  fields[DESCRIPTOR_INDEX])
 228                 match_codes[code] += 1
 229
 230         return match_codes, mapped_reads, reads
 231
 232
 233     def _score_mapped_mismatches(self, match, match_codes):
 234         """Update match_codes with eland map counts, or failure code.
 235
 236         Returns True if the read mapped, false if it was an error code.
 237         """
 238         groups = ElandLane.MATCH_COUNTS_RE.match(match)
 239         if groups is None:
 240             # match is not of the form [\d]+:[\d]+:[\d]+
 241             if match_codes.has_key(match):
 242                 # match is one quality control codes QC/NM etc
 243                 match_codes[match] += 1
 244                 return ElandLane.SCORE_QC
 245             else:
 246                 return ElandLane.SCORE_UNRECOGNIZED
 247         else:
 248             # match is of the form [\d]+:[\d]+:[\d]+
 249             # AKA Multiread
 250             zero_mismatches = int(groups.group(1))
 251             one_mismatches = int(groups.group(2))
 252             two_mismatches = int(groups.group(3))
 253
 254             if zero_mismatches == 1:
 255                 match_codes['U0'] += 1
 256             elif zero_mismatches < 255:
 257                 match_codes['R0'] += zero_mismatches
 258
 259             if one_mismatches == 1:
 260                 match_codes['U1'] += 1
 261             elif one_mismatches < 255:
 262                 match_codes['R1'] += one_mismatches
 263
 264             if two_mismatches == 1:
 265                 match_codes['U2'] += 1
 266             elif two_mismatches < 255:
 267                 match_codes['R2'] += two_mismatches
 268
 269             return ElandLane.SCORE_READ
 270
 271
 272     def _count_mapped_multireads(self, mapped_reads, match_string):
 273         chromo = None
 274         for match in match_string.split(','):
 275             match_fragment = match.split(':')
 276             if len(match_fragment) == 2:
 277                 chromo = match_fragment[0]
 278                 pos = match_fragment[1]
 279
 280             fasta = self.genome_map.get(chromo, chromo)
 281             assert fasta is not None
 282             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 283
 284
 285     def _count_mapped_export(self, mapped_reads, match_string, descriptor):
 286         """Count a read as defined in an export file
 287
 288         match_string contains the chromosome
 289         descriptor contains the an ecoding of bases that match, mismatch,
 290                    and have indels.
 291         returns the "best" match code
 292
 293         Currently "best" match code is ignoring the possibility of in-dels
 294         """
 295         chromo = match_string
 296         fasta = self.genome_map.get(chromo, chromo)
 297         assert fasta is not None
 298         mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
 299
 300         mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
 301         if len(mismatch_bases) == 0:
 302             return 'U0'
 303         elif len(mismatch_bases) == 1:
 304             return 'U1'
 305         else:
 306             return 'U2'
 307
 308
 309     def _get_mapped_reads(self):
 310         if self._mapped_reads is None:
 311             self._update()
 312         return self._mapped_reads
 313     mapped_reads = property(_get_mapped_reads)
 314
 315     def _get_match_codes(self):
 316         if self._match_codes is None:
 317             self._update()
 318         return self._match_codes
 319     match_codes = property(_get_match_codes)
 320
 321     def _get_no_match(self):
 322         if self._mapped_reads is None:
 323             self._update()
 324         return self._match_codes['NM']
 325     no_match = property(_get_no_match,
 326                         doc="total reads that didn't match the target genome.")
 327
 328     def _get_no_match_percent(self):
 329         return float(self.no_match)/self.reads * 100
 330     no_match_percent = property(_get_no_match_percent,
 331                                 doc="no match reads as percent of total")
 332
 333     def _get_qc_failed(self):
 334         if self._mapped_reads is None:
 335             self._update()
 336         return self._match_codes['QC']
 337     qc_failed = property(_get_qc_failed,
 338                         doc="total reads that didn't match the target genome.")
 339
 340     def _get_qc_failed_percent(self):
 341         return float(self.qc_failed)/self.reads * 100
 342     qc_failed_percent = property(_get_qc_failed_percent,
 343                                  doc="QC failed reads as percent of total")
 344
 345     def _get_unique_reads(self):
 346         if self._mapped_reads is None:
 347            self._update()
 348         sum = 0
 349         for code in ['U0','U1','U2']:
 350             sum += self._match_codes[code]
 351         return sum
 352     unique_reads = property(_get_unique_reads,
 353                             doc="total unique reads")
 354
 355     def _get_repeat_reads(self):
 356         if self._mapped_reads is None:
 357            self._update()
 358         sum = 0
 359         for code in ['R0','R1','R2']:
 360             sum += self._match_codes[code]
 361         return sum
 362     repeat_reads = property(_get_repeat_reads,
 363                             doc="total repeat reads")
 364
 365     def get_elements(self):
 366         lane = ElementTree.Element(ElandLane.LANE,
 367                                    {'version':
 368                                     unicode(ElandLane.XML_VERSION)})
 369         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
 370         sample_tag.text = self.sample_name
 371         lane_tag = ElementTree.SubElement(lane, LANE_ID)
 372         lane_tag.text = str(self.lane_id)
 373         if self.end is not None:
 374             end_tag = ElementTree.SubElement(lane, END)
 375             end_tag.text = str(self.end)
 376         genome_map = ElementTree.SubElement(lane, GENOME_MAP)
 377         for k, v in self.genome_map.items():
 378             item = ElementTree.SubElement(
 379                 genome_map, GENOME_ITEM,
 380                 {'name':k, 'value':unicode(v)})
 381         mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
 382         for k, v in self.mapped_reads.items():
 383             item = ElementTree.SubElement(
 384                 mapped_reads, MAPPED_ITEM,
 385                 {'name':k, 'value':unicode(v)})
 386         match_codes = ElementTree.SubElement(lane, MATCH_CODES)
 387         for k, v in self.match_codes.items():
 388             item = ElementTree.SubElement(
 389                 match_codes, MATCH_ITEM,
 390                 {'name':k, 'value':unicode(v)})
 391         reads = ElementTree.SubElement(lane, READS)
 392         reads.text = unicode(self.reads)
 393
 394         return lane
 395
 396     def set_elements(self, tree):
 397         if tree.tag != ElandLane.LANE:
 398             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
 399
 400         # reset dictionaries
 401         self._mapped_reads = {}
 402         self._match_codes = {}
 403
 404         for element in tree:
 405             tag = element.tag.lower()
 406             if tag == SAMPLE_NAME.lower():
 407                 self._sample_name = element.text
 408             elif tag == LANE_ID.lower():
 409                 self.lane_id = int(element.text)
 410             elif tag == END.lower():
 411                 self.end = int(element.text)
 412             elif tag == GENOME_MAP.lower():
 413                 for child in element:
 414                     name = child.attrib['name']
 415                     value = child.attrib['value']
 416                     self.genome_map[name] = value
 417             elif tag == MAPPED_READS.lower():
 418                 for child in element:
 419                     name = child.attrib['name']
 420                     value = child.attrib['value']
 421                     self._mapped_reads[name] = int(value)
 422             elif tag == MATCH_CODES.lower():
 423                 for child in element:
 424                     name = child.attrib['name']
 425                     value = int(child.attrib['value'])
 426                     self._match_codes[name] = value
 427             elif tag == READS.lower():
 428                 self._reads = int(element.text)
 429             else:
 430                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
 431
 432 class SequenceLane(ResultLane):
 433     XML_VERSION=1
 434     LANE = 'SequenceLane'
 435     SEQUENCE_TYPE = 'SequenceType'
 436
 437     NONE_TYPE = None
 438     SCARF_TYPE = 1
 439     FASTQ_TYPE = 2
 440     SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
 441
 442     def __init__(self, pathname=None, lane_id=None, end=None, xml=None):
 443         self.sequence_type = None
 444         super(SequenceLane, self).__init__(pathname, lane_id, end, xml)
 445
 446     def _guess_sequence_type(self, pathname):
 447         """
 448         Determine if we have a scarf or fastq sequence file
 449         """
 450         f = open(pathname,'r')
 451         l = f.readline()
 452         f.close()
 453
 454         if l[0] == '@':
 455             # fastq starts with a @
 456             self.sequence_type = SequenceLane.FASTQ_TYPE
 457         else:
 458             self.sequence_type = SequenceLane.SCARF_TYPE
 459         return self.sequence_type
 460
 461     def _update(self):
 462         """
 463         Actually read the file and actually count the reads
 464         """
 465         # can't do anything if we don't have a file to process
 466         if self.pathname is None:
 467             return
 468
 469         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
 470             raise RuntimeError("Sequencing isn't done, try again later.")
 471
 472         self._guess_sequence_type(self.pathname)
 473
 474         logging.info("summarizing results for %s" % (self.pathname))
 475         lines = 0
 476         f = open(self.pathname)
 477         for l in f.xreadlines():
 478             lines += 1
 479         f.close()
 480
 481         if self.sequence_type == SequenceLane.SCARF_TYPE:
 482             self._reads = lines
 483         elif self.sequence_type == SequenceLane.FASTQ_TYPE:
 484             self._reads = lines / 4
 485         else:
 486           raise NotImplementedError("This only supports scarf or fastq squence files")
 487
 488     def get_elements(self):
 489         lane = ElementTree.Element(SequenceLane.LANE,
 490                                    {'version':
 491                                     unicode(SequenceLane.XML_VERSION)})
 492         sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
 493         sample_tag.text = self.sample_name
 494         lane_tag = ElementTree.SubElement(lane, LANE_ID)
 495         lane_tag.text = str(self.lane_id)
 496         if self.end is not None:
 497             end_tag = ElementTree.SubElement(lane, END)
 498             end_tag.text = str(self.end)
 499         reads = ElementTree.SubElement(lane, READS)
 500         reads.text = unicode(self.reads)
 501         sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
 502         sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
 503
 504         return lane
 505
 506     def set_elements(self, tree):
 507         if tree.tag != SequenceLane.LANE:
 508             raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
 509         lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
 510
 511         for element in tree:
 512             tag = element.tag.lower()
 513             if tag == SAMPLE_NAME.lower():
 514                 self._sample_name = element.text
 515             elif tag == LANE_ID.lower():
 516                 self.lane_id = int(element.text)
 517             elif tag == END.lower():
 518                 self.end = int(element.text)
 519             elif tag == READS.lower():
 520                 self._reads = int(element.text)
 521             elif tag == SequenceLane.SEQUENCE_TYPE.lower():
 522                 self.sequence_type = lookup_sequence_type.get(element.text, None)
 523             else:
 524                 logging.warn("SequenceLane unrecognized tag %s" % (element.tag,))
 525
 526 class ELAND(object):
 527     """
 528     Summarize information from eland files
 529     """
 530     XML_VERSION = 3
 531
 532     ELAND = 'ElandCollection'
 533     LANE = 'Lane'
 534     LANE_ID = 'id'
 535     END = 'end'
 536
 537     def __init__(self, xml=None):
 538         # we need information from the gerald config.xml
 539         self.results = [{},{}]
 540
 541         if xml is not None:
 542             self.set_elements(xml)
 543
 544         if len(self.results[0]) == 0:
 545             # Initialize our eland object with meaningless junk
 546             for l in  LANE_LIST:
 547                 self.results[0][l] = ResultLane(lane_id=l, end=0)
 548
 549
 550     def get_elements(self):
 551         root = ElementTree.Element(ELAND.ELAND,
 552                                    {'version': unicode(ELAND.XML_VERSION)})
 553         for end in range(len(self.results)):
 554            end_results = self.results[end]
 555            for lane_id, lane in end_results.items():
 556                 eland_lane = lane.get_elements()
 557                 if eland_lane is not None:
 558                     eland_lane.attrib[ELAND.END] = unicode (end)
 559                     eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
 560                     root.append(eland_lane)
 561         return root
 562
 563     def set_elements(self, tree):
 564         if tree.tag.lower() != ELAND.ELAND.lower():
 565             raise ValueError('Expecting %s', ELAND.ELAND)
 566         for element in list(tree):
 567             lane_id = int(element.attrib[ELAND.LANE_ID])
 568             end = int(element.attrib.get(ELAND.END, 0))
 569             if element.tag.lower() == ElandLane.LANE.lower():
 570                 lane = ElandLane(xml=element)
 571             elif element.tag.lower() == SequenceLane.LANE.lower():
 572                 lane = SequenceLane(xml=element)
 573
 574             self.results[end][lane_id] = lane
 575
 576 def check_for_eland_file(basedir, pattern, lane_id, end):
 577    if end is None:
 578       full_lane_id = lane_id
 579    else:
 580       full_lane_id = "%d_%d" % ( lane_id, end )
 581
 582    basename = pattern % (full_lane_id,)
 583    logging.info("Eland pattern: %s" %(basename,))
 584    pathname = os.path.join(basedir, basename)
 585    if os.path.exists(pathname):
 586        logging.info('found eland file in %s' % (pathname,))
 587        return pathname
 588    else:
 589        return None
 590
 591 def update_result_with_eland(gerald, results, lane_id, end, pathname, genome_maps):
 592     # yes the lane_id is also being computed in ElandLane._update
 593     # I didn't want to clutter up my constructor
 594     # but I needed to persist the sample_name/lane_id for
 595     # runfolder summary_report
 596     path, name = os.path.split(pathname)
 597     logging.info("Adding eland file %s" %(name,))
 598     # split_name = name.split('_')
 599     # lane_id = int(split_name[1])
 600
 601     if genome_maps is not None:
 602         genome_map = genome_maps[lane_id]
 603     elif gerald is not None:
 604         genome_dir = gerald.lanes[lane_id].eland_genome
 605         genome_map = build_genome_fasta_map(genome_dir)
 606     else:
 607         genome_map = {}
 608
 609     lane = ElandLane(pathname, lane_id, end, genome_map)
 610
 611     if end is None:
 612         effective_end =  0
 613     else:
 614         effective_end = end - 1
 615
 616     results[effective_end][lane_id] = lane
 617
 618 def update_result_with_sequence(gerald, results, lane_id, end, pathname):
 619     result = SequenceLane(pathname, lane_id, end)
 620
 621     if end is None:
 622         effective_end =  0
 623     else:
 624         effective_end = end - 1
 625
 626     results[effective_end][lane_id] = result
 627
 628
 629 def eland(gerald_dir, gerald=None, genome_maps=None):
 630     e = ELAND()
 631
 632     lane_ids = range(1,9)
 633     ends = [None, 1, 2]
 634
 635     basedirs = [gerald_dir]
 636
 637     # if there is a basedir/Temp change basedir to point to the temp
 638     # directory, as 1.1rc1 moves most of the files we've historically
 639     # cared about to that subdirectory.
 640     # we should look into what the official 'result' files are.
 641     # and 1.3 moves them back
 642     basedir_temp = os.path.join(gerald_dir, 'Temp')
 643     if os.path.isdir(basedir_temp):
 644         basedirs.append(basedir_temp)
 645
 646
 647     # the order in patterns determines the preference for what
 648     # will be found.
 649     MAPPED_ELAND = 0
 650     SEQUENCE = 1
 651     patterns = [('s_%s_eland_result.txt', MAPPED_ELAND),
 652                 ('s_%s_eland_result.txt.bz2', MAPPED_ELAND),
 653                 ('s_%s_eland_result.txt.gz', MAPPED_ELAND),
 654                 ('s_%s_eland_extended.txt', MAPPED_ELAND),
 655                 ('s_%s_eland_extended.txt.bz2', MAPPED_ELAND),
 656                 ('s_%s_eland_extended.txt.gz', MAPPED_ELAND),
 657                 ('s_%s_eland_multi.txt', MAPPED_ELAND),
 658                 ('s_%s_eland_multi.txt.bz2', MAPPED_ELAND),
 659                 ('s_%s_eland_multi.txt.gz', MAPPED_ELAND),
 660                 ('s_%s_export.txt', MAPPED_ELAND),
 661                 ('s_%s_export.txt.bz2', MAPPED_ELAND),
 662                 ('s_%s_export.txt.gz', MAPPED_ELAND),
 663                 ('s_%s_sequence.txt', SEQUENCE),]
 664
 665     for basedir in basedirs:
 666         for end in ends:
 667             for lane_id in lane_ids:
 668                 for p in patterns:
 669                     pathname = check_for_eland_file(basedir, p[0], lane_id, end)
 670                     if pathname is not None:
 671                         if p[1] == MAPPED_ELAND:
 672                             update_result_with_eland(gerald, e.results, lane_id, end, pathname, genome_maps)
 673                         elif p[1] == SEQUENCE:
 674                             update_result_with_sequence(gerald, e.results, lane_id, end, pathname)
 675                         break
 676                 else:
 677                     logging.debug("No eland file found in %s for lane %s and end %s" %(basedir, lane_id, end))
 678                     continue
 679
 680     return e
 681
 682 def build_genome_fasta_map(genome_dir):
 683     # build fasta to fasta file map
 684     logging.info("Building genome map")
 685     genome = genome_dir.split(os.path.sep)[-1]
 686     fasta_map = {}
 687     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
 688         is_link = False
 689         if os.path.islink(vld_file):
 690             is_link = True
 691         vld_file = os.path.realpath(vld_file)
 692         path, vld_name = os.path.split(vld_file)
 693         name, ext = os.path.splitext(vld_name)
 694         if is_link:
 695             fasta_map[name] = name
 696         else:
 697             fasta_map[name] = os.path.join(genome, name)
 698     return fasta_map
 699
 700
 701 def extract_eland_sequence(instream, outstream, start, end):
 702     """
 703     Extract a chunk of sequence out of an eland file
 704     """
 705     for line in instream:
 706         record = line.split()
 707         if len(record) > 1:
 708             result = [record[0], record[1][start:end]]
 709         else:
 710             result = [record[0][start:end]]
 711         outstream.write("\t".join(result))
 712         outstream.write(os.linesep)
 713
 714
 715 def main(cmdline=None):
 716     """Run eland extraction against the specified gerald directory"""
 717     from optparse import OptionParser
 718     parser = OptionParser("%prog: <gerald dir>+")
 719     opts, args = parser.parse_args(cmdline)
 720     logging.basicConfig(level=logging.DEBUG)
 721     for a in args:
 722         logging.info("Starting scan of %s" % (a,))
 723         e = eland(a)
 724         print e.get_elements()
 725
 726     return
 727
 728
 729 if __name__ == "__main__":
 730     main(sys.argv[1:])