13 from htsworkflow.pipelines import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.pipelines.genomemap import GenomeMap
16 from htsworkflow.util.ethelp import indent, flatten
17 from htsworkflow.util.opener import autoopen
19 LOGGER = logging.getLogger(__name__)
21 SAMPLE_NAME = 'SampleName'
26 GENOME_MAP = 'GenomeMap'
27 GENOME_ITEM = 'GenomeItem'
28 MAPPED_READS = 'MappedReads'
29 MAPPED_ITEM = 'MappedItem'
30 MATCH_CODES = 'MatchCodes'
39 class ResultLane(object):
41 Base class for result lanes
46 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
48 self.pathnames = pathnames
49 self.sample_name = sample
50 self.lane_id = lane_id
55 self.set_elements(xml)
59 Actually read the file and actually count the reads
64 if self._reads is None:
67 reads = property(_get_reads)
69 def get_elements(self):
75 name.append('L%s' % (self.lane_id,))
76 name.append('R%s' % (self.end,))
77 name.append('S%s' % (self.sample_name,))
79 return '<ResultLane(' + ",".join(name) + ')>'
81 class ElandLane(ResultLane):
83 Process an eland result file
87 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
88 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
89 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
90 SCORE_UNRECOGNIZED = 0
94 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
95 genome_map=None, eland_type=None, xml=None):
96 super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
98 self._mapped_reads = None
99 self._match_codes = None
101 self.genome_map = GenomeMap(genome_map)
102 self.eland_type = None
105 self.set_elements(xml)
110 name.append('L%s' % (self.lane_id,))
111 name.append('R%s' % (self.end,))
112 name.append('S%s' % (self.sample_name,))
114 reads = str(self._reads) if self._reads is not None else 'Uncounted'
115 return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
117 def _guess_eland_type(self, pathname):
118 if self.eland_type is None:
119 # attempt autodetect eland file type
120 pathn, name = os.path.split(pathname)
121 if re.search('result', name):
122 self.eland_type = ELAND_SINGLE
123 elif re.search('multi', name):
124 self.eland_type = ELAND_MULTI
125 elif re.search('extended', name):
126 self.eland_type = ELAND_EXTENDED
127 elif re.search('export', name):
128 self.eland_type = ELAND_EXPORT
130 self.eland_type = ELAND_SINGLE
134 Actually read the file and actually count the reads
136 # can't do anything if we don't have a file to process
137 if self.pathnames is None or len(self.pathnames) == 0:
139 pathname = self.pathnames[-1]
140 self._guess_eland_type(pathname)
142 if os.stat(pathname)[stat.ST_SIZE] == 0:
143 raise RuntimeError("Eland isn't done, try again later.")
145 LOGGER.debug("summarizing results for %s" % (pathname))
146 self._match_codes = MatchCodes()
147 self._mapped_reads = MappedReads()
150 for pathname in self.pathnames:
151 stream = autoopen(pathname, 'r')
152 if self.eland_type == ELAND_SINGLE:
153 result = self._update_eland_result(stream)
154 elif self.eland_type == ELAND_MULTI or \
155 self.eland_type == ELAND_EXTENDED:
156 result = self._update_eland_multi(stream)
157 elif self.eland_type == ELAND_EXPORT:
158 result = self._update_eland_export(stream)
160 errmsg = "Only support single/multi/extended eland files"
161 raise NotImplementedError(errmsg)
164 match, mapped, reads = result
165 self._match_codes += match
166 self._mapped_reads += mapped
169 def _update_eland_result(self, instream):
171 mapped_reads = MappedReads()
172 match_codes = MatchCodes()
174 for line in instream:
176 fields = line.split()
178 # match_codes[code] = match_codes.setdefault(code, 0) + 1
179 # the QC/NM etc codes are in the 3rd field and always present
180 match_codes[fields[2]] += 1
181 # ignore lines that don't have a fasta filename
184 fasta = self.genome_map.get(fields[6], fields[6])
185 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
186 return match_codes, mapped_reads, reads
188 def _update_eland_multi(self, instream):
189 """Summarize an eland_extend."""
193 mapped_reads = MappedReads()
194 match_codes = MatchCodes()
196 for line in instream:
198 fields = line.split()
199 # fields[2] = QC/NM/or number of matches
200 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
202 if score_type == ElandLane.SCORE_READ:
203 # when there are too many hits, eland writes a - where
204 # it would have put the list of hits.
205 # or in a different version of eland, it just leaves
206 # that column blank, and only outputs 3 fields.
207 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
210 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
212 return match_codes, mapped_reads, reads
214 def _update_eland_export(self, instream):
215 """Summarize a gerald export file."""
220 mapped_reads = MappedReads()
221 match_codes = MatchCodes()
223 for line in instream:
225 fields = line.split()
226 # fields[2] = QC/NM/or number of matches
227 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
229 if score_type == ElandLane.SCORE_UNRECOGNIZED:
230 # export files have three states for the match field
231 # QC code, count of multi-reads, or a single
232 # read location. The score_mapped_mismatches function
233 # only understands the first two types.
234 # if we get unrecognized, that implies the field is probably
236 code = self._count_mapped_export(mapped_reads,
237 fields[LOCATION_INDEX],
238 fields[DESCRIPTOR_INDEX])
239 match_codes[code] += 1
241 return match_codes, mapped_reads, reads
244 def _score_mapped_mismatches(self, match, match_codes):
245 """Update match_codes with eland map counts, or failure code.
247 Returns True if the read mapped, false if it was an error code.
249 groups = ElandLane.MATCH_COUNTS_RE.match(match)
251 # match is not of the form [\d]+:[\d]+:[\d]+
252 if match in match_codes:
253 # match is one quality control codes QC/NM etc
254 match_codes[match] += 1
255 return ElandLane.SCORE_QC
257 return ElandLane.SCORE_UNRECOGNIZED
259 # match is of the form [\d]+:[\d]+:[\d]+
261 zero_mismatches = int(groups.group(1))
262 one_mismatches = int(groups.group(2))
263 two_mismatches = int(groups.group(3))
265 if zero_mismatches == 1:
266 match_codes['U0'] += 1
267 elif zero_mismatches < 255:
268 match_codes['R0'] += zero_mismatches
270 if one_mismatches == 1:
271 match_codes['U1'] += 1
272 elif one_mismatches < 255:
273 match_codes['R1'] += one_mismatches
275 if two_mismatches == 1:
276 match_codes['U2'] += 1
277 elif two_mismatches < 255:
278 match_codes['R2'] += two_mismatches
280 return ElandLane.SCORE_READ
283 def _count_mapped_multireads(self, mapped_reads, match_string):
285 for match in match_string.split(','):
286 match_fragment = match.split(':')
287 if len(match_fragment) == 2:
288 chromo = match_fragment[0]
289 pos = match_fragment[1]
291 fasta = self.genome_map.get(chromo, chromo)
292 assert fasta is not None
293 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
296 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
297 """Count a read as defined in an export file
299 match_string contains the chromosome
300 descriptor contains the an ecoding of bases that match, mismatch,
302 returns the "best" match code
304 Currently "best" match code is ignoring the possibility of in-dels
306 chromo = match_string
307 fasta = self.genome_map.get(chromo, chromo)
308 assert fasta is not None
309 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
311 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
312 if len(mismatch_bases) == 0:
314 elif len(mismatch_bases) == 1:
320 def _get_mapped_reads(self):
321 if self._mapped_reads is None:
323 return self._mapped_reads
324 mapped_reads = property(_get_mapped_reads)
326 def _get_match_codes(self):
327 if self._match_codes is None:
329 return self._match_codes
330 match_codes = property(_get_match_codes)
332 def _get_no_match(self):
333 if self._mapped_reads is None:
335 return self._match_codes['NM']
336 no_match = property(_get_no_match,
337 doc="total reads that didn't match the target genome.")
339 def _get_no_match_percent(self):
340 return float(self.no_match)/self.reads * 100
341 no_match_percent = property(_get_no_match_percent,
342 doc="no match reads as percent of total")
344 def _get_qc_failed(self):
345 if self._mapped_reads is None:
347 return self._match_codes['QC']
348 qc_failed = property(_get_qc_failed,
349 doc="total reads that didn't match the target genome.")
351 def _get_qc_failed_percent(self):
352 return float(self.qc_failed)/self.reads * 100
353 qc_failed_percent = property(_get_qc_failed_percent,
354 doc="QC failed reads as percent of total")
356 def _get_unique_reads(self):
357 if self._mapped_reads is None:
360 for code in ['U0','U1','U2']:
361 sum += self._match_codes[code]
363 unique_reads = property(_get_unique_reads,
364 doc="total unique reads")
366 def _get_repeat_reads(self):
367 if self._mapped_reads is None:
370 for code in ['R0','R1','R2']:
371 sum += self._match_codes[code]
373 repeat_reads = property(_get_repeat_reads,
374 doc="total repeat reads")
376 def get_elements(self):
377 lane = ElementTree.Element(ElandLane.LANE,
379 str(ElandLane.XML_VERSION)})
380 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
381 sample_tag.text = self.sample_name
382 lane_tag = ElementTree.SubElement(lane, LANE_ID)
383 lane_tag.text = str(self.lane_id)
384 if self.end is not None:
385 end_tag = ElementTree.SubElement(lane, END)
386 end_tag.text = str(self.end)
387 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
388 for k, v in list(self.genome_map.items()):
389 item = ElementTree.SubElement(
390 genome_map, GENOME_ITEM,
391 {'name':k, 'value':str(v)})
392 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
393 for k, v in list(self.mapped_reads.items()):
394 item = ElementTree.SubElement(
395 mapped_reads, MAPPED_ITEM,
396 {'name':k, 'value':str(v)})
397 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
398 for k, v in list(self.match_codes.items()):
399 item = ElementTree.SubElement(
400 match_codes, MATCH_ITEM,
401 {'name':k, 'value':str(v)})
402 reads = ElementTree.SubElement(lane, READS)
403 reads.text = str(self.reads)
407 def set_elements(self, tree):
408 if tree.tag != ElandLane.LANE:
409 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
412 self._mapped_reads = {}
413 self._match_codes = {}
416 tag = element.tag.lower()
417 if tag == SAMPLE_NAME.lower():
418 self.sample_name = element.text
419 elif tag == LANE_ID.lower():
420 self.lane_id = int(element.text)
421 elif tag == END.lower():
422 self.end = int(element.text)
423 elif tag == GENOME_MAP.lower():
424 for child in element:
425 name = child.attrib['name']
426 value = child.attrib['value']
427 self.genome_map[name] = value
428 elif tag == MAPPED_READS.lower():
429 for child in element:
430 name = child.attrib['name']
431 value = child.attrib['value']
432 self._mapped_reads[name] = int(value)
433 elif tag == MATCH_CODES.lower():
434 for child in element:
435 name = child.attrib['name']
436 value = int(child.attrib['value'])
437 self._match_codes[name] = value
438 elif tag == READS.lower():
439 self._reads = int(element.text)
441 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
444 class MatchCodes(collections.MutableMapping):
445 """Mapping to hold match counts -
446 supports combining two match count sets together
448 def __init__(self, initializer=None):
449 self.match_codes = {'NM':0, 'QC':0, 'RM':0,
450 'U0':0, 'U1':0, 'U2':0,
451 'R0':0, 'R1':0, 'R2':0,
454 if initializer is not None:
455 if not isinstance(initializer, collections.Mapping):
456 raise ValueError("Expected dictionary like class")
457 for key in initializer:
458 if key not in self.match_codes:
459 errmsg = "Initializer can only contain: %s"
460 raise ValueError(errmsg % (",".join(list(self.match_codes.keys()))))
461 self.match_codes[key] += initializer[key]
464 return iter(self.match_codes)
466 def __delitem__(self, key):
467 raise RuntimeError("delete not allowed")
469 def __getitem__(self, key):
470 return self.match_codes[key]
472 def __setitem__(self, key, value):
473 if key not in self.match_codes:
474 errmsg = "Unrecognized key, allowed values are: %s"
475 raise ValueError(errmsg % (",".join(list(self.match_codes.keys()))))
476 self.match_codes[key] = value
479 return len(self.match_codes)
481 def __add__(self, other):
482 if not isinstance(other, MatchCodes):
483 raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
485 newobj = MatchCodes(self)
486 for key, value in list(other.items()):
487 newobj[key] = self.get(key, 0) + other[key]
492 class MappedReads(collections.MutableMapping):
493 """Mapping to hold mapped reads -
494 supports combining two mapped read sets together
496 def __init__(self, initializer=None):
497 self.mapped_reads = {}
499 if initializer is not None:
500 if not isinstance(initializer, collections.Mapping):
501 raise ValueError("Expected dictionary like class")
502 for key in initializer:
503 self[key] = self.setdefault(key, 0) + initializer[key]
506 return iter(self.mapped_reads)
508 def __delitem__(self, key):
509 del self.mapped_reads[key]
511 def __getitem__(self, key):
512 return self.mapped_reads[key]
514 def __setitem__(self, key, value):
515 self.mapped_reads[key] = value
518 return len(self.mapped_reads)
520 def __add__(self, other):
521 if not isinstance(other, MappedReads):
522 raise ValueError("Expected a MappedReads, got %s", str(type(other)))
524 newobj = MappedReads(self)
526 newobj[key] = self.get(key, 0) + other[key]
530 class SequenceLane(ResultLane):
532 LANE = 'SequenceLane'
533 SEQUENCE_TYPE = 'SequenceType'
538 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
540 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
542 self.sequence_type = None
543 super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
545 def _guess_sequence_type(self, pathname):
547 Determine if we have a scarf or fastq sequence file
549 f = open(pathname,'r')
554 # fastq starts with a @
555 self.sequence_type = SequenceLane.FASTQ_TYPE
557 self.sequence_type = SequenceLane.SCARF_TYPE
558 return self.sequence_type
562 Actually read the file and actually count the reads
564 # can't do anything if we don't have a file to process
565 if self.pathnames is None:
568 pathname = self.pathnames[-1]
569 if os.stat(pathname)[stat.ST_SIZE] == 0:
570 raise RuntimeError("Sequencing isn't done, try again later.")
572 self._guess_sequence_type(pathname)
574 LOGGER.info("summarizing results for %s" % (pathname))
581 if self.sequence_type == SequenceLane.SCARF_TYPE:
583 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
584 self._reads = lines / 4
586 errmsg = "This only supports scarf or fastq squence files"
587 raise NotImplementedError(errmsg)
589 def get_elements(self):
590 lane = ElementTree.Element(SequenceLane.LANE,
592 str(SequenceLane.XML_VERSION)})
593 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
594 sample_tag.text = self.sample_name
595 lane_tag = ElementTree.SubElement(lane, LANE_ID)
596 lane_tag.text = str(self.lane_id)
597 if self.end is not None:
598 end_tag = ElementTree.SubElement(lane, END)
599 end_tag.text = str(self.end)
600 reads = ElementTree.SubElement(lane, READS)
601 reads.text = str(self.reads)
602 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
603 sequence_type.text = str(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
607 def set_elements(self, tree):
608 if tree.tag != SequenceLane.LANE:
609 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
610 lookup_sequence_type = dict([ (v,k) for k,v in list(SequenceLane.SEQUENCE_DESCRIPTION.items())])
613 tag = element.tag.lower()
614 if tag == SAMPLE_NAME.lower():
615 self.sample_name = element.text
616 elif tag == LANE_ID.lower():
617 self.lane_id = int(element.text)
618 elif tag == END.lower():
619 self.end = int(element.text)
620 elif tag == READS.lower():
621 self._reads = int(element.text)
622 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
623 self.sequence_type = lookup_sequence_type.get(element.text, None)
625 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
627 class ELAND(collections.MutableMapping):
629 Summarize information from eland files
633 ELAND = 'ElandCollection'
639 def __init__(self, xml=None):
640 # we need information from the gerald config.xml
644 self.set_elements(xml)
646 def __getitem__(self, key):
647 if not isinstance(key, SampleKey):
648 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
649 return self.results[key]
651 def __setitem__(self, key, value):
652 if not isinstance(key, SampleKey):
653 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
654 self.results[key] = value
656 def __delitem__(self, key):
660 keys = iter(self.results.keys())
661 for k in sorted(keys):
665 return len(self.results)
667 def find_keys(self, search):
668 """Return results that match key"""
669 if not isinstance(search, SampleKey):
670 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
671 if not search.iswild:
673 for key in list(self.keys()):
674 if key.matches(search): yield key
676 def get_elements(self):
677 root = ElementTree.Element(ELAND.ELAND,
678 {'version': str(ELAND.XML_VERSION)})
681 eland_lane = self[key].get_elements()
682 eland_lane.attrib[ELAND.END] = str(self[key].end-1)
683 eland_lane.attrib[ELAND.LANE_ID] = str(self[key].lane_id)
684 eland_lane.attrib[ELAND.SAMPLE] = str(self[key].sample_name)
685 root.append(eland_lane)
689 def set_elements(self, tree):
690 if tree.tag.lower() != ELAND.ELAND.lower():
691 raise ValueError('Expecting %s', ELAND.ELAND)
692 for element in list(tree):
693 lane_id = int(element.attrib[ELAND.LANE_ID])
694 end = int(element.attrib.get(ELAND.END, 0))
695 sample = element.attrib.get(ELAND.SAMPLE, 's')
696 if element.tag.lower() == ElandLane.LANE.lower():
697 lane = ElandLane(xml=element)
698 elif element.tag.lower() == SequenceLane.LANE.lower():
699 lane = SequenceLane(xml=element)
701 key = SampleKey(lane=lane_id, read=end+1, sample=sample)
702 self.results[key] = lane
705 def update_result_with_eland(self, gerald, key, pathnames,
707 # yes the lane_id is also being computed in ElandLane._update
708 # I didn't want to clutter up my constructor
709 # but I needed to persist the sample_name/lane_id for
710 # runfolder summary_report
711 names = [ os.path.split(p)[1] for p in pathnames]
712 LOGGER.info("Adding eland files %s" %(",".join(names),))
713 basedir = os.path.split(pathnames[0])[0]
714 gs_template = "{0}_*_L{1:03}_genomesize.xml"
716 os.path.join(basedir,
717 gs_template.format(key.sample, key.lane)))
720 genome_map = GenomeMap()
721 if genome_maps is not None:
722 genome_map = GenomeMap(genome_maps[key.lane])
723 elif len(genomesize) > 0:
724 LOGGER.info("Found {0}".format(genomesize))
725 genome_map.parse_genomesize(genomesize[0])
726 elif gerald is not None:
727 genome_dir = gerald.lanes[key].eland_genome
728 if genome_dir is not None:
729 genome_map.scan_genome_dir(genome_dir)
731 lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
733 self.results[key] = lane
735 def update_result_with_sequence(self, gerald, key, pathnames,
737 self.results[key] = SequenceLane(pathnames,
738 key.sample, key.lane, key.read)
741 def eland(gerald_dir, gerald=None, genome_maps=None):
743 eland_files = ElandMatches(e)
745 for path, dirnames, filenames in os.walk(gerald_dir):
746 for filename in filenames:
747 pathname = os.path.abspath(os.path.join(path, filename))
748 eland_files.add(pathname)
749 for key in eland_files:
750 eland_files.count(key, gerald, genome_maps)
754 class ElandMatches(collections.MutableMapping):
755 def __init__(self, eland_container):
756 # the order in patterns determines the preference for what
758 self.eland_container = eland_container
759 MAPPED = eland_container.update_result_with_eland
760 SEQUENCE = eland_container.update_result_with_sequence
762 sample = '(?P<sample>[^_]+)'
763 hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
764 hiLane = '_L(?P<lane>[\d]+)'
765 gaLane = '_(?P<lane>[\d]+)'
766 hiRead = '_R(?P<read>[\d]+)'
767 gaRead = '(_(?P<read>[\d])+)?'
768 part = '_(?P<part>[\d]+)'
769 ext = '(?P<extention>(\.bz2|\.gz)?)'
771 hiPrefix = sample + hiIndex + hiLane + hiRead + part
772 gaPrefix = sample + gaLane + gaRead
773 P = collections.namedtuple('Patterns', 'pattern counter priority')
775 P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
776 P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
777 P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
778 P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
779 P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
780 P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
783 self.file_priority = {}
784 self.file_counter = {}
786 def add(self, pathname):
787 """Add pathname to our set of files
789 path, filename = os.path.split(pathname)
791 for pattern, counter, priority in self.patterns:
792 rematch = re.match(pattern, filename)
793 if rematch is not None:
794 m = ElandMatch(pathname, counter, **rematch.groupdict())
795 key = m.make_samplekey()
796 old_priority = self.file_priority.get(key, 0)
797 if priority > old_priority:
798 self.file_sets[key] = set((m,))
799 self.file_counter[key] = counter
800 self.file_priority[key] = priority
801 elif priority == old_priority:
802 self.file_sets[key].add(m)
804 def count(self, key, gerald=None, genome_maps=None):
805 #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
806 counter = self.file_counter[key]
807 file_set = self.file_sets[key]
808 filenames = [ f.filename for f in file_set ]
809 return counter(gerald, key,
810 filenames, genome_maps)
813 return iter(self.file_sets)
816 return len(self.file_sets)
818 def __getitem__(self, key):
819 return self.file_sets[key]
821 def __setitem__(self, key, value):
822 if not isintance(value, set):
823 raise ValueError("Expected set for value")
824 self.file_sets[key] = value
826 def __delitem__(self, key):
827 del self.file_sets[key]
829 class ElandMatch(object):
830 def __init__(self, pathname, counter,
831 lane=None, read=None, extension=None,
832 sample=None, index=None, part=None, **kwargs):
833 self.filename = pathname
834 self.counter = counter
837 self.extension = extension
841 LOGGER.info("Found %s: L%s R%s Samp%s" % (
842 self.filename, self._lane, self._read, self.sample))
844 def make_samplekey(self):
845 read = self._read if self._read is not None else 1
846 return SampleKey(lane=self.lane, read=read, sample=self.sample)
849 if self._lane is not None:
850 return int(self._lane)
852 lane = property(_get_lane)
855 if self._read is not None:
856 return int(self._read)
858 read = property(_get_read)
861 if self._part is not None:
862 return int(self._part)
864 part = property(_get_part)
868 if self.sample is not None: name.append(self.sample)
869 if self._lane is not None: name.append('L%s' % (self.lane,))
870 if self._read is not None: name.append('R%s' % (self.read,))
871 if self._part is not None: name.append('P%s' % (self.part,))
872 return '<ElandMatch(' + "_".join(name) + ')>'
875 def extract_eland_sequence(instream, outstream, start, end):
877 Extract a chunk of sequence out of an eland file
879 for line in instream:
880 record = line.split()
882 result = [record[0], record[1][start:end]]
884 result = [record[0][start:end]]
885 outstream.write("\t".join(result))
886 outstream.write(os.linesep)
889 def main(cmdline=None):
890 """Run eland extraction against the specified gerald directory"""
891 from optparse import OptionParser
892 parser = OptionParser("%prog: <gerald dir>+")
893 opts, args = parser.parse_args(cmdline)
894 logging.basicConfig(level=logging.DEBUG)
896 LOGGER.info("Starting scan of %s" % (a,))
898 print(ElementTree.tostring(e.get_elements()))
902 if __name__ == "__main__":