4 from __future__ import print_function
15 from htsworkflow.pipelines import ElementTree, LANE_LIST
16 from htsworkflow.pipelines.samplekey import SampleKey
17 from htsworkflow.pipelines.genomemap import GenomeMap
18 from htsworkflow.util.ethelp import indent, flatten
19 from htsworkflow.util.opener import autoopen
21 LOGGER = logging.getLogger(__name__)
23 SAMPLE_NAME = 'SampleName'
28 GENOME_MAP = 'GenomeMap'
29 GENOME_ITEM = 'GenomeItem'
30 MAPPED_READS = 'MappedReads'
31 MAPPED_ITEM = 'MappedItem'
32 MATCH_CODES = 'MatchCodes'
41 class ResultLane(object):
43 Base class for result lanes
48 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
50 self.pathnames = pathnames
51 self.sample_name = sample
52 self.lane_id = lane_id
57 self.set_elements(xml)
61 Actually read the file and actually count the reads
66 if self._reads is None:
69 reads = property(_get_reads)
71 def get_elements(self):
77 name.append('L%s' % (self.lane_id,))
78 name.append('R%s' % (self.end,))
79 name.append('S%s' % (self.sample_name,))
81 return '<ResultLane(' + ",".join(name) + ')>'
83 class ElandLane(ResultLane):
85 Process an eland result file
89 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92 SCORE_UNRECOGNIZED = 0
96 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
97 genome_map=None, eland_type=None, xml=None):
98 super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
100 self._mapped_reads = None
101 self._match_codes = None
103 self.genome_map = GenomeMap(genome_map)
104 self.eland_type = None
107 self.set_elements(xml)
112 name.append('L%s' % (self.lane_id,))
113 name.append('R%s' % (self.end,))
114 name.append('S%s' % (self.sample_name,))
116 reads = str(self._reads) if self._reads is not None else 'Uncounted'
117 return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
119 def _guess_eland_type(self, pathname):
120 if self.eland_type is None:
121 # attempt autodetect eland file type
122 pathn, name = os.path.split(pathname)
123 if re.search('result', name):
124 self.eland_type = ELAND_SINGLE
125 elif re.search('multi', name):
126 self.eland_type = ELAND_MULTI
127 elif re.search('extended', name):
128 self.eland_type = ELAND_EXTENDED
129 elif re.search('export', name):
130 self.eland_type = ELAND_EXPORT
132 self.eland_type = ELAND_SINGLE
136 Actually read the file and actually count the reads
138 # can't do anything if we don't have a file to process
139 if self.pathnames is None or len(self.pathnames) == 0:
141 pathname = self.pathnames[-1]
142 self._guess_eland_type(pathname)
144 if os.stat(pathname)[stat.ST_SIZE] == 0:
145 raise RuntimeError("Eland isn't done, try again later.")
147 LOGGER.debug("summarizing results for %s" % (pathname))
148 self._match_codes = MatchCodes()
149 self._mapped_reads = MappedReads()
152 for pathname in self.pathnames:
153 stream = autoopen(pathname, 'rt')
154 if self.eland_type == ELAND_SINGLE:
155 result = self._update_eland_result(stream)
156 elif self.eland_type == ELAND_MULTI or \
157 self.eland_type == ELAND_EXTENDED:
158 result = self._update_eland_multi(stream)
159 elif self.eland_type == ELAND_EXPORT:
160 result = self._update_eland_export(stream)
162 errmsg = "Only support single/multi/extended eland files"
163 raise NotImplementedError(errmsg)
166 match, mapped, reads = result
167 self._match_codes += match
168 self._mapped_reads += mapped
171 def _update_eland_result(self, instream):
173 mapped_reads = MappedReads()
174 match_codes = MatchCodes()
176 for line in instream:
178 fields = line.split()
180 # match_codes[code] = match_codes.setdefault(code, 0) + 1
181 # the QC/NM etc codes are in the 3rd field and always present
182 match_codes[fields[2]] += 1
183 # ignore lines that don't have a fasta filename
186 fasta = self.genome_map.get(fields[6], fields[6])
187 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
188 return match_codes, mapped_reads, reads
190 def _update_eland_multi(self, instream):
191 """Summarize an eland_extend."""
195 mapped_reads = MappedReads()
196 match_codes = MatchCodes()
198 for line in instream:
200 fields = line.split()
201 # fields[2] = QC/NM/or number of matches
202 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
204 if score_type == ElandLane.SCORE_READ:
205 # when there are too many hits, eland writes a - where
206 # it would have put the list of hits.
207 # or in a different version of eland, it just leaves
208 # that column blank, and only outputs 3 fields.
209 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
212 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
214 return match_codes, mapped_reads, reads
216 def _update_eland_export(self, instream):
217 """Summarize a gerald export file."""
222 mapped_reads = MappedReads()
223 match_codes = MatchCodes()
225 for line in instream:
227 fields = line.split()
228 # fields[2] = QC/NM/or number of matches
229 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
231 if score_type == ElandLane.SCORE_UNRECOGNIZED:
232 # export files have three states for the match field
233 # QC code, count of multi-reads, or a single
234 # read location. The score_mapped_mismatches function
235 # only understands the first two types.
236 # if we get unrecognized, that implies the field is probably
238 code = self._count_mapped_export(mapped_reads,
239 fields[LOCATION_INDEX],
240 fields[DESCRIPTOR_INDEX])
241 match_codes[code] += 1
243 return match_codes, mapped_reads, reads
246 def _score_mapped_mismatches(self, match, match_codes):
247 """Update match_codes with eland map counts, or failure code.
249 Returns True if the read mapped, false if it was an error code.
251 groups = ElandLane.MATCH_COUNTS_RE.match(match)
253 # match is not of the form [\d]+:[\d]+:[\d]+
254 if match in match_codes:
255 # match is one quality control codes QC/NM etc
256 match_codes[match] += 1
257 return ElandLane.SCORE_QC
259 return ElandLane.SCORE_UNRECOGNIZED
261 # match is of the form [\d]+:[\d]+:[\d]+
263 zero_mismatches = int(groups.group(1))
264 one_mismatches = int(groups.group(2))
265 two_mismatches = int(groups.group(3))
267 if zero_mismatches == 1:
268 match_codes['U0'] += 1
269 elif zero_mismatches < 255:
270 match_codes['R0'] += zero_mismatches
272 if one_mismatches == 1:
273 match_codes['U1'] += 1
274 elif one_mismatches < 255:
275 match_codes['R1'] += one_mismatches
277 if two_mismatches == 1:
278 match_codes['U2'] += 1
279 elif two_mismatches < 255:
280 match_codes['R2'] += two_mismatches
282 return ElandLane.SCORE_READ
285 def _count_mapped_multireads(self, mapped_reads, match_string):
287 for match in match_string.split(','):
288 match_fragment = match.split(':')
289 if len(match_fragment) == 2:
290 chromo = match_fragment[0]
291 pos = match_fragment[1]
293 fasta = self.genome_map.get(chromo, chromo)
294 assert fasta is not None
295 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
298 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
299 """Count a read as defined in an export file
301 match_string contains the chromosome
302 descriptor contains the an ecoding of bases that match, mismatch,
304 returns the "best" match code
306 Currently "best" match code is ignoring the possibility of in-dels
308 chromo = match_string
309 fasta = self.genome_map.get(chromo, chromo)
310 assert fasta is not None
311 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
313 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
314 if len(mismatch_bases) == 0:
316 elif len(mismatch_bases) == 1:
322 def _get_mapped_reads(self):
323 if self._mapped_reads is None:
325 return self._mapped_reads
326 mapped_reads = property(_get_mapped_reads)
328 def _get_match_codes(self):
329 if self._match_codes is None:
331 return self._match_codes
332 match_codes = property(_get_match_codes)
334 def _get_no_match(self):
335 if self._mapped_reads is None:
337 return self._match_codes['NM']
338 no_match = property(_get_no_match,
339 doc="total reads that didn't match the target genome.")
341 def _get_no_match_percent(self):
342 return float(self.no_match)/self.reads * 100
343 no_match_percent = property(_get_no_match_percent,
344 doc="no match reads as percent of total")
346 def _get_qc_failed(self):
347 if self._mapped_reads is None:
349 return self._match_codes['QC']
350 qc_failed = property(_get_qc_failed,
351 doc="total reads that didn't match the target genome.")
353 def _get_qc_failed_percent(self):
354 return float(self.qc_failed)/self.reads * 100
355 qc_failed_percent = property(_get_qc_failed_percent,
356 doc="QC failed reads as percent of total")
358 def _get_unique_reads(self):
359 if self._mapped_reads is None:
362 for code in ['U0','U1','U2']:
363 sum += self._match_codes[code]
365 unique_reads = property(_get_unique_reads,
366 doc="total unique reads")
368 def _get_repeat_reads(self):
369 if self._mapped_reads is None:
372 for code in ['R0','R1','R2']:
373 sum += self._match_codes[code]
375 repeat_reads = property(_get_repeat_reads,
376 doc="total repeat reads")
378 def get_elements(self):
379 lane = ElementTree.Element(ElandLane.LANE,
381 str(ElandLane.XML_VERSION)})
382 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
383 sample_tag.text = self.sample_name
384 lane_tag = ElementTree.SubElement(lane, LANE_ID)
385 lane_tag.text = str(self.lane_id)
386 if self.end is not None:
387 end_tag = ElementTree.SubElement(lane, END)
388 end_tag.text = str(self.end)
389 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
390 for k, v in self.genome_map.items():
391 item = ElementTree.SubElement(
392 genome_map, GENOME_ITEM,
393 {'name':k, 'value':str(v)})
394 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
395 for k, v in self.mapped_reads.items():
396 item = ElementTree.SubElement(
397 mapped_reads, MAPPED_ITEM,
398 {'name':k, 'value':str(v)})
399 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
400 for k, v in self.match_codes.items():
401 item = ElementTree.SubElement(
402 match_codes, MATCH_ITEM,
403 {'name':k, 'value':str(v)})
404 reads = ElementTree.SubElement(lane, READS)
405 reads.text = str(self.reads)
409 def set_elements(self, tree):
410 if tree.tag != ElandLane.LANE:
411 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
414 self._mapped_reads = {}
415 self._match_codes = {}
418 tag = element.tag.lower()
419 if tag == SAMPLE_NAME.lower():
420 self.sample_name = element.text
421 elif tag == LANE_ID.lower():
422 self.lane_id = int(element.text)
423 elif tag == END.lower():
424 self.end = int(element.text)
425 elif tag == GENOME_MAP.lower():
426 for child in element:
427 name = child.attrib['name']
428 value = child.attrib['value']
429 self.genome_map[name] = value
430 elif tag == MAPPED_READS.lower():
431 for child in element:
432 name = child.attrib['name']
433 value = child.attrib['value']
434 self._mapped_reads[name] = int(value)
435 elif tag == MATCH_CODES.lower():
436 for child in element:
437 name = child.attrib['name']
438 value = int(child.attrib['value'])
439 self._match_codes[name] = value
440 elif tag == READS.lower():
441 self._reads = int(element.text)
443 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
446 class MatchCodes(collections.MutableMapping):
447 """Mapping to hold match counts -
448 supports combining two match count sets together
450 def __init__(self, initializer=None):
451 self.match_codes = {'NM':0, 'QC':0, 'RM':0,
452 'U0':0, 'U1':0, 'U2':0,
453 'R0':0, 'R1':0, 'R2':0,
456 if initializer is not None:
457 if not isinstance(initializer, collections.Mapping):
458 raise ValueError("Expected dictionary like class")
459 for key in initializer:
460 if key not in self.match_codes:
461 errmsg = "Initializer can only contain: %s"
462 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
463 self.match_codes[key] += initializer[key]
466 return iter(self.match_codes)
468 def __delitem__(self, key):
469 raise RuntimeError("delete not allowed")
471 def __getitem__(self, key):
472 return self.match_codes[key]
474 def __setitem__(self, key, value):
475 if key not in self.match_codes:
476 errmsg = "Unrecognized key, allowed values are: %s"
477 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
478 self.match_codes[key] = value
481 return len(self.match_codes)
483 def __add__(self, other):
484 if not isinstance(other, MatchCodes):
485 raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
487 newobj = MatchCodes(self)
488 for key, value in other.items():
489 newobj[key] = self.get(key, 0) + other[key]
494 class MappedReads(collections.MutableMapping):
495 """Mapping to hold mapped reads -
496 supports combining two mapped read sets together
498 def __init__(self, initializer=None):
499 self.mapped_reads = {}
501 if initializer is not None:
502 if not isinstance(initializer, collections.Mapping):
503 raise ValueError("Expected dictionary like class")
504 for key in initializer:
505 self[key] = self.setdefault(key, 0) + initializer[key]
508 return iter(self.mapped_reads)
510 def __delitem__(self, key):
511 del self.mapped_reads[key]
513 def __getitem__(self, key):
514 return self.mapped_reads[key]
516 def __setitem__(self, key, value):
517 self.mapped_reads[key] = value
520 return len(self.mapped_reads)
522 def __add__(self, other):
523 if not isinstance(other, MappedReads):
524 raise ValueError("Expected a MappedReads, got %s", str(type(other)))
526 newobj = MappedReads(self)
528 newobj[key] = self.get(key, 0) + other[key]
532 class SequenceLane(ResultLane):
534 LANE = 'SequenceLane'
535 SEQUENCE_TYPE = 'SequenceType'
540 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
542 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
544 self.sequence_type = None
545 super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
547 def _guess_sequence_type(self, pathname):
549 Determine if we have a scarf or fastq sequence file
551 f = open(pathname,'rt')
556 # fastq starts with a @
557 self.sequence_type = SequenceLane.FASTQ_TYPE
559 self.sequence_type = SequenceLane.SCARF_TYPE
560 return self.sequence_type
564 Actually read the file and actually count the reads
566 # can't do anything if we don't have a file to process
567 if self.pathnames is None:
570 pathname = self.pathnames[-1]
571 if os.stat(pathname)[stat.ST_SIZE] == 0:
572 raise RuntimeError("Sequencing isn't done, try again later.")
574 self._guess_sequence_type(pathname)
576 LOGGER.info("summarizing results for %s" % (pathname))
578 f = open(pathname, 'rt')
583 if self.sequence_type == SequenceLane.SCARF_TYPE:
585 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
586 self._reads = lines / 4
588 errmsg = "This only supports scarf or fastq squence files"
589 raise NotImplementedError(errmsg)
591 def get_elements(self):
592 lane = ElementTree.Element(SequenceLane.LANE,
594 str(SequenceLane.XML_VERSION)})
595 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
596 sample_tag.text = self.sample_name
597 lane_tag = ElementTree.SubElement(lane, LANE_ID)
598 lane_tag.text = str(self.lane_id)
599 if self.end is not None:
600 end_tag = ElementTree.SubElement(lane, END)
601 end_tag.text = str(self.end)
602 reads = ElementTree.SubElement(lane, READS)
603 reads.text = str(self.reads)
604 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
605 sequence_type.text = str(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
609 def set_elements(self, tree):
610 if tree.tag != SequenceLane.LANE:
611 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
612 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
615 tag = element.tag.lower()
616 if tag == SAMPLE_NAME.lower():
617 self.sample_name = element.text
618 elif tag == LANE_ID.lower():
619 self.lane_id = int(element.text)
620 elif tag == END.lower():
621 self.end = int(element.text)
622 elif tag == READS.lower():
623 self._reads = int(element.text)
624 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
625 self.sequence_type = lookup_sequence_type.get(element.text, None)
627 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
629 class ELAND(collections.MutableMapping):
631 Summarize information from eland files
635 ELAND = 'ElandCollection'
641 def __init__(self, xml=None):
642 # we need information from the gerald config.xml
646 self.set_elements(xml)
648 def __getitem__(self, key):
649 if not isinstance(key, SampleKey):
650 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
651 return self.results[key]
653 def __setitem__(self, key, value):
654 if not isinstance(key, SampleKey):
655 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
656 self.results[key] = value
658 def __delitem__(self, key):
662 for k in sorted(self.results):
666 return len(self.results)
668 def find_keys(self, search):
669 """Return results that match key"""
670 if not isinstance(search, SampleKey):
671 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
672 if not search.iswild:
674 for key in self.keys():
675 if key.matches(search): yield key
677 def get_elements(self):
678 root = ElementTree.Element(ELAND.ELAND,
679 {'version': str(ELAND.XML_VERSION)})
682 eland_lane = self[key].get_elements()
683 eland_lane.attrib[ELAND.END] = str(self[key].end-1)
684 eland_lane.attrib[ELAND.LANE_ID] = str(self[key].lane_id)
685 eland_lane.attrib[ELAND.SAMPLE] = str(self[key].sample_name)
686 root.append(eland_lane)
690 def set_elements(self, tree):
691 if tree.tag.lower() != ELAND.ELAND.lower():
692 raise ValueError('Expecting %s', ELAND.ELAND)
693 for element in list(tree):
694 lane_id = int(element.attrib[ELAND.LANE_ID])
695 end = int(element.attrib.get(ELAND.END, 0))
696 sample = element.attrib.get(ELAND.SAMPLE, 's')
697 if element.tag.lower() == ElandLane.LANE.lower():
698 lane = ElandLane(xml=element)
699 elif element.tag.lower() == SequenceLane.LANE.lower():
700 lane = SequenceLane(xml=element)
702 key = SampleKey(lane=lane_id, read=end+1, sample=sample)
703 self.results[key] = lane
706 def update_result_with_eland(self, gerald, key, pathnames,
708 # yes the lane_id is also being computed in ElandLane._update
709 # I didn't want to clutter up my constructor
710 # but I needed to persist the sample_name/lane_id for
711 # runfolder summary_report
712 names = [ os.path.split(p)[1] for p in pathnames]
713 LOGGER.info("Adding eland files %s" %(",".join(names),))
714 basedir = os.path.split(pathnames[0])[0]
715 gs_template = "{0}_*_L{1:03}_genomesize.xml"
717 os.path.join(basedir,
718 gs_template.format(key.sample, key.lane)))
721 genome_map = GenomeMap()
722 if genome_maps is not None:
723 genome_map = GenomeMap(genome_maps[key.lane])
724 elif len(genomesize) > 0:
725 LOGGER.info("Found {0}".format(genomesize))
726 genome_map.parse_genomesize(genomesize[0])
727 elif gerald is not None:
728 genome_dir = gerald.lanes[key].eland_genome
729 if genome_dir is not None:
730 genome_map.scan_genome_dir(genome_dir)
732 lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
734 self.results[key] = lane
736 def update_result_with_sequence(self, gerald, key, pathnames,
738 self.results[key] = SequenceLane(pathnames,
739 key.sample, key.lane, key.read)
742 def eland(gerald_dir, gerald=None, genome_maps=None):
744 eland_files = ElandMatches(e)
746 for path, dirnames, filenames in os.walk(gerald_dir):
747 for filename in filenames:
748 pathname = os.path.abspath(os.path.join(path, filename))
749 eland_files.add(pathname)
750 for key in eland_files:
751 eland_files.count(key, gerald, genome_maps)
755 class ElandMatches(collections.MutableMapping):
756 def __init__(self, eland_container):
757 # the order in patterns determines the preference for what
759 self.eland_container = eland_container
760 MAPPED = eland_container.update_result_with_eland
761 SEQUENCE = eland_container.update_result_with_sequence
763 sample = '(?P<sample>[^_]+)'
764 hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
765 hiLane = '_L(?P<lane>[\d]+)'
766 gaLane = '_(?P<lane>[\d]+)'
767 hiRead = '_R(?P<read>[\d]+)'
768 gaRead = '(_(?P<read>[\d])+)?'
769 part = '_(?P<part>[\d]+)'
770 ext = '(?P<extention>(\.bz2|\.gz)?)'
772 hiPrefix = sample + hiIndex + hiLane + hiRead + part
773 gaPrefix = sample + gaLane + gaRead
774 P = collections.namedtuple('Patterns', 'pattern counter priority')
776 P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
777 P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
778 P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
779 P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
780 P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
781 P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
784 self.file_priority = {}
785 self.file_counter = {}
787 def add(self, pathname):
788 """Add pathname to our set of files
790 path, filename = os.path.split(pathname)
792 for pattern, counter, priority in self.patterns:
793 rematch = re.match(pattern, filename)
794 if rematch is not None:
795 m = ElandMatch(pathname, counter, **rematch.groupdict())
796 key = m.make_samplekey()
797 old_priority = self.file_priority.get(key, 0)
798 if priority > old_priority:
799 self.file_sets[key] = set((m,))
800 self.file_counter[key] = counter
801 self.file_priority[key] = priority
802 elif priority == old_priority:
803 self.file_sets[key].add(m)
805 def count(self, key, gerald=None, genome_maps=None):
806 #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
807 counter = self.file_counter[key]
808 file_set = self.file_sets[key]
809 filenames = [ f.filename for f in file_set ]
810 return counter(gerald, key,
811 filenames, genome_maps)
814 return iter(self.file_sets)
817 return len(self.file_sets)
819 def __getitem__(self, key):
820 return self.file_sets[key]
822 def __setitem__(self, key, value):
823 if not isintance(value, set):
824 raise ValueError("Expected set for value")
825 self.file_sets[key] = value
827 def __delitem__(self, key):
828 del self.file_sets[key]
830 class ElandMatch(object):
831 def __init__(self, pathname, counter,
832 lane=None, read=None, extension=None,
833 sample=None, index=None, part=None, **kwargs):
834 self.filename = pathname
835 self.counter = counter
838 self.extension = extension
842 LOGGER.info("Found %s: L%s R%s Samp%s" % (
843 self.filename, self._lane, self._read, self.sample))
845 def make_samplekey(self):
846 read = self._read if self._read is not None else 1
847 return SampleKey(lane=self.lane, read=read, sample=self.sample)
850 if self._lane is not None:
851 return int(self._lane)
853 lane = property(_get_lane)
856 if self._read is not None:
857 return int(self._read)
859 read = property(_get_read)
862 if self._part is not None:
863 return int(self._part)
865 part = property(_get_part)
869 if self.sample is not None: name.append(self.sample)
870 if self._lane is not None: name.append('L%s' % (self.lane,))
871 if self._read is not None: name.append('R%s' % (self.read,))
872 if self._part is not None: name.append('P%s' % (self.part,))
873 return '<ElandMatch(' + "_".join(name) + ')>'
876 def extract_eland_sequence(instream, outstream, start, end):
878 Extract a chunk of sequence out of an eland file
880 for line in instream:
881 record = line.split()
883 result = [record[0], record[1][start:end]]
885 result = [record[0][start:end]]
886 outstream.write("\t".join(result))
887 outstream.write(os.linesep)
890 def main(cmdline=None):
891 """Run eland extraction against the specified gerald directory"""
892 from optparse import OptionParser
893 parser = OptionParser("%prog: <gerald dir>+")
894 opts, args = parser.parse_args(cmdline)
895 logging.basicConfig(level=logging.DEBUG)
897 LOGGER.info("Starting scan of %s" % (a,))
899 print(ElementTree.tostring(e.get_elements()))
903 if __name__ == "__main__":