4 from __future__ import print_function
15 from htsworkflow.pipelines import ElementTree, LANE_LIST
16 from htsworkflow.pipelines.samplekey import SampleKey
17 from htsworkflow.pipelines.genomemap import GenomeMap
18 from htsworkflow.util.ethelp import indent, flatten
19 from htsworkflow.util.opener import autoopen
21 LOGGER = logging.getLogger(__name__)
23 SAMPLE_NAME = 'SampleName'
28 GENOME_MAP = 'GenomeMap'
29 GENOME_ITEM = 'GenomeItem'
30 MAPPED_READS = 'MappedReads'
31 MAPPED_ITEM = 'MappedItem'
32 MATCH_CODES = 'MatchCodes'
41 class ResultLane(object):
43 Base class for result lanes
48 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
50 self.pathnames = pathnames
51 self.sample_name = sample
52 self.lane_id = lane_id
57 self.set_elements(xml)
61 Actually read the file and actually count the reads
66 if self._reads is None:
69 reads = property(_get_reads)
71 def get_elements(self):
77 name.append('L%s' % (self.lane_id,))
78 name.append('R%s' % (self.end,))
79 name.append('S%s' % (self.sample_name,))
81 return '<ResultLane(' + ",".join(name) + ')>'
83 class ElandLane(ResultLane):
85 Process an eland result file
89 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
90 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
91 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
92 SCORE_UNRECOGNIZED = 0
96 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
97 genome_map=None, eland_type=None, xml=None):
98 super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
100 self._mapped_reads = None
101 self._match_codes = None
103 self.genome_map = GenomeMap(genome_map)
104 self.eland_type = None
107 self.set_elements(xml)
112 name.append('L%s' % (self.lane_id,))
113 name.append('R%s' % (self.end,))
114 name.append('S%s' % (self.sample_name,))
116 reads = str(self._reads) if self._reads is not None else 'Uncounted'
117 return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
119 def _guess_eland_type(self, pathname):
120 if self.eland_type is None:
121 # attempt autodetect eland file type
122 pathn, name = os.path.split(pathname)
123 if re.search('result', name):
124 self.eland_type = ELAND_SINGLE
125 elif re.search('multi', name):
126 self.eland_type = ELAND_MULTI
127 elif re.search('extended', name):
128 self.eland_type = ELAND_EXTENDED
129 elif re.search('export', name):
130 self.eland_type = ELAND_EXPORT
132 self.eland_type = ELAND_SINGLE
136 Actually read the file and actually count the reads
138 # can't do anything if we don't have a file to process
139 if self.pathnames is None or len(self.pathnames) == 0:
141 pathname = self.pathnames[-1]
142 self._guess_eland_type(pathname)
144 if os.stat(pathname)[stat.ST_SIZE] == 0:
145 raise RuntimeError("Eland isn't done, try again later.")
147 LOGGER.debug("summarizing results for %s" % (pathname))
148 self._match_codes = MatchCodes()
149 self._mapped_reads = MappedReads()
152 for pathname in self.pathnames:
153 stream = autoopen(pathname, 'r')
154 if self.eland_type == ELAND_SINGLE:
155 result = self._update_eland_result(stream)
156 elif self.eland_type == ELAND_MULTI or \
157 self.eland_type == ELAND_EXTENDED:
158 result = self._update_eland_multi(stream)
159 elif self.eland_type == ELAND_EXPORT:
160 result = self._update_eland_export(stream)
162 errmsg = "Only support single/multi/extended eland files"
163 raise NotImplementedError(errmsg)
166 match, mapped, reads = result
167 self._match_codes += match
168 self._mapped_reads += mapped
171 def _update_eland_result(self, instream):
173 mapped_reads = MappedReads()
174 match_codes = MatchCodes()
176 for line in instream:
178 fields = line.split()
180 # match_codes[code] = match_codes.setdefault(code, 0) + 1
181 # the QC/NM etc codes are in the 3rd field and always present
182 match_codes[fields[2]] += 1
183 # ignore lines that don't have a fasta filename
186 fasta = self.genome_map.get(fields[6], fields[6])
187 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
188 return match_codes, mapped_reads, reads
190 def _update_eland_multi(self, instream):
191 """Summarize an eland_extend."""
195 mapped_reads = MappedReads()
196 match_codes = MatchCodes()
198 for line in instream:
200 fields = line.split()
201 # fields[2] = QC/NM/or number of matches
202 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
204 if score_type == ElandLane.SCORE_READ:
205 # when there are too many hits, eland writes a - where
206 # it would have put the list of hits.
207 # or in a different version of eland, it just leaves
208 # that column blank, and only outputs 3 fields.
209 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
212 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
214 return match_codes, mapped_reads, reads
216 def _update_eland_export(self, instream):
217 """Summarize a gerald export file."""
222 mapped_reads = MappedReads()
223 match_codes = MatchCodes()
225 for line in instream:
227 fields = line.split()
228 # fields[2] = QC/NM/or number of matches
229 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
231 if score_type == ElandLane.SCORE_UNRECOGNIZED:
232 # export files have three states for the match field
233 # QC code, count of multi-reads, or a single
234 # read location. The score_mapped_mismatches function
235 # only understands the first two types.
236 # if we get unrecognized, that implies the field is probably
238 code = self._count_mapped_export(mapped_reads,
239 fields[LOCATION_INDEX],
240 fields[DESCRIPTOR_INDEX])
241 match_codes[code] += 1
243 return match_codes, mapped_reads, reads
246 def _score_mapped_mismatches(self, match, match_codes):
247 """Update match_codes with eland map counts, or failure code.
249 Returns True if the read mapped, false if it was an error code.
251 groups = ElandLane.MATCH_COUNTS_RE.match(match)
253 # match is not of the form [\d]+:[\d]+:[\d]+
254 if match in match_codes:
255 # match is one quality control codes QC/NM etc
256 match_codes[match] += 1
257 return ElandLane.SCORE_QC
259 return ElandLane.SCORE_UNRECOGNIZED
261 # match is of the form [\d]+:[\d]+:[\d]+
263 zero_mismatches = int(groups.group(1))
264 one_mismatches = int(groups.group(2))
265 two_mismatches = int(groups.group(3))
267 if zero_mismatches == 1:
268 match_codes['U0'] += 1
269 elif zero_mismatches < 255:
270 match_codes['R0'] += zero_mismatches
272 if one_mismatches == 1:
273 match_codes['U1'] += 1
274 elif one_mismatches < 255:
275 match_codes['R1'] += one_mismatches
277 if two_mismatches == 1:
278 match_codes['U2'] += 1
279 elif two_mismatches < 255:
280 match_codes['R2'] += two_mismatches
282 return ElandLane.SCORE_READ
285 def _count_mapped_multireads(self, mapped_reads, match_string):
287 for match in match_string.split(','):
288 match_fragment = match.split(':')
289 if len(match_fragment) == 2:
290 chromo = match_fragment[0]
291 pos = match_fragment[1]
293 fasta = self.genome_map.get(chromo, chromo)
294 assert fasta is not None
295 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
298 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
299 """Count a read as defined in an export file
301 match_string contains the chromosome
302 descriptor contains the an ecoding of bases that match, mismatch,
304 returns the "best" match code
306 Currently "best" match code is ignoring the possibility of in-dels
308 chromo = match_string
309 fasta = self.genome_map.get(chromo, chromo)
310 assert fasta is not None
311 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
313 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
314 if len(mismatch_bases) == 0:
316 elif len(mismatch_bases) == 1:
322 def _get_mapped_reads(self):
323 if self._mapped_reads is None:
325 return self._mapped_reads
326 mapped_reads = property(_get_mapped_reads)
328 def _get_match_codes(self):
329 if self._match_codes is None:
331 return self._match_codes
332 match_codes = property(_get_match_codes)
334 def _get_no_match(self):
335 if self._mapped_reads is None:
337 return self._match_codes['NM']
338 no_match = property(_get_no_match,
339 doc="total reads that didn't match the target genome.")
341 def _get_no_match_percent(self):
342 return float(self.no_match)/self.reads * 100
343 no_match_percent = property(_get_no_match_percent,
344 doc="no match reads as percent of total")
346 def _get_qc_failed(self):
347 if self._mapped_reads is None:
349 return self._match_codes['QC']
350 qc_failed = property(_get_qc_failed,
351 doc="total reads that didn't match the target genome.")
353 def _get_qc_failed_percent(self):
354 return float(self.qc_failed)/self.reads * 100
355 qc_failed_percent = property(_get_qc_failed_percent,
356 doc="QC failed reads as percent of total")
358 def _get_unique_reads(self):
359 if self._mapped_reads is None:
362 for code in ['U0','U1','U2']:
363 sum += self._match_codes[code]
365 unique_reads = property(_get_unique_reads,
366 doc="total unique reads")
368 def _get_repeat_reads(self):
369 if self._mapped_reads is None:
372 for code in ['R0','R1','R2']:
373 sum += self._match_codes[code]
375 repeat_reads = property(_get_repeat_reads,
376 doc="total repeat reads")
378 def get_elements(self):
379 lane = ElementTree.Element(ElandLane.LANE,
381 unicode(ElandLane.XML_VERSION)})
382 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
383 sample_tag.text = self.sample_name
384 lane_tag = ElementTree.SubElement(lane, LANE_ID)
385 lane_tag.text = str(self.lane_id)
386 if self.end is not None:
387 end_tag = ElementTree.SubElement(lane, END)
388 end_tag.text = str(self.end)
389 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
390 for k, v in self.genome_map.items():
391 item = ElementTree.SubElement(
392 genome_map, GENOME_ITEM,
393 {'name':k, 'value':unicode(v)})
394 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
395 for k, v in self.mapped_reads.items():
396 item = ElementTree.SubElement(
397 mapped_reads, MAPPED_ITEM,
398 {'name':k, 'value':unicode(v)})
399 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
400 for k, v in self.match_codes.items():
401 item = ElementTree.SubElement(
402 match_codes, MATCH_ITEM,
403 {'name':k, 'value':unicode(v)})
404 reads = ElementTree.SubElement(lane, READS)
405 reads.text = unicode(self.reads)
409 def set_elements(self, tree):
410 if tree.tag != ElandLane.LANE:
411 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
414 self._mapped_reads = {}
415 self._match_codes = {}
418 tag = element.tag.lower()
419 if tag == SAMPLE_NAME.lower():
420 self.sample_name = element.text
421 elif tag == LANE_ID.lower():
422 self.lane_id = int(element.text)
423 elif tag == END.lower():
424 self.end = int(element.text)
425 elif tag == GENOME_MAP.lower():
426 for child in element:
427 name = child.attrib['name']
428 value = child.attrib['value']
429 self.genome_map[name] = value
430 elif tag == MAPPED_READS.lower():
431 for child in element:
432 name = child.attrib['name']
433 value = child.attrib['value']
434 self._mapped_reads[name] = int(value)
435 elif tag == MATCH_CODES.lower():
436 for child in element:
437 name = child.attrib['name']
438 value = int(child.attrib['value'])
439 self._match_codes[name] = value
440 elif tag == READS.lower():
441 self._reads = int(element.text)
443 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
446 class MatchCodes(collections.MutableMapping):
447 """Mapping to hold match counts -
448 supports combining two match count sets together
450 def __init__(self, initializer=None):
451 self.match_codes = {'NM':0, 'QC':0, 'RM':0,
452 'U0':0, 'U1':0, 'U2':0,
453 'R0':0, 'R1':0, 'R2':0,
456 if initializer is not None:
457 if not isinstance(initializer, collections.Mapping):
458 raise ValueError("Expected dictionary like class")
459 for key in initializer:
460 if key not in self.match_codes:
461 errmsg = "Initializer can only contain: %s"
462 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
463 self.match_codes[key] += initializer[key]
466 return iter(self.match_codes)
468 def __delitem__(self, key):
469 raise RuntimeError("delete not allowed")
471 def __getitem__(self, key):
472 return self.match_codes[key]
474 def __setitem__(self, key, value):
475 if key not in self.match_codes:
476 errmsg = "Unrecognized key, allowed values are: %s"
477 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
478 self.match_codes[key] = value
481 return len(self.match_codes)
483 def __add__(self, other):
484 if not isinstance(other, MatchCodes):
485 raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
487 newobj = MatchCodes(self)
488 for key, value in other.items():
489 newobj[key] = self.get(key, 0) + other[key]
494 class MappedReads(collections.MutableMapping):
495 """Mapping to hold mapped reads -
496 supports combining two mapped read sets together
498 def __init__(self, initializer=None):
499 self.mapped_reads = {}
501 if initializer is not None:
502 if not isinstance(initializer, collections.Mapping):
503 raise ValueError("Expected dictionary like class")
504 for key in initializer:
505 self[key] = self.setdefault(key, 0) + initializer[key]
508 return iter(self.mapped_reads)
510 def __delitem__(self, key):
511 del self.mapped_reads[key]
513 def __getitem__(self, key):
514 return self.mapped_reads[key]
516 def __setitem__(self, key, value):
517 self.mapped_reads[key] = value
520 return len(self.mapped_reads)
522 def __add__(self, other):
523 if not isinstance(other, MappedReads):
524 raise ValueError("Expected a MappedReads, got %s", str(type(other)))
526 newobj = MappedReads(self)
528 newobj[key] = self.get(key, 0) + other[key]
532 class SequenceLane(ResultLane):
534 LANE = 'SequenceLane'
535 SEQUENCE_TYPE = 'SequenceType'
540 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
542 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
544 self.sequence_type = None
545 super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
547 def _guess_sequence_type(self, pathname):
549 Determine if we have a scarf or fastq sequence file
551 f = open(pathname,'r')
556 # fastq starts with a @
557 self.sequence_type = SequenceLane.FASTQ_TYPE
559 self.sequence_type = SequenceLane.SCARF_TYPE
560 return self.sequence_type
564 Actually read the file and actually count the reads
566 # can't do anything if we don't have a file to process
567 if self.pathnames is None:
570 pathname = self.pathnames[-1]
571 if os.stat(pathname)[stat.ST_SIZE] == 0:
572 raise RuntimeError("Sequencing isn't done, try again later.")
574 self._guess_sequence_type(pathname)
576 LOGGER.info("summarizing results for %s" % (pathname))
579 for l in f.xreadlines():
583 if self.sequence_type == SequenceLane.SCARF_TYPE:
585 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
586 self._reads = lines / 4
588 errmsg = "This only supports scarf or fastq squence files"
589 raise NotImplementedError(errmsg)
591 def get_elements(self):
592 lane = ElementTree.Element(SequenceLane.LANE,
594 unicode(SequenceLane.XML_VERSION)})
595 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
596 sample_tag.text = self.sample_name
597 lane_tag = ElementTree.SubElement(lane, LANE_ID)
598 lane_tag.text = str(self.lane_id)
599 if self.end is not None:
600 end_tag = ElementTree.SubElement(lane, END)
601 end_tag.text = str(self.end)
602 reads = ElementTree.SubElement(lane, READS)
603 reads.text = unicode(self.reads)
604 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
605 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
609 def set_elements(self, tree):
610 if tree.tag != SequenceLane.LANE:
611 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
612 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
615 tag = element.tag.lower()
616 if tag == SAMPLE_NAME.lower():
617 self.sample_name = element.text
618 elif tag == LANE_ID.lower():
619 self.lane_id = int(element.text)
620 elif tag == END.lower():
621 self.end = int(element.text)
622 elif tag == READS.lower():
623 self._reads = int(element.text)
624 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
625 self.sequence_type = lookup_sequence_type.get(element.text, None)
627 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
629 class ELAND(collections.MutableMapping):
631 Summarize information from eland files
635 ELAND = 'ElandCollection'
641 def __init__(self, xml=None):
642 # we need information from the gerald config.xml
646 self.set_elements(xml)
648 def __getitem__(self, key):
649 if not isinstance(key, SampleKey):
650 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
651 return self.results[key]
653 def __setitem__(self, key, value):
654 if not isinstance(key, SampleKey):
655 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
656 self.results[key] = value
658 def __delitem__(self, key):
662 keys = self.results.iterkeys()
663 for k in sorted(keys):
667 return len(self.results)
669 def find_keys(self, search):
670 """Return results that match key"""
671 if not isinstance(search, SampleKey):
672 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
673 if not search.iswild:
675 for key in self.keys():
676 if key.matches(search): yield key
678 def get_elements(self):
679 root = ElementTree.Element(ELAND.ELAND,
680 {'version': unicode(ELAND.XML_VERSION)})
683 eland_lane = self[key].get_elements()
684 eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
685 eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
686 eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
687 root.append(eland_lane)
691 def set_elements(self, tree):
692 if tree.tag.lower() != ELAND.ELAND.lower():
693 raise ValueError('Expecting %s', ELAND.ELAND)
694 for element in list(tree):
695 lane_id = int(element.attrib[ELAND.LANE_ID])
696 end = int(element.attrib.get(ELAND.END, 0))
697 sample = element.attrib.get(ELAND.SAMPLE, 's')
698 if element.tag.lower() == ElandLane.LANE.lower():
699 lane = ElandLane(xml=element)
700 elif element.tag.lower() == SequenceLane.LANE.lower():
701 lane = SequenceLane(xml=element)
703 key = SampleKey(lane=lane_id, read=end+1, sample=sample)
704 self.results[key] = lane
707 def update_result_with_eland(self, gerald, key, pathnames,
709 # yes the lane_id is also being computed in ElandLane._update
710 # I didn't want to clutter up my constructor
711 # but I needed to persist the sample_name/lane_id for
712 # runfolder summary_report
713 names = [ os.path.split(p)[1] for p in pathnames]
714 LOGGER.info("Adding eland files %s" %(",".join(names),))
715 basedir = os.path.split(pathnames[0])[0]
716 gs_template = "{0}_*_L{1:03}_genomesize.xml"
718 os.path.join(basedir,
719 gs_template.format(key.sample, key.lane)))
722 genome_map = GenomeMap()
723 if genome_maps is not None:
724 genome_map = GenomeMap(genome_maps[key.lane])
725 elif len(genomesize) > 0:
726 LOGGER.info("Found {0}".format(genomesize))
727 genome_map.parse_genomesize(genomesize[0])
728 elif gerald is not None:
729 genome_dir = gerald.lanes[key].eland_genome
730 if genome_dir is not None:
731 genome_map.scan_genome_dir(genome_dir)
733 lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
735 self.results[key] = lane
737 def update_result_with_sequence(self, gerald, key, pathnames,
739 self.results[key] = SequenceLane(pathnames,
740 key.sample, key.lane, key.read)
743 def eland(gerald_dir, gerald=None, genome_maps=None):
745 eland_files = ElandMatches(e)
747 for path, dirnames, filenames in os.walk(gerald_dir):
748 for filename in filenames:
749 pathname = os.path.abspath(os.path.join(path, filename))
750 eland_files.add(pathname)
751 for key in eland_files:
752 eland_files.count(key, gerald, genome_maps)
756 class ElandMatches(collections.MutableMapping):
757 def __init__(self, eland_container):
758 # the order in patterns determines the preference for what
760 self.eland_container = eland_container
761 MAPPED = eland_container.update_result_with_eland
762 SEQUENCE = eland_container.update_result_with_sequence
764 sample = '(?P<sample>[^_]+)'
765 hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
766 hiLane = '_L(?P<lane>[\d]+)'
767 gaLane = '_(?P<lane>[\d]+)'
768 hiRead = '_R(?P<read>[\d]+)'
769 gaRead = '(_(?P<read>[\d])+)?'
770 part = '_(?P<part>[\d]+)'
771 ext = '(?P<extention>(\.bz2|\.gz)?)'
773 hiPrefix = sample + hiIndex + hiLane + hiRead + part
774 gaPrefix = sample + gaLane + gaRead
775 P = collections.namedtuple('Patterns', 'pattern counter priority')
777 P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
778 P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
779 P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
780 P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
781 P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
782 P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
785 self.file_priority = {}
786 self.file_counter = {}
788 def add(self, pathname):
789 """Add pathname to our set of files
791 path, filename = os.path.split(pathname)
793 for pattern, counter, priority in self.patterns:
794 rematch = re.match(pattern, filename)
795 if rematch is not None:
796 m = ElandMatch(pathname, counter, **rematch.groupdict())
797 key = m.make_samplekey()
798 old_priority = self.file_priority.get(key, 0)
799 if priority > old_priority:
800 self.file_sets[key] = set((m,))
801 self.file_counter[key] = counter
802 self.file_priority[key] = priority
803 elif priority == old_priority:
804 self.file_sets[key].add(m)
806 def count(self, key, gerald=None, genome_maps=None):
807 #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
808 counter = self.file_counter[key]
809 file_set = self.file_sets[key]
810 filenames = [ f.filename for f in file_set ]
811 return counter(gerald, key,
812 filenames, genome_maps)
815 return iter(self.file_sets)
818 return len(self.file_sets)
820 def __getitem__(self, key):
821 return self.file_sets[key]
823 def __setitem__(self, key, value):
824 if not isintance(value, set):
825 raise ValueError("Expected set for value")
826 self.file_sets[key] = value
828 def __delitem__(self, key):
829 del self.file_sets[key]
831 class ElandMatch(object):
832 def __init__(self, pathname, counter,
833 lane=None, read=None, extension=None,
834 sample=None, index=None, part=None, **kwargs):
835 self.filename = pathname
836 self.counter = counter
839 self.extension = extension
843 LOGGER.info("Found %s: L%s R%s Samp%s" % (
844 self.filename, self._lane, self._read, self.sample))
846 def make_samplekey(self):
847 read = self._read if self._read is not None else 1
848 return SampleKey(lane=self.lane, read=read, sample=self.sample)
851 if self._lane is not None:
852 return int(self._lane)
854 lane = property(_get_lane)
857 if self._read is not None:
858 return int(self._read)
860 read = property(_get_read)
863 if self._part is not None:
864 return int(self._part)
866 part = property(_get_part)
870 if self.sample is not None: name.append(self.sample)
871 if self._lane is not None: name.append('L%s' % (self.lane,))
872 if self._read is not None: name.append('R%s' % (self.read,))
873 if self._part is not None: name.append('P%s' % (self.part,))
874 return '<ElandMatch(' + "_".join(name) + ')>'
877 def extract_eland_sequence(instream, outstream, start, end):
879 Extract a chunk of sequence out of an eland file
881 for line in instream:
882 record = line.split()
884 result = [record[0], record[1][start:end]]
886 result = [record[0][start:end]]
887 outstream.write("\t".join(result))
888 outstream.write(os.linesep)
891 def main(cmdline=None):
892 """Run eland extraction against the specified gerald directory"""
893 from optparse import OptionParser
894 parser = OptionParser("%prog: <gerald dir>+")
895 opts, args = parser.parse_args(cmdline)
896 logging.basicConfig(level=logging.DEBUG)
898 LOGGER.info("Starting scan of %s" % (a,))
900 print(ElementTree.tostring(e.get_elements()))
904 if __name__ == "__main__":