13 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.util.ethelp import indent, flatten
16 from htsworkflow.util.opener import autoopen
18 LOGGER = logging.getLogger(__name__)
20 SAMPLE_NAME = 'SampleName'
25 GENOME_MAP = 'GenomeMap'
26 GENOME_ITEM = 'GenomeItem'
27 MAPPED_READS = 'MappedReads'
28 MAPPED_ITEM = 'MappedItem'
29 MATCH_CODES = 'MatchCodes'
38 class ResultLane(object):
40 Base class for result lanes
45 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
47 self.pathnames = pathnames
48 self.sample_name = sample
49 self.lane_id = lane_id
54 self.set_elements(xml)
58 Actually read the file and actually count the reads
63 if self._reads is None:
66 reads = property(_get_reads)
68 def get_elements(self):
74 name.append('L%s' % (self.lane_id,))
75 name.append('R%s' % (self.end,))
76 name.append('S%s' % (self.sample_name,))
78 return '<ResultLane(' + ",".join(name) + ')>'
80 class ElandLane(ResultLane):
82 Process an eland result file
86 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
87 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
88 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
89 SCORE_UNRECOGNIZED = 0
93 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
94 genome_map=None, eland_type=None, xml=None):
95 super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
97 self._mapped_reads = None
98 self._match_codes = None
100 if genome_map is None:
102 self.genome_map = genome_map
103 self.eland_type = None
106 self.set_elements(xml)
111 name.append('L%s' % (self.lane_id,))
112 name.append('R%s' % (self.end,))
113 name.append('S%s' % (self.sample_name,))
115 reads = str(self._reads) if self._reads is not None else 'Uncounted'
116 return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
118 def _guess_eland_type(self, pathname):
119 if self.eland_type is None:
120 # attempt autodetect eland file type
121 pathn, name = os.path.split(pathname)
122 if re.search('result', name):
123 self.eland_type = ELAND_SINGLE
124 elif re.search('multi', name):
125 self.eland_type = ELAND_MULTI
126 elif re.search('extended', name):
127 self.eland_type = ELAND_EXTENDED
128 elif re.search('export', name):
129 self.eland_type = ELAND_EXPORT
131 self.eland_type = ELAND_SINGLE
135 Actually read the file and actually count the reads
137 # can't do anything if we don't have a file to process
138 if self.pathnames is None or len(self.pathnames) == 0:
140 pathname = self.pathnames[-1]
141 self._guess_eland_type(pathname)
143 if os.stat(pathname)[stat.ST_SIZE] == 0:
144 raise RuntimeError("Eland isn't done, try again later.")
146 LOGGER.debug("summarizing results for %s" % (pathname))
147 self._match_codes = MatchCodes()
148 self._mapped_reads = MappedReads()
151 for pathname in self.pathnames:
152 stream = autoopen(pathname, 'r')
153 if self.eland_type == ELAND_SINGLE:
154 result = self._update_eland_result(stream)
155 elif self.eland_type == ELAND_MULTI or \
156 self.eland_type == ELAND_EXTENDED:
157 result = self._update_eland_multi(stream)
158 elif self.eland_type == ELAND_EXPORT:
159 result = self._update_eland_export(stream)
161 errmsg = "Only support single/multi/extended eland files"
162 raise NotImplementedError(errmsg)
165 match, mapped, reads = result
166 self._match_codes += match
167 self._mapped_reads += mapped
170 def _update_eland_result(self, instream):
172 mapped_reads = MappedReads()
173 match_codes = MatchCodes()
175 for line in instream:
177 fields = line.split()
179 # match_codes[code] = match_codes.setdefault(code, 0) + 1
180 # the QC/NM etc codes are in the 3rd field and always present
181 match_codes[fields[2]] += 1
182 # ignore lines that don't have a fasta filename
185 fasta = self.genome_map.get(fields[6], fields[6])
186 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
187 return match_codes, mapped_reads, reads
189 def _update_eland_multi(self, instream):
190 """Summarize an eland_extend."""
194 mapped_reads = MappedReads()
195 match_codes = MatchCodes()
197 for line in instream:
199 fields = line.split()
200 # fields[2] = QC/NM/or number of matches
201 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
203 if score_type == ElandLane.SCORE_READ:
204 # when there are too many hits, eland writes a - where
205 # it would have put the list of hits.
206 # or in a different version of eland, it just leaves
207 # that column blank, and only outputs 3 fields.
208 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
211 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
213 return match_codes, mapped_reads, reads
215 def _update_eland_export(self, instream):
216 """Summarize a gerald export file."""
221 mapped_reads = MappedReads()
222 match_codes = MatchCodes()
224 for line in instream:
226 fields = line.split()
227 # fields[2] = QC/NM/or number of matches
228 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
230 if score_type == ElandLane.SCORE_UNRECOGNIZED:
231 # export files have three states for the match field
232 # QC code, count of multi-reads, or a single
233 # read location. The score_mapped_mismatches function
234 # only understands the first two types.
235 # if we get unrecognized, that implies the field is probably
237 code = self._count_mapped_export(mapped_reads,
238 fields[LOCATION_INDEX],
239 fields[DESCRIPTOR_INDEX])
240 match_codes[code] += 1
242 return match_codes, mapped_reads, reads
245 def _score_mapped_mismatches(self, match, match_codes):
246 """Update match_codes with eland map counts, or failure code.
248 Returns True if the read mapped, false if it was an error code.
250 groups = ElandLane.MATCH_COUNTS_RE.match(match)
252 # match is not of the form [\d]+:[\d]+:[\d]+
253 if match in match_codes:
254 # match is one quality control codes QC/NM etc
255 match_codes[match] += 1
256 return ElandLane.SCORE_QC
258 return ElandLane.SCORE_UNRECOGNIZED
260 # match is of the form [\d]+:[\d]+:[\d]+
262 zero_mismatches = int(groups.group(1))
263 one_mismatches = int(groups.group(2))
264 two_mismatches = int(groups.group(3))
266 if zero_mismatches == 1:
267 match_codes['U0'] += 1
268 elif zero_mismatches < 255:
269 match_codes['R0'] += zero_mismatches
271 if one_mismatches == 1:
272 match_codes['U1'] += 1
273 elif one_mismatches < 255:
274 match_codes['R1'] += one_mismatches
276 if two_mismatches == 1:
277 match_codes['U2'] += 1
278 elif two_mismatches < 255:
279 match_codes['R2'] += two_mismatches
281 return ElandLane.SCORE_READ
284 def _count_mapped_multireads(self, mapped_reads, match_string):
286 for match in match_string.split(','):
287 match_fragment = match.split(':')
288 if len(match_fragment) == 2:
289 chromo = match_fragment[0]
290 pos = match_fragment[1]
292 fasta = self.genome_map.get(chromo, chromo)
293 assert fasta is not None
294 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
297 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
298 """Count a read as defined in an export file
300 match_string contains the chromosome
301 descriptor contains the an ecoding of bases that match, mismatch,
303 returns the "best" match code
305 Currently "best" match code is ignoring the possibility of in-dels
307 chromo = match_string
308 fasta = self.genome_map.get(chromo, chromo)
309 assert fasta is not None
310 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
312 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
313 if len(mismatch_bases) == 0:
315 elif len(mismatch_bases) == 1:
321 def _get_mapped_reads(self):
322 if self._mapped_reads is None:
324 return self._mapped_reads
325 mapped_reads = property(_get_mapped_reads)
327 def _get_match_codes(self):
328 if self._match_codes is None:
330 return self._match_codes
331 match_codes = property(_get_match_codes)
333 def _get_no_match(self):
334 if self._mapped_reads is None:
336 return self._match_codes['NM']
337 no_match = property(_get_no_match,
338 doc="total reads that didn't match the target genome.")
340 def _get_no_match_percent(self):
341 return float(self.no_match)/self.reads * 100
342 no_match_percent = property(_get_no_match_percent,
343 doc="no match reads as percent of total")
345 def _get_qc_failed(self):
346 if self._mapped_reads is None:
348 return self._match_codes['QC']
349 qc_failed = property(_get_qc_failed,
350 doc="total reads that didn't match the target genome.")
352 def _get_qc_failed_percent(self):
353 return float(self.qc_failed)/self.reads * 100
354 qc_failed_percent = property(_get_qc_failed_percent,
355 doc="QC failed reads as percent of total")
357 def _get_unique_reads(self):
358 if self._mapped_reads is None:
361 for code in ['U0','U1','U2']:
362 sum += self._match_codes[code]
364 unique_reads = property(_get_unique_reads,
365 doc="total unique reads")
367 def _get_repeat_reads(self):
368 if self._mapped_reads is None:
371 for code in ['R0','R1','R2']:
372 sum += self._match_codes[code]
374 repeat_reads = property(_get_repeat_reads,
375 doc="total repeat reads")
377 def get_elements(self):
378 lane = ElementTree.Element(ElandLane.LANE,
380 unicode(ElandLane.XML_VERSION)})
381 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
382 sample_tag.text = self.sample_name
383 lane_tag = ElementTree.SubElement(lane, LANE_ID)
384 lane_tag.text = str(self.lane_id)
385 if self.end is not None:
386 end_tag = ElementTree.SubElement(lane, END)
387 end_tag.text = str(self.end)
388 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
389 for k, v in self.genome_map.items():
390 item = ElementTree.SubElement(
391 genome_map, GENOME_ITEM,
392 {'name':k, 'value':unicode(v)})
393 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
394 for k, v in self.mapped_reads.items():
395 item = ElementTree.SubElement(
396 mapped_reads, MAPPED_ITEM,
397 {'name':k, 'value':unicode(v)})
398 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
399 for k, v in self.match_codes.items():
400 item = ElementTree.SubElement(
401 match_codes, MATCH_ITEM,
402 {'name':k, 'value':unicode(v)})
403 reads = ElementTree.SubElement(lane, READS)
404 reads.text = unicode(self.reads)
408 def set_elements(self, tree):
409 if tree.tag != ElandLane.LANE:
410 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
413 self._mapped_reads = {}
414 self._match_codes = {}
417 tag = element.tag.lower()
418 if tag == SAMPLE_NAME.lower():
419 self.sample_name = element.text
420 elif tag == LANE_ID.lower():
421 self.lane_id = int(element.text)
422 elif tag == END.lower():
423 self.end = int(element.text)
424 elif tag == GENOME_MAP.lower():
425 for child in element:
426 name = child.attrib['name']
427 value = child.attrib['value']
428 self.genome_map[name] = value
429 elif tag == MAPPED_READS.lower():
430 for child in element:
431 name = child.attrib['name']
432 value = child.attrib['value']
433 self._mapped_reads[name] = int(value)
434 elif tag == MATCH_CODES.lower():
435 for child in element:
436 name = child.attrib['name']
437 value = int(child.attrib['value'])
438 self._match_codes[name] = value
439 elif tag == READS.lower():
440 self._reads = int(element.text)
442 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
445 class MatchCodes(collections.MutableMapping):
446 """Mapping to hold match counts -
447 supports combining two match count sets together
449 def __init__(self, initializer=None):
450 self.match_codes = {'NM':0, 'QC':0, 'RM':0,
451 'U0':0, 'U1':0, 'U2':0,
452 'R0':0, 'R1':0, 'R2':0,
455 if initializer is not None:
456 if not isinstance(initializer, collections.Mapping):
457 raise ValueError("Expected dictionary like class")
458 for key in initializer:
459 if key not in self.match_codes:
460 errmsg = "Initializer can only contain: %s"
461 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
462 self.match_codes[key] += initializer[key]
465 return iter(self.match_codes)
467 def __delitem__(self, key):
468 raise RuntimeError("delete not allowed")
470 def __getitem__(self, key):
471 return self.match_codes[key]
473 def __setitem__(self, key, value):
474 if key not in self.match_codes:
475 errmsg = "Unrecognized key, allowed values are: %s"
476 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
477 self.match_codes[key] = value
480 return len(self.match_codes)
482 def __add__(self, other):
483 if not isinstance(other, MatchCodes):
484 raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
486 newobj = MatchCodes(self)
487 for key, value in other.items():
488 newobj[key] = self.get(key, 0) + other[key]
493 class MappedReads(collections.MutableMapping):
494 """Mapping to hold mapped reads -
495 supports combining two mapped read sets together
497 def __init__(self, initializer=None):
498 self.mapped_reads = {}
500 if initializer is not None:
501 if not isinstance(initializer, collections.Mapping):
502 raise ValueError("Expected dictionary like class")
503 for key in initializer:
504 self[key] = self.setdefault(key, 0) + initializer[key]
507 return iter(self.mapped_reads)
509 def __delitem__(self, key):
510 del self.mapped_reads[key]
512 def __getitem__(self, key):
513 return self.mapped_reads[key]
515 def __setitem__(self, key, value):
516 self.mapped_reads[key] = value
519 return len(self.mapped_reads)
521 def __add__(self, other):
522 if not isinstance(other, MappedReads):
523 raise ValueError("Expected a MappedReads, got %s", str(type(other)))
525 newobj = MappedReads(self)
527 newobj[key] = self.get(key, 0) + other[key]
531 class SequenceLane(ResultLane):
533 LANE = 'SequenceLane'
534 SEQUENCE_TYPE = 'SequenceType'
539 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
541 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
543 self.sequence_type = None
544 super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
546 def _guess_sequence_type(self, pathname):
548 Determine if we have a scarf or fastq sequence file
550 f = open(pathname,'r')
555 # fastq starts with a @
556 self.sequence_type = SequenceLane.FASTQ_TYPE
558 self.sequence_type = SequenceLane.SCARF_TYPE
559 return self.sequence_type
563 Actually read the file and actually count the reads
565 # can't do anything if we don't have a file to process
566 if self.pathnames is None:
569 pathname = self.pathnames[-1]
570 if os.stat(pathname)[stat.ST_SIZE] == 0:
571 raise RuntimeError("Sequencing isn't done, try again later.")
573 self._guess_sequence_type(pathname)
575 LOGGER.info("summarizing results for %s" % (pathname))
578 for l in f.xreadlines():
582 if self.sequence_type == SequenceLane.SCARF_TYPE:
584 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
585 self._reads = lines / 4
587 errmsg = "This only supports scarf or fastq squence files"
588 raise NotImplementedError(errmsg)
590 def get_elements(self):
591 lane = ElementTree.Element(SequenceLane.LANE,
593 unicode(SequenceLane.XML_VERSION)})
594 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
595 sample_tag.text = self.sample_name
596 lane_tag = ElementTree.SubElement(lane, LANE_ID)
597 lane_tag.text = str(self.lane_id)
598 if self.end is not None:
599 end_tag = ElementTree.SubElement(lane, END)
600 end_tag.text = str(self.end)
601 reads = ElementTree.SubElement(lane, READS)
602 reads.text = unicode(self.reads)
603 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
604 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
608 def set_elements(self, tree):
609 if tree.tag != SequenceLane.LANE:
610 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
611 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
614 tag = element.tag.lower()
615 if tag == SAMPLE_NAME.lower():
616 self.sample_name = element.text
617 elif tag == LANE_ID.lower():
618 self.lane_id = int(element.text)
619 elif tag == END.lower():
620 self.end = int(element.text)
621 elif tag == READS.lower():
622 self._reads = int(element.text)
623 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
624 self.sequence_type = lookup_sequence_type.get(element.text, None)
626 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
628 class ELAND(collections.MutableMapping):
630 Summarize information from eland files
634 ELAND = 'ElandCollection'
640 def __init__(self, xml=None):
641 # we need information from the gerald config.xml
642 self.results = collections.OrderedDict()
645 self.set_elements(xml)
647 def __getitem__(self, key):
648 if not isinstance(key, SampleKey):
649 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
650 return self.results[key]
652 def __setitem__(self, key, value):
653 if not isinstance(key, SampleKey):
654 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
655 self.results[key] = value
657 def __delitem__(self, key):
661 return self.results.iterkeys()
664 return len(self.results)
666 def find_keys(self, search):
667 """Return results that match key"""
668 if not isinstance(search, SampleKey):
669 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
670 if not search.iswild:
672 for key in self.keys():
673 if key.matches(search): yield key
675 def get_elements(self):
676 root = ElementTree.Element(ELAND.ELAND,
677 {'version': unicode(ELAND.XML_VERSION)})
680 eland_lane = self[key].get_elements()
681 eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
682 eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
683 eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
684 root.append(eland_lane)
688 def set_elements(self, tree):
689 if tree.tag.lower() != ELAND.ELAND.lower():
690 raise ValueError('Expecting %s', ELAND.ELAND)
691 for element in list(tree):
692 lane_id = int(element.attrib[ELAND.LANE_ID])
693 end = int(element.attrib.get(ELAND.END, 0))
694 sample = element.attrib.get(ELAND.SAMPLE, 's')
695 if element.tag.lower() == ElandLane.LANE.lower():
696 lane = ElandLane(xml=element)
697 elif element.tag.lower() == SequenceLane.LANE.lower():
698 lane = SequenceLane(xml=element)
700 key = SampleKey(lane=lane_id, read=end+1, sample=sample)
701 self.results[key] = lane
704 def update_result_with_eland(self, gerald, key, pathnames,
706 # yes the lane_id is also being computed in ElandLane._update
707 # I didn't want to clutter up my constructor
708 # but I needed to persist the sample_name/lane_id for
709 # runfolder summary_report
710 names = [ os.path.split(p)[1] for p in pathnames]
711 LOGGER.info("Adding eland files %s" %(",".join(names),))
714 if genome_maps is not None:
715 genome_map = genome_maps[key.lane]
716 elif gerald is not None:
717 genome_dir = gerald.lanes[key.lane].eland_genome
718 if genome_dir is not None:
719 genome_map = build_genome_fasta_map(genome_dir)
721 lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
723 self.results[key] = lane
725 def update_result_with_sequence(self, gerald, key, pathnames,
727 self.results[key] = SequenceLane(pathnames,
728 key.sample, key.lane, key.read)
731 def eland(gerald_dir, gerald=None, genome_maps=None):
733 eland_files = ElandMatches(e)
735 for path, dirnames, filenames in os.walk(gerald_dir):
736 for filename in filenames:
737 pathname = os.path.abspath(os.path.join(path, filename))
738 eland_files.add(pathname)
739 for key in eland_files:
740 eland_files.count(key, gerald, genome_maps)
744 class ElandMatches(collections.MutableMapping):
745 def __init__(self, eland_container):
746 # the order in patterns determines the preference for what
748 self.eland_container = eland_container
749 MAPPED = eland_container.update_result_with_eland
750 SEQUENCE = eland_container.update_result_with_sequence
752 sample = '(?P<sample>[^_]+)'
753 hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
754 hiLane = '_L(?P<lane>[\d]+)'
755 gaLane = '_(?P<lane>[\d]+)'
756 hiRead = '_R(?P<read>[\d]+)'
757 gaRead = '(_(?P<read>[\d])+)?'
758 part = '_(?P<part>[\d]+)'
759 ext = '(?P<extention>(\.bz2|\.gz)?)'
761 hiPrefix = sample + hiIndex + hiLane + hiRead + part
762 gaPrefix = sample + gaLane + gaRead
763 P = collections.namedtuple('Patterns', 'pattern counter priority')
765 P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
766 P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
767 P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
768 P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
769 P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
770 P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
773 self.file_priority = {}
774 self.file_counter = {}
776 def add(self, pathname):
777 """Add pathname to our set of files
779 path, filename = os.path.split(pathname)
781 for pattern, counter, priority in self.patterns:
782 rematch = re.match(pattern, filename)
783 if rematch is not None:
784 m = ElandMatch(pathname, counter, **rematch.groupdict())
785 key = m.make_samplekey()
786 old_priority = self.file_priority.get(key, 0)
787 if priority > old_priority:
788 self.file_sets[key] = set((m,))
789 self.file_counter[key] = counter
790 self.file_priority[key] = priority
791 elif priority == old_priority:
792 self.file_sets[key].add(m)
794 def count(self, key, gerald=None, genome_maps=None):
795 #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
796 counter = self.file_counter[key]
797 file_set = self.file_sets[key]
798 filenames = [ f.filename for f in file_set ]
799 return counter(gerald, key,
800 filenames, genome_maps)
803 return iter(self.file_sets)
806 return len(self.file_sets)
808 def __getitem__(self, key):
809 return self.file_sets[key]
811 def __setitem__(self, key, value):
812 if not isintance(value, set):
813 raise ValueError("Expected set for value")
814 self.file_sets[key] = value
816 def __delitem__(self, key):
817 del self.file_sets[key]
819 class ElandMatch(object):
820 def __init__(self, pathname, counter,
821 lane=None, read=None, extension=None,
822 sample=None, index=None, part=None, **kwargs):
823 self.filename = pathname
824 self.counter = counter
827 self.extension = extension
831 LOGGER.info("Found %s: L%s R%s Samp%s" % (
832 self.filename, self._lane, self._read, self.sample))
834 def make_samplekey(self):
835 read = self._read if self._read is not None else 1
836 return SampleKey(lane=self.lane, read=read, sample=self.sample)
839 if self._lane is not None:
840 return int(self._lane)
842 lane = property(_get_lane)
845 if self._read is not None:
846 return int(self._read)
848 read = property(_get_read)
851 if self._part is not None:
852 return int(self._part)
854 part = property(_get_part)
858 if self.sample is not None: name.append(self.sample)
859 if self._lane is not None: name.append('L%s' % (self.lane,))
860 if self._read is not None: name.append('R%s' % (self.read,))
861 if self._part is not None: name.append('P%s' % (self.part,))
862 return '<ElandMatch(' + "_".join(name) + ')>'
864 def build_genome_fasta_map(genome_dir):
865 # build fasta to fasta file map
866 LOGGER.info("Building genome map")
867 genome = genome_dir.split(os.path.sep)[-1]
869 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
871 if os.path.islink(vld_file):
873 vld_file = os.path.realpath(vld_file)
874 path, vld_name = os.path.split(vld_file)
875 name, ext = os.path.splitext(vld_name)
877 fasta_map[name] = name
879 fasta_map[name] = os.path.join(genome, name)
883 def extract_eland_sequence(instream, outstream, start, end):
885 Extract a chunk of sequence out of an eland file
887 for line in instream:
888 record = line.split()
890 result = [record[0], record[1][start:end]]
892 result = [record[0][start:end]]
893 outstream.write("\t".join(result))
894 outstream.write(os.linesep)
897 def main(cmdline=None):
898 """Run eland extraction against the specified gerald directory"""
899 from optparse import OptionParser
900 parser = OptionParser("%prog: <gerald dir>+")
901 opts, args = parser.parse_args(cmdline)
902 logging.basicConfig(level=logging.DEBUG)
904 LOGGER.info("Starting scan of %s" % (a,))
906 print ElementTree.tostring(e.get_elements())
910 if __name__ == "__main__":