13 from htsworkflow.pipelines.runfolder import ElementTree, LANE_LIST
14 from htsworkflow.pipelines.samplekey import SampleKey
15 from htsworkflow.util.ethelp import indent, flatten
16 from htsworkflow.util.opener import autoopen
18 LOGGER = logging.getLogger(__name__)
20 SAMPLE_NAME = 'SampleName'
25 GENOME_MAP = 'GenomeMap'
26 GENOME_ITEM = 'GenomeItem'
27 MAPPED_READS = 'MappedReads'
28 MAPPED_ITEM = 'MappedItem'
29 MATCH_CODES = 'MatchCodes'
38 class ResultLane(object):
40 Base class for result lanes
45 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
47 self.pathnames = pathnames
48 self.sample_name = sample
49 self.lane_id = lane_id
54 self.set_elements(xml)
58 Actually read the file and actually count the reads
63 if self._reads is None:
66 reads = property(_get_reads)
68 def get_elements(self):
74 name.append('L%s' % (self.lane_id,))
75 name.append('R%s' % (self.end,))
76 name.append('S%s' % (self.sample_name,))
78 return '<ResultLane(' + ",".join(name) + ')>'
80 class ElandLane(ResultLane):
82 Process an eland result file
86 MATCH_COUNTS_RE = re.compile("([\d]+):([\d]+):([\d]+)")
87 DESCRIPTOR_MISMATCH_RE = re.compile("[AGCT]")
88 DESCRIPTOR_INDEL_RE = re.compile("^[\dAGCT]$")
89 SCORE_UNRECOGNIZED = 0
93 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
94 genome_map=None, eland_type=None, xml=None):
95 super(ElandLane, self).__init__(pathnames, sample, lane_id, end)
97 self._mapped_reads = None
98 self._match_codes = None
100 if genome_map is None:
102 self.genome_map = genome_map
103 self.eland_type = None
106 self.set_elements(xml)
111 name.append('L%s' % (self.lane_id,))
112 name.append('R%s' % (self.end,))
113 name.append('S%s' % (self.sample_name,))
115 reads = str(self._reads) if self._reads is not None else 'Uncounted'
116 return '<ElandLane(' + ",".join(name) + ' = '+ reads + ')>'
118 def _guess_eland_type(self, pathname):
119 if self.eland_type is None:
120 # attempt autodetect eland file type
121 pathn, name = os.path.split(pathname)
122 if re.search('result', name):
123 self.eland_type = ELAND_SINGLE
124 elif re.search('multi', name):
125 self.eland_type = ELAND_MULTI
126 elif re.search('extended', name):
127 self.eland_type = ELAND_EXTENDED
128 elif re.search('export', name):
129 self.eland_type = ELAND_EXPORT
131 self.eland_type = ELAND_SINGLE
135 Actually read the file and actually count the reads
137 # can't do anything if we don't have a file to process
138 if self.pathnames is None or len(self.pathnames) == 0:
140 pathname = self.pathnames[-1]
141 self._guess_eland_type(pathname)
143 if os.stat(pathname)[stat.ST_SIZE] == 0:
144 raise RuntimeError("Eland isn't done, try again later.")
146 LOGGER.debug("summarizing results for %s" % (pathname))
147 self._match_codes = MatchCodes()
148 self._mapped_reads = MappedReads()
151 for pathname in self.pathnames:
152 stream = autoopen(pathname, 'r')
153 if self.eland_type == ELAND_SINGLE:
154 result = self._update_eland_result(stream)
155 elif self.eland_type == ELAND_MULTI or \
156 self.eland_type == ELAND_EXTENDED:
157 result = self._update_eland_multi(stream)
158 elif self.eland_type == ELAND_EXPORT:
159 result = self._update_eland_export(stream)
161 errmsg = "Only support single/multi/extended eland files"
162 raise NotImplementedError(errmsg)
165 match, mapped, reads = result
166 self._match_codes += match
167 self._mapped_reads += mapped
170 def _update_eland_result(self, instream):
172 mapped_reads = MappedReads()
173 match_codes = MatchCodes()
175 for line in instream:
177 fields = line.split()
179 # match_codes[code] = match_codes.setdefault(code, 0) + 1
180 # the QC/NM etc codes are in the 3rd field and always present
181 match_codes[fields[2]] += 1
182 # ignore lines that don't have a fasta filename
185 fasta = self.genome_map.get(fields[6], fields[6])
186 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
187 return match_codes, mapped_reads, reads
189 def _update_eland_multi(self, instream):
190 """Summarize an eland_extend."""
194 mapped_reads = MappedReads()
195 match_codes = MatchCodes()
197 for line in instream:
199 fields = line.split()
200 # fields[2] = QC/NM/or number of matches
201 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
203 if score_type == ElandLane.SCORE_READ:
204 # when there are too many hits, eland writes a - where
205 # it would have put the list of hits.
206 # or in a different version of eland, it just leaves
207 # that column blank, and only outputs 3 fields.
208 if len(fields) < 4 or fields[LOCATION_INDEX] == '-':
211 self._count_mapped_multireads(mapped_reads, fields[LOCATION_INDEX])
213 return match_codes, mapped_reads, reads
215 def _update_eland_export(self, instream):
216 """Summarize a gerald export file."""
221 mapped_reads = MappedReads()
222 match_codes = MatchCodes()
224 for line in instream:
226 fields = line.split()
227 # fields[2] = QC/NM/or number of matches
228 score_type = self._score_mapped_mismatches(fields[MATCH_INDEX],
230 if score_type == ElandLane.SCORE_UNRECOGNIZED:
231 # export files have three states for the match field
232 # QC code, count of multi-reads, or a single
233 # read location. The score_mapped_mismatches function
234 # only understands the first two types.
235 # if we get unrecognized, that implies the field is probably
237 code = self._count_mapped_export(mapped_reads,
238 fields[LOCATION_INDEX],
239 fields[DESCRIPTOR_INDEX])
240 match_codes[code] += 1
242 return match_codes, mapped_reads, reads
245 def _score_mapped_mismatches(self, match, match_codes):
246 """Update match_codes with eland map counts, or failure code.
248 Returns True if the read mapped, false if it was an error code.
250 groups = ElandLane.MATCH_COUNTS_RE.match(match)
252 # match is not of the form [\d]+:[\d]+:[\d]+
253 if match in match_codes:
254 # match is one quality control codes QC/NM etc
255 match_codes[match] += 1
256 return ElandLane.SCORE_QC
258 return ElandLane.SCORE_UNRECOGNIZED
260 # match is of the form [\d]+:[\d]+:[\d]+
262 zero_mismatches = int(groups.group(1))
263 one_mismatches = int(groups.group(2))
264 two_mismatches = int(groups.group(3))
266 if zero_mismatches == 1:
267 match_codes['U0'] += 1
268 elif zero_mismatches < 255:
269 match_codes['R0'] += zero_mismatches
271 if one_mismatches == 1:
272 match_codes['U1'] += 1
273 elif one_mismatches < 255:
274 match_codes['R1'] += one_mismatches
276 if two_mismatches == 1:
277 match_codes['U2'] += 1
278 elif two_mismatches < 255:
279 match_codes['R2'] += two_mismatches
281 return ElandLane.SCORE_READ
284 def _count_mapped_multireads(self, mapped_reads, match_string):
286 for match in match_string.split(','):
287 match_fragment = match.split(':')
288 if len(match_fragment) == 2:
289 chromo = match_fragment[0]
290 pos = match_fragment[1]
292 fasta = self.genome_map.get(chromo, chromo)
293 assert fasta is not None
294 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
297 def _count_mapped_export(self, mapped_reads, match_string, descriptor):
298 """Count a read as defined in an export file
300 match_string contains the chromosome
301 descriptor contains the an ecoding of bases that match, mismatch,
303 returns the "best" match code
305 Currently "best" match code is ignoring the possibility of in-dels
307 chromo = match_string
308 fasta = self.genome_map.get(chromo, chromo)
309 assert fasta is not None
310 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
312 mismatch_bases = ElandLane.DESCRIPTOR_MISMATCH_RE.findall(descriptor)
313 if len(mismatch_bases) == 0:
315 elif len(mismatch_bases) == 1:
321 def _get_mapped_reads(self):
322 if self._mapped_reads is None:
324 return self._mapped_reads
325 mapped_reads = property(_get_mapped_reads)
327 def _get_match_codes(self):
328 if self._match_codes is None:
330 return self._match_codes
331 match_codes = property(_get_match_codes)
333 def _get_no_match(self):
334 if self._mapped_reads is None:
336 return self._match_codes['NM']
337 no_match = property(_get_no_match,
338 doc="total reads that didn't match the target genome.")
340 def _get_no_match_percent(self):
341 return float(self.no_match)/self.reads * 100
342 no_match_percent = property(_get_no_match_percent,
343 doc="no match reads as percent of total")
345 def _get_qc_failed(self):
346 if self._mapped_reads is None:
348 return self._match_codes['QC']
349 qc_failed = property(_get_qc_failed,
350 doc="total reads that didn't match the target genome.")
352 def _get_qc_failed_percent(self):
353 return float(self.qc_failed)/self.reads * 100
354 qc_failed_percent = property(_get_qc_failed_percent,
355 doc="QC failed reads as percent of total")
357 def _get_unique_reads(self):
358 if self._mapped_reads is None:
361 for code in ['U0','U1','U2']:
362 sum += self._match_codes[code]
364 unique_reads = property(_get_unique_reads,
365 doc="total unique reads")
367 def _get_repeat_reads(self):
368 if self._mapped_reads is None:
371 for code in ['R0','R1','R2']:
372 sum += self._match_codes[code]
374 repeat_reads = property(_get_repeat_reads,
375 doc="total repeat reads")
377 def get_elements(self):
378 lane = ElementTree.Element(ElandLane.LANE,
380 unicode(ElandLane.XML_VERSION)})
381 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
382 sample_tag.text = self.sample_name
383 lane_tag = ElementTree.SubElement(lane, LANE_ID)
384 lane_tag.text = str(self.lane_id)
385 if self.end is not None:
386 end_tag = ElementTree.SubElement(lane, END)
387 end_tag.text = str(self.end)
388 genome_map = ElementTree.SubElement(lane, GENOME_MAP)
389 for k, v in self.genome_map.items():
390 item = ElementTree.SubElement(
391 genome_map, GENOME_ITEM,
392 {'name':k, 'value':unicode(v)})
393 mapped_reads = ElementTree.SubElement(lane, MAPPED_READS)
394 for k, v in self.mapped_reads.items():
395 item = ElementTree.SubElement(
396 mapped_reads, MAPPED_ITEM,
397 {'name':k, 'value':unicode(v)})
398 match_codes = ElementTree.SubElement(lane, MATCH_CODES)
399 for k, v in self.match_codes.items():
400 item = ElementTree.SubElement(
401 match_codes, MATCH_ITEM,
402 {'name':k, 'value':unicode(v)})
403 reads = ElementTree.SubElement(lane, READS)
404 reads.text = unicode(self.reads)
408 def set_elements(self, tree):
409 if tree.tag != ElandLane.LANE:
410 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
413 self._mapped_reads = {}
414 self._match_codes = {}
417 tag = element.tag.lower()
418 if tag == SAMPLE_NAME.lower():
419 self.sample_name = element.text
420 elif tag == LANE_ID.lower():
421 self.lane_id = int(element.text)
422 elif tag == END.lower():
423 self.end = int(element.text)
424 elif tag == GENOME_MAP.lower():
425 for child in element:
426 name = child.attrib['name']
427 value = child.attrib['value']
428 self.genome_map[name] = value
429 elif tag == MAPPED_READS.lower():
430 for child in element:
431 name = child.attrib['name']
432 value = child.attrib['value']
433 self._mapped_reads[name] = int(value)
434 elif tag == MATCH_CODES.lower():
435 for child in element:
436 name = child.attrib['name']
437 value = int(child.attrib['value'])
438 self._match_codes[name] = value
439 elif tag == READS.lower():
440 self._reads = int(element.text)
442 LOGGER.warn("ElandLane unrecognized tag %s" % (element.tag,))
445 class MatchCodes(collections.MutableMapping):
446 """Mapping to hold match counts -
447 supports combining two match count sets together
449 def __init__(self, initializer=None):
450 self.match_codes = {'NM':0, 'QC':0, 'RM':0,
451 'U0':0, 'U1':0, 'U2':0,
452 'R0':0, 'R1':0, 'R2':0,
455 if initializer is not None:
456 if not isinstance(initializer, collections.Mapping):
457 raise ValueError("Expected dictionary like class")
458 for key in initializer:
459 if key not in self.match_codes:
460 errmsg = "Initializer can only contain: %s"
461 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
462 self.match_codes[key] += initializer[key]
465 return iter(self.match_codes)
467 def __delitem__(self, key):
468 raise RuntimeError("delete not allowed")
470 def __getitem__(self, key):
471 return self.match_codes[key]
473 def __setitem__(self, key, value):
474 if key not in self.match_codes:
475 errmsg = "Unrecognized key, allowed values are: %s"
476 raise ValueError(errmsg % (",".join(self.match_codes.keys())))
477 self.match_codes[key] = value
480 return len(self.match_codes)
482 def __add__(self, other):
483 if not isinstance(other, MatchCodes):
484 raise ValueError("Expected a MatchCodes, got %s", str(type(other)))
486 newobj = MatchCodes(self)
487 for key, value in other.items():
488 newobj[key] = self.get(key, 0) + other[key]
493 class MappedReads(collections.MutableMapping):
494 """Mapping to hold mapped reads -
495 supports combining two mapped read sets together
497 def __init__(self, initializer=None):
498 self.mapped_reads = {}
500 if initializer is not None:
501 if not isinstance(initializer, collections.Mapping):
502 raise ValueError("Expected dictionary like class")
503 for key in initializer:
504 self[key] = self.setdefault(key, 0) + initializer[key]
507 return iter(self.mapped_reads)
509 def __delitem__(self, key):
510 del self.mapped_reads[key]
512 def __getitem__(self, key):
513 return self.mapped_reads[key]
515 def __setitem__(self, key, value):
516 self.mapped_reads[key] = value
519 return len(self.mapped_reads)
521 def __add__(self, other):
522 if not isinstance(other, MappedReads):
523 raise ValueError("Expected a MappedReads, got %s", str(type(other)))
525 newobj = MappedReads(self)
527 newobj[key] = self.get(key, 0) + other[key]
531 class SequenceLane(ResultLane):
533 LANE = 'SequenceLane'
534 SEQUENCE_TYPE = 'SequenceType'
539 SEQUENCE_DESCRIPTION = { NONE_TYPE: 'None', SCARF_TYPE: 'SCARF', FASTQ_TYPE: 'FASTQ' }
541 def __init__(self, pathnames=None, sample=None, lane_id=None, end=None,
543 self.sequence_type = None
544 super(SequenceLane, self).__init__(pathnames, sample, lane_id, end, xml)
546 def _guess_sequence_type(self, pathname):
548 Determine if we have a scarf or fastq sequence file
550 f = open(pathname,'r')
555 # fastq starts with a @
556 self.sequence_type = SequenceLane.FASTQ_TYPE
558 self.sequence_type = SequenceLane.SCARF_TYPE
559 return self.sequence_type
563 Actually read the file and actually count the reads
565 # can't do anything if we don't have a file to process
566 if self.pathnames is None:
569 pathname = self.pathnames[-1]
570 if os.stat(pathname)[stat.ST_SIZE] == 0:
571 raise RuntimeError("Sequencing isn't done, try again later.")
573 self._guess_sequence_type(pathname)
575 LOGGER.info("summarizing results for %s" % (pathname))
578 for l in f.xreadlines():
582 if self.sequence_type == SequenceLane.SCARF_TYPE:
584 elif self.sequence_type == SequenceLane.FASTQ_TYPE:
585 self._reads = lines / 4
587 errmsg = "This only supports scarf or fastq squence files"
588 raise NotImplementedError(errmsg)
590 def get_elements(self):
591 lane = ElementTree.Element(SequenceLane.LANE,
593 unicode(SequenceLane.XML_VERSION)})
594 sample_tag = ElementTree.SubElement(lane, SAMPLE_NAME)
595 sample_tag.text = self.sample_name
596 lane_tag = ElementTree.SubElement(lane, LANE_ID)
597 lane_tag.text = str(self.lane_id)
598 if self.end is not None:
599 end_tag = ElementTree.SubElement(lane, END)
600 end_tag.text = str(self.end)
601 reads = ElementTree.SubElement(lane, READS)
602 reads.text = unicode(self.reads)
603 sequence_type = ElementTree.SubElement(lane, SequenceLane.SEQUENCE_TYPE)
604 sequence_type.text = unicode(SequenceLane.SEQUENCE_DESCRIPTION[self.sequence_type])
608 def set_elements(self, tree):
609 if tree.tag != SequenceLane.LANE:
610 raise ValueError('Exptecting %s' % (SequenceLane.LANE,))
611 lookup_sequence_type = dict([ (v,k) for k,v in SequenceLane.SEQUENCE_DESCRIPTION.items()])
614 tag = element.tag.lower()
615 if tag == SAMPLE_NAME.lower():
616 self.sample_name = element.text
617 elif tag == LANE_ID.lower():
618 self.lane_id = int(element.text)
619 elif tag == END.lower():
620 self.end = int(element.text)
621 elif tag == READS.lower():
622 self._reads = int(element.text)
623 elif tag == SequenceLane.SEQUENCE_TYPE.lower():
624 self.sequence_type = lookup_sequence_type.get(element.text, None)
626 LOGGER.warn("SequenceLane unrecognized tag %s" % (element.tag,))
628 class ELAND(collections.MutableMapping):
630 Summarize information from eland files
634 ELAND = 'ElandCollection'
640 def __init__(self, xml=None):
641 # we need information from the gerald config.xml
645 self.set_elements(xml)
647 def __getitem__(self, key):
648 if not isinstance(key, SampleKey):
649 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
650 return self.results[key]
652 def __setitem__(self, key, value):
653 if not isinstance(key, SampleKey):
654 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
655 self.results[key] = value
657 def __delitem__(self, key):
661 keys = self.results.iterkeys()
662 for k in sorted(keys):
666 return len(self.results)
668 def find_keys(self, search):
669 """Return results that match key"""
670 if not isinstance(search, SampleKey):
671 raise ValueError("Key must be a %s" % (str(type(SampleKey))))
672 if not search.iswild:
674 for key in self.keys():
675 if key.matches(search): yield key
677 def get_elements(self):
678 root = ElementTree.Element(ELAND.ELAND,
679 {'version': unicode(ELAND.XML_VERSION)})
682 eland_lane = self[key].get_elements()
683 eland_lane.attrib[ELAND.END] = unicode(self[key].end-1)
684 eland_lane.attrib[ELAND.LANE_ID] = unicode(self[key].lane_id)
685 eland_lane.attrib[ELAND.SAMPLE] = unicode(self[key].sample_name)
686 root.append(eland_lane)
690 def set_elements(self, tree):
691 if tree.tag.lower() != ELAND.ELAND.lower():
692 raise ValueError('Expecting %s', ELAND.ELAND)
693 for element in list(tree):
694 lane_id = int(element.attrib[ELAND.LANE_ID])
695 end = int(element.attrib.get(ELAND.END, 0))
696 sample = element.attrib.get(ELAND.SAMPLE, 's')
697 if element.tag.lower() == ElandLane.LANE.lower():
698 lane = ElandLane(xml=element)
699 elif element.tag.lower() == SequenceLane.LANE.lower():
700 lane = SequenceLane(xml=element)
702 key = SampleKey(lane=lane_id, read=end+1, sample=sample)
703 self.results[key] = lane
706 def update_result_with_eland(self, gerald, key, pathnames,
708 # yes the lane_id is also being computed in ElandLane._update
709 # I didn't want to clutter up my constructor
710 # but I needed to persist the sample_name/lane_id for
711 # runfolder summary_report
712 names = [ os.path.split(p)[1] for p in pathnames]
713 LOGGER.info("Adding eland files %s" %(",".join(names),))
714 basedir = os.path.split(pathnames[0])[0]
715 gs_template = "{0}_*_L{1:03}_genomesize.xml"
717 os.path.join(basedir,
718 gs_template.format(key.sample, key.lane)))
722 if genome_maps is not None:
723 genome_map = genome_maps[key.lane]
724 elif len(genomesize) > 0:
725 print "Found {0}".format(genomesize)
726 genome_map = build_genome_size_map(genomesize[0])
727 elif gerald is not None:
728 genome_dir = gerald.lanes[key].eland_genome
729 if genome_dir is not None:
730 genome_map = build_genome_fasta_map(genome_dir)
732 lane = ElandLane(pathnames, key.sample, key.lane, key.read, genome_map)
734 self.results[key] = lane
736 def update_result_with_sequence(self, gerald, key, pathnames,
738 self.results[key] = SequenceLane(pathnames,
739 key.sample, key.lane, key.read)
742 def eland(gerald_dir, gerald=None, genome_maps=None):
744 eland_files = ElandMatches(e)
746 for path, dirnames, filenames in os.walk(gerald_dir):
747 for filename in filenames:
748 pathname = os.path.abspath(os.path.join(path, filename))
749 eland_files.add(pathname)
750 for key in eland_files:
751 eland_files.count(key, gerald, genome_maps)
755 class ElandMatches(collections.MutableMapping):
756 def __init__(self, eland_container):
757 # the order in patterns determines the preference for what
759 self.eland_container = eland_container
760 MAPPED = eland_container.update_result_with_eland
761 SEQUENCE = eland_container.update_result_with_sequence
763 sample = '(?P<sample>[^_]+)'
764 hiIndex = '_(?P<index>(NoIndex|[AGCT])+)'
765 hiLane = '_L(?P<lane>[\d]+)'
766 gaLane = '_(?P<lane>[\d]+)'
767 hiRead = '_R(?P<read>[\d]+)'
768 gaRead = '(_(?P<read>[\d])+)?'
769 part = '_(?P<part>[\d]+)'
770 ext = '(?P<extention>(\.bz2|\.gz)?)'
772 hiPrefix = sample + hiIndex + hiLane + hiRead + part
773 gaPrefix = sample + gaLane + gaRead
774 P = collections.namedtuple('Patterns', 'pattern counter priority')
776 P(hiPrefix +'_export.txt' + ext, MAPPED, 6),
777 P(gaPrefix + '_eland_result.txt' + ext, MAPPED, 5),
778 P(gaPrefix + '_eland_extended.txt' + ext, MAPPED, 4),
779 P(gaPrefix + '_eland_multi.txt' + ext, MAPPED, 3),
780 P(gaPrefix + '_export.txt' + ext, MAPPED, 2),
781 P(gaPrefix + '_sequence.txt' + ext, SEQUENCE, 1),
784 self.file_priority = {}
785 self.file_counter = {}
787 def add(self, pathname):
788 """Add pathname to our set of files
790 path, filename = os.path.split(pathname)
792 for pattern, counter, priority in self.patterns:
793 rematch = re.match(pattern, filename)
794 if rematch is not None:
795 m = ElandMatch(pathname, counter, **rematch.groupdict())
796 key = m.make_samplekey()
797 old_priority = self.file_priority.get(key, 0)
798 if priority > old_priority:
799 self.file_sets[key] = set((m,))
800 self.file_counter[key] = counter
801 self.file_priority[key] = priority
802 elif priority == old_priority:
803 self.file_sets[key].add(m)
805 def count(self, key, gerald=None, genome_maps=None):
806 #previous sig: gerald, e.results, lane_id, end, pathnames, genome_maps
807 counter = self.file_counter[key]
808 file_set = self.file_sets[key]
809 filenames = [ f.filename for f in file_set ]
810 return counter(gerald, key,
811 filenames, genome_maps)
814 return iter(self.file_sets)
817 return len(self.file_sets)
819 def __getitem__(self, key):
820 return self.file_sets[key]
822 def __setitem__(self, key, value):
823 if not isintance(value, set):
824 raise ValueError("Expected set for value")
825 self.file_sets[key] = value
827 def __delitem__(self, key):
828 del self.file_sets[key]
830 class ElandMatch(object):
831 def __init__(self, pathname, counter,
832 lane=None, read=None, extension=None,
833 sample=None, index=None, part=None, **kwargs):
834 self.filename = pathname
835 self.counter = counter
838 self.extension = extension
842 LOGGER.info("Found %s: L%s R%s Samp%s" % (
843 self.filename, self._lane, self._read, self.sample))
845 def make_samplekey(self):
846 read = self._read if self._read is not None else 1
847 return SampleKey(lane=self.lane, read=read, sample=self.sample)
850 if self._lane is not None:
851 return int(self._lane)
853 lane = property(_get_lane)
856 if self._read is not None:
857 return int(self._read)
859 read = property(_get_read)
862 if self._part is not None:
863 return int(self._part)
865 part = property(_get_part)
869 if self.sample is not None: name.append(self.sample)
870 if self._lane is not None: name.append('L%s' % (self.lane,))
871 if self._read is not None: name.append('R%s' % (self.read,))
872 if self._part is not None: name.append('P%s' % (self.part,))
873 return '<ElandMatch(' + "_".join(name) + ')>'
875 def build_genome_fasta_map(genome_dir):
876 # build fasta to fasta file map
877 LOGGER.info("Building genome map")
878 genome = genome_dir.split(os.path.sep)[-1]
880 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
882 if os.path.islink(vld_file):
884 vld_file = os.path.realpath(vld_file)
885 path, vld_name = os.path.split(vld_file)
886 name, ext = os.path.splitext(vld_name)
888 fasta_map[name] = name
890 fasta_map[name] = os.path.join(genome, name)
893 def build_genome_size_map(pathname):
894 """Guess what genome we're using"""
896 tree = ElementTree.parse(pathname)
897 for element in tree.getroot():
898 name = element.attrib['contigName']
899 bases = int(element.attrib['totalBases'])
903 if sizes.get('chr1', 0) == 197195432:
905 elif sizes.get('chr1', 0) == 247249719:
907 elif sizes.get('chrI', 0) == 230218:
909 elif len(sizes) == 1:
910 genome = os.path.splitext(sizes.keys()[0])[0]
912 raise RuntimeError("Unrecognized genome type, update detection")
915 for k,v in sizes.items():
916 fasta_map[k] = genome + '/' + k
920 def extract_eland_sequence(instream, outstream, start, end):
922 Extract a chunk of sequence out of an eland file
924 for line in instream:
925 record = line.split()
927 result = [record[0], record[1][start:end]]
929 result = [record[0][start:end]]
930 outstream.write("\t".join(result))
931 outstream.write(os.linesep)
934 def main(cmdline=None):
935 """Run eland extraction against the specified gerald directory"""
936 from optparse import OptionParser
937 parser = OptionParser("%prog: <gerald dir>+")
938 opts, args = parser.parse_args(cmdline)
939 logging.basicConfig(level=logging.DEBUG)
941 LOGGER.info("Starting scan of %s" % (a,))
943 print ElementTree.tostring(e.get_elements())
947 if __name__ == "__main__":