1 """Convert chromosome names to genome/chromsome names
6 from htsworkflow.pipelines import ElementTree
8 vldInfo = collections.namedtuple('vldInfo', 'name is_link')
10 class GenomeMap(collections.MutableMapping):
11 def __init__(self, items=None):
14 self._contigs.update(items)
17 return len(self._contigs)
20 return self._contigs.iterkeys()
22 def __getitem__(self, name):
23 return self._contigs[name]
25 def __setitem__(self, name, value):
26 self._contigs[name] = value
28 def __delitem__(self, name):
29 del self._contigs[name]
31 def scan_genome_dir(self, genome_dir):
32 """Build map from a genome directory"""
33 genome = genome_dir.split(os.path.sep)[-1]
34 vld_files = glob(os.path.join(genome_dir, '*.vld'))
35 vld = [ vldInfo(f, os.path.islink(f)) for f in vld_files ]
36 self.build_map_from_dir(genome, vld)
38 def build_map_from_dir(self, genome_name, vld_list):
39 """Initialize contig genome map from list of vldInfo tuples
41 # build fasta to fasta file map
42 for vld_file, is_link in vld_list:
43 vld_file = os.path.realpath(vld_file)
44 path, vld_name = os.path.split(vld_file)
45 name, ext = os.path.splitext(vld_name)
47 self._contigs[name] = name
49 self._contigs[name] = genome_name + '/' + name
51 def parse_genomesize(self, pathname):
52 """Parse genomesize.xml file
54 tree = ElementTree.parse(pathname)
55 self.build_map_from_element(tree.getroot())
57 def build_map_from_element(self, root):
58 """Build genome map from a parsed HiSeq genomesizes.xml file
63 contig = element.attrib['contigName']
64 filename = element.attrib['fileName']
65 filenames[contig] = filename
66 bases = int(element.attrib['totalBases'])
69 genome = guess_genome(sizes)
71 for contig, basese in sizes.items():
72 name = filenames[contig]
73 self._contigs[name] = genome + '/' + name
75 def guess_genome(contig_sizes):
76 """Guess what genome we're using"""
79 genomes = {'chr1': {197195432: 'mm9',
84 'chrI': {230218: 'sacCer3',
85 15072421: 'elegans190'},
86 '1': {60348388: 'danRe6'},
87 'chr2L': { 23011544: 'dm3',
88 49364326: 'Anopheles_gambiae'},
93 size = contig_sizes.get(key, 0)
94 if size in genomes[key]:
95 return genomes[key][size]
97 if len(contig_sizes) == 1:
98 return os.path.splitext(contig_sizes.keys()[0])[0]
100 raise RuntimeError("Unrecognized genome type, update detection code.")