Create a class to convert contig names into genome/contig names
[htsworkflow.git] / htsworkflow / pipelines / genomemap.py
1 """Convert chromosome names to genome/chromsome names
2 """
3 from glob import glob
4 import os
5 import collections
6
7 vldInfo = collections.namedtuple('vldInfo', 'name is_link')
8
9 class GenomeMap(collections.MutableMapping):
10     def __init__(self, items=None):
11         self._contigs = {}
12         if items is not None:
13             self._contigs.update(items)
14
15     def __len__(self):
16         return len(self._contigs)
17
18     def __iter__(self):
19         return self._contigs.iterkeys()
20
21     def __getitem__(self, name):
22         return self._contigs[name]
23
24     def __setitem__(self, name, value):
25         self._contigs[name] = value
26
27     def __delitem__(self, name):
28         del self._contigs[name]
29
30     def scan_genome_dir(self, genome_dir):
31         """Build map from a genome directory"""
32         genome = genome_dir.split(os.path.sep)[-1]
33         vld_files = glob(os.path.join(genome_dir, '*.vld'))
34         vld = [ vldInfo(f, os.path.islink(f)) for f in vld_files ]
35         self.build_map_from_dir(genome, vld)
36
37     def build_map_from_dir(self, genome_name, vld_list):
38         """Initialize contig genome map from list of vldInfo tuples
39         """
40         # build fasta to fasta file map
41         for vld_file, is_link in vld_list:
42             vld_file = os.path.realpath(vld_file)
43             path, vld_name = os.path.split(vld_file)
44             name, ext = os.path.splitext(vld_name)
45             if is_link:
46                 self._contigs[name] = name
47             else:
48                 self._contigs[name] = genome_name + '/' + name
49
50     def parse_genomesize(self, pathname):
51         """Parse genomesize.xml file
52         """
53         tree = ElementTree.parse(pathname)
54         self.build_map_from_element(tree.getroot())
55
56     def build_map_from_element(self, root):
57         """Build genome map from a parsed HiSeq genomesizes.xml file
58         """
59         sizes = {}
60         filenames = {}
61         for element in root:
62             contig = element.attrib['contigName']
63             filename = element.attrib['fileName']
64             filenames[contig] = filename
65             bases = int(element.attrib['totalBases'])
66             sizes[contig] = bases
67
68         genome = guess_genome(sizes)
69
70         for contig, basese in sizes.items():
71             name = filenames[contig]
72             self._contigs[name] = genome + '/' + name
73
74 def guess_genome(contig_sizes):
75     """Guess what genome we're using"""
76     # guess genome names
77
78     genomes = {'chr1': {197195432: 'mm9',
79                         247249719: 'hg19',
80                         },
81                'chrI': {230218, 'sacCer3'},
82                }
83
84     for key in genomes:
85         size = contig_sizes.get(key, 0)
86         if size in genomes[key]:
87             return genomes[key][size]
88
89     if len(contig_sizes) == 1:
90         return os.path.splitext(contig_sizes.keys()[0])[0]
91
92     raise RuntimeError("Unrecognized genome type, update detection code.")