Merge branch 'master' into django1.4
[htsworkflow.git] / htsworkflow / pipelines / genomemap.py
1 """Convert chromosome names to genome/chromsome names
2 """
3 from glob import glob
4 import os
5 import collections
6 from htsworkflow.pipelines import ElementTree
7
8 vldInfo = collections.namedtuple('vldInfo', 'name is_link')
9
10 class GenomeMap(collections.MutableMapping):
11     def __init__(self, items=None):
12         self._contigs = {}
13         if items is not None:
14             self._contigs.update(items)
15
16     def __len__(self):
17         return len(self._contigs)
18
19     def __iter__(self):
20         return self._contigs.iterkeys()
21
22     def __getitem__(self, name):
23         return self._contigs[name]
24
25     def __setitem__(self, name, value):
26         self._contigs[name] = value
27
28     def __delitem__(self, name):
29         del self._contigs[name]
30
31     def scan_genome_dir(self, genome_dir):
32         """Build map from a genome directory"""
33         genome = genome_dir.split(os.path.sep)[-1]
34         vld_files = glob(os.path.join(genome_dir, '*.vld'))
35         vld = [ vldInfo(f, os.path.islink(f)) for f in vld_files ]
36         self.build_map_from_dir(genome, vld)
37
38     def build_map_from_dir(self, genome_name, vld_list):
39         """Initialize contig genome map from list of vldInfo tuples
40         """
41         # build fasta to fasta file map
42         for vld_file, is_link in vld_list:
43             vld_file = os.path.realpath(vld_file)
44             path, vld_name = os.path.split(vld_file)
45             name, ext = os.path.splitext(vld_name)
46             if is_link:
47                 self._contigs[name] = name
48             else:
49                 self._contigs[name] = genome_name + '/' + name
50
51     def parse_genomesize(self, pathname):
52         """Parse genomesize.xml file
53         """
54         tree = ElementTree.parse(pathname)
55         self.build_map_from_element(tree.getroot())
56
57     def build_map_from_element(self, root):
58         """Build genome map from a parsed HiSeq genomesizes.xml file
59         """
60         sizes = {}
61         filenames = {}
62         for element in root:
63             contig = element.attrib['contigName']
64             filename = element.attrib['fileName']
65             filenames[contig] = filename
66             bases = int(element.attrib['totalBases'])
67             sizes[contig] = bases
68
69         genome = guess_genome(sizes)
70
71         for contig, basese in sizes.items():
72             name = filenames[contig]
73             self._contigs[name] = genome + '/' + name
74
75 def guess_genome(contig_sizes):
76     """Guess what genome we're using"""
77     # guess genome names
78
79     genomes = {'chr1': {197195432: 'mm9',
80                         247249719: 'hg18',
81                         200994015: 'galGal3',
82                         230208: 'saccer1',
83                         },
84                'chrI': {230218: 'sacCer3',
85                         15072421: 'elegans190'},
86                '1': {60348388: 'danRe6'},
87                'chr2L': { 23011544: 'dm3',
88                           49364326: 'Anopheles_gambiae'},
89
90                }
91
92     for key in genomes:
93         size = contig_sizes.get(key, 0)
94         if size in genomes[key]:
95             return genomes[key][size]
96
97     if len(contig_sizes) == 1:
98         return os.path.splitext(contig_sizes.keys()[0])[0]
99
100     raise RuntimeError("Unrecognized genome type, update detection code.")