The htsworkflow.pipelines.gerald module was getting to large
[htsworkflow.git] / htsworkflow / pipelines / eland.py
1 """
2 Analyze ELAND files
3 """
4
5 from glob import glob
6 import logging
7 import os
8 import stat
9
10 from htsworkflow.pipelines.runfolder import ElementTree
11 from htsworkflow.util.ethelp import indent, flatten
12 from htsworkflow.util.opener import autoopen
13
14 class ElandLane(object):
15     """
16     Process an eland result file
17     """
18     XML_VERSION = 1
19     LANE = 'ElandLane'
20     SAMPLE_NAME = 'SampleName'
21     LANE_ID = 'LaneID'
22     GENOME_MAP = 'GenomeMap'
23     GENOME_ITEM = 'GenomeItem'
24     MAPPED_READS = 'MappedReads'
25     MAPPED_ITEM = 'MappedItem'
26     MATCH_CODES = 'MatchCodes'
27     MATCH_ITEM = 'Code'
28     READS = 'Reads'
29
30     def __init__(self, pathname=None, genome_map=None, xml=None):
31         self.pathname = pathname
32         self._sample_name = None
33         self._lane_id = None
34         self._reads = None
35         self._mapped_reads = None
36         self._match_codes = None
37         if genome_map is None:
38             genome_map = {}
39         self.genome_map = genome_map
40
41         if xml is not None:
42             self.set_elements(xml)
43
44     def _update(self):
45         """
46         Actually read the file and actually count the reads
47         """
48         # can't do anything if we don't have a file to process
49         if self.pathname is None:
50             return
51
52         if os.stat(self.pathname)[stat.ST_SIZE] == 0:
53             raise RuntimeError("Eland isn't done, try again later.")
54
55         logging.info("summarizing results for %s" % (self.pathname))
56         reads = 0
57         mapped_reads = {}
58
59         match_codes = {'NM':0, 'QC':0, 'RM':0,
60                        'U0':0, 'U1':0, 'U2':0,
61                        'R0':0, 'R1':0, 'R2':0,
62                       }
63         for line in autoopen(self.pathname,'r'):
64             reads += 1
65             fields = line.split()
66             # code = fields[2]
67             # match_codes[code] = match_codes.setdefault(code, 0) + 1
68             # the QC/NM etc codes are in the 3rd field and always present
69             match_codes[fields[2]] += 1
70             # ignore lines that don't have a fasta filename
71             if len(fields) < 7:
72                 continue
73             fasta = self.genome_map.get(fields[6], fields[6])
74             mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
75         self._match_codes = match_codes
76         self._mapped_reads = mapped_reads
77         self._reads = reads
78
79     def _update_name(self):
80         # extract the sample name
81         if self.pathname is None:
82             return
83
84         path, name = os.path.split(self.pathname)
85         split_name = name.split('_')
86         self._sample_name = split_name[0]
87         self._lane_id = split_name[1]
88
89     def _get_sample_name(self):
90         if self._sample_name is None:
91             self._update_name()
92         return self._sample_name
93     sample_name = property(_get_sample_name)
94
95     def _get_lane_id(self):
96         if self._lane_id is None:
97             self._update_name()
98         return self._lane_id
99     lane_id = property(_get_lane_id)
100
101     def _get_reads(self):
102         if self._reads is None:
103             self._update()
104         return self._reads
105     reads = property(_get_reads)
106
107     def _get_mapped_reads(self):
108         if self._mapped_reads is None:
109             self._update()
110         return self._mapped_reads
111     mapped_reads = property(_get_mapped_reads)
112
113     def _get_match_codes(self):
114         if self._match_codes is None:
115             self._update()
116         return self._match_codes
117     match_codes = property(_get_match_codes)
118
119     def get_elements(self):
120         lane = ElementTree.Element(ElandLane.LANE,
121                                    {'version':
122                                     unicode(ElandLane.XML_VERSION)})
123         sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
124         sample_tag.text = self.sample_name
125         lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
126         lane_tag.text = self.lane_id
127         genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
128         for k, v in self.genome_map.items():
129             item = ElementTree.SubElement(
130                 genome_map, ElandLane.GENOME_ITEM,
131                 {'name':k, 'value':unicode(v)})
132         mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
133         for k, v in self.mapped_reads.items():
134             item = ElementTree.SubElement(
135                 mapped_reads, ElandLane.MAPPED_ITEM,
136                 {'name':k, 'value':unicode(v)})
137         match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
138         for k, v in self.match_codes.items():
139             item = ElementTree.SubElement(
140                 match_codes, ElandLane.MATCH_ITEM,
141                 {'name':k, 'value':unicode(v)})
142         reads = ElementTree.SubElement(lane, ElandLane.READS)
143         reads.text = unicode(self.reads)
144
145         return lane
146
147     def set_elements(self, tree):
148         if tree.tag != ElandLane.LANE:
149             raise ValueError('Exptecting %s' % (ElandLane.LANE,))
150
151         # reset dictionaries
152         self._mapped_reads = {}
153         self._match_codes = {}
154
155         for element in tree:
156             tag = element.tag.lower()
157             if tag == ElandLane.SAMPLE_NAME.lower():
158                 self._sample_name = element.text
159             elif tag == ElandLane.LANE_ID.lower():
160                 self._lane_id = element.text
161             elif tag == ElandLane.GENOME_MAP.lower():
162                 for child in element:
163                     name = child.attrib['name']
164                     value = child.attrib['value']
165                     self.genome_map[name] = value
166             elif tag == ElandLane.MAPPED_READS.lower():
167                 for child in element:
168                     name = child.attrib['name']
169                     value = child.attrib['value']
170                     self._mapped_reads[name] = int(value)
171             elif tag == ElandLane.MATCH_CODES.lower():
172                 for child in element:
173                     name = child.attrib['name']
174                     value = int(child.attrib['value'])
175                     self._match_codes[name] = value
176             elif tag == ElandLane.READS.lower():
177                 self._reads = int(element.text)
178             else:
179                 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
180
181 class ELAND(object):
182     """
183     Summarize information from eland files
184     """
185     XML_VERSION = 1
186
187     ELAND = 'ElandCollection'
188     LANE = 'Lane'
189     LANE_ID = 'id'
190
191     def __init__(self, xml=None):
192         # we need information from the gerald config.xml
193         self.results = {}
194
195         if xml is not None:
196             self.set_elements(xml)
197
198     def __len__(self):
199         return len(self.results)
200
201     def keys(self):
202         return self.results.keys()
203
204     def values(self):
205         return self.results.values()
206
207     def items(self):
208         return self.results.items()
209
210     def __getitem__(self, key):
211         return self.results[key]
212
213     def get_elements(self):
214         root = ElementTree.Element(ELAND.ELAND,
215                                    {'version': unicode(ELAND.XML_VERSION)})
216         for lane_id, lane in self.results.items():
217             eland_lane = lane.get_elements()
218             eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
219             root.append(eland_lane)
220         return root
221
222     def set_elements(self, tree):
223         if tree.tag.lower() != ELAND.ELAND.lower():
224             raise ValueError('Expecting %s', ELAND.ELAND)
225         for element in list(tree):
226             lane_id = element.attrib[ELAND.LANE_ID]
227             lane = ElandLane(xml=element)
228             self.results[lane_id] = lane
229
230 def eland(basedir, gerald=None, genome_maps=None):
231     e = ELAND()
232
233     file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
234     if len(file_list) == 0:
235         # lets handle compressed eland files too
236         file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
237
238     for pathname in file_list:
239         # yes the lane_id is also being computed in ElandLane._update
240         # I didn't want to clutter up my constructor
241         # but I needed to persist the sample_name/lane_id for
242         # runfolder summary_report
243         path, name = os.path.split(pathname)
244         logging.info("Adding eland file %s" %(name,))
245         split_name = name.split('_')
246         lane_id = split_name[1]
247
248         if genome_maps is not None:
249             genome_map = genome_maps[lane_id]
250         elif gerald is not None:
251             genome_dir = gerald.lanes[lane_id].eland_genome
252             genome_map = build_genome_fasta_map(genome_dir)
253         else:
254             genome_map = {}
255
256         eland_result = ElandLane(pathname, genome_map)
257         e.results[lane_id] = eland_result
258     return e
259
260 def build_genome_fasta_map(genome_dir):
261     # build fasta to fasta file map
262     logging.info("Building genome map")
263     genome = genome_dir.split(os.path.sep)[-1]
264     fasta_map = {}
265     for vld_file in glob(os.path.join(genome_dir, '*.vld')):
266         is_link = False
267         if os.path.islink(vld_file):
268             is_link = True
269         vld_file = os.path.realpath(vld_file)
270         path, vld_name = os.path.split(vld_file)
271         name, ext = os.path.splitext(vld_name)
272         if is_link:
273             fasta_map[name] = name
274         else:
275             fasta_map[name] = os.path.join(genome, name)
276     return fasta_map
277
278
279 def extract_eland_sequence(instream, outstream, start, end):
280     """
281     Extract a chunk of sequence out of an eland file
282     """
283     for line in instream:
284         record = line.split()
285         if len(record) > 1:
286             result = [record[0], record[1][start:end]]
287         else:
288             result = [record[0][start:end]]
289         outstream.write("\t".join(result))
290         outstream.write(os.linesep)
291