10 from htsworkflow.pipelines.runfolder import ElementTree
11 from htsworkflow.util.ethelp import indent, flatten
12 from htsworkflow.util.opener import autoopen
14 class ElandLane(object):
16 Process an eland result file
20 SAMPLE_NAME = 'SampleName'
22 GENOME_MAP = 'GenomeMap'
23 GENOME_ITEM = 'GenomeItem'
24 MAPPED_READS = 'MappedReads'
25 MAPPED_ITEM = 'MappedItem'
26 MATCH_CODES = 'MatchCodes'
30 def __init__(self, pathname=None, genome_map=None, xml=None):
31 self.pathname = pathname
32 self._sample_name = None
35 self._mapped_reads = None
36 self._match_codes = None
37 if genome_map is None:
39 self.genome_map = genome_map
42 self.set_elements(xml)
46 Actually read the file and actually count the reads
48 # can't do anything if we don't have a file to process
49 if self.pathname is None:
52 if os.stat(self.pathname)[stat.ST_SIZE] == 0:
53 raise RuntimeError("Eland isn't done, try again later.")
55 logging.info("summarizing results for %s" % (self.pathname))
59 match_codes = {'NM':0, 'QC':0, 'RM':0,
60 'U0':0, 'U1':0, 'U2':0,
61 'R0':0, 'R1':0, 'R2':0,
63 for line in autoopen(self.pathname,'r'):
67 # match_codes[code] = match_codes.setdefault(code, 0) + 1
68 # the QC/NM etc codes are in the 3rd field and always present
69 match_codes[fields[2]] += 1
70 # ignore lines that don't have a fasta filename
73 fasta = self.genome_map.get(fields[6], fields[6])
74 mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
75 self._match_codes = match_codes
76 self._mapped_reads = mapped_reads
79 def _update_name(self):
80 # extract the sample name
81 if self.pathname is None:
84 path, name = os.path.split(self.pathname)
85 split_name = name.split('_')
86 self._sample_name = split_name[0]
87 self._lane_id = split_name[1]
89 def _get_sample_name(self):
90 if self._sample_name is None:
92 return self._sample_name
93 sample_name = property(_get_sample_name)
95 def _get_lane_id(self):
96 if self._lane_id is None:
99 lane_id = property(_get_lane_id)
101 def _get_reads(self):
102 if self._reads is None:
105 reads = property(_get_reads)
107 def _get_mapped_reads(self):
108 if self._mapped_reads is None:
110 return self._mapped_reads
111 mapped_reads = property(_get_mapped_reads)
113 def _get_match_codes(self):
114 if self._match_codes is None:
116 return self._match_codes
117 match_codes = property(_get_match_codes)
119 def get_elements(self):
120 lane = ElementTree.Element(ElandLane.LANE,
122 unicode(ElandLane.XML_VERSION)})
123 sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
124 sample_tag.text = self.sample_name
125 lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
126 lane_tag.text = self.lane_id
127 genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
128 for k, v in self.genome_map.items():
129 item = ElementTree.SubElement(
130 genome_map, ElandLane.GENOME_ITEM,
131 {'name':k, 'value':unicode(v)})
132 mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
133 for k, v in self.mapped_reads.items():
134 item = ElementTree.SubElement(
135 mapped_reads, ElandLane.MAPPED_ITEM,
136 {'name':k, 'value':unicode(v)})
137 match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
138 for k, v in self.match_codes.items():
139 item = ElementTree.SubElement(
140 match_codes, ElandLane.MATCH_ITEM,
141 {'name':k, 'value':unicode(v)})
142 reads = ElementTree.SubElement(lane, ElandLane.READS)
143 reads.text = unicode(self.reads)
147 def set_elements(self, tree):
148 if tree.tag != ElandLane.LANE:
149 raise ValueError('Exptecting %s' % (ElandLane.LANE,))
152 self._mapped_reads = {}
153 self._match_codes = {}
156 tag = element.tag.lower()
157 if tag == ElandLane.SAMPLE_NAME.lower():
158 self._sample_name = element.text
159 elif tag == ElandLane.LANE_ID.lower():
160 self._lane_id = element.text
161 elif tag == ElandLane.GENOME_MAP.lower():
162 for child in element:
163 name = child.attrib['name']
164 value = child.attrib['value']
165 self.genome_map[name] = value
166 elif tag == ElandLane.MAPPED_READS.lower():
167 for child in element:
168 name = child.attrib['name']
169 value = child.attrib['value']
170 self._mapped_reads[name] = int(value)
171 elif tag == ElandLane.MATCH_CODES.lower():
172 for child in element:
173 name = child.attrib['name']
174 value = int(child.attrib['value'])
175 self._match_codes[name] = value
176 elif tag == ElandLane.READS.lower():
177 self._reads = int(element.text)
179 logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
183 Summarize information from eland files
187 ELAND = 'ElandCollection'
191 def __init__(self, xml=None):
192 # we need information from the gerald config.xml
196 self.set_elements(xml)
199 return len(self.results)
202 return self.results.keys()
205 return self.results.values()
208 return self.results.items()
210 def __getitem__(self, key):
211 return self.results[key]
213 def get_elements(self):
214 root = ElementTree.Element(ELAND.ELAND,
215 {'version': unicode(ELAND.XML_VERSION)})
216 for lane_id, lane in self.results.items():
217 eland_lane = lane.get_elements()
218 eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
219 root.append(eland_lane)
222 def set_elements(self, tree):
223 if tree.tag.lower() != ELAND.ELAND.lower():
224 raise ValueError('Expecting %s', ELAND.ELAND)
225 for element in list(tree):
226 lane_id = element.attrib[ELAND.LANE_ID]
227 lane = ElandLane(xml=element)
228 self.results[lane_id] = lane
230 def eland(basedir, gerald=None, genome_maps=None):
233 file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
234 if len(file_list) == 0:
235 # lets handle compressed eland files too
236 file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
238 for pathname in file_list:
239 # yes the lane_id is also being computed in ElandLane._update
240 # I didn't want to clutter up my constructor
241 # but I needed to persist the sample_name/lane_id for
242 # runfolder summary_report
243 path, name = os.path.split(pathname)
244 logging.info("Adding eland file %s" %(name,))
245 split_name = name.split('_')
246 lane_id = split_name[1]
248 if genome_maps is not None:
249 genome_map = genome_maps[lane_id]
250 elif gerald is not None:
251 genome_dir = gerald.lanes[lane_id].eland_genome
252 genome_map = build_genome_fasta_map(genome_dir)
256 eland_result = ElandLane(pathname, genome_map)
257 e.results[lane_id] = eland_result
260 def build_genome_fasta_map(genome_dir):
261 # build fasta to fasta file map
262 logging.info("Building genome map")
263 genome = genome_dir.split(os.path.sep)[-1]
265 for vld_file in glob(os.path.join(genome_dir, '*.vld')):
267 if os.path.islink(vld_file):
269 vld_file = os.path.realpath(vld_file)
270 path, vld_name = os.path.split(vld_file)
271 name, ext = os.path.splitext(vld_name)
273 fasta_map[name] = name
275 fasta_map[name] = os.path.join(genome, name)
279 def extract_eland_sequence(instream, outstream, start, end):
281 Extract a chunk of sequence out of an eland file
283 for line in instream:
284 record = line.split()
286 result = [record[0], record[1][start:end]]
288 result = [record[0][start:end]]
289 outstream.write("\t".join(result))
290 outstream.write(os.linesep)