19962b1d0e6e42247c57e80bdca7117736aca890
[htsworkflow.git] / gaworkflow / pipeline / runfolder.py
1 """
2 Core information needed to inspect a runfolder.
3 """
4 from glob import glob
5 import logging
6 import os
7 import re
8 import stat
9 import sys
10 import time
11
12 try:
13   from xml.etree import ElementTree
14 except ImportError, e:
15   from elementtree import ElementTree
16
17 EUROPEAN_STRPTIME = "%d-%m-%Y"
18 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
19 VERSION_RE = "([0-9\.]+)"
20 USER_RE = "([a-zA-Z0-9]+)"
21 LANES_PER_FLOWCELL = 8
22
23 from gaworkflow.util.alphanum import alphanum
24 from gaworkflow.util.ethelp import indent, flatten
25
26
27 class PipelineRun(object):
28     """
29     Capture "interesting" information about a pipeline run
30     """
31     XML_VERSION = 1
32     PIPELINE_RUN = 'PipelineRun'
33     FLOWCELL_ID = 'FlowcellID'
34
35     def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
36         self.pathname = os.path.normpath(pathname)
37         self._name = None
38         self._flowcell_id = None
39         self.firecrest = firecrest
40         self.bustard = bustard
41         self.gerald = gerald
42
43         if xml is not None:
44           self.set_elements(xml)
45     
46     def _get_flowcell_id(self):
47         # extract flowcell ID
48         if self._flowcell_id is None:
49           config_dir = os.path.join(self.pathname, 'Config')
50           flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
51           if os.path.exists(flowcell_id_path):
52             flowcell_id_tree = ElementTree.parse(flowcell_id_path)
53             self._flowcell_id = flowcell_id_tree.findtext('Text')
54           else:
55             path_fields = self.pathname.split('_')
56             if len(path_fields) > 0:
57               # guessing last element of filename
58               flowcell_id = path_fields[-1]
59             else:
60               flowcell_id = 'unknown'
61               
62             logging.warning(
63               "Flowcell id was not found, guessing %s" % (
64                  flowcell_id))
65             self._flowcell_id = flowcell_id
66         return self._flowcell_id
67     flowcell_id = property(_get_flowcell_id)
68
69     def get_elements(self):
70         """
71         make one master xml file from all of our sub-components.
72         """
73         root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
74         flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
75         flowcell.text = self.flowcell_id
76         root.append(self.firecrest.get_elements())
77         root.append(self.bustard.get_elements())
78         root.append(self.gerald.get_elements())
79         return root
80
81     def set_elements(self, tree):
82         # this file gets imported by all the others,
83         # so we need to hide the imports to avoid a cyclic imports
84         from gaworkflow.pipeline import firecrest
85         from gaworkflow.pipeline import bustard
86         from gaworkflow.pipeline import gerald
87
88         tag = tree.tag.lower()
89         if tag != PipelineRun.PIPELINE_RUN.lower():
90           raise ValueError('Pipeline Run Expecting %s got %s' % (
91               PipelineRun.PIPELINE_RUN, tag))
92         for element in tree:
93           tag = element.tag.lower()
94           if tag == PipelineRun.FLOWCELL_ID.lower():
95             self._flowcell_id = element.text
96           #ok the xword.Xword.XWORD pattern for module.class.constant is lame
97           elif tag == firecrest.Firecrest.FIRECREST.lower():
98             self.firecrest = firecrest.Firecrest(xml=element)
99           elif tag == bustard.Bustard.BUSTARD.lower():
100             self.bustard = bustard.Bustard(xml=element)
101           elif tag == gerald.Gerald.GERALD.lower():
102             self.gerald = gerald.Gerald(xml=element)
103           else:
104             logging.warn('PipelineRun unrecognized tag %s' % (tag,))
105
106     def _get_run_name(self):
107         """
108         Given a run tuple, find the latest date and use that as our name
109         """
110         if self._name is None:
111           tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
112           timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
113           self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
114         return self._name
115     name = property(_get_run_name)
116
117     def save(self):
118         logging.info("Saving run report "+ self.name)
119         xml = self.get_elements()
120         indent(xml)
121         ElementTree.ElementTree(xml).write(self.name)
122
123     def load(self, filename):
124         logging.info("Loading run report from " + filename)
125         tree = ElementTree.parse(filename).getroot()
126         self.set_elements(tree)
127
128 def get_runs(runfolder):
129     """
130     Search through a run folder for all the various sub component runs
131     and then return a PipelineRun for each different combination.
132
133     For example if there are two different GERALD runs, this will
134     generate two different PipelineRun objects, that differ
135     in there gerald component.
136     """
137     from gaworkflow.pipeline import firecrest
138     from gaworkflow.pipeline import bustard
139     from gaworkflow.pipeline import gerald
140
141     datadir = os.path.join(runfolder, 'Data')
142
143     logging.info('Searching for runs in ' + datadir)
144     runs = []
145     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
146         f = firecrest.firecrest(firecrest_pathname)
147         bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
148         for bustard_pathname in glob(bustard_glob):
149             b = bustard.bustard(bustard_pathname)
150             gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
151             for gerald_pathname in glob(gerald_glob):
152                 try:
153                     g = gerald.gerald(gerald_pathname)
154                     runs.append(PipelineRun(runfolder, f, b, g))
155                 except IOError, e:
156                     print "Ignoring", str(e)
157     return runs
158                 
159     
160 def extract_run_parameters(runs):
161     """
162     Search through runfolder_path for various runs and grab their parameters
163     """
164     for run in runs:
165       run.save()
166
167 def summarize_mapped_reads(mapped_reads):
168     """
169     Summarize per chromosome reads into a genome count
170     But handle spike-in/contamination symlinks seperately.
171     """
172     summarized_reads = {}
173     genome_reads = 0
174     genome = 'unknown'
175     for k, v in mapped_reads.items():
176         path, k = os.path.split(k)
177         if len(path) > 0:
178             genome = path
179             genome_reads += v
180         else:
181             summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
182     summarized_reads[genome] = genome_reads
183     return summarized_reads
184
185 def summary_report(runs):
186     """
187     Summarize cluster numbers and mapped read counts for a runfolder
188     """
189     report = []
190     for run in runs:
191         # print a run name?
192         report.append('Summary for %s' % (run.name,))
193         # sort the report
194         eland_keys = run.gerald.eland_results.results.keys()
195         eland_keys.sort(alphanum)
196
197         lane_results = run.gerald.summary.lane_results
198         for lane_id in eland_keys:
199             result = run.gerald.eland_results.results[lane_id]
200             report.append("Sample name %s" % (result.sample_name))
201             report.append("Lane id %s" % (result.lane_id,))
202             cluster = lane_results[result.lane_id].cluster
203             report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
204             report.append("Total Reads: %d" % (result.reads))
205             mc = result._match_codes
206             report.append("No Match: %d" % (mc['NM']))
207             report.append("QC Failed: %d" % (mc['QC']))
208             report.append('Unique (0,1,2 mismatches) %d %d %d' % \
209                           (mc['U0'], mc['U1'], mc['U2']))
210             report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
211                           (mc['R0'], mc['R1'], mc['R2']))
212             report.append("Mapped Reads")
213             mapped_reads = summarize_mapped_reads(result.mapped_reads)
214             for name, counts in mapped_reads.items():
215               report.append("  %s: %d" % (name, counts))
216             report.append('---')
217             report.append('')
218         return os.linesep.join(report)