2 Core information needed to inspect a runfolder.
13 from xml.etree import ElementTree
14 except ImportError, e:
15 from elementtree import ElementTree
17 EUROPEAN_STRPTIME = "%d-%m-%Y"
18 EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
19 VERSION_RE = "([0-9\.]+)"
20 USER_RE = "([a-zA-Z0-9]+)"
21 LANES_PER_FLOWCELL = 8
23 from gaworkflow.util.alphanum import alphanum
24 from gaworkflow.util.ethelp import indent, flatten
27 class PipelineRun(object):
29 Capture "interesting" information about a pipeline run
32 PIPELINE_RUN = 'PipelineRun'
33 FLOWCELL_ID = 'FlowcellID'
35 def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
36 self.pathname = pathname
38 self._flowcell_id = None
39 self.firecrest = firecrest
40 self.bustard = bustard
44 self.set_elements(xml)
46 def _get_flowcell_id(self):
48 if self._flowcell_id is None:
49 config_dir = os.path.join(self.pathname, 'Config')
50 flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
51 if os.path.exists(flowcell_id_path):
52 flowcell_id_tree = ElementTree.parse(flowcell_id_path)
53 self._flowcell_id = flowcell_id_tree.findtext('Text')
55 path_fields = self.pathname.split('_')
56 if len(path_fields) > 0:
57 # guessing last element of filename
58 flowcell_id = path_fields[-1]
60 flowcell_id = 'unknown'
63 "Flowcell idwas not found, guessing %s" % (
65 self._flowcell_id = flowcell_id
66 return self._flowcell_id
67 flowcell_id = property(_get_flowcell_id)
69 def get_elements(self):
71 make one master xml file from all of our sub-components.
73 root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
74 flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
75 flowcell.text = self.flowcell_id
76 root.append(self.firecrest.get_elements())
77 root.append(self.bustard.get_elements())
78 root.append(self.gerald.get_elements())
81 def set_elements(self, tree):
82 # this file gets imported by all the others,
83 # so we need to hide the imports to avoid a cyclic imports
84 from gaworkflow.pipeline import firecrest
85 from gaworkflow.pipeline import bustard
86 from gaworkflow.pipeline import gerald
88 tag = tree.tag.lower()
89 if tag != PipelineRun.PIPELINE_RUN.lower():
90 raise ValueError('Pipeline Run Expecting %s got %s' % (
91 PipelineRun.PIPELINE_RUN, tag))
93 tag = element.tag.lower()
94 if tag == PipelineRun.FLOWCELL_ID.lower():
95 self._flowcell_id = element.text
96 #ok the xword.Xword.XWORD pattern for module.class.constant is lame
97 elif tag == firecrest.Firecrest.FIRECREST.lower():
98 self.firecrest = firecrest.Firecrest(xml=element)
99 elif tag == bustard.Bustard.BUSTARD.lower():
100 self.bustard = bustard.Bustard(xml=element)
101 elif tag == gerald.Gerald.GERALD.lower():
102 self.gerald = gerald.Gerald(xml=element)
104 logging.warn('PipelineRun unrecognized tag %s' % (tag,))
106 def _get_run_name(self):
108 Given a run tuple, find the latest date and use that as our name
110 if self._name is None:
111 tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
112 timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
113 self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
115 name = property(_get_run_name)
118 logging.info("Saving run report "+ self.name)
119 xml = self.get_elements()
121 ElementTree.ElementTree(xml).write(self.name)
123 def load(self, filename):
124 logging.info("Loading run report from " + filename)
125 tree = ElementTree.parse(filename).getroot()
126 self.set_elements(tree)
128 def get_runs(runfolder):
130 Search through a run folder for all the various sub component runs
131 and then return a PipelineRun for each different combination.
133 For example if there are two different GERALD runs, this will
134 generate two different PipelineRun objects, that differ
135 in there gerald component.
137 from gaworkflow.pipeline import firecrest
138 from gaworkflow.pipeline import bustard
139 from gaworkflow.pipeline import gerald
141 datadir = os.path.join(runfolder, 'Data')
143 logging.info('Searching for runs in ' + datadir)
145 for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
146 f = firecrest.firecrest(firecrest_pathname)
147 bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
148 for bustard_pathname in glob(bustard_glob):
149 b = bustard.bustard(bustard_pathname)
150 gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
151 for gerald_pathname in glob(gerald_glob):
153 g = gerald.gerald(gerald_pathname)
154 runs.append(PipelineRun(runfolder, f, b, g))
156 print "Ignoring", str(e)
160 def extract_run_parameters(runs):
162 Search through runfolder_path for various runs and grab their parameters
167 def summarize_mapped_reads(mapped_reads):
169 Summarize per chromosome reads into a genome count
170 But handle spike-in/contamination symlinks seperately.
172 summarized_reads = {}
175 for k, v in mapped_reads.items():
176 path, k = os.path.split(k)
181 summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
182 summarized_reads[genome] = genome_reads
183 return summarized_reads
185 def summary_report(runs):
187 Summarize cluster numbers and mapped read counts for a runfolder
192 report.append('Summary for %s' % (run.name,))
194 eland_keys = run.gerald.eland_results.results.keys()
195 eland_keys.sort(alphanum)
197 lane_results = run.gerald.summary.lane_results
198 for lane_id in eland_keys:
199 result = run.gerald.eland_results.results[lane_id]
200 report.append("Sample name %s" % (result.sample_name))
201 report.append("Lane id %s" % (result.lane_id,))
202 cluster = lane_results[result.lane_id].cluster
203 report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
204 report.append("Total Reads: %d" % (result.reads))
205 mc = result._match_codes
206 report.append("No Match: %d" % (mc['NM']))
207 report.append("QC Failed: %d" % (mc['QC']))
208 report.append('Unique (0,1,2 mismatches) %d %d %d' % \
209 (mc['U0'], mc['U1'], mc['U2']))
210 report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
211 (mc['R0'], mc['R1'], mc['R2']))
212 report.append("Mapped Reads")
213 mapped_reads = summarize_mapped_reads(result.mapped_reads)
214 for name, counts in mapped_reads.items():
215 report.append(" %s: %d" % (name, counts))
218 return os.linesep.join(report)