7 from StringIO import StringIO
12 from htsworkflow.util.rdfhelp import \
23 from htsworkflow.util.hashfile import make_md5sum
25 logger = logging.getLogger(__name__)
28 class ModelException(RuntimeError): pass
29 class MetadataLookupException(RuntimeError):
30 """Problem accessing metadata"""
37 def parse_into_model(model, submission_name, filename):
38 """Read a DAF into RDF Model
40 requires a short submission name
42 attributes = parse(filename)
43 add_to_model(model, attributes, submission_name)
45 def fromstream_into_model(model, submission_name, daf_stream):
46 attributes = parse_stream(daf_stream)
47 add_to_model(model, attributes, submission_name)
49 def fromstring_into_model(model, submission_name, daf_string):
50 """Read a string containing a DAF into RDF Model
52 requires a short submission name
54 attributes = fromstring(daf_string)
55 add_to_model(model, attributes, submission_name)
58 stream = open(filename,'r')
59 attributes = parse_stream(stream)
63 def fromstring(daf_string):
64 stream = StringIO(daf_string)
65 return parse_stream(stream)
67 def parse_stream(stream):
68 comment_re = re.compile("#.*$")
71 attributes = {'views': {}}
76 line = comment_re.sub("", line)
77 nstop = _extract_name_index(line)
79 sstop = _consume_whitespace(line, start=nstop)
80 vstop = _extract_value_index(line, start=sstop)
81 value = line[sstop:vstop]
83 if value.lower() in ('yes',):
85 elif value.lower() in ('no',):
89 if view_name is not None:
90 attributes['views'][view_name] = view_attributes
94 elif state == DAF_HEADER and name == 'variables':
95 attributes[name] = [ x.strip() for x in value.split(',')]
96 elif state == DAF_HEADER and name == 'view':
98 view_attributes['view'] = value
100 elif state == DAF_HEADER:
101 attributes[name] = value
102 elif state == DAF_VIEW:
103 view_attributes[name] = value
106 if view_name is not None:
107 attributes['views'][view_name] = view_attributes
111 def _consume_whitespace(line, start=0):
112 for i in xrange(start, len(line)):
113 if line[i] not in string.whitespace:
118 def _extract_name_index(line, start=0):
119 for i in xrange(start, len(line)):
120 if line[i] in string.whitespace:
125 def _extract_value_index(line, start=0):
126 shortline = line.rstrip()
127 return len(shortline)
129 def convert_to_rdf_statements(attributes, name):
130 submission_uri = get_submission_uri(name)
131 subject = RDF.Node(submission_uri)
134 for daf_key in attributes:
135 predicate = dafTermOntology[daf_key]
136 if daf_key == 'views':
137 statements.extend(_views_to_statements(name,
139 attributes[daf_key]))
140 elif daf_key == 'variables':
141 #predicate = ddfNS['variables']
142 for var in attributes.get('variables', []):
143 obj = toTypedNode(var)
144 statements.append(RDF.Statement(subject, predicate, obj))
146 value = attributes[daf_key]
147 obj = toTypedNode(value)
148 statements.append(RDF.Statement(subject,predicate,obj))
152 def _views_to_statements(name, dafNS, views):
153 subject = RDF.Node(get_submission_uri(name))
154 viewNS = get_view_namespace(name)
157 for view_name in views:
158 view_attributes = views[view_name]
159 viewSubject = viewNS[view_name]
160 statements.append(RDF.Statement(subject, dafNS['views'], viewSubject))
162 RDF.Statement(viewSubject, dafNS['name'], toTypedNode(view_name)))
163 for view_attribute_name in view_attributes:
164 predicate = dafNS[view_attribute_name]
165 obj = toTypedNode(view_attributes[view_attribute_name])
166 statements.append(RDF.Statement(viewSubject, predicate, obj))
168 #statements.extend(convert_to_rdf_statements(view, viewNode))
171 def add_to_model(model, attributes, name):
172 for statement in convert_to_rdf_statements(attributes, name):
173 model.add_statement(statement)
175 def get_submission_uri(name):
176 return submissionLog[name].uri
178 def get_view_namespace(name):
179 submission_uri = get_submission_uri(name)
180 viewNS = RDF.NS(str(submission_uri) + '/view/')
183 class DAFMapper(object):
184 """Convert filenames to views in the UCSC Daf
186 def __init__(self, name, daf_file=None, model=None):
187 """Construct a RDF backed model of a UCSC DAF
190 name (str): the name of this submission (used to construct DAF url)
191 daf_file (str, stream, or None):
192 if str, use as filename
193 if stream, parse as stream
194 if none, don't attempt to load the DAF into our model
195 model (RDF.Model or None):
196 if None, construct a memory backed model
197 otherwise specifies model to use
199 if daf_file is None and model is None:
200 logger.error("We need a DAF or Model containing a DAF to work")
203 if model is not None:
206 self.model = get_model()
208 if hasattr(daf_file, 'next'):
209 # its some kind of stream
210 fromstream_into_model(self.model, name, daf_file)
213 parse_into_model(self.model, name, daf_file)
215 self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
216 self.submissionSet = get_submission_uri(self.name)
217 self.submissionSetNS = RDF.NS(str(self.submissionSet)+'/')
218 self.__view_map = None
221 def add_pattern(self, view_name, filename_pattern):
222 """Map a filename regular expression to a view name
224 viewNS = get_view_namespace(self.name)
226 obj = toTypedNode(filename_pattern)
227 self.model.add_statement(
228 RDF.Statement(viewNS[view_name],
229 dafTermOntology['filename_re'],
233 def import_submission_dir(self, submission_dir, library_id):
234 """Import a submission directories and update our model as needed
236 #attributes = get_filename_attribute_map(paired)
237 libNode = self.libraryNS[library_id + "/"]
239 self._add_library_details_to_model(libNode)
241 submission_files = os.listdir(submission_dir)
242 for f in submission_files:
243 self.construct_file_attributes(submission_dir, libNode, f)
246 def construct_file_attributes(self, submission_dir, libNode, pathname):
247 """Looking for the best extension
248 The 'best' is the longest match
251 filename (str): the filename whose extention we are about to examine
253 path, filename = os.path.split(pathname)
255 logger.debug("Searching for view")
256 view = self.find_view(filename)
258 logger.warn("Unrecognized file: %s" % (pathname,))
260 if str(view) == str(libraryOntology['ignore']):
263 submission_name = self.make_submission_name(submission_dir)
264 submissionNode = self.get_submission_node(submission_dir)
265 submission_uri = str(submissionNode.uri)
266 view_name = fromTypedNode(self.model.get_target(view, dafTermOntology['name']))
267 if view_name is None:
268 logging.warning('Could not find view name for {0}'.format(str(view)))
271 view_name = str(view_name)
272 submissionView = RDF.Node(RDF.Uri(submission_uri + '/' + view_name))
274 self.model.add_statement(
275 RDF.Statement(self.submissionSet, dafTermOntology['has_submission'], submissionNode))
276 logger.debug("Adding statements to {0}".format(str(submissionNode)))
277 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['has_view'], submissionView))
278 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['name'], toTypedNode(submission_name)))
279 self.model.add_statement(RDF.Statement(submissionNode, rdfNS['type'], submissionOntology['submission']))
280 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['library'], libNode))
282 logger.debug("Adding statements to {0}".format(str(submissionView)))
283 # add trac specific information
284 self.model.add_statement(
285 RDF.Statement(submissionView, dafTermOntology['view'], view))
286 self.model.add_statement(
287 RDF.Statement(submissionView, dafTermOntology['paired'], toTypedNode(self._is_paired(libNode))))
288 self.model.add_statement(
289 RDF.Statement(submissionView, dafTermOntology['submission'], submissionNode))
292 terms = [dafTermOntology['type'],
293 dafTermOntology['filename_re'],
295 terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
297 # add file specific information
298 logger.debug("Updating file md5sum")
299 fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
300 submission_pathname = os.path.join(submission_dir, filename)
301 md5 = make_md5sum(submission_pathname)
302 self.model.add_statement(
303 RDF.Statement(submissionView, dafTermOntology['has_file'], fileNode))
304 self.model.add_statement(
305 RDF.Statement(fileNode, dafTermOntology['filename'], filename))
308 logging.warning("Unable to produce md5sum for %s" % ( submission_pathname))
310 self.model.add_statement(
311 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
313 logger.debug("Done.")
315 def _add_library_details_to_model(self, libNode):
316 parser = RDF.Parser(name='rdfa')
317 new_statements = parser.parse_as_stream(libNode.uri)
318 for s in new_statements:
319 # don't override things we already have in the model
320 targets = list(self.model.get_targets(s.subject, s.predicate))
321 if len(targets) == 0:
324 def get_daf_variables(self):
325 """Returns simple variables names that to include in the ddf
327 variableTerm = dafTermOntology['variables']
329 for obj in self.model.get_targets(self.submissionSet, variableTerm):
330 value = str(fromTypedNode(obj))
331 results.append(value)
332 results.append('labVersion')
335 def make_submission_name(self, submission_dir):
336 submission_dir = os.path.normpath(submission_dir)
337 submission_dir_name = os.path.split(submission_dir)[1]
338 if len(submission_dir_name) == 0:
340 "Submission dir name too short: %s" %(submission_dir,))
341 return submission_dir_name
343 def get_submission_node(self, submission_dir):
344 """Convert a submission directory name to a submission node
346 submission_name = self.make_submission_name(submission_dir)
347 return self.submissionSetNS[submission_name]
349 def _get_library_attribute(self, libNode, attribute):
350 if not isinstance(attribute, RDF.Node):
351 attribute = libraryOntology[attribute]
353 targets = list(self.model.get_targets(libNode, attribute))
355 return self._format_library_attribute(targets)
359 #targets = self._search_same_as(libNode, attribute)
360 #if targets is not None:
361 # return self._format_library_attribute(targets)
363 # we don't know anything about this attribute
364 self._add_library_details_to_model(libNode)
366 targets = list(self.model.get_targets(libNode, attribute))
368 return self._format_library_attribute(targets)
372 def _format_library_attribute(self, targets):
373 if len(targets) == 0:
375 elif len(targets) == 1:
376 return fromTypedNode(targets[0])
377 elif len(targets) > 1:
378 return [fromTypedNode(t) for t in targets]
380 def _search_same_as(self, subject, predicate):
381 # look for alternate names
382 other_predicates = self.model.get_targets(predicate, owlNS['sameAs'])
383 for other in other_predicates:
384 targets = list(self.model.get_targets(subject, other))
389 def find_view(self, filename):
390 """Search through potential DAF filename patterns
392 if self.__view_map is None:
393 self.__view_map = self._get_filename_view_map()
396 for pattern, view in self.__view_map.items():
397 if re.match(pattern, filename):
401 msg = "%s matched multiple views %s" % (
403 [str(x) for x in results])
404 raise ModelException(msg)
405 elif len(results) == 1:
410 def get_view_name(self, view):
411 names = list(self.model.get_targets(view, submissionOntology['view_name']))
413 return fromTypedNode(names[0])
415 msg = "Found wrong number of view names for {0} len = {1}"
416 msg = msg.format(str(view), len(names))
418 raise RuntimeError(msg)
421 def _get_filename_view_map(self):
422 """Query our model for filename patterns
424 return a dictionary of compiled regular expressions to view names
426 filename_query = RDF.Statement(
427 None, dafTermOntology['filename_re'], None)
430 for s in self.model.find_statements(filename_query):
431 view_name = s.subject
432 literal_re = s.object.literal_value['string']
433 logger.debug("Found: %s" % (literal_re,))
435 filename_re = re.compile(literal_re)
437 logger.error("Unable to compile: %s" % (literal_re,))
438 patterns[literal_re] = view_name
441 def _is_paired(self, libNode):
442 """Determine if a library is paired end"""
443 library_type = self._get_library_attribute(libNode, 'library_type')
444 if library_type is None:
445 raise ModelException("%s doesn't have a library type" % (str(libNode),))
448 single = ['Single End', 'Small RNA', 'CSHL (lacking last nt)']
449 paired = ['Paired End', 'Multiplexing', 'Barcoded']
450 if library_type in single:
452 elif library_type in paired:
455 raise MetadataLookupException(
456 "Unrecognized library type %s for %s" % \
457 (library_type, str(libNode)))
459 def _get_library_url(self):
460 return str(self.libraryNS[''].uri)
461 def _set_library_url(self, value):
462 self.libraryNS = RDF.NS(str(value))
463 library_url = property(_get_library_url, _set_library_url)