7 from StringIO import StringIO
12 from htsworkflow.util.rdfhelp import \
23 from htsworkflow.util.hashfile import make_md5sum
25 logger = logging.getLogger(__name__)
28 class ModelException(RuntimeError): pass
29 class MetadataLookupException(RuntimeError):
30 """Problem accessing metadata"""
37 def parse_into_model(model, submission_name, filename):
38 """Read a DAF into RDF Model
40 requires a short submission name
42 attributes = parse(filename)
43 add_to_model(model, attributes, submission_name)
45 def fromstream_into_model(model, submission_name, daf_stream):
46 attributes = parse_stream(daf_stream)
47 add_to_model(model, attributes, submission_name)
49 def fromstring_into_model(model, submission_name, daf_string):
50 """Read a string containing a DAF into RDF Model
52 requires a short submission name
54 attributes = fromstring(daf_string)
55 add_to_model(model, attributes, submission_name)
58 stream = open(filename,'r')
59 attributes = parse_stream(stream)
63 def fromstring(daf_string):
64 stream = StringIO(daf_string)
65 return parse_stream(stream)
67 def parse_stream(stream):
68 comment_re = re.compile("#.*$")
71 attributes = {'views': {}}
76 line = comment_re.sub("", line)
77 nstop = _extract_name_index(line)
79 sstop = _consume_whitespace(line, start=nstop)
80 vstop = _extract_value_index(line, start=sstop)
81 value = line[sstop:vstop]
83 if value.lower() in ('yes',):
85 elif value.lower() in ('no',):
89 if view_name is not None:
90 attributes['views'][view_name] = view_attributes
94 elif state == DAF_HEADER and name == 'variables':
95 attributes[name] = [ x.strip() for x in value.split(',')]
96 elif state == DAF_HEADER and name == 'view':
98 view_attributes['view'] = value
100 elif state == DAF_HEADER:
101 attributes[name] = value
102 elif state == DAF_VIEW:
103 view_attributes[name] = value
106 if view_name is not None:
107 attributes['views'][view_name] = view_attributes
111 def _consume_whitespace(line, start=0):
112 for i in xrange(start, len(line)):
113 if line[i] not in string.whitespace:
118 def _extract_name_index(line, start=0):
119 for i in xrange(start, len(line)):
120 if line[i] in string.whitespace:
125 def _extract_value_index(line, start=0):
126 shortline = line.rstrip()
127 return len(shortline)
129 def convert_to_rdf_statements(attributes, name):
130 submission_uri = get_submission_uri(name)
131 subject = RDF.Node(submission_uri)
134 for daf_key in attributes:
135 predicate = dafTermOntology[daf_key]
136 if daf_key == 'views':
137 statements.extend(_views_to_statements(name,
139 attributes[daf_key]))
140 elif daf_key == 'variables':
141 #predicate = ddfNS['variables']
142 for var in attributes.get('variables', []):
143 obj = toTypedNode(var)
144 statements.append(RDF.Statement(subject, predicate, obj))
146 value = attributes[daf_key]
147 obj = toTypedNode(value)
148 statements.append(RDF.Statement(subject,predicate,obj))
152 def _views_to_statements(name, dafNS, views):
153 subject = RDF.Node(get_submission_uri(name))
154 viewNS = get_view_namespace(name)
157 for view_name in views:
158 view_attributes = views[view_name]
159 viewSubject = viewNS[view_name]
160 statements.append(RDF.Statement(subject, dafNS['views'], viewSubject))
162 RDF.Statement(viewSubject, dafNS['name'], toTypedNode(view_name)))
163 for view_attribute_name in view_attributes:
164 predicate = dafNS[view_attribute_name]
165 obj = toTypedNode(view_attributes[view_attribute_name])
166 statements.append(RDF.Statement(viewSubject, predicate, obj))
168 #statements.extend(convert_to_rdf_statements(view, viewNode))
171 def add_to_model(model, attributes, name):
172 for statement in convert_to_rdf_statements(attributes, name):
173 model.add_statement(statement)
175 def get_submission_uri(name):
176 return submissionLog[name].uri
178 def get_view_namespace(name):
179 submission_uri = get_submission_uri(name)
180 viewNS = RDF.NS(str(submission_uri) + '/view/')
183 class DAFMapper(object):
184 """Convert filenames to views in the UCSC Daf
186 def __init__(self, name, daf_file=None, model=None):
187 """Construct a RDF backed model of a UCSC DAF
190 name (str): the name of this submission (used to construct DAF url)
191 daf_file (str, stream, or None):
192 if str, use as filename
193 if stream, parse as stream
194 if none, don't attempt to load the DAF into our model
195 model (RDF.Model or None):
196 if None, construct a memory backed model
197 otherwise specifies model to use
199 if daf_file is None and model is None:
200 logger.error("We need a DAF or Model containing a DAF to work")
203 if model is not None:
206 self.model = get_model()
208 if hasattr(daf_file, 'next'):
209 # its some kind of stream
210 fromstream_into_model(self.model, name, daf_file)
213 parse_into_model(self.model, name, daf_file)
215 self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
216 self.submissionSet = get_submission_uri(self.name)
217 self.submissionSetNS = RDF.NS(str(self.submissionSet)+'/')
218 self.__view_map = None
221 def add_pattern(self, view_name, filename_pattern):
222 """Map a filename regular expression to a view name
224 viewNS = get_view_namespace(self.name)
226 obj = toTypedNode(filename_pattern)
227 self.model.add_statement(
228 RDF.Statement(viewNS[view_name],
229 dafTermOntology['filename_re'],
233 def import_submission_dir(self, submission_dir, library_id):
234 """Import a submission directories and update our model as needed
236 #attributes = get_filename_attribute_map(paired)
237 libNode = self.libraryNS[library_id + "/"]
239 submission_files = os.listdir(submission_dir)
240 for f in submission_files:
241 self.construct_file_attributes(submission_dir, libNode, f)
244 def construct_file_attributes(self, submission_dir, libNode, pathname):
245 """Looking for the best extension
246 The 'best' is the longest match
249 filename (str): the filename whose extention we are about to examine
251 path, filename = os.path.split(pathname)
253 view = self.find_view(filename)
255 logger.warn("Unrecognized file: %s" % (pathname,))
257 if str(view) == str(libraryOntology['ignore']):
260 submission_name = self.make_submission_name(submission_dir)
261 submissionNode = self.get_submission_node(submission_dir)
262 submission_uri = str(submissionNode.uri)
263 view_name = fromTypedNode(self.model.get_target(view, dafTermOntology['name']))
264 submissionView = RDF.Node(RDF.Uri(submission_uri + '/' + view_name))
266 self.model.add_statement(
267 RDF.Statement(self.submissionSet, dafTermOntology['has_submission'], submissionNode))
269 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['has_view'], submissionView))
270 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['name'], toTypedNode(submission_name)))
271 self.model.add_statement(RDF.Statement(submissionNode, rdfNS['type'], submissionOntology['submission']))
272 self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['library'], libNode))
274 # add trac specific information
275 self.model.add_statement(
276 RDF.Statement(submissionView, dafTermOntology['view'], view))
277 self.model.add_statement(
278 RDF.Statement(submissionView, dafTermOntology['paired'], toTypedNode(self._is_paired(libNode))))
279 self.model.add_statement(
280 RDF.Statement(submissionView, dafTermOntology['submission'], submissionNode))
283 terms = [dafTermOntology['type'],
284 dafTermOntology['filename_re'],
286 terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
288 # Add everything I can find
290 value = self._get_library_attribute(libNode, term)
291 if value is not None:
292 self.model.add_statement(RDF.Statement(submissionView, term, value))
294 # add file specific information
295 fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
296 submission_pathname = os.path.join(submission_dir, filename)
297 md5 = make_md5sum(submission_pathname)
298 self.model.add_statement(
299 RDF.Statement(submissionView, dafTermOntology['has_file'], fileNode))
300 self.model.add_statement(
301 RDF.Statement(fileNode, dafTermOntology['filename'], filename))
304 logging.warning("Unable to produce md5sum for %s" % ( submission_pathname))
306 self.model.add_statement(
307 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
310 def _add_library_details_to_model(self, libNode):
311 parser = RDF.Parser(name='rdfa')
312 new_statements = parser.parse_as_stream(libNode.uri)
313 for s in new_statements:
314 # don't override things we already have in the model
315 q = RDF.Statement(s.subject, s.predicate, None)
316 if len(list(self.model.find_statements(q))) == 0:
319 statements = list(self.model.find_statements(q))
320 if len(statements) == 0:
321 logger.warning("Nothing known about %s" % (str(libNode),))
323 def get_daf_variables(self):
324 """Returns simple variables names that to include in the ddf
326 variableTerm = dafTermOntology['variables']
328 for obj in self.model.get_targets(self.submissionSet, variableTerm):
329 value = str(fromTypedNode(obj))
330 results.append(value)
331 results.append('labVersion')
334 def make_submission_name(self, submission_dir):
335 submission_dir = os.path.normpath(submission_dir)
336 submission_dir_name = os.path.split(submission_dir)[1]
337 if len(submission_dir_name) == 0:
339 "Submission dir name too short: %s" %(submission_dir,))
340 return submission_dir_name
342 def get_submission_node(self, submission_dir):
343 """Convert a submission directory name to a submission node
345 submission_name = self.make_submission_name(submission_dir)
346 return self.submissionSetNS[submission_name]
348 def _get_library_attribute(self, libNode, attribute):
349 if not isinstance(attribute, RDF.Node):
350 attribute = libraryOntology[attribute]
352 # search through the model twice (adding in data from website)
354 targets = list(self.model.get_targets(libNode, attribute))
356 return self._format_library_attribute(targets)
358 targets = self._search_same_as(libNode, attribute)
359 if targets is not None:
360 return self._format_library_attribute(targets)
362 # we don't know anything about this attribute
363 self._add_library_details_to_model(libNode)
367 def _format_library_attribute(self, targets):
368 if len(targets) == 0:
370 elif len(targets) == 1:
371 return fromTypedNode(targets[0])
372 elif len(targets) > 1:
373 return [fromTypedNode(t) for t in targets]
375 def _search_same_as(self, subject, predicate):
376 # look for alternate names
377 other_predicates = self.model.get_targets(predicate, owlNS['sameAs'])
378 for other in other_predicates:
379 targets = list(self.model.get_targets(subject, other))
384 def find_view(self, filename):
385 """Search through potential DAF filename patterns
387 if self.__view_map is None:
388 self.__view_map = self._get_filename_view_map()
391 for pattern, view in self.__view_map.items():
392 if re.match(pattern, filename):
396 msg = "%s matched multiple views %s" % (
398 [str(x) for x in results])
399 raise ModelException(msg)
400 elif len(results) == 1:
406 def _get_filename_view_map(self):
407 """Query our model for filename patterns
409 return a dictionary of compiled regular expressions to view names
411 filename_query = RDF.Statement(
412 None, dafTermOntology['filename_re'], None)
415 for s in self.model.find_statements(filename_query):
416 view_name = s.subject
417 literal_re = s.object.literal_value['string']
418 logger.debug("Found: %s" % (literal_re,))
420 filename_re = re.compile(literal_re)
422 logger.error("Unable to compile: %s" % (literal_re,))
423 patterns[literal_re] = view_name
426 def _is_paired(self, libNode):
427 """Determine if a library is paired end"""
428 library_type = self._get_library_attribute(libNode, 'library_type')
429 if library_type is None:
430 raise ModelException("%s doesn't have a library type" % (str(libNode),))
433 single = ['Single End', 'Small RNA', 'CSHL (lacking last nt)']
434 paired = ['Paired End', 'Multiplexing', 'Barcoded']
435 if library_type in single:
437 elif library_type in paired:
440 raise MetadataLookupException(
441 "Unrecognized library type %s for %s" % \
442 (library_type, str(libNode)))
444 def _get_library_url(self):
445 return str(self.libraryNS[''].uri)
446 def _set_library_url(self, value):
447 self.libraryNS = RDF.NS(str(value))
448 library_url = property(_get_library_url, _set_library_url)