htsworkflow/submission/submission.py

   1 """Common submission elements
   2 """
   3 import logging
   4 import os
   5 import re
   6
   7 import RDF
   8
   9 from htsworkflow.util.rdfhelp import \
  10      blankOrUri, \
  11      dump_model, \
  12      fromTypedNode, \
  13      get_model, \
  14      strip_namespace, \
  15      toTypedNode
  16 from htsworkflow.util.rdfns import *
  17 from htsworkflow.util.hashfile import make_md5sum
  18 from htsworkflow.submission.fastqname import FastqName
  19 from htsworkflow.submission.daf import \
  20      MetadataLookupException, \
  21      ModelException, \
  22      get_submission_uri
  23
  24 from django.conf import settings
  25 from django.template import Context, Template, loader
  26
  27 LOGGER = logging.getLogger(__name__)
  28
  29 class Submission(object):
  30     def __init__(self, name, model, host):
  31         self.name = name
  32         self.model = model
  33
  34         self.submissionSet = get_submission_uri(self.name)
  35         self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
  36         self.libraryNS = RDF.NS('{0}/library/'.format(host))
  37
  38         self.__view_map = None
  39
  40     def scan_submission_dirs(self, result_map):
  41         """Examine files in our result directory
  42         """
  43         for lib_id, result_dir in result_map.items():
  44             LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
  45             try:
  46                 self.import_analysis_dir(result_dir, lib_id)
  47             except MetadataLookupException, e:
  48                 LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
  49
  50     def import_analysis_dir(self, analysis_dir, library_id):
  51         """Import a submission directories and update our model as needed
  52         """
  53         #attributes = get_filename_attribute_map(paired)
  54         libNode = self.libraryNS[library_id + "/"]
  55
  56         self._add_library_details_to_model(libNode)
  57
  58         submission_files = os.listdir(analysis_dir)
  59         for filename in submission_files:
  60             pathname = os.path.abspath(os.path.join(analysis_dir, filename))
  61             self.construct_file_attributes(analysis_dir, libNode, pathname)
  62
  63     def analysis_nodes(self, result_map):
  64         """Return an iterable of analysis nodes
  65         """
  66         for result_dir in result_map.values():
  67             an_analysis = self.get_submission_node(result_dir)
  68             yield an_analysis
  69
  70     def construct_file_attributes(self, analysis_dir, libNode, pathname):
  71         """Looking for the best extension
  72         The 'best' is the longest match
  73
  74         :Args:
  75         filename (str): the filename whose extention we are about to examine
  76         """
  77         path, filename = os.path.split(pathname)
  78
  79         LOGGER.debug("Searching for view")
  80         file_type = self.find_best_match(filename)
  81         if file_type is None:
  82             LOGGER.warn("Unrecognized file: {0}".format(pathname))
  83             return None
  84         if str(file_type) == str(libraryOntology['ignore']):
  85             return None
  86
  87         an_analysis_name = self.make_submission_name(analysis_dir)
  88         an_analysis = self.get_submission_node(analysis_dir)
  89         an_analysis_uri = str(an_analysis.uri)
  90         file_classification = self.model.get_target(file_type,
  91                                                     rdfNS['type'])
  92         if file_classification is None:
  93             errmsg = 'Could not find class for {0}'
  94             LOGGER.warning(errmsg.format(str(file_type)))
  95             return
  96
  97         self.model.add_statement(
  98             RDF.Statement(self.submissionSetNS[''],
  99                           submissionOntology['has_submission'],
 100                           an_analysis))
 101         self.model.add_statement(RDF.Statement(an_analysis,
 102                                                submissionOntology['name'],
 103                                                toTypedNode(an_analysis_name)))
 104         self.model.add_statement(
 105             RDF.Statement(an_analysis,
 106                           rdfNS['type'],
 107                           submissionOntology['submission']))
 108         self.model.add_statement(RDF.Statement(an_analysis,
 109                                                submissionOntology['library'],
 110                                                libNode))
 111
 112         LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
 113         # add track specific information
 114         self.model.add_statement(
 115             RDF.Statement(an_analysis,
 116                           dafTermOntology['paired'],
 117                           toTypedNode(self._is_paired(libNode))))
 118         self.model.add_statement(
 119             RDF.Statement(an_analysis,
 120                           dafTermOntology['submission'],
 121                           an_analysis))
 122
 123         # add file specific information
 124         fileNode = self.make_file_node(pathname, an_analysis)
 125         self.add_md5s(filename, fileNode, analysis_dir)
 126         self.add_file_size(filename, fileNode, analysis_dir)
 127         self.add_fastq_metadata(filename, fileNode)
 128         self.add_label(file_type, fileNode, libNode)
 129         self.model.add_statement(
 130             RDF.Statement(fileNode,
 131                           rdfNS['type'],
 132                           file_type))
 133         self.model.add_statement(
 134             RDF.Statement(fileNode,
 135                           libraryOntology['library'],
 136                           libNode))
 137
 138         LOGGER.debug("Done.")
 139
 140     def make_file_node(self, pathname, submissionNode):
 141         """Create file node and attach it to its submission.
 142         """
 143         # add file specific information
 144         path, filename = os.path.split(pathname)
 145         pathname = os.path.abspath(pathname)
 146         fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
 147         self.model.add_statement(
 148             RDF.Statement(submissionNode,
 149                           dafTermOntology['has_file'],
 150                           fileNode))
 151         self.model.add_statement(
 152             RDF.Statement(fileNode,
 153                           dafTermOntology['filename'],
 154                           filename))
 155         self.model.add_statement(
 156             RDF.Statement(fileNode,
 157                           dafTermOntology['relative_path'],
 158                           os.path.relpath(pathname)))
 159         return fileNode
 160
 161     def add_md5s(self, filename, fileNode, analysis_dir):
 162         LOGGER.debug("Updating file md5sum")
 163         submission_pathname = os.path.join(analysis_dir, filename)
 164         md5 = make_md5sum(submission_pathname)
 165         if md5 is None:
 166             errmsg = "Unable to produce md5sum for {0}"
 167             LOGGER.warning(errmsg.format(submission_pathname))
 168         else:
 169             self.model.add_statement(
 170                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 171
 172     def add_file_size(self, filename, fileNode, analysis_dir):
 173         LOGGER.debug("Updating file size")
 174         submission_pathname = os.path.join(analysis_dir, filename)
 175         file_size = os.stat(submission_pathname).st_size
 176         self.model.add_statement(
 177             RDF.Statement(fileNode, dafTermOntology['file_size'], toTypedNode(file_size)))
 178
 179     def add_fastq_metadata(self, filename, fileNode):
 180         # How should I detect if this is actually a fastq file?
 181         try:
 182             fqname = FastqName(filename=filename)
 183         except ValueError:
 184             # currently its just ignore it if the fastq name parser fails
 185             return
 186
 187         terms = [('flowcell', libraryOntology['flowcell_id']),
 188                  ('lib_id', libraryOntology['library_id']),
 189                  ('lane', libraryOntology['lane_number']),
 190                  ('read', libraryOntology['read']),
 191                  ('cycle', libraryOntology['read_length'])]
 192         for file_term, model_term in terms:
 193             value = fqname.get(file_term)
 194             if value is not None:
 195                 s = RDF.Statement(fileNode, model_term, toTypedNode(value))
 196                 self.model.append(s)
 197
 198     def add_label(self, file_type, file_node, lib_node):
 199         """Add rdfs:label to a file node
 200         """
 201         #template_term = libraryOntology['label_template']
 202         template_term = libraryOntology['label_template']
 203         label_template = self.model.get_target(file_type, template_term)
 204         if label_template:
 205             template = loader.get_template('submission_view_rdfs_label_metadata.sparql')
 206             context = Context({
 207                 'library': str(lib_node.uri),
 208                 })
 209             for r in self.execute_query(template, context):
 210                 context = Context(r)
 211                 label = Template(label_template).render(context)
 212                 s = RDF.Statement(file_node, rdfsNS['label'], unicode(label))
 213                 self.model.append(s)
 214
 215     def _add_library_details_to_model(self, libNode):
 216         # attributes that can have multiple values
 217         set_attributes = set((libraryOntology['has_lane'],
 218                               libraryOntology['has_mappings'],
 219                               dafTermOntology['has_file']))
 220         parser = RDF.Parser(name='rdfa')
 221         try:
 222             new_statements = parser.parse_as_stream(libNode.uri)
 223         except RDF.RedlandError as e:
 224             LOGGER.error(e)
 225             return
 226         LOGGER.debug("Scanning %s", str(libNode.uri))
 227         toadd = []
 228         for s in new_statements:
 229             # always add "collections"
 230             if s.predicate in set_attributes:
 231                 toadd.append(s)
 232                 continue
 233             # don't override things we already have in the model
 234             targets = list(self.model.get_targets(s.subject, s.predicate))
 235             if len(targets) == 0:
 236                 toadd.append(s)
 237
 238         for s in toadd:
 239             self.model.append(s)
 240
 241         self._add_lane_details(libNode)
 242
 243     def _add_lane_details(self, libNode):
 244         """Import lane details
 245         """
 246         query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
 247         lanes = []
 248         for lane_stmt in self.model.find_statements(query):
 249             lanes.append(lane_stmt.object)
 250
 251         parser = RDF.Parser(name='rdfa')
 252         for lane in lanes:
 253             LOGGER.debug("Importing %s" % (lane.uri,))
 254             try:
 255                 parser.parse_into_model(self.model, lane.uri)
 256             except RDF.RedlandError, e:
 257                 LOGGER.error("Error accessing %s" % (lane.uri,))
 258                 raise e
 259
 260
 261     def find_best_match(self, filename):
 262         """Search through potential filename matching patterns
 263         """
 264         if self.__view_map is None:
 265             self.__view_map = self._get_filename_view_map()
 266
 267         results = []
 268         for pattern, view in self.__view_map.items():
 269             if re.match(pattern, filename):
 270                 results.append(view)
 271
 272         if len(results) > 1:
 273             msg = "%s matched multiple views %s" % (
 274                 filename,
 275                 [str(x) for x in results])
 276             raise ModelException(msg)
 277         elif len(results) == 1:
 278             return results[0]
 279         else:
 280             return None
 281
 282     def _get_filename_view_map(self):
 283         """Query our model for filename patterns
 284
 285         return a dictionary of compiled regular expressions to view names
 286         """
 287         filename_query = RDF.Statement(
 288             None, dafTermOntology['filename_re'], None)
 289
 290         patterns = {}
 291         for s in self.model.find_statements(filename_query):
 292             view_name = s.subject
 293             literal_re = s.object.literal_value['string']
 294             LOGGER.debug("Found: %s" % (literal_re,))
 295             try:
 296                 filename_re = re.compile(literal_re)
 297             except re.error, e:
 298                 LOGGER.error("Unable to compile: %s" % (literal_re,))
 299             patterns[literal_re] = view_name
 300         return patterns
 301
 302     def make_submission_name(self, analysis_dir):
 303         analysis_dir = os.path.normpath(analysis_dir)
 304         analysis_dir_name = os.path.split(analysis_dir)[1]
 305         if len(analysis_dir_name) == 0:
 306             raise RuntimeError(
 307                 "Submission dir name too short: {0}".format(analysis_dir))
 308         return analysis_dir_name
 309
 310     def get_submission_node(self, analysis_dir):
 311         """Convert a submission directory name to a submission node
 312         """
 313         submission_name = self.make_submission_name(analysis_dir)
 314         return self.submissionSetNS[submission_name]
 315
 316     def _get_library_attribute(self, libNode, attribute):
 317         if not isinstance(attribute, RDF.Node):
 318             attribute = libraryOntology[attribute]
 319
 320         targets = list(self.model.get_targets(libNode, attribute))
 321         if len(targets) > 0:
 322             return self._format_library_attribute(targets)
 323         else:
 324             return None
 325
 326         #targets = self._search_same_as(libNode, attribute)
 327         #if targets is not None:
 328         #    return self._format_library_attribute(targets)
 329
 330         # we don't know anything about this attribute
 331         self._add_library_details_to_model(libNode)
 332
 333         targets = list(self.model.get_targets(libNode, attribute))
 334         if len(targets) > 0:
 335             return self._format_library_attribute(targets)
 336
 337         return None
 338
 339     def _format_library_attribute(self, targets):
 340         if len(targets) == 0:
 341             return None
 342         elif len(targets) == 1:
 343             return fromTypedNode(targets[0])
 344         elif len(targets) > 1:
 345             return [fromTypedNode(t) for t in targets]
 346
 347     def _is_paired(self, libNode):
 348         """Determine if a library is paired end"""
 349         library_type = self._get_library_attribute(libNode, 'library_type')
 350         if library_type is None:
 351             errmsg = "%s doesn't have a library type"
 352             raise ModelException(errmsg % (str(libNode),))
 353
 354         single = ['CSHL (lacking last nt)',
 355                   'Single End (non-multiplexed)',
 356                   'Small RNA (non-multiplexed)',]
 357         paired = ['Barcoded Illumina',
 358                   'Multiplexing',
 359                   'NEBNext Multiplexed',
 360                   'NEBNext Small RNA',
 361                   'Nextera',
 362                   'Paired End (non-multiplexed)',
 363                   'Dual Index Illumina',]
 364         if library_type in single:
 365             return False
 366         elif library_type in paired:
 367             return True
 368         else:
 369             raise MetadataLookupException(
 370                 "Unrecognized library type %s for %s" % \
 371                 (library_type, str(libNode)))
 372
 373     def execute_query(self, template, context):
 374         """Execute the query, returning the results
 375         """
 376         formatted_query = template.render(context)
 377         LOGGER.debug(formatted_query)
 378         query = RDF.SPARQLQuery(str(formatted_query))
 379         rdfstream = query.execute(self.model)
 380         results = []
 381         for record in rdfstream:
 382             d = {}
 383             for key, value in record.items():
 384                 d[key] = fromTypedNode(value)
 385             results.append(d)
 386         return results
 387
 388
 389 def list_submissions(model):
 390     """Return generator of submissions in this model.
 391     """
 392     query_body = """
 393       PREFIX subns: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 394
 395       select distinct ?submission
 396       where { ?submission subns:has_submission ?library_dir }
 397     """
 398     query = RDF.SPARQLQuery(query_body)
 399     rdfstream = query.execute(model)
 400     for row in rdfstream:
 401         s = strip_namespace(submissionLog, row['submission'])
 402         if s[-1] in ['#', '/', '?']:
 403             s = s[:-1]
 404         yield s