htsworkflow/submission/submission.py

   1 """Common submission elements
   2 """
   3 import logging
   4 import os
   5 import re
   6
   7 import RDF
   8
   9 from htsworkflow.util.rdfhelp import \
  10      blankOrUri, \
  11      dump_model, \
  12      fromTypedNode, \
  13      get_model, \
  14      strip_namespace, \
  15      toTypedNode
  16 from htsworkflow.util.rdfns import (
  17     dafTermOntology,
  18     libraryOntology,
  19     rdfNS,
  20     rdfsNS,
  21     submissionLog,
  22     submissionOntology,
  23 )
  24 from htsworkflow.util.hashfile import make_md5sum
  25 from htsworkflow.submission.fastqname import FastqName
  26 from htsworkflow.submission.daf import \
  27      MetadataLookupException, \
  28      ModelException, \
  29      get_submission_uri
  30 from htsworkflow.util import opener
  31
  32 from django.template import Context, Template, loader
  33
  34 LOGGER = logging.getLogger(__name__)
  35
  36 class Submission(object):
  37     def __init__(self, name, model, host):
  38         self.name = name
  39         self.model = model
  40
  41         self.submissionSet = get_submission_uri(self.name)
  42         self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
  43         self.libraryNS = RDF.NS('{0}/library/'.format(host))
  44         self.flowcellNS = RDF.NS('{0}/flowcell/'.format(host))
  45
  46         self.__view_map = None
  47
  48     def scan_submission_dirs(self, result_map):
  49         """Examine files in our result directory
  50         """
  51         for lib_id, result_dir in result_map.items():
  52             LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
  53             try:
  54                 self.import_analysis_dir(result_dir, lib_id)
  55             except MetadataLookupException as e:
  56                 LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
  57
  58     def import_analysis_dir(self, analysis_dir, library_id):
  59         """Import a submission directories and update our model as needed
  60         """
  61         #attributes = get_filename_attribute_map(paired)
  62         libNode = self.libraryNS[library_id + "/"]
  63
  64         self._add_library_details_to_model(libNode)
  65
  66         submission_files = os.listdir(analysis_dir)
  67         for filename in submission_files:
  68             pathname = os.path.abspath(os.path.join(analysis_dir, filename))
  69             self.construct_file_attributes(analysis_dir, libNode, pathname)
  70
  71     def analysis_nodes(self, result_map):
  72         """Return an iterable of analysis nodes
  73         """
  74         for result_dir in result_map.values():
  75             an_analysis = self.get_submission_node(result_dir)
  76             yield an_analysis
  77
  78     def construct_file_attributes(self, analysis_dir, libNode, pathname):
  79         """Looking for the best extension
  80         The 'best' is the longest match
  81
  82         :Args:
  83         filename (str): the filename whose extention we are about to examine
  84         """
  85         path, filename = os.path.split(pathname)
  86
  87         LOGGER.debug("Searching for view")
  88         file_type = self.find_best_match(filename)
  89         if file_type is None:
  90             LOGGER.warn("Unrecognized file: {0}".format(pathname))
  91             return None
  92         if str(file_type) == str(libraryOntology['ignore']):
  93             return None
  94
  95         an_analysis_name = self.make_submission_name(analysis_dir)
  96         an_analysis = self.get_submission_node(analysis_dir)
  97         file_classification = self.model.get_target(file_type,
  98                                                     rdfNS['type'])
  99         if file_classification is None:
 100             errmsg = 'Could not find class for {0}'
 101             LOGGER.warning(errmsg.format(str(file_type)))
 102             return
 103
 104         self.model.add_statement(
 105             RDF.Statement(self.submissionSetNS[''],
 106                           submissionOntology['has_submission'],
 107                           an_analysis))
 108         self.model.add_statement(RDF.Statement(an_analysis,
 109                                                submissionOntology['name'],
 110                                                toTypedNode(an_analysis_name)))
 111         self.model.add_statement(
 112             RDF.Statement(an_analysis,
 113                           rdfNS['type'],
 114                           submissionOntology['submission']))
 115         self.model.add_statement(RDF.Statement(an_analysis,
 116                                                submissionOntology['library'],
 117                                                libNode))
 118
 119         LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
 120         # add track specific information
 121         self.model.add_statement(
 122             RDF.Statement(an_analysis,
 123                           dafTermOntology['paired'],
 124                           toTypedNode(self._is_paired(libNode))))
 125         self.model.add_statement(
 126             RDF.Statement(an_analysis,
 127                           dafTermOntology['submission'],
 128                           an_analysis))
 129
 130         # add file specific information
 131         fileNode = self.make_file_node(pathname, an_analysis)
 132         self.add_md5s(filename, fileNode, analysis_dir)
 133         self.add_file_size(filename, fileNode, analysis_dir)
 134         self.add_read_length(filename, fileNode, analysis_dir)
 135         self.add_fastq_metadata(filename, fileNode)
 136         self.add_label(file_type, fileNode, libNode)
 137         self.model.add_statement(
 138             RDF.Statement(fileNode,
 139                           rdfNS['type'],
 140                           file_type))
 141         self.model.add_statement(
 142             RDF.Statement(fileNode,
 143                           libraryOntology['library'],
 144                           libNode))
 145
 146         LOGGER.debug("Done.")
 147
 148     def make_file_node(self, pathname, submissionNode):
 149         """Create file node and attach it to its submission.
 150         """
 151         # add file specific information
 152         path, filename = os.path.split(pathname)
 153         pathname = os.path.abspath(pathname)
 154         fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
 155         self.model.add_statement(
 156             RDF.Statement(submissionNode,
 157                           dafTermOntology['has_file'],
 158                           fileNode))
 159         self.model.add_statement(
 160             RDF.Statement(fileNode,
 161                           dafTermOntology['filename'],
 162                           filename))
 163         self.model.add_statement(
 164             RDF.Statement(fileNode,
 165                           dafTermOntology['relative_path'],
 166                           os.path.relpath(pathname)))
 167         return fileNode
 168
 169     def add_md5s(self, filename, fileNode, analysis_dir):
 170         LOGGER.debug("Updating file md5sum")
 171         submission_pathname = os.path.join(analysis_dir, filename)
 172         md5 = make_md5sum(submission_pathname)
 173         if md5 is None:
 174             errmsg = "Unable to produce md5sum for {0}"
 175             LOGGER.warning(errmsg.format(submission_pathname))
 176         else:
 177             self.model.add_statement(
 178                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 179
 180     def add_file_size(self, filename, fileNode, analysis_dir):
 181         submission_pathname = os.path.join(analysis_dir, filename)
 182         file_size = os.stat(submission_pathname).st_size
 183         self.model.add_statement(
 184             RDF.Statement(fileNode, dafTermOntology['file_size'], toTypedNode(file_size)))
 185         LOGGER.debug("Updating file size: %d", file_size)
 186
 187     def add_read_length(self, filename, fileNode, analysis_dir):
 188         submission_pathname = os.path.join(analysis_dir, filename)
 189         stream = opener.autoopen(submission_pathname, 'rt')
 190         header = stream.readline().strip()
 191         sequence = stream.readline().strip()
 192         read_length = len(sequence)
 193         self.model.add_statement(
 194             RDF.Statement(fileNode,
 195                           libraryOntology['read_length'],
 196                           toTypedNode(read_length))
 197         )
 198         LOGGER.debug("Updating read length: %d", read_length)
 199
 200     def add_fastq_metadata(self, filename, fileNode):
 201         # How should I detect if this is actually a fastq file?
 202         try:
 203             fqname = FastqName(filename=filename)
 204         except ValueError:
 205             # currently its just ignore it if the fastq name parser fails
 206             return
 207
 208         terms = [('flowcell', libraryOntology['flowcell_id']),
 209                  ('lib_id', libraryOntology['library_id']),
 210                  ('lane', libraryOntology['lane_number']),
 211                  ('read', libraryOntology['read']),
 212         ]
 213         for file_term, model_term in terms:
 214             value = fqname.get(file_term)
 215             if value is not None:
 216                 s = RDF.Statement(fileNode, model_term, toTypedNode(value))
 217                 self.model.append(s)
 218
 219         if 'flowcell' in fqname:
 220             value = self.flowcellNS[fqname['flowcell'] + '/']
 221             s = RDF.Statement(fileNode, libraryOntology['flowcell'], value)
 222             self.model.append(s)
 223
 224     def add_label(self, file_type, file_node, lib_node):
 225         """Add rdfs:label to a file node
 226         """
 227         #template_term = libraryOntology['label_template']
 228         template_term = libraryOntology['label_template']
 229         label_template = self.model.get_target(file_type, template_term)
 230         if label_template:
 231             template = loader.get_template('submission_view_rdfs_label_metadata.sparql')
 232             context = Context({
 233                 'library': str(lib_node.uri),
 234                 })
 235             for r in self.execute_query(template, context):
 236                 context = Context(r)
 237                 label = Template(label_template).render(context)
 238                 s = RDF.Statement(file_node, rdfsNS['label'], unicode(label))
 239                 self.model.append(s)
 240
 241     def _add_library_details_to_model(self, libNode):
 242         # attributes that can have multiple values
 243         set_attributes = set((libraryOntology['has_lane'],
 244                               libraryOntology['has_mappings'],
 245                               dafTermOntology['has_file']))
 246         parser = RDF.Parser(name='rdfa')
 247         try:
 248             new_statements = parser.parse_as_stream(libNode.uri)
 249         except RDF.RedlandError as e:
 250             LOGGER.error(e)
 251             return
 252         LOGGER.debug("Scanning %s", str(libNode.uri))
 253         toadd = []
 254         for s in new_statements:
 255             # always add "collections"
 256             if s.predicate in set_attributes:
 257                 toadd.append(s)
 258                 continue
 259             # don't override things we already have in the model
 260             targets = list(self.model.get_targets(s.subject, s.predicate))
 261             if len(targets) == 0:
 262                 toadd.append(s)
 263
 264         for s in toadd:
 265             self.model.append(s)
 266
 267         self._add_lane_details(libNode)
 268         self._add_flowcell_details()
 269
 270     def _add_lane_details(self, libNode):
 271         """Import lane details
 272         """
 273         query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
 274         lanes = []
 275         for lane_stmt in self.model.find_statements(query):
 276             lanes.append(lane_stmt.object)
 277
 278         parser = RDF.Parser(name='rdfa')
 279         for lane in lanes:
 280             LOGGER.debug("Importing %s" % (lane.uri,))
 281             try:
 282                 parser.parse_into_model(self.model, lane.uri)
 283             except RDF.RedlandError as e:
 284                 LOGGER.error("Error accessing %s" % (lane.uri,))
 285                 raise e
 286
 287
 288     def _add_flowcell_details(self):
 289         template = loader.get_template('aws_flowcell.sparql')
 290         results = self.execute_query(template, Context())
 291
 292         parser = RDF.Parser(name='rdfa')
 293         for r in self.execute_query(template, Context()):
 294             flowcell = r['flowcell']
 295             try:
 296                 parser.parse_into_model(self.model, flowcell.uri)
 297             except RDF.RedlandError as e:
 298                 LOGGER.error("Error accessing %s" % (str(flowcell)))
 299                 raise e
 300
 301
 302     def find_best_match(self, filename):
 303         """Search through potential filename matching patterns
 304         """
 305         if self.__view_map is None:
 306             self.__view_map = self._get_filename_view_map()
 307
 308         results = []
 309         for pattern, view in self.__view_map.items():
 310             if re.match(pattern, filename):
 311                 results.append(view)
 312
 313         if len(results) > 1:
 314             msg = "%s matched multiple views %s" % (
 315                 filename,
 316                 [str(x) for x in results])
 317             raise ModelException(msg)
 318         elif len(results) == 1:
 319             return results[0]
 320         else:
 321             return None
 322
 323     def _get_filename_view_map(self):
 324         """Query our model for filename patterns
 325
 326         return a dictionary of compiled regular expressions to view names
 327         """
 328         filename_query = RDF.Statement(
 329             None, dafTermOntology['filename_re'], None)
 330
 331         patterns = {}
 332         for s in self.model.find_statements(filename_query):
 333             view_name = s.subject
 334             literal_re = s.object.literal_value['string']
 335             LOGGER.debug("Found: %s" % (literal_re,))
 336             try:
 337                 filename_re = re.compile(literal_re)
 338             except re.error as e:
 339                 LOGGER.error("Unable to compile: %s" % (literal_re,))
 340             patterns[literal_re] = view_name
 341         return patterns
 342
 343     def make_submission_name(self, analysis_dir):
 344         analysis_dir = os.path.normpath(analysis_dir)
 345         analysis_dir_name = os.path.split(analysis_dir)[1]
 346         if len(analysis_dir_name) == 0:
 347             raise RuntimeError(
 348                 "Submission dir name too short: {0}".format(analysis_dir))
 349         return analysis_dir_name
 350
 351     def get_submission_node(self, analysis_dir):
 352         """Convert a submission directory name to a submission node
 353         """
 354         submission_name = self.make_submission_name(analysis_dir)
 355         return self.submissionSetNS[submission_name]
 356
 357     def _get_library_attribute(self, libNode, attribute):
 358         if not isinstance(attribute, RDF.Node):
 359             attribute = libraryOntology[attribute]
 360
 361         targets = list(self.model.get_targets(libNode, attribute))
 362         if len(targets) > 0:
 363             return self._format_library_attribute(targets)
 364         else:
 365             return None
 366
 367         #targets = self._search_same_as(libNode, attribute)
 368         #if targets is not None:
 369         #    return self._format_library_attribute(targets)
 370
 371         # we don't know anything about this attribute
 372         self._add_library_details_to_model(libNode)
 373
 374         targets = list(self.model.get_targets(libNode, attribute))
 375         if len(targets) > 0:
 376             return self._format_library_attribute(targets)
 377
 378         return None
 379
 380     def _format_library_attribute(self, targets):
 381         if len(targets) == 0:
 382             return None
 383         elif len(targets) == 1:
 384             return fromTypedNode(targets[0])
 385         elif len(targets) > 1:
 386             return [fromTypedNode(t) for t in targets]
 387
 388     def _is_paired(self, libNode):
 389         """Determine if a library is paired end"""
 390         library_type = self._get_library_attribute(libNode, 'library_type')
 391         if library_type is None:
 392             errmsg = "%s doesn't have a library type"
 393             raise ModelException(errmsg % (str(libNode),))
 394
 395         single = ['CSHL (lacking last nt)',
 396                   'Single End (non-multiplexed)',
 397                   'Small RNA (non-multiplexed)',]
 398         paired = ['Barcoded Illumina',
 399                   'Multiplexing',
 400                   'NEBNext Multiplexed',
 401                   'NEBNext Small RNA',
 402                   'Nextera',
 403                   'Paired End (non-multiplexed)',
 404                   'Dual Index Illumina',]
 405         if library_type in single:
 406             return False
 407         elif library_type in paired:
 408             return True
 409         else:
 410             raise MetadataLookupException(
 411                 "Unrecognized library type %s for %s" % \
 412                 (library_type, str(libNode)))
 413
 414     def execute_query(self, template, context):
 415         """Execute the query, returning the results
 416         """
 417         formatted_query = template.render(context)
 418         LOGGER.debug(formatted_query)
 419         query = RDF.SPARQLQuery(str(formatted_query))
 420         rdfstream = query.execute(self.model)
 421         results = []
 422         for record in rdfstream:
 423             d = {}
 424             for key, value in record.items():
 425                 d[key] = fromTypedNode(value)
 426             results.append(d)
 427         return results
 428
 429
 430 def list_submissions(model):
 431     """Return generator of submissions in this model.
 432     """
 433     query_body = """
 434       PREFIX subns: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 435
 436       select distinct ?submission
 437       where { ?submission subns:has_submission ?library_dir }
 438     """
 439     query = RDF.SPARQLQuery(query_body)
 440     rdfstream = query.execute(model)
 441     for row in rdfstream:
 442         s = strip_namespace(submissionLog, row['submission'])
 443         if s[-1] in ['#', '/', '?']:
 444             s = s[:-1]
 445         yield s