htsworkflow/submission/submission.py

   1 """Common submission elements
   2 """
   3 import logging
   4 import os
   5 import re
   6
   7 import RDF
   8
   9 from htsworkflow.util.rdfhelp import \
  10      blankOrUri, \
  11      dump_model, \
  12      fromTypedNode, \
  13      get_model, \
  14      strip_namespace, \
  15      toTypedNode
  16 from htsworkflow.util.rdfns import *
  17 from htsworkflow.util.hashfile import make_md5sum
  18 from htsworkflow.submission.fastqname import FastqName
  19 from htsworkflow.submission.daf import \
  20      MetadataLookupException, \
  21      ModelException, \
  22      get_submission_uri
  23 from htsworkflow.util import opener
  24
  25 from django.template import Context, Template, loader
  26
  27 LOGGER = logging.getLogger(__name__)
  28
  29 class Submission(object):
  30     def __init__(self, name, model, host):
  31         self.name = name
  32         self.model = model
  33
  34         self.submissionSet = get_submission_uri(self.name)
  35         self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
  36         self.libraryNS = RDF.NS('{0}/library/'.format(host))
  37         self.flowcellNS = RDF.NS('{0}/flowcell/'.format(host))
  38
  39         self.__view_map = None
  40
  41     def scan_submission_dirs(self, result_map):
  42         """Examine files in our result directory
  43         """
  44         for lib_id, result_dir in result_map.items():
  45             LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
  46             try:
  47                 self.import_analysis_dir(result_dir, lib_id)
  48             except MetadataLookupException as e:
  49                 LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
  50
  51     def import_analysis_dir(self, analysis_dir, library_id):
  52         """Import a submission directories and update our model as needed
  53         """
  54         #attributes = get_filename_attribute_map(paired)
  55         libNode = self.libraryNS[library_id + "/"]
  56
  57         self._add_library_details_to_model(libNode)
  58
  59         submission_files = os.listdir(analysis_dir)
  60         for filename in submission_files:
  61             pathname = os.path.abspath(os.path.join(analysis_dir, filename))
  62             self.construct_file_attributes(analysis_dir, libNode, pathname)
  63
  64     def analysis_nodes(self, result_map):
  65         """Return an iterable of analysis nodes
  66         """
  67         for result_dir in result_map.values():
  68             an_analysis = self.get_submission_node(result_dir)
  69             yield an_analysis
  70
  71     def construct_file_attributes(self, analysis_dir, libNode, pathname):
  72         """Looking for the best extension
  73         The 'best' is the longest match
  74
  75         :Args:
  76         filename (str): the filename whose extention we are about to examine
  77         """
  78         path, filename = os.path.split(pathname)
  79
  80         LOGGER.debug("Searching for view")
  81         file_type = self.find_best_match(filename)
  82         if file_type is None:
  83             LOGGER.warn("Unrecognized file: {0}".format(pathname))
  84             return None
  85         if str(file_type) == str(libraryOntology['ignore']):
  86             return None
  87
  88         an_analysis_name = self.make_submission_name(analysis_dir)
  89         an_analysis = self.get_submission_node(analysis_dir)
  90         file_classification = self.model.get_target(file_type,
  91                                                     rdfNS['type'])
  92         if file_classification is None:
  93             errmsg = 'Could not find class for {0}'
  94             LOGGER.warning(errmsg.format(str(file_type)))
  95             return
  96
  97         self.model.add_statement(
  98             RDF.Statement(self.submissionSetNS[''],
  99                           submissionOntology['has_submission'],
 100                           an_analysis))
 101         self.model.add_statement(RDF.Statement(an_analysis,
 102                                                submissionOntology['name'],
 103                                                toTypedNode(an_analysis_name)))
 104         self.model.add_statement(
 105             RDF.Statement(an_analysis,
 106                           rdfNS['type'],
 107                           submissionOntology['submission']))
 108         self.model.add_statement(RDF.Statement(an_analysis,
 109                                                submissionOntology['library'],
 110                                                libNode))
 111
 112         LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
 113         # add track specific information
 114         self.model.add_statement(
 115             RDF.Statement(an_analysis,
 116                           dafTermOntology['paired'],
 117                           toTypedNode(self._is_paired(libNode))))
 118         self.model.add_statement(
 119             RDF.Statement(an_analysis,
 120                           dafTermOntology['submission'],
 121                           an_analysis))
 122
 123         # add file specific information
 124         fileNode = self.make_file_node(pathname, an_analysis)
 125         self.add_md5s(filename, fileNode, analysis_dir)
 126         self.add_file_size(filename, fileNode, analysis_dir)
 127         self.add_read_length(filename, fileNode, analysis_dir)
 128         self.add_fastq_metadata(filename, fileNode)
 129         self.add_label(file_type, fileNode, libNode)
 130         self.model.add_statement(
 131             RDF.Statement(fileNode,
 132                           rdfNS['type'],
 133                           file_type))
 134         self.model.add_statement(
 135             RDF.Statement(fileNode,
 136                           libraryOntology['library'],
 137                           libNode))
 138
 139         LOGGER.debug("Done.")
 140
 141     def make_file_node(self, pathname, submissionNode):
 142         """Create file node and attach it to its submission.
 143         """
 144         # add file specific information
 145         path, filename = os.path.split(pathname)
 146         pathname = os.path.abspath(pathname)
 147         fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
 148         self.model.add_statement(
 149             RDF.Statement(submissionNode,
 150                           dafTermOntology['has_file'],
 151                           fileNode))
 152         self.model.add_statement(
 153             RDF.Statement(fileNode,
 154                           dafTermOntology['filename'],
 155                           filename))
 156         self.model.add_statement(
 157             RDF.Statement(fileNode,
 158                           dafTermOntology['relative_path'],
 159                           os.path.relpath(pathname)))
 160         return fileNode
 161
 162     def add_md5s(self, filename, fileNode, analysis_dir):
 163         LOGGER.debug("Updating file md5sum")
 164         submission_pathname = os.path.join(analysis_dir, filename)
 165         md5 = make_md5sum(submission_pathname)
 166         if md5 is None:
 167             errmsg = "Unable to produce md5sum for {0}"
 168             LOGGER.warning(errmsg.format(submission_pathname))
 169         else:
 170             self.model.add_statement(
 171                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 172
 173     def add_file_size(self, filename, fileNode, analysis_dir):
 174         submission_pathname = os.path.join(analysis_dir, filename)
 175         file_size = os.stat(submission_pathname).st_size
 176         self.model.add_statement(
 177             RDF.Statement(fileNode, dafTermOntology['file_size'], toTypedNode(file_size)))
 178         LOGGER.debug("Updating file size: %d", file_size)
 179
 180     def add_read_length(self, filename, fileNode, analysis_dir):
 181         submission_pathname = os.path.join(analysis_dir, filename)
 182         stream = opener.autoopen(submission_pathname, 'rt')
 183         header = stream.readline().strip()
 184         sequence = stream.readline().strip()
 185         read_length = len(sequence)
 186         self.model.add_statement(
 187             RDF.Statement(fileNode,
 188                           libraryOntology['read_length'],
 189                           toTypedNode(read_length))
 190         )
 191         LOGGER.debug("Updating read length: %d", read_length)
 192
 193     def add_fastq_metadata(self, filename, fileNode):
 194         # How should I detect if this is actually a fastq file?
 195         try:
 196             fqname = FastqName(filename=filename)
 197         except ValueError:
 198             # currently its just ignore it if the fastq name parser fails
 199             return
 200
 201         terms = [('flowcell', libraryOntology['flowcell_id']),
 202                  ('lib_id', libraryOntology['library_id']),
 203                  ('lane', libraryOntology['lane_number']),
 204                  ('read', libraryOntology['read']),
 205         ]
 206         for file_term, model_term in terms:
 207             value = fqname.get(file_term)
 208             if value is not None:
 209                 s = RDF.Statement(fileNode, model_term, toTypedNode(value))
 210                 self.model.append(s)
 211
 212         if 'flowcell' in fqname:
 213             value = self.flowcellNS[fqname['flowcell'] + '/']
 214             s = RDF.Statement(fileNode, libraryOntology['flowcell'], value)
 215             self.model.append(s)
 216
 217     def add_label(self, file_type, file_node, lib_node):
 218         """Add rdfs:label to a file node
 219         """
 220         #template_term = libraryOntology['label_template']
 221         template_term = libraryOntology['label_template']
 222         label_template = self.model.get_target(file_type, template_term)
 223         if label_template:
 224             template = loader.get_template('submission_view_rdfs_label_metadata.sparql')
 225             context = Context({
 226                 'library': str(lib_node.uri),
 227                 })
 228             for r in self.execute_query(template, context):
 229                 context = Context(r)
 230                 label = Template(label_template).render(context)
 231                 s = RDF.Statement(file_node, rdfsNS['label'], unicode(label))
 232                 self.model.append(s)
 233
 234     def _add_library_details_to_model(self, libNode):
 235         # attributes that can have multiple values
 236         set_attributes = set((libraryOntology['has_lane'],
 237                               libraryOntology['has_mappings'],
 238                               dafTermOntology['has_file']))
 239         parser = RDF.Parser(name='rdfa')
 240         try:
 241             new_statements = parser.parse_as_stream(libNode.uri)
 242         except RDF.RedlandError as e:
 243             LOGGER.error(e)
 244             return
 245         LOGGER.debug("Scanning %s", str(libNode.uri))
 246         toadd = []
 247         for s in new_statements:
 248             # always add "collections"
 249             if s.predicate in set_attributes:
 250                 toadd.append(s)
 251                 continue
 252             # don't override things we already have in the model
 253             targets = list(self.model.get_targets(s.subject, s.predicate))
 254             if len(targets) == 0:
 255                 toadd.append(s)
 256
 257         for s in toadd:
 258             self.model.append(s)
 259
 260         self._add_lane_details(libNode)
 261         self._add_flowcell_details()
 262
 263     def _add_lane_details(self, libNode):
 264         """Import lane details
 265         """
 266         query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
 267         lanes = []
 268         for lane_stmt in self.model.find_statements(query):
 269             lanes.append(lane_stmt.object)
 270
 271         parser = RDF.Parser(name='rdfa')
 272         for lane in lanes:
 273             LOGGER.debug("Importing %s" % (lane.uri,))
 274             try:
 275                 parser.parse_into_model(self.model, lane.uri)
 276             except RDF.RedlandError as e:
 277                 LOGGER.error("Error accessing %s" % (lane.uri,))
 278                 raise e
 279
 280
 281     def _add_flowcell_details(self):
 282         template = loader.get_template('aws_flowcell.sparql')
 283         results = self.execute_query(template, Context())
 284
 285         parser = RDF.Parser(name='rdfa')
 286         for r in self.execute_query(template, Context()):
 287             flowcell = r['flowcell']
 288             try:
 289                 parser.parse_into_model(self.model, flowcell.uri)
 290             except RDF.RedlandError as e:
 291                 LOGGER.error("Error accessing %s" % (str(flowcell)))
 292                 raise e
 293
 294
 295     def find_best_match(self, filename):
 296         """Search through potential filename matching patterns
 297         """
 298         if self.__view_map is None:
 299             self.__view_map = self._get_filename_view_map()
 300
 301         results = []
 302         for pattern, view in self.__view_map.items():
 303             if re.match(pattern, filename):
 304                 results.append(view)
 305
 306         if len(results) > 1:
 307             msg = "%s matched multiple views %s" % (
 308                 filename,
 309                 [str(x) for x in results])
 310             raise ModelException(msg)
 311         elif len(results) == 1:
 312             return results[0]
 313         else:
 314             return None
 315
 316     def _get_filename_view_map(self):
 317         """Query our model for filename patterns
 318
 319         return a dictionary of compiled regular expressions to view names
 320         """
 321         filename_query = RDF.Statement(
 322             None, dafTermOntology['filename_re'], None)
 323
 324         patterns = {}
 325         for s in self.model.find_statements(filename_query):
 326             view_name = s.subject
 327             literal_re = s.object.literal_value['string']
 328             LOGGER.debug("Found: %s" % (literal_re,))
 329             try:
 330                 filename_re = re.compile(literal_re)
 331             except re.error as e:
 332                 LOGGER.error("Unable to compile: %s" % (literal_re,))
 333             patterns[literal_re] = view_name
 334         return patterns
 335
 336     def make_submission_name(self, analysis_dir):
 337         analysis_dir = os.path.normpath(analysis_dir)
 338         analysis_dir_name = os.path.split(analysis_dir)[1]
 339         if len(analysis_dir_name) == 0:
 340             raise RuntimeError(
 341                 "Submission dir name too short: {0}".format(analysis_dir))
 342         return analysis_dir_name
 343
 344     def get_submission_node(self, analysis_dir):
 345         """Convert a submission directory name to a submission node
 346         """
 347         submission_name = self.make_submission_name(analysis_dir)
 348         return self.submissionSetNS[submission_name]
 349
 350     def _get_library_attribute(self, libNode, attribute):
 351         if not isinstance(attribute, RDF.Node):
 352             attribute = libraryOntology[attribute]
 353
 354         targets = list(self.model.get_targets(libNode, attribute))
 355         if len(targets) > 0:
 356             return self._format_library_attribute(targets)
 357         else:
 358             return None
 359
 360         #targets = self._search_same_as(libNode, attribute)
 361         #if targets is not None:
 362         #    return self._format_library_attribute(targets)
 363
 364         # we don't know anything about this attribute
 365         self._add_library_details_to_model(libNode)
 366
 367         targets = list(self.model.get_targets(libNode, attribute))
 368         if len(targets) > 0:
 369             return self._format_library_attribute(targets)
 370
 371         return None
 372
 373     def _format_library_attribute(self, targets):
 374         if len(targets) == 0:
 375             return None
 376         elif len(targets) == 1:
 377             return fromTypedNode(targets[0])
 378         elif len(targets) > 1:
 379             return [fromTypedNode(t) for t in targets]
 380
 381     def _is_paired(self, libNode):
 382         """Determine if a library is paired end"""
 383         library_type = self._get_library_attribute(libNode, 'library_type')
 384         if library_type is None:
 385             errmsg = "%s doesn't have a library type"
 386             raise ModelException(errmsg % (str(libNode),))
 387
 388         single = ['CSHL (lacking last nt)',
 389                   'Single End (non-multiplexed)',
 390                   'Small RNA (non-multiplexed)',]
 391         paired = ['Barcoded Illumina',
 392                   'Multiplexing',
 393                   'NEBNext Multiplexed',
 394                   'NEBNext Small RNA',
 395                   'Nextera',
 396                   'Paired End (non-multiplexed)',
 397                   'Dual Index Illumina',]
 398         if library_type in single:
 399             return False
 400         elif library_type in paired:
 401             return True
 402         else:
 403             raise MetadataLookupException(
 404                 "Unrecognized library type %s for %s" % \
 405                 (library_type, str(libNode)))
 406
 407     def execute_query(self, template, context):
 408         """Execute the query, returning the results
 409         """
 410         formatted_query = template.render(context)
 411         LOGGER.debug(formatted_query)
 412         query = RDF.SPARQLQuery(str(formatted_query))
 413         rdfstream = query.execute(self.model)
 414         results = []
 415         for record in rdfstream:
 416             d = {}
 417             for key, value in record.items():
 418                 d[key] = fromTypedNode(value)
 419             results.append(d)
 420         return results
 421
 422
 423 def list_submissions(model):
 424     """Return generator of submissions in this model.
 425     """
 426     query_body = """
 427       PREFIX subns: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 428
 429       select distinct ?submission
 430       where { ?submission subns:has_submission ?library_dir }
 431     """
 432     query = RDF.SPARQLQuery(query_body)
 433     rdfstream = query.execute(model)
 434     for row in rdfstream:
 435         s = strip_namespace(submissionLog, row['submission'])
 436         if s[-1] in ['#', '/', '?']:
 437             s = s[:-1]
 438         yield s