htsworkflow/submission/submission.py

   1 """Common submission elements
   2 """
   3 import logging
   4 import os
   5 import re
   6
   7 import RDF
   8
   9 from htsworkflow.util.rdfhelp import \
  10      blankOrUri, \
  11      dump_model, \
  12      fromTypedNode, \
  13      get_model, \
  14      strip_namespace, \
  15      toTypedNode
  16 from htsworkflow.util.rdfns import *
  17 from htsworkflow.util.hashfile import make_md5sum
  18 from htsworkflow.submission.fastqname import FastqName
  19 from htsworkflow.submission.daf import \
  20      MetadataLookupException, \
  21      ModelException, \
  22      get_submission_uri
  23
  24 from django.conf import settings
  25 from django.template import Context, Template, loader
  26
  27 LOGGER = logging.getLogger(__name__)
  28
  29 class Submission(object):
  30     def __init__(self, name, model, host):
  31         self.name = name
  32         self.model = model
  33
  34         self.submissionSet = get_submission_uri(self.name)
  35         self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
  36         self.libraryNS = RDF.NS('{0}/library/'.format(host))
  37         self.flowcellNS = RDF.NS('{0}/flowcell/'.format(host))
  38
  39         self.__view_map = None
  40
  41     def scan_submission_dirs(self, result_map):
  42         """Examine files in our result directory
  43         """
  44         for lib_id, result_dir in result_map.items():
  45             LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
  46             try:
  47                 self.import_analysis_dir(result_dir, lib_id)
  48             except MetadataLookupException, e:
  49                 LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
  50
  51     def import_analysis_dir(self, analysis_dir, library_id):
  52         """Import a submission directories and update our model as needed
  53         """
  54         #attributes = get_filename_attribute_map(paired)
  55         libNode = self.libraryNS[library_id + "/"]
  56
  57         self._add_library_details_to_model(libNode)
  58
  59         submission_files = os.listdir(analysis_dir)
  60         for filename in submission_files:
  61             pathname = os.path.abspath(os.path.join(analysis_dir, filename))
  62             self.construct_file_attributes(analysis_dir, libNode, pathname)
  63
  64     def analysis_nodes(self, result_map):
  65         """Return an iterable of analysis nodes
  66         """
  67         for result_dir in result_map.values():
  68             an_analysis = self.get_submission_node(result_dir)
  69             yield an_analysis
  70
  71     def construct_file_attributes(self, analysis_dir, libNode, pathname):
  72         """Looking for the best extension
  73         The 'best' is the longest match
  74
  75         :Args:
  76         filename (str): the filename whose extention we are about to examine
  77         """
  78         path, filename = os.path.split(pathname)
  79
  80         LOGGER.debug("Searching for view")
  81         file_type = self.find_best_match(filename)
  82         if file_type is None:
  83             LOGGER.warn("Unrecognized file: {0}".format(pathname))
  84             return None
  85         if str(file_type) == str(libraryOntology['ignore']):
  86             return None
  87
  88         an_analysis_name = self.make_submission_name(analysis_dir)
  89         an_analysis = self.get_submission_node(analysis_dir)
  90         an_analysis_uri = str(an_analysis.uri)
  91         file_classification = self.model.get_target(file_type,
  92                                                     rdfNS['type'])
  93         if file_classification is None:
  94             errmsg = 'Could not find class for {0}'
  95             LOGGER.warning(errmsg.format(str(file_type)))
  96             return
  97
  98         self.model.add_statement(
  99             RDF.Statement(self.submissionSetNS[''],
 100                           submissionOntology['has_submission'],
 101                           an_analysis))
 102         self.model.add_statement(RDF.Statement(an_analysis,
 103                                                submissionOntology['name'],
 104                                                toTypedNode(an_analysis_name)))
 105         self.model.add_statement(
 106             RDF.Statement(an_analysis,
 107                           rdfNS['type'],
 108                           submissionOntology['submission']))
 109         self.model.add_statement(RDF.Statement(an_analysis,
 110                                                submissionOntology['library'],
 111                                                libNode))
 112
 113         LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
 114         # add track specific information
 115         self.model.add_statement(
 116             RDF.Statement(an_analysis,
 117                           dafTermOntology['paired'],
 118                           toTypedNode(self._is_paired(libNode))))
 119         self.model.add_statement(
 120             RDF.Statement(an_analysis,
 121                           dafTermOntology['submission'],
 122                           an_analysis))
 123
 124         # add file specific information
 125         fileNode = self.make_file_node(pathname, an_analysis)
 126         self.add_md5s(filename, fileNode, analysis_dir)
 127         self.add_file_size(filename, fileNode, analysis_dir)
 128         self.add_fastq_metadata(filename, fileNode)
 129         self.add_label(file_type, fileNode, libNode)
 130         self.model.add_statement(
 131             RDF.Statement(fileNode,
 132                           rdfNS['type'],
 133                           file_type))
 134         self.model.add_statement(
 135             RDF.Statement(fileNode,
 136                           libraryOntology['library'],
 137                           libNode))
 138
 139         LOGGER.debug("Done.")
 140
 141     def make_file_node(self, pathname, submissionNode):
 142         """Create file node and attach it to its submission.
 143         """
 144         # add file specific information
 145         path, filename = os.path.split(pathname)
 146         pathname = os.path.abspath(pathname)
 147         fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
 148         self.model.add_statement(
 149             RDF.Statement(submissionNode,
 150                           dafTermOntology['has_file'],
 151                           fileNode))
 152         self.model.add_statement(
 153             RDF.Statement(fileNode,
 154                           dafTermOntology['filename'],
 155                           filename))
 156         self.model.add_statement(
 157             RDF.Statement(fileNode,
 158                           dafTermOntology['relative_path'],
 159                           os.path.relpath(pathname)))
 160         return fileNode
 161
 162     def add_md5s(self, filename, fileNode, analysis_dir):
 163         LOGGER.debug("Updating file md5sum")
 164         submission_pathname = os.path.join(analysis_dir, filename)
 165         md5 = make_md5sum(submission_pathname)
 166         if md5 is None:
 167             errmsg = "Unable to produce md5sum for {0}"
 168             LOGGER.warning(errmsg.format(submission_pathname))
 169         else:
 170             self.model.add_statement(
 171                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 172
 173     def add_file_size(self, filename, fileNode, analysis_dir):
 174         LOGGER.debug("Updating file size")
 175         submission_pathname = os.path.join(analysis_dir, filename)
 176         file_size = os.stat(submission_pathname).st_size
 177         self.model.add_statement(
 178             RDF.Statement(fileNode, dafTermOntology['file_size'], toTypedNode(file_size)))
 179
 180     def add_fastq_metadata(self, filename, fileNode):
 181         # How should I detect if this is actually a fastq file?
 182         try:
 183             fqname = FastqName(filename=filename)
 184         except ValueError:
 185             # currently its just ignore it if the fastq name parser fails
 186             return
 187
 188         terms = [('flowcell', libraryOntology['flowcell_id']),
 189                  ('lib_id', libraryOntology['library_id']),
 190                  ('lane', libraryOntology['lane_number']),
 191                  ('read', libraryOntology['read']),
 192                  ('cycle', libraryOntology['read_length'])]
 193         for file_term, model_term in terms:
 194             value = fqname.get(file_term)
 195             if value is not None:
 196                 s = RDF.Statement(fileNode, model_term, toTypedNode(value))
 197                 self.model.append(s)
 198
 199         if 'flowcell' in fqname:
 200             value = self.flowcellNS[fqname['flowcell'] + '/']
 201             s = RDF.Statement(fileNode, libraryOntology['flowcell'], value)
 202             self.model.append(s)
 203
 204     def add_label(self, file_type, file_node, lib_node):
 205         """Add rdfs:label to a file node
 206         """
 207         #template_term = libraryOntology['label_template']
 208         template_term = libraryOntology['label_template']
 209         label_template = self.model.get_target(file_type, template_term)
 210         if label_template:
 211             template = loader.get_template('submission_view_rdfs_label_metadata.sparql')
 212             context = Context({
 213                 'library': str(lib_node.uri),
 214                 })
 215             for r in self.execute_query(template, context):
 216                 context = Context(r)
 217                 label = Template(label_template).render(context)
 218                 s = RDF.Statement(file_node, rdfsNS['label'], unicode(label))
 219                 self.model.append(s)
 220
 221     def _add_library_details_to_model(self, libNode):
 222         # attributes that can have multiple values
 223         set_attributes = set((libraryOntology['has_lane'],
 224                               libraryOntology['has_mappings'],
 225                               dafTermOntology['has_file']))
 226         parser = RDF.Parser(name='rdfa')
 227         try:
 228             new_statements = parser.parse_as_stream(libNode.uri)
 229         except RDF.RedlandError as e:
 230             LOGGER.error(e)
 231             return
 232         LOGGER.debug("Scanning %s", str(libNode.uri))
 233         toadd = []
 234         for s in new_statements:
 235             # always add "collections"
 236             if s.predicate in set_attributes:
 237                 toadd.append(s)
 238                 continue
 239             # don't override things we already have in the model
 240             targets = list(self.model.get_targets(s.subject, s.predicate))
 241             if len(targets) == 0:
 242                 toadd.append(s)
 243
 244         for s in toadd:
 245             self.model.append(s)
 246
 247         self._add_lane_details(libNode)
 248
 249     def _add_lane_details(self, libNode):
 250         """Import lane details
 251         """
 252         query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
 253         lanes = []
 254         for lane_stmt in self.model.find_statements(query):
 255             lanes.append(lane_stmt.object)
 256
 257         parser = RDF.Parser(name='rdfa')
 258         for lane in lanes:
 259             LOGGER.debug("Importing %s" % (lane.uri,))
 260             try:
 261                 parser.parse_into_model(self.model, lane.uri)
 262             except RDF.RedlandError, e:
 263                 LOGGER.error("Error accessing %s" % (lane.uri,))
 264                 raise e
 265
 266
 267     def find_best_match(self, filename):
 268         """Search through potential filename matching patterns
 269         """
 270         if self.__view_map is None:
 271             self.__view_map = self._get_filename_view_map()
 272
 273         results = []
 274         for pattern, view in self.__view_map.items():
 275             if re.match(pattern, filename):
 276                 results.append(view)
 277
 278         if len(results) > 1:
 279             msg = "%s matched multiple views %s" % (
 280                 filename,
 281                 [str(x) for x in results])
 282             raise ModelException(msg)
 283         elif len(results) == 1:
 284             return results[0]
 285         else:
 286             return None
 287
 288     def _get_filename_view_map(self):
 289         """Query our model for filename patterns
 290
 291         return a dictionary of compiled regular expressions to view names
 292         """
 293         filename_query = RDF.Statement(
 294             None, dafTermOntology['filename_re'], None)
 295
 296         patterns = {}
 297         for s in self.model.find_statements(filename_query):
 298             view_name = s.subject
 299             literal_re = s.object.literal_value['string']
 300             LOGGER.debug("Found: %s" % (literal_re,))
 301             try:
 302                 filename_re = re.compile(literal_re)
 303             except re.error, e:
 304                 LOGGER.error("Unable to compile: %s" % (literal_re,))
 305             patterns[literal_re] = view_name
 306         return patterns
 307
 308     def make_submission_name(self, analysis_dir):
 309         analysis_dir = os.path.normpath(analysis_dir)
 310         analysis_dir_name = os.path.split(analysis_dir)[1]
 311         if len(analysis_dir_name) == 0:
 312             raise RuntimeError(
 313                 "Submission dir name too short: {0}".format(analysis_dir))
 314         return analysis_dir_name
 315
 316     def get_submission_node(self, analysis_dir):
 317         """Convert a submission directory name to a submission node
 318         """
 319         submission_name = self.make_submission_name(analysis_dir)
 320         return self.submissionSetNS[submission_name]
 321
 322     def _get_library_attribute(self, libNode, attribute):
 323         if not isinstance(attribute, RDF.Node):
 324             attribute = libraryOntology[attribute]
 325
 326         targets = list(self.model.get_targets(libNode, attribute))
 327         if len(targets) > 0:
 328             return self._format_library_attribute(targets)
 329         else:
 330             return None
 331
 332         #targets = self._search_same_as(libNode, attribute)
 333         #if targets is not None:
 334         #    return self._format_library_attribute(targets)
 335
 336         # we don't know anything about this attribute
 337         self._add_library_details_to_model(libNode)
 338
 339         targets = list(self.model.get_targets(libNode, attribute))
 340         if len(targets) > 0:
 341             return self._format_library_attribute(targets)
 342
 343         return None
 344
 345     def _format_library_attribute(self, targets):
 346         if len(targets) == 0:
 347             return None
 348         elif len(targets) == 1:
 349             return fromTypedNode(targets[0])
 350         elif len(targets) > 1:
 351             return [fromTypedNode(t) for t in targets]
 352
 353     def _is_paired(self, libNode):
 354         """Determine if a library is paired end"""
 355         library_type = self._get_library_attribute(libNode, 'library_type')
 356         if library_type is None:
 357             errmsg = "%s doesn't have a library type"
 358             raise ModelException(errmsg % (str(libNode),))
 359
 360         single = ['CSHL (lacking last nt)',
 361                   'Single End (non-multiplexed)',
 362                   'Small RNA (non-multiplexed)',]
 363         paired = ['Barcoded Illumina',
 364                   'Multiplexing',
 365                   'NEBNext Multiplexed',
 366                   'NEBNext Small RNA',
 367                   'Nextera',
 368                   'Paired End (non-multiplexed)',
 369                   'Dual Index Illumina',]
 370         if library_type in single:
 371             return False
 372         elif library_type in paired:
 373             return True
 374         else:
 375             raise MetadataLookupException(
 376                 "Unrecognized library type %s for %s" % \
 377                 (library_type, str(libNode)))
 378
 379     def execute_query(self, template, context):
 380         """Execute the query, returning the results
 381         """
 382         formatted_query = template.render(context)
 383         LOGGER.debug(formatted_query)
 384         query = RDF.SPARQLQuery(str(formatted_query))
 385         rdfstream = query.execute(self.model)
 386         results = []
 387         for record in rdfstream:
 388             d = {}
 389             for key, value in record.items():
 390                 d[key] = fromTypedNode(value)
 391             results.append(d)
 392         return results
 393
 394
 395 def list_submissions(model):
 396     """Return generator of submissions in this model.
 397     """
 398     query_body = """
 399       PREFIX subns: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 400
 401       select distinct ?submission
 402       where { ?submission subns:has_submission ?library_dir }
 403     """
 404     query = RDF.SPARQLQuery(query_body)
 405     rdfstream = query.execute(model)
 406     for row in rdfstream:
 407         s = strip_namespace(submissionLog, row['submission'])
 408         if s[-1] in ['#', '/', '?']:
 409             s = s[:-1]
 410         yield s