Initial port to python3

[htsworkflow.git] / htsworkflow / submission / daf.py
diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py

index 1be882b051eba71ffe274c8b2b905500c70e67ca..51595931ff2fa7699109d6644214ebdd89d531d3 100644 (file)
--- a/htsworkflow/submission/daf.py
+++ b/htsworkflow/submission/daf.py
@@ -2,16 +2,18 @@
  """
  import logging
  import os
+from pprint import pformat
  import re
  import string
-from StringIO import StringIO
+from io import StringIO
  import types
-import urlparse
+import urllib.parse
  
  import RDF
  from htsworkflow.util.rdfhelp import \
       blankOrUri, \
       dafTermOntology, \
+     dump_model, \
       get_model, \
       libraryOntology, \
       owlNS, \
@@ -22,7 +24,12 @@ from htsworkflow.util.rdfhelp import \
       fromTypedNode
  from htsworkflow.util.hashfile import make_md5sum
  
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
+
+DAF_VARIABLE_NAMES = ("variables", "extraVariables")
+VARIABLES_TERM_NAME = 'variables'
+DAF_PRE_VARIABLES = ['files', 'view']
+DAF_POST_VARIABLES = [ 'labExpId', 'md5sum']
  
  
  class ModelException(RuntimeError):
@@ -108,7 +115,7 @@ def parse_stream(stream):
                  view_name = None
                  view_attributes = {}
              state = DAF_HEADER
-        elif state == DAF_HEADER and name == 'variables':
+        elif state == DAF_HEADER and name in DAF_VARIABLE_NAMES:
              attributes[name] = [x.strip() for x in value.split(',')]
          elif state == DAF_HEADER and name == 'view':
              view_name = value
@@ -123,6 +130,7 @@ def parse_stream(stream):
      if view_name is not None:
          attributes['views'][view_name] = view_attributes
  
+    LOGGER.debug("DAF Attributes" + pformat(attributes))
      return attributes
  
  
@@ -131,7 +139,7 @@ def _consume_whitespace(line, start=0):
  
      returns length of string if it can't find anything
      """
-    for i in xrange(start, len(line)):
+    for i in range(start, len(line)):
          if line[i] not in string.whitespace:
              return i
  
@@ -143,7 +151,7 @@ def _extract_name_index(line, start=0):
  
      returns length of string if nothing matches
      """
-    for i in xrange(start, len(line)):
+    for i in range(start, len(line)):
          if line[i] in string.whitespace:
              return i
  
@@ -162,6 +170,7 @@ def convert_to_rdf_statements(attributes, subject):
  
      The statements are attached to the provided subject node
      """
+    variables_term = dafTermOntology[VARIABLES_TERM_NAME]
      statements = []
      for daf_key in attributes:
          predicate = dafTermOntology[daf_key]
@@ -169,11 +178,10 @@ def convert_to_rdf_statements(attributes, subject):
              statements.extend(_views_to_statements(subject,
                                                     dafTermOntology,
                                                     attributes[daf_key]))
-        elif daf_key == 'variables':
-            #predicate = ddfNS['variables']
-            for var in attributes.get('variables', []):
+        elif daf_key in DAF_VARIABLE_NAMES:
+            for var in attributes.get(daf_key, []):
                  obj = toTypedNode(var)
-                statements.append(RDF.Statement(subject, predicate, obj))
+                statements.append(RDF.Statement(subject, variables_term, obj))
          else:
              value = attributes[daf_key]
              obj = toTypedNode(value)
@@ -224,13 +232,13 @@ def submission_uri_to_string(submission_uri):
  
  def get_view_namespace(submission_uri):
      submission_uri = submission_uri_to_string(submission_uri)
-    view_uri = urlparse.urljoin(submission_uri, 'view/')
+    view_uri = urllib.parse.urljoin(submission_uri, 'view/')
      viewNS = RDF.NS(view_uri)
      return viewNS
  
  
-class DAFMapper(object):
-    """Convert filenames to views in the UCSC Daf
+class UCSCSubmission(object):
+    """Build a submission by examining the DAF for what we need to submit
      """
      def __init__(self, name, daf_file=None, model=None):
          """Construct a RDF backed model of a UCSC DAF
@@ -246,7 +254,7 @@ class DAFMapper(object):
               otherwise specifies model to use
          """
          if daf_file is None and model is None:
-            logger.error("We need a DAF or Model containing a DAF to work")
+            LOGGER.error("We need a DAF or Model containing a DAF to work")
  
          self.name = name
          self.submissionSet = get_submission_uri(self.name)
@@ -259,15 +267,23 @@ class DAFMapper(object):
  
          if hasattr(daf_file, 'next'):
              # its some kind of stream
-            fromstream_into_model(self.model, self.submissionSet, daf_file)
+            self.daf = daf_file.read()
          else:
              # file
-            parse_into_model(self.model, self.submissionSet, daf_file)
+            stream = open(daf_file, 'r')
+            self.daf = stream.read()
+            stream.close()
+
+        fromstring_into_model(self.model, self.submissionSet, self.daf)
  
          self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
          self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
          self.__view_map = None
  
+    def _get_daf_name(self):
+        return self.name + '.daf'
+    daf_name = property(_get_daf_name,doc="construct name for DAF file")
+
      def add_pattern(self, view_name, filename_pattern):
          """Map a filename regular expression to a view name
          """
@@ -277,6 +293,16 @@ class DAFMapper(object):
                            dafTermOntology['filename_re'],
                            obj))
  
+    def scan_submission_dirs(self, result_map):
+        """Examine files in our result directory
+        """
+        for lib_id, result_dir in list(result_map.items()):
+            LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
+            try:
+                self.import_submission_dir(result_dir, lib_id)
+            except MetadataLookupException as e:
+                LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
+
      def import_submission_dir(self, submission_dir, library_id):
          """Import a submission directories and update our model as needed
          """
@@ -298,10 +324,10 @@ class DAFMapper(object):
          """
          path, filename = os.path.split(pathname)
  
-        logger.debug("Searching for view")
+        LOGGER.debug("Searching for view")
          view = self.find_view(filename)
          if view is None:
-            logger.warn("Unrecognized file: {0}".format(pathname))
+            LOGGER.warn("Unrecognized file: {0}".format(pathname))
              return None
          if str(view) == str(libraryOntology['ignore']):
              return None
@@ -313,7 +339,7 @@ class DAFMapper(object):
                                         dafTermOntology['name']))
          if view_name is None:
              errmsg = 'Could not find view name for {0}'
-            logging.warning(errmsg.format(str(view)))
+            LOGGER.warning(errmsg.format(str(view)))
              return
  
          view_name = str(view_name)
@@ -323,7 +349,7 @@ class DAFMapper(object):
              RDF.Statement(self.submissionSet,
                            dafTermOntology['has_submission'],
                            submissionNode))
-        logger.debug("Adding statements to {0}".format(str(submissionNode)))
+        LOGGER.debug("Adding statements to {0}".format(str(submissionNode)))
          self.model.add_statement(RDF.Statement(submissionNode,
                                                 submissionOntology['has_view'],
                                                 submissionView))
@@ -335,10 +361,10 @@ class DAFMapper(object):
                            rdfNS['type'],
                            submissionOntology['submission']))
          self.model.add_statement(RDF.Statement(submissionNode,
-                                               submissionOntology['library'],
+                                               libraryOntology['library'],
                                                 libNode))
  
-        logger.debug("Adding statements to {0}".format(str(submissionView)))
+        LOGGER.debug("Adding statements to {0}".format(str(submissionView)))
          # add track specific information
          self.model.add_statement(
              RDF.Statement(submissionView, dafTermOntology['view'], view))
@@ -351,22 +377,16 @@ class DAFMapper(object):
                            dafTermOntology['submission'],
                            submissionNode))
  
-        # extra information
-        terms = [dafTermOntology['type'],
-                 dafTermOntology['filename_re'],
-                 ]
-        terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
-
          # add file specific information
          self.create_file_attributes(filename, submissionView, submission_uri, submission_dir)
  
-        logger.debug("Done.")
+        LOGGER.debug("Done.")
  
      def create_file_attributes(self, filename, submissionView, submission_uri, submission_dir):
          # add file specific information
-        logger.debug("Updating file md5sum")
-        fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
+        LOGGER.debug("Updating file md5sum")
          submission_pathname = os.path.join(submission_dir, filename)
+        fileNode = RDF.Node(RDF.Uri("file://" + submission_pathname))
          self.model.add_statement(
              RDF.Statement(submissionView,
                            dafTermOntology['has_file'],
@@ -379,7 +399,7 @@ class DAFMapper(object):
          md5 = make_md5sum(submission_pathname)
          if md5 is None:
              errmsg = "Unable to produce md5sum for {0}"
-            logging.warning(errmsg.format(submission_pathname))
+            LOGGER.warning(errmsg.format(submission_pathname))
          else:
              self.model.add_statement(
                  RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
@@ -396,15 +416,18 @@ class DAFMapper(object):
      def get_daf_variables(self):
          """Returns simple variables names that to include in the ddf
          """
-        variableTerm = dafTermOntology['variables']
-        results = ['view']
-        if self.need_replicate():
+        variables_term = dafTermOntology[VARIABLES_TERM_NAME]
+        results = []
+        results.extend([v for v in DAF_PRE_VARIABLES if v not in results])
+        results = DAF_PRE_VARIABLES[:]
+        if self.need_replicate() and 'replicate' not in results:
              results.append('replicate')
  
-        for obj in self.model.get_targets(self.submissionSet, variableTerm):
+        for obj in self.model.get_targets(self.submissionSet, variables_term):
              value = str(fromTypedNode(obj))
-            results.append(value)
-        results.append('labVersion')
+            if value not in results:
+                results.append(value)
+        results.extend([v for v in DAF_POST_VARIABLES if v not in results])
          return results
  
      def make_submission_name(self, submission_dir):
@@ -468,7 +491,7 @@ class DAFMapper(object):
              self.__view_map = self._get_filename_view_map()
  
          results = []
-        for pattern, view in self.__view_map.items():
+        for pattern, view in list(self.__view_map.items()):
              if re.match(pattern, filename):
                  results.append(view)
  
@@ -490,7 +513,7 @@ class DAFMapper(object):
          else:
              msg = "Found wrong number of view names for {0} len = {1}"
              msg = msg.format(str(view), len(names))
-            logger.error(msg)
+            LOGGER.error(msg)
              raise RuntimeError(msg)
  
      def _get_filename_view_map(self):
@@ -505,11 +528,11 @@ class DAFMapper(object):
          for s in self.model.find_statements(filename_query):
              view_name = s.subject
              literal_re = s.object.literal_value['string']
-            logger.debug("Found: %s" % (literal_re,))
+            LOGGER.debug("Found: %s" % (literal_re,))
              try:
                  filename_re = re.compile(literal_re)
-            except re.error, e:
-                logger.error("Unable to compile: %s" % (literal_re,))
+            except re.error as e:
+                LOGGER.error("Unable to compile: %s" % (literal_re,))
              patterns[literal_re] = view_name
          return patterns
  
@@ -528,9 +551,13 @@ class DAFMapper(object):
              errmsg = "%s doesn't have a library type"
              raise ModelException(errmsg % (str(libNode),))
  
-        #single = (1,3,6)
-        single = ['Single End', 'Small RNA', 'CSHL (lacking last nt)']
-        paired = ['Paired End', 'Multiplexing', 'Barcoded']
+        single = ['CSHL (lacking last nt)',
+                  'Single End (non-multiplexed)',
+                  'Small RNA (non-multiplexed)',]
+        paired = ['Barcoded Illumina',
+                  'Multiplexing',
+                  'Nextera',
+                  'Paired End (non-multiplexed)',]
          if library_type in single:
              return False
          elif library_type in paired:
@@ -552,3 +579,57 @@ class DAFMapper(object):
                  return True
  
          return False
+
+
+    def link_daf(self, result_map):
+        if self.daf is None or len(self.daf) == 0:
+            raise RuntimeError(
+                "DAF data does not exist, how can I link to it?")
+
+        base_daf = self.daf_name
+
+        for result_dir in list(result_map.values()):
+            if not os.path.exists(result_dir):
+                raise RuntimeError(
+                    "Couldn't find target directory %s" %(result_dir,))
+            submission_daf = os.path.join(result_dir, base_daf)
+            if os.path.exists(submission_daf):
+                previous_daf = open(submission_daf, 'r').read()
+                if self.daf != previous_daf:
+                    LOGGER.info("Old daf is different, overwriting it.")
+            stream = open(submission_daf, 'w')
+            stream.write(self.daf)
+            stream.close()
+
+
+if __name__ == "__main__":
+    example_daf = """# Lab and general info
+grant             Hardison
+lab               Caltech-m
+dataType          ChipSeq
+variables         cell, antibody,sex,age,strain,control
+compositeSuffix   CaltechHistone
+assembly          mm9
+dafVersion        2.0
+validationSettings validateFiles.bam:mismatches=2,bamPercent=99.9;validateFiles.fastq:quick=1000
+
+# Track/view definition
+view             FastqRd1
+longLabelPrefix  Caltech Fastq Read 1
+type             fastq
+hasReplicates    yes
+required         no
+
+view             Signal
+longLabelPrefix  Caltech Histone Signal
+type             bigWig
+hasReplicates    yes
+required         no
+"""
+    model = get_model()
+    example_daf_stream = StringIO(example_daf)
+    name = "test_rep"
+    mapper = DAFMapper(name, daf_file = example_daf_stream, model=model)
+    dump_model(model)
+
+