Initial port to python3

[htsworkflow.git] / htsworkflow / pipelines / sequences.py
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index 479ce3ddcd330fa5fe4285215ed08edb81e1c710..0a3c2fd61e856f04c9872600bc8f4025cc4ba6f4 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -7,7 +7,7 @@ import os
  import types
  import re
  import sys
-from urlparse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse
  
  import RDF
  from htsworkflow.util.rdfhelp import libraryOntology as libNS
@@ -84,7 +84,7 @@ class SequenceFile(object):
          return (self.flowcell, self.lane, self.read, self.project, self.split)
  
      def __unicode__(self):
-        return unicode(self.path)
+        return str(self.path)
  
      def __eq__(self, other):
          """
@@ -109,7 +109,7 @@ class SequenceFile(object):
          return not self == other
  
      def __repr__(self):
-        return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
+        return "<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
  
      def make_target_name(self, root):
          """
@@ -164,7 +164,7 @@ class SequenceFile(object):
          # a bit unreliable... assumes filesystem is encoded in utf-8
          path = os.path.abspath(self.path.encode('utf-8'))
          fileNode = RDF.Node(RDF.Uri('file://' + path))
-        add(model, fileNode, rdfNS['type'], libNS['illumina_result'])
+        add(model, fileNode, rdfNS['type'], libNS['IlluminaResult'])
          add_lit(model, fileNode, libNS['flowcell_id'], self.flowcell)
          add_lit(model, fileNode, libNS['lane_number'], self.lane)
          if self.read is not None:
@@ -206,8 +206,8 @@ class SequenceFile(object):
          def get_one(s, p):
              values = get(s, p)
              if len(values) > 1:
-                errmsg = u"To many values for %s %s"
-                raise ValueError(errmsg % (unicode(s), unicode(p)))
+                errmsg = "To many values for %s %s"
+                raise ValueError(errmsg % (str(s), str(p)))
              elif len(values) == 1:
                  return values[0]
              else:
@@ -217,9 +217,9 @@ class SequenceFile(object):
              seq_id = RDF.Node(RDF.Uri(seq_id))
          result_statement = RDF.Statement(seq_id,
                                           rdfNS['type'],
-                                         libNS['illumina_result'])
+                                         libNS['IlluminaResult'])
          if not model.contains_statement(result_statement):
-            raise KeyError(u"%s not found" % (unicode(seq_id),))
+            raise KeyError("%s not found" % (str(seq_id),))
  
          seq_type_node = model.get_target(seq_id, libNS['file_type'])
          seq_type = stripNamespace(libNS, seq_type_node)
@@ -275,7 +275,7 @@ def parse_srf(path, filename):
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      flowcell = records[4]
-    lane = int(records[5][0])
+    lane = records[5][0]
      fullpath = os.path.join(path, filename)
  
      if flowcell_dir != flowcell:
@@ -290,7 +290,7 @@ def parse_qseq(path, filename):
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
      flowcell = records[4]
-    lane = int(records[5][1])
+    lane = records[5][1]
      read = int(records[6][1])
  
      if flowcell_dir != flowcell:
@@ -309,7 +309,7 @@ def parse_fastq(path, filename):
      if project is not None:
          # demultiplexed sample!
          flowcell = flowcell_dir
-        lane = int(records[2][-1])
+        lane = records[2][-1]
          read = int(records[3][-1])
          pf = True # as I understand it hiseq runs toss the ones that fail filter
          index = records[1]
@@ -318,7 +318,7 @@ def parse_fastq(path, filename):
          sequence_type = 'split_fastq'
      else:
          flowcell = records[4]
-        lane = int(records[5][1])
+        lane = records[5][1]
          read = int(records[6][1])
          pf = parse_fastq_pf_flag(records)
          index = None
@@ -362,7 +362,7 @@ def parse_eland(path, filename, eland_match=None):
      fullpath = os.path.join(path, filename)
      flowcell, start, stop, project = get_flowcell_cycle(path)
      if eland_match.group('lane'):
-        lane = int(eland_match.group('lane'))
+        lane = eland_match.group('lane')
      else:
          lane = None
      if eland_match.group('read'):
@@ -376,7 +376,7 @@ def scan_for_sequences(dirs):
      Scan through a list of directories for sequence like files
      """
      sequences = []
-    if type(dirs) in types.StringTypes:
+    if type(dirs) in str:
          raise ValueError("You probably want a list or set, not a string")
  
      for d in dirs:
@@ -415,15 +415,15 @@ def update_model_sequence_library(model, base_url):
      """Find sequence objects and add library information if its missing
      """
      file_body = """
-    prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+    prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
      select ?filenode ?flowcell_id ?lane_id ?library_id ?flowcell ?library
      where {
-       ?filenode a libNS:illumina_result ;
-                 libNS:flowcell_id ?flowcell_id ;
-                 libNS:lane_number ?lane_id .
-       OPTIONAL { ?filenode libNS:flowcell ?flowcell . }
-       OPTIONAL { ?filenode libNS:library ?library .}
-       OPTIONAL { ?filenode libNS:library_id ?library_id .}
+       ?filenode a libns:IlluminaResult ;
+                 libns:flowcell_id ?flowcell_id ;
+                 libns:lane_number ?lane_id .
+       OPTIONAL { ?filenode libns:flowcell ?flowcell . }
+       OPTIONAL { ?filenode libns:library ?library .}
+       OPTIONAL { ?filenode libns:library_id ?library_id .}
      }
      """
      LOGGER.debug("update_model_sequence_library query %s", file_body)
@@ -453,6 +453,10 @@ def update_model_sequence_library(model, base_url):
                  library = guess_library_from_model(model, base_url,
                                                     flowcell,
                                                     lane_id)
+                if library is None:
+                    LOGGER.error("Unable to decypher: %s %s",
+                                 str(flowcell), str(lane_id))
+                    continue
                  library_id = toTypedNode(simplify_uri(library))
                  LOGGER.debug("Adding file (%s) to library (%s) link",
                               str(filenode),
@@ -470,19 +474,21 @@ def guess_library_from_model(model, base_url, flowcell, lane_id):
      flowcellNode = RDF.Node(flowcell)
      flowcell = str(flowcell.uri)
      lane_body = """
-    prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+    prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
      prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      prefix xsd: <http://www.w3.org/2001/XMLSchema#>
  
      select ?library ?lane
      where {{
-      <{flowcell}> libNS:has_lane ?lane ;
-                   a libNS:IlluminaFlowcell .
-      ?lane libNS:lane_number {lane_id} ;
-            libNS:library ?library .
+      <{flowcell}> libns:has_lane ?lane ;
+                   a libns:IlluminaFlowcell .
+      ?lane libns:lane_number ?lane_id ;
+            libns:library ?library .
+      FILTER(str(?lane_id) = "{lane_id}")
      }}
      """
      lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id)
+    LOGGER.debug("guess_library_from_model: %s", lane_body)
      lanes = []
      tries = 3
      while len(lanes) == 0 and tries > 0:
@@ -503,5 +509,3 @@ def guess_library_from_model(model, base_url, flowcell, lane_id):
          else:
              # try grabbing data
              model.load(flowcellNode.uri, name="rdfa")
-
-