Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow

author Diane Trout <diane@caltech.edu>

Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)

committer Diane Trout <diane@caltech.edu>

Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)
author Diane Trout <diane@caltech.edu>
Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)
committer Diane Trout <diane@caltech.edu>
Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)
diff --git a/encode_submission/geo_gather.py b/encode_submission/geo_gather.py

index 5d6bf1142f193c3c88a566270ed6579b4f58273c..c67edefc7d45d0e95e7a798dd015a31befbbbac2 100644 (file)
--- a/encode_submission/geo_gather.py
+++ b/encode_submission/geo_gather.py
@@ -84,6 +84,8 @@ def main(cmdline=None):
          extractor.create_scripts(results)
  
      if opts.scan_submission:
+        if opts.name is None:
+            parser.error("Please define a submission name")
          mapper.scan_submission_dirs(results)
  
      if opts.make_soft:
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index 5baf1b5022a948b1022925737bc79cd3e425aafd..23e7fe8cfa401a52a52e936e91e3182311ed851f 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -164,7 +164,7 @@ class SequenceFile(object):
          # a bit unreliable... assumes filesystem is encoded in utf-8
          path = os.path.abspath(self.path.encode('utf-8'))
          fileNode = RDF.Node(RDF.Uri('file://' + path))
-        add(model, fileNode, rdfNS['type'], libNS['raw_file'])
+        add(model, fileNode, rdfNS['type'], libNS['illumina_result'])
          add_lit(model, fileNode, libNS['flowcell_id'], self.flowcell)
          add_lit(model, fileNode, libNS['lane_number'], self.lane)
          if self.read is not None:
@@ -177,7 +177,7 @@ class SequenceFile(object):
          add_lit(model, fileNode, libNS['split_id'], self.split)
          add_lit(model, fileNode, libNS['cycle'], self.cycle)
          add_lit(model, fileNode, libNS['passed_filter'], self.pf)
-        add(model, fileNode, rdfNS['type'], libNS[self.filetype])
+        add(model, fileNode, libNS['file_type'], libNS[self.filetype])
  
          if base_url is not None:
              flowcell = RDF.Node(RDF.Uri("{base}/flowcell/{flowcell}/".format(
@@ -215,18 +215,15 @@ class SequenceFile(object):
  
          if not isinstance(seq_id, RDF.Node):
              seq_id = RDF.Node(RDF.Uri(seq_id))
-        seqTypesStmt = RDF.Statement(seq_id, rdfNS['type'], None)
-        seqTypes = model.find_statements(seqTypesStmt)
-        isSequenceFile = False
-        for s in seqTypes:
-            if s.object == libNS['raw_file']:
-                isSequenceFile = True
-            else:
-                seq_type = stripNamespace(libNS, s.object)
-
-        if not isSequenceFile:
+        result_statement = RDF.Statement(seq_id,
+                                         rdfNS['type'],
+                                         libNS['illumina_result'])
+        if not model.contains_statement(result_statement):
              raise KeyError(u"%s not found" % (unicode(seq_id),))
  
+        seq_type_node = model.get_target(seq_id, libNS['file_type'])
+        seq_type = stripNamespace(libNS, seq_type_node)
+
          path = urlparse(str(seq_id.uri)).path
          flowcellNode = get_one(seq_id, libNS['flowcell'])
          flowcell = get_one(seq_id, libNS['flowcell_id'])
@@ -421,7 +418,7 @@ def update_model_sequence_library(model, base_url):
      prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
      select ?filenode ?flowcell_id ?lane_id ?library_id ?flowcell ?library
      where {
-       ?filenode a libNS:raw_file ;
+       ?filenode a libNS:illumina_result ;
                   libNS:flowcell_id ?flowcell_id ;
                   libNS:lane_number ?lane_id .
         OPTIONAL { ?filenode libNS:flowcell ?flowcell . }
@@ -429,6 +426,7 @@ def update_model_sequence_library(model, base_url):
         OPTIONAL { ?filenode libNS:library_id ?library_id .}
      }
      """
+    LOGGER.debug("update_model_sequence_library query %s", file_body)
      file_query = RDF.SPARQLQuery(file_body)
      files = file_query.execute(model)
  
@@ -436,9 +434,13 @@ def update_model_sequence_library(model, base_url):
      flowcellNS = RDF.NS(urljoin(base_url, 'flowcell/'))
      for f in files:
          filenode = f['filenode']
+        LOGGER.debug("Updating file node %s", str(filenode))
          lane_id = fromTypedNode(f['lane_id'])
          if f['flowcell'] is None:
              flowcell = flowcellNS[str(f['flowcell_id'])+'/']
+            LOGGER.debug("Adding file (%s) to flowcell (%s) link",
+                         str(filenode),
+                         str(flowcell))
              model.add_statement(
                  RDF.Statement(filenode, libNS['flowcell'], flowcell))
          else:
@@ -452,6 +454,9 @@ def update_model_sequence_library(model, base_url):
                                                     flowcell,
                                                     lane_id)
                  library_id = toTypedNode(simplify_uri(library))
+                LOGGER.debug("Adding file (%s) to library (%s) link",
+                             str(filenode),
+                             str(library))
                  model.add_statement(
                      RDF.Statement(filenode, libNS['library_id'], library_id))
              if library is not None:
diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py

index d051c036ca2eddda6ffbbc1cc3f1c1f31bb5fcea..34ddeab593da5636386a7f0a7254a5e2af1b53e4 100644 (file)
--- a/htsworkflow/pipelines/test/test_sequences.py
+++ b/htsworkflow/pipelines/test/test_sequences.py
@@ -344,13 +344,19 @@ class SequenceFileTests(unittest.TestCase):
              seq.save_to_model(model)
  
          files = list(model.find_statements(
-            RDF.Statement(None, rdfNS['type'], libraryOntology['raw_file'])))
+            RDF.Statement(None,
+                          rdfNS['type'],
+                          libraryOntology['illumina_result'])))
          self.assertEqual(len(files), 5)
          files = list(model.find_statements(
-            RDF.Statement(None, rdfNS['type'], libraryOntology['qseq'])))
+            RDF.Statement(None,
+                          libraryOntology['file_type'],
+                          libraryOntology['qseq'])))
          self.assertEqual(len(files), 4)
          files = list(model.find_statements(
-            RDF.Statement(None, rdfNS['type'], libraryOntology['split_fastq'])))
+            RDF.Statement(None,
+                          libraryOntology['file_type'],
+                          libraryOntology['split_fastq'])))
          self.assertEqual(len(files), 1)
  
          files = list(model.find_statements(
diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py

index e48afd5869b48a9bbc69059aa5ebcad90608312f..9aab790ef6fb6262e998b7ef1b4c2689144d5e88 100644 (file)
--- a/htsworkflow/submission/condorfastq.py
+++ b/htsworkflow/submission/condorfastq.py
@@ -15,9 +15,8 @@ from htsworkflow.pipelines import srf2fastq
  from htsworkflow.pipelines import desplit_fastq
  from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
       fromTypedNode, \
-     libraryOntology, \
-     stripNamespace, \
-     rdfNS
+     stripNamespace
+from htsworkflow.util.rdfns import *
  from htsworkflow.util.conversion import parse_flowcell_id
  
  from django.conf import settings
@@ -61,7 +60,6 @@ class CondorFastqExtract(object):
          template_map = {'srf': 'srf.condor',
                          'qseq': 'qseq.condor',
                          'split_fastq': 'split_fastq.condor',
-                        'by_sample': 'lane_to_fastq.turtle',
                          }
  
          env = None
@@ -91,9 +89,8 @@ class CondorFastqExtract(object):
                              'qseq': self.condor_qseq_to_fastq,
                              'split_fastq': self.condor_desplit_fastq
                              }
-        by_sample = {}
          sequences = self.find_archive_sequence_files(result_map)
-        needed_targets = self.find_missing_targets(result_map, sequences)
+        needed_targets = self.update_fastq_targets(result_map, sequences)
  
          for target_pathname, available_sources in needed_targets.items():
              LOGGER.debug(' target : %s' % (target_pathname,))
@@ -110,13 +107,9 @@ class CondorFastqExtract(object):
                  if sources is not None:
                      condor_entries.setdefault(condor_type, []).append(
                          conversion(sources, target_pathname))
-                    for s in sources:
-                        by_sample.setdefault(s.lane_number,[]).append(
-                            target_pathname)
              else:
                  print " need file", target_pathname
  
-        condor_entries['by_sample'] = by_sample
          return condor_entries
  
      def find_archive_sequence_files(self,  result_map):
@@ -127,8 +120,7 @@ class CondorFastqExtract(object):
          flowcell_ids = self.find_relavant_flowcell_ids()
          self.import_sequences(flowcell_ids)
  
-
-        query = RDF.SPARQLQuery("""
+        query_text = """
          prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
          prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
          prefix xsd: <http://www.w3.org/2001/XMLSchema#>
@@ -145,19 +137,26 @@ class CondorFastqExtract(object):
                        libns:flowcell_id ?flowcell_id ;
                        libns:library ?library ;
                        libns:library_id ?library_id ;
-                      rdf:type ?filetype ;
-                      a libns:raw_file .
+                      libns:file_type ?filetype ;
+                      a libns:illumina_result .
              ?flowcell libns:read_length ?read_length ;
                        libns:flowcell_type ?flowcell_type .
              OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status }
-            FILTER(?filetype != libns:raw_file)
+            FILTER(?filetype != libns:sequencer_result)
          }
-        """)
+        """
+        LOGGER.debug("find_archive_sequence_files query: %s",
+                     query_text)
+        query = RDF.SPARQLQuery(query_text)
          results = []
          for r in query.execute(self.model):
              library_id = fromTypedNode(r['library_id'])
              if library_id in result_map:
-                results.append(SequenceResult(r))
+                seq = SequenceResult(r)
+                LOGGER.debug("Creating sequence result for library %s: %s",
+                             library_id,
+                             repr(seq))
+                results.append(seq)
          return results
  
      def import_libraries(self, result_map):
@@ -171,8 +170,11 @@ class CondorFastqExtract(object):
          """Import library data into our model if we don't have it already
          """
          q = RDF.Statement(library, rdfNS['type'], libraryOntology['library'])
+        present = False
          if not self.model.contains_statement(q):
+            present = True
              load_into_model(self.model, 'rdfa', library)
+        LOGGER.debug("Did we import %s: %s", library, present)
  
      def find_relavant_flowcell_ids(self):
          """Generate set of flowcell ids that had samples of interest on them
@@ -200,7 +202,6 @@ WHERE {
          return flowcell_ids
  
      def import_sequences(self, flowcell_ids):
-
          seq_dirs = []
          for f in flowcell_ids:
              seq_dirs.append(os.path.join(self.sequences_path, str(f)))
@@ -209,13 +210,11 @@ WHERE {
              seq.save_to_model(self.model, self.host)
          update_model_sequence_library(self.model, self.host)
  
-    def find_missing_targets(self, result_map, raw_files):
-        """
-        Check if the sequence file exists.
-        This requires computing what the sequence name is and checking
-        to see if it can be found in the sequence location.
+    def update_fastq_targets(self, result_map, raw_files):
+        """Return list of fastq files that need to be built.
  
-        Adds seq.paired flag to sequences listed in lib_db[*]['lanes']
+        Also update model with link between illumina result files
+        and our target fastq file.
          """
          fastq_paired_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s_r%(read)s.fastq'
          fastq_single_template = '%(lib_id)s_%(flowcell)s_c%(cycle)s_l%(lane)s.fastq'
@@ -244,9 +243,18 @@ WHERE {
              if self.force or not os.path.exists(target_pathname):
                  t = needed_targets.setdefault(target_pathname, {})
                  t.setdefault(seq.filetype, []).append(seq)
-
+            self.add_target_source_links(target_pathname, seq)
          return needed_targets
  
+    def add_target_source_links(self, target, seq):
+        """Add link between target pathname and the 'lane' that produced it
+        (note lane objects are now post demultiplexing.)
+        """
+        target_uri = 'file://' + target
+        target_node = RDF.Node(RDF.Uri(target_uri))
+        source_stmt = RDF.Statement(target_node, dcNS['source'], seq.filenode)
+        self.model.add_statement(source_stmt)
+
      def condor_srf_to_fastq(self, sources, target_pathname):
          if len(sources) > 1:
              raise ValueError("srf to fastq can only handle one file")
@@ -350,3 +358,9 @@ class SequenceResult(object):
              errmsg = u"Unsupported scheme {0} for {1}"
              raise ValueError(errmsg.format(url.scheme, unicode(url)))
      path = property(_get_path)
+
+    def __repr__(self):
+        return "SequenceResult({0},{1},{2})".format(
+            str(self.filenode),
+            str(self.library_id),
+            str(self.flowcell_id))
diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py

index a74d71a667d1cb788cbdaab24cdf8661738c9413..f04ac8fe5328e738a012738a5d256b9307e11504 100644 (file)
--- a/htsworkflow/submission/daf.py
+++ b/htsworkflow/submission/daf.py
@@ -361,7 +361,7 @@ class UCSCSubmission(object):
                            rdfNS['type'],
                            submissionOntology['submission']))
          self.model.add_statement(RDF.Statement(submissionNode,
-                                               submissionOntology['library'],
+                                               libraryOntology['library'],
                                                 libNode))
  
          LOGGER.debug("Adding statements to {0}".format(str(submissionView)))
@@ -385,8 +385,8 @@ class UCSCSubmission(object):
      def create_file_attributes(self, filename, submissionView, submission_uri, submission_dir):
          # add file specific information
          LOGGER.debug("Updating file md5sum")
-        fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
          submission_pathname = os.path.join(submission_dir, filename)
+        fileNode = RDF.Node(RDF.Uri("file://" + submission_pathname))
          self.model.add_statement(
              RDF.Statement(submissionView,
                            dafTermOntology['has_file'],
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py

index 6137875b1ad86048b0c08080f6fb947d090e9471..413d2c3ae5e99daa7025a6be3101edb50d96da85 100644 (file)
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -31,9 +31,13 @@ class GEOSubmission(Submission):
          for lib_id, result_dir in result_map.items():
              an_analysis = self.get_submission_node(result_dir)
              metadata = self.get_sample_metadata(an_analysis)
-            if len(metadata) > 1:
+            if len(metadata) == 0:
+                errmsg = 'No metadata found for {0}'
+                LOGGER.error(errmsg.format(str(an_analysis),))
+                continue
+            elif len(metadata) > 1:
                  errmsg = 'Confused there are more than one samples for %s'
-                LOGGER.debug(errmsg % (str(an_analysis,)))
+                LOGGER.debug(errmsg % (str(an_analysis),))
              metadata = metadata[0]
              metadata['raw'] = self.get_raw_files(an_analysis)
              metadata['supplimental'] = self.get_sample_files(an_analysis)
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index 6dd630aeda90fe5ad08a96da8be610f628642b04..18fa3b2bc9f487b63915ca02a3c060adbfeedf3b 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -83,7 +83,7 @@ class Submission(object):
                                                      rdfNS['type'])
          if file_classification is None:
              errmsg = 'Could not find class for {0}'
-            logger.warning(errmsg.format(str(file_type)))
+            LOGGER.warning(errmsg.format(str(file_type)))
              return
  
          self.model.add_statement(
diff --git a/htsworkflow/submission/test/test_condorfastq.py b/htsworkflow/submission/test/test_condorfastq.py

index 24d6e4982018dd23ac8e8746d7ade1856ae3d4e7..94df7b60b2f1e7dbca6d5e2fc2ecbacded55f30a 100644 (file)
--- a/htsworkflow/submission/test/test_condorfastq.py
+++ b/htsworkflow/submission/test/test_condorfastq.py
@@ -441,7 +441,7 @@ class TestCondorFastq(unittest.TestCase):
      def test_find_needed_targets(self):
          lib_db = self.extract.find_archive_sequence_files(self.result_map)
  
-        needed_targets = self.extract.find_missing_targets(self.result_map,
+        needed_targets = self.extract.update_fastq_targets(self.result_map,
                                                             lib_db)
          self.assertEqual(len(needed_targets), 9)
          srf_30221 = needed_targets[
diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql

index e7fcbc12d0f8337e099a8bbe85409ad2e8c18432..de9097ba9962849dde661bbd9ebd38521573f781 100644 (file)
--- a/htsworkflow/templates/geo_fastqs.sparql
+++ b/htsworkflow/templates/geo_fastqs.sparql
@@ -11,7 +11,7 @@ WHERE {
  
    ?file ucscDaf:filename ?filename ;
          ucscDaf:md5sum ?md5sum ;
-        libraryOntology:has_lane ?lane ;
+        libraryOntology:library ?library ;
          a ?file_type .
    ?file_type a <{{file_class}}> ;
               geoSoft:fileTypeLabel ?file_type_label .
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql

index 850d99a944ee51009675a381aa5c8a023d521a9d..b4d4b0bc8102db05b84f16472b4508a82db48afa 100644 (file)
--- a/htsworkflow/templates/geo_samples.sparql
+++ b/htsworkflow/templates/geo_samples.sparql
@@ -7,18 +7,22 @@ PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
  
  select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
  WHERE {
-  <{{submission}}> a submissionOntology:submission .
+  <{{submission}}> a submissionOntology:submission ;
+                   submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
  
    OPTIONAL { <{{submission}}> ucscDaf:control ?control }
    OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
    OPTIONAL { ?library libraryOntology:antibody ?antibody }
    OPTIONAL { ?library libraryOntology:cell_line ?cell .
-             ?cell_line cells:cell ?cell ;
-                        cells:documents ?growthProtocol . }
+             OPTIONAL { ?cell_line cells:cell ?cell ;
+                                   cells:documents ?growthProtocol . }}
    OPTIONAL { ?library ucscDaf:sex ?sex }
    OPTIONAL { ?library libraryOntology:library_id ?library_id }
    OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:species ?species_name .
+             ?species libraryOntology:species ?species_name ;
+                      libraryOntology:taxon_id ?taxon_id . }
    OPTIONAL { ?library libraryOntology:condition_term ?treatment }
    OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
    OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
@@ -32,8 +36,6 @@ WHERE {
    OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
    OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
  
-  <{{submission}}> submissionOntology:library ?library ;
-                   submissionOntology:name ?name .
    ?species libraryOntology:species ?species_name ;
             libraryOntology:taxon_id ?taxon_id .
  
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft

index 969ff53836bd37b3d4f82747cf3d94c806852ca9..8ae3ed9ef78e0d737f1d7cada296a740e2ff553d 100644 (file)
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,5 +1,6 @@
-{% for name, value in series %}{{name}}={{value}}
-{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
+{% for name, value in series %}
+{{name}}={{value}}{% endfor %}{% for row in samples %}
+^SAMPLE={{row.name}}
  !Sample_type=SRA
  !Sample_title={{row.name}}
  !Sample_series_id={{ series_id }}
@@ -14,18 +15,13 @@
  !Sample_extract_protocol={{ row.extractProtocol|safe }}
  !Sample_data_processing={{ row.dataProtocol|safe }}
  !Sample_molecule_ch1={{ row.extractMolecule }}
-!Sample_characteristics_ch1=labExpId: {{ row.library_id }}
-!Sample_characteristics_ch1=replicate: {{ row.replicate }}
-{% if row.cell %}{% spaceless %}
-!Sample_characteristics_ch1=cell: {{ row.cell }}
-{% endspaceless %}{% endif %}
-{% if row.readType %}{% spaceless %}
-!Sample_characteristics_ch1=readType: {{ row.readType }}
-{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
-!Sample_characteristics_ch1=cell: {{ row.antibody }}
-{% endspaceless %}{% endif %}{% for run in row.run %}
-!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}
-!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %}
+!Sample_characteristics_ch1=labExpId: {{ row.library_id }}{% if row.replicate %}
+!Sample_characteristics_ch1=replicate: {{ row.replicate }}{% endif %}{% if row.cell %}
+!Sample_characteristics_ch1=cell: {{ row.cell }}{% endif %}{% if row.readType %}
+!Sample_characteristics_ch1=readType: {{ row.readType }}{% endif %}{% if row.antibody %}
+!Sample_characteristics_ch1=cell: {{ row.antibody }}{% endif %}{% for run in row.run %}{% if run.image_software %}
+!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endif %}{% if run.basecall_software %}
+!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.basecall_software }}-{{ run.basecall_version }}{% endif %}{% endfor %}{% for raw in row.raw %}
  !Sample_raw_file_{{forloop.counter}}={{ raw.filename }}
  !Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
  !Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }}
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py

index cb445b6b0cb4f5f1d9abab3d13ae7edad483b931..89d9e5b4d16290e940878d7cc6844d8b64d80c79 100644 (file)
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -258,13 +258,14 @@ def load_into_model(model, parser_name, path, ns=None):
      if len(url_parts[0]) == 0 or url_parts[0] == 'file':
          url_parts[0] = 'file'
          url_parts[2] = os.path.abspath(url_parts[2])
-        if parser_name is None or parser_name == 'guess':
-            parser_name = guess_parser_by_extension(path)
+    if parser_name is None or parser_name == 'guess':
+        parser_name = guess_parser_by_extension(path)
      url = urlunparse(url_parts)
      logger.info("Opening {0} with parser {1}".format(url, parser_name))
  
      rdf_parser = RDF.Parser(name=parser_name)
  
+    statements = []
      retries = 3
      while retries > 0:
          try:
@@ -273,7 +274,7 @@ def load_into_model(model, parser_name, path, ns=None):
              retries = 0
          except RDF.RedlandError, e:
              errmsg = "RDF.RedlandError: {0} {1} tries remaining"
-            logger.error(errmsg.format(str(e), tries))
+            logger.error(errmsg.format(str(e), retries))
  
      for s in statements:
          conditionally_add_statement(model, s, ns)
@@ -384,16 +385,16 @@ def guess_parser(content_type, pathname):
          return 'turtle'
      elif content_type in ('text/html',):
          return 'rdfa'
-    elif content_type is None:
+    elif content_type is None or content_type in ('text/plain',):
          return guess_parser_by_extension(pathname)
  
  def guess_parser_by_extension(pathname):
      _, ext = os.path.splitext(pathname)
      if ext in ('.xml', '.rdf'):
          return 'rdfxml'
-    elif ext in ('.html'):
+    elif ext in ('.html',):
          return 'rdfa'
-    elif ext in ('.turtle'):
+    elif ext in ('.turtle',):
          return 'turtle'
      return 'guess'
  
diff --git a/htsworkflow/util/test/test_rdfhelp.py b/htsworkflow/util/test/test_rdfhelp.py

index 9a31ca90e382370b0b5a838cf8fd61fd8b96926a..948bcf407cf976eea74c44b0ff095b475f02770e 100644 (file)
--- a/htsworkflow/util/test/test_rdfhelp.py
+++ b/htsworkflow/util/test/test_rdfhelp.py
@@ -207,7 +207,8 @@ _:a owl:imports "{loc}extra.turtle" .
                  ('/a/b/c.rdf', 'rdfxml'),
                  ('/a/b/c.xml', 'rdfxml'),
                  ('/a/b/c.html', 'rdfa'),
-                ('/a/b/c.turtle', 'turtle')]
+                ('/a/b/c.turtle', 'turtle'),
+                ('http://foo.bar/bleem.turtle', 'turtle')]
              for path, parser in DATA:
                  self.assertEqual(guess_parser_by_extension(path), parser)
                  self.assertEqual(guess_parser(None, path), parser)
@@ -215,7 +216,10 @@ _:a owl:imports "{loc}extra.turtle" .
              DATA = [
                  ('application/rdf+xml', 'http://a.org/b/c', 'rdfxml'),
                  ('application/x-turtle', 'http://a.org/b/c', 'turtle'),
-                ('text/html', 'http://a.org/b/c', 'rdfa')
+                ('text/html', 'http://a.org/b/c', 'rdfa'),
+                ('text/html', 'http://a.org/b/c.html', 'rdfa'),
+                ('text/plain', 'http://a.org/b/c.turtle', 'turtle'),
+                ('text/plain', 'http://a.org/b/c', 'guess')
              ]
              for contenttype, url, parser in DATA:
                  self.assertEqual(guess_parser(contenttype, url), parser)
author	Diane Trout <diane@caltech.edu>
	Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)
committer	Diane Trout <diane@caltech.edu>
	Tue, 18 Sep 2012 18:36:16 +0000 (11:36 -0700)
encode_submission/geo_gather.py		patch \| blob \| history
htsworkflow/pipelines/sequences.py		patch \| blob \| history
htsworkflow/pipelines/test/test_sequences.py		patch \| blob \| history
htsworkflow/submission/condorfastq.py		patch \| blob \| history
htsworkflow/submission/daf.py		patch \| blob \| history
htsworkflow/submission/geo.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/submission/test/test_condorfastq.py		patch \| blob \| history
htsworkflow/templates/geo_fastqs.sparql		patch \| blob \| history
htsworkflow/templates/geo_samples.sparql		patch \| blob \| history
htsworkflow/templates/geo_submission.soft		patch \| blob \| history
htsworkflow/util/rdfhelp.py		patch \| blob \| history
htsworkflow/util/test/test_rdfhelp.py		patch \| blob \| history