Generate manifest files for ENCODE3

author Diane Trout <diane@caltech.edu>

Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)

committer Diane Trout <diane@caltech.edu>

Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
author Diane Trout <diane@caltech.edu>
Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
committer Diane Trout <diane@caltech.edu>
Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
diff --git a/encode_submission/trackhub.py b/encode_submission/trackhub.py

index fb0cea8771dcbc25b11dc95654e1e12ed62d2eb9..197d7f34092ef87e9b2a617b0a8474d3896ae3f9 100644 (file)
--- a/encode_submission/trackhub.py
+++ b/encode_submission/trackhub.py
@@ -96,8 +96,11 @@ def main(cmdline=None):
          mapper.scan_submission_dirs(results)
  
      if opts.make_hub:
-        make_hub(results)
+        make_hub(mapper, results, opts.make_hub)
  
+    if opts.make_manifest:
+        make_manifest(mapper, results, opts.make_manifest)
+        
      if opts.sparql:
          sparql_query(model, opts.sparql)
  
@@ -106,17 +109,23 @@ def main(cmdline=None):
          print writer.serialize_model_to_string(model)
  
  
-def make_hub(results):
+def make_hub(mapper, results, filename=None):
      trackdb = mapper.make_hub(results)
-    manifest = mapper.make_manifest(results)
  
-    trackstream = sys.stdout
-    #with open('trackDb.txt', 'w') as trackstream:
-    trackstream.write(trackdb)
+    if filename is None or filename == '-':
+        sys.stdout.write(trackdb)
+    else:
+        with open('trackDb.txt', 'w') as trackstream:
+            trackstream.write(trackdb)
  
-    #with open('manifest.txt', 'w') as mainifeststream:
-    manifeststream = sys.stdout
-    mainifeststream.write(mainifest)
+def make_manifest(mapper, results, filename=None):
+    manifest = mapper.make_manifest(results)
+
+    if filename is None or filename == '-':
+        sys.stdout.write(manifest)
+    else:
+        with open(filename, 'w') as mainifeststream:
+            mainifeststream.write(manifest)
          
  def make_parser():
      parser = OptionParser()
@@ -145,8 +154,12 @@ def make_parser():
                          help="generate scripts for making fastq files")
      commands.add_option('--scan-submission', default=False, action="store_true",
                        help="Import metadata for submission into our model")
-    commands.add_option('--make-hub', help='make the hub file', default=False,
-                      action="store_true")
+    commands.add_option('--make-hub', default=None, 
+                        help='name the hub file or - for stdout to create it')
+    commands.add_option('--make-manifest', 
+                        help='name the manifest file name or - for stdout to create it', 
+                        default=None)
+
  
      parser.add_option_group(commands)
  
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index 77a1b68e4db763ab83eb50135f9613e8264ce973..3320f1caf3174e6d5636c4dd20841188771050d9 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -122,6 +122,11 @@ class Submission(object):
              RDF.Statement(fileNode,
                            rdfNS['type'],
                            file_type))
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          libraryOntology['library'],
+                          libNode))
+                          
          LOGGER.debug("Done.")
  
      def make_file_node(self, pathname, submissionNode):
@@ -129,7 +134,8 @@ class Submission(object):
          """
          # add file specific information
          path, filename = os.path.split(pathname)
-        fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
+        pathname = os.path.abspath(pathname)
+        fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
          self.model.add_statement(
              RDF.Statement(submissionNode,
                            dafTermOntology['has_file'],
@@ -138,6 +144,10 @@ class Submission(object):
              RDF.Statement(fileNode,
                            dafTermOntology['filename'],
                            filename))
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          dafTermOntology['relative_path'],
+                          os.path.relpath(pathname)))
          return fileNode
  
      def add_md5s(self, filename, fileNode, analysis_dir):
@@ -176,7 +186,12 @@ class Submission(object):
                                libraryOntology['has_mappings'],
                                dafTermOntology['has_file']))
          parser = RDF.Parser(name='rdfa')
-        new_statements = parser.parse_as_stream(libNode.uri)
+        try:
+            new_statements = parser.parse_as_stream(libNode.uri)
+        except RDF.RedlandError as e:
+            LOGGER.error(e)
+            return
+        LOGGER.debug("Scanning %s", str(libNode.uri))
          toadd = []
          for s in new_statements:
              # always add "collections"
@@ -310,7 +325,8 @@ class Submission(object):
          paired = ['Barcoded Illumina',
                    'Multiplexing',
                    'Nextera',
-                  'Paired End (non-multiplexed)',]
+                  'Paired End (non-multiplexed)',
+                  'Dual Index Illumina',]
          if library_type in single:
              return False
          elif library_type in paired:
diff --git a/htsworkflow/submission/trackhub.py b/htsworkflow/submission/trackhub.py

index 87a52582eb5317afb7c1c3be487a354f05f6ee77..8b7b424094c0be0d1dcf4ee0e4f9c598cdc295ab 100644 (file)
--- a/htsworkflow/submission/trackhub.py
+++ b/htsworkflow/submission/trackhub.py
@@ -35,14 +35,24 @@ class TrackHubSubmission(Submission):
              metadata = metadata[0]
              samples.append(metadata)
  
-        soft_template = loader.get_template('trackDb.txt')
+        template = loader.get_template('trackDb.txt')
          context = Context({
              'samples': samples,
          })
-        return str(soft_template.render(context))
+        return str(template.render(context))
  
-    def make_mainifest(self, result_map):
-        pass
+    def make_manifest(self, result_map):
+        files = []
+        for lib_id, result_dir in result_map.items():
+            an_analysis = self.get_submission_node(result_dir)
+            metadata = self.get_manifest_metadata(an_analysis)
+            files.extend(metadata)
+
+        template = loader.get_template('manifest.txt')
+        context = Context({
+            'files': files
+        })
+        return str(template.render(context))
          
      def get_sample_metadata(self, analysis_node):
          """Gather information for filling out sample section of a SOFT file
@@ -56,3 +66,13 @@ class TrackHubSubmission(Submission):
  
          results = self.execute_query(query_template, context)
          return results
+
+    def get_manifest_metadata(self, analysis_node):
+        query_template = loader.get_template('trackhub_manifest.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
+            })
+        results = self.execute_query(query_template, context)
+        return results
diff --git a/htsworkflow/templates/manifest.txt b/htsworkflow/templates/manifest.txt

new file mode 100644 (file)

index 0000000..adf0554
--- /dev/null
+++ b/htsworkflow/templates/manifest.txt
@@ -0,0 +1,2 @@
+#file_name     format  output_type     experiment      replicate       enriched_in     ucsc_db{% for r in files %}
+{{ r.relative_path }}  {{ r.file_format }}     {{ r.output_type }}     {{ r.dataset_id }}      {{ r.replicate }}       {{ r.enriched_in }}     {{ r.ucsc_db }}{% endfor %}
diff --git a/htsworkflow/templates/trackhub_manifest.sparql b/htsworkflow/templates/trackhub_manifest.sparql

new file mode 100644 (file)

index 0000000..295caa8
--- /dev/null
+++ b/htsworkflow/templates/trackhub_manifest.sparql
@@ -0,0 +1,32 @@
+PREFIX htswlib: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX encode3: <http://jumpgate.caltech.edu/wiki/Encode3#> 
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?filename ?relative_path ?file_format ?output_type ?dataset_id ?replicate ?enriched_in ?ucsc_db
+WHERE {
+  <{{submission}}> a submissionOntology:submission ;
+                   submissionOntology:name ?name ;
+                   ucscDaf:has_file ?file .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:relative_path ?relative_path ;
+        htswlib:library ?library ;
+        a ?fileClass .
+
+  ?fileClass geoSoft:fileTypeLabel ?file_format ;
+             ucscDaf:output_type ?output_type .
+  
+  ?library htswlib:replicate ?replicate ;
+           ucscDaf:enriched_in ?enriched_in;
+           ucscDaf:genome_build ?ucsc_db .
+
+  ?library encode3:dataset_id ?dataset_id .
+  # This is lame! why!!!!
+  # ?library2 encode3:dcc_id ?dcc_library_id .
+  # FILTER (?library = ?library2) 
+
+}
author	Diane Trout <diane@caltech.edu>
	Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
committer	Diane Trout <diane@caltech.edu>
	Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
encode_submission/trackhub.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/submission/trackhub.py		patch \| blob \| history
htsworkflow/templates/manifest.txt	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/trackhub_manifest.sparql	[new file with mode: 0644]	patch \| blob