Generate manifest files for ENCODE3
authorDiane Trout <diane@caltech.edu>
Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
committerDiane Trout <diane@caltech.edu>
Wed, 3 Jul 2013 17:57:10 +0000 (10:57 -0700)
I added a new option to the trackhub generation script.
There were some changes to the model generation to capture
relative path names and add the library URI to files to
make some queries faster.

encode_submission/trackhub.py
htsworkflow/submission/submission.py
htsworkflow/submission/trackhub.py
htsworkflow/templates/manifest.txt [new file with mode: 0644]
htsworkflow/templates/trackhub_manifest.sparql [new file with mode: 0644]

index fb0cea8771dcbc25b11dc95654e1e12ed62d2eb9..197d7f34092ef87e9b2a617b0a8474d3896ae3f9 100644 (file)
@@ -96,8 +96,11 @@ def main(cmdline=None):
         mapper.scan_submission_dirs(results)
 
     if opts.make_hub:
-        make_hub(results)
+        make_hub(mapper, results, opts.make_hub)
 
+    if opts.make_manifest:
+        make_manifest(mapper, results, opts.make_manifest)
+        
     if opts.sparql:
         sparql_query(model, opts.sparql)
 
@@ -106,17 +109,23 @@ def main(cmdline=None):
         print writer.serialize_model_to_string(model)
 
 
-def make_hub(results):
+def make_hub(mapper, results, filename=None):
     trackdb = mapper.make_hub(results)
-    manifest = mapper.make_manifest(results)
 
-    trackstream = sys.stdout
-    #with open('trackDb.txt', 'w') as trackstream:
-    trackstream.write(trackdb)
+    if filename is None or filename == '-':
+        sys.stdout.write(trackdb)
+    else:
+        with open('trackDb.txt', 'w') as trackstream:
+            trackstream.write(trackdb)
 
-    #with open('manifest.txt', 'w') as mainifeststream:
-    manifeststream = sys.stdout
-    mainifeststream.write(mainifest)
+def make_manifest(mapper, results, filename=None):
+    manifest = mapper.make_manifest(results)
+
+    if filename is None or filename == '-':
+        sys.stdout.write(manifest)
+    else:
+        with open(filename, 'w') as mainifeststream:
+            mainifeststream.write(manifest)
         
 def make_parser():
     parser = OptionParser()
@@ -145,8 +154,12 @@ def make_parser():
                         help="generate scripts for making fastq files")
     commands.add_option('--scan-submission', default=False, action="store_true",
                       help="Import metadata for submission into our model")
-    commands.add_option('--make-hub', help='make the hub file', default=False,
-                      action="store_true")
+    commands.add_option('--make-hub', default=None, 
+                        help='name the hub file or - for stdout to create it')
+    commands.add_option('--make-manifest', 
+                        help='name the manifest file name or - for stdout to create it', 
+                        default=None)
+
 
     parser.add_option_group(commands)
 
index 77a1b68e4db763ab83eb50135f9613e8264ce973..3320f1caf3174e6d5636c4dd20841188771050d9 100644 (file)
@@ -122,6 +122,11 @@ class Submission(object):
             RDF.Statement(fileNode,
                           rdfNS['type'],
                           file_type))
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          libraryOntology['library'],
+                          libNode))
+                          
         LOGGER.debug("Done.")
 
     def make_file_node(self, pathname, submissionNode):
@@ -129,7 +134,8 @@ class Submission(object):
         """
         # add file specific information
         path, filename = os.path.split(pathname)
-        fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
+        pathname = os.path.abspath(pathname)
+        fileNode = RDF.Node(RDF.Uri('file://'+ pathname))
         self.model.add_statement(
             RDF.Statement(submissionNode,
                           dafTermOntology['has_file'],
@@ -138,6 +144,10 @@ class Submission(object):
             RDF.Statement(fileNode,
                           dafTermOntology['filename'],
                           filename))
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          dafTermOntology['relative_path'],
+                          os.path.relpath(pathname)))
         return fileNode
 
     def add_md5s(self, filename, fileNode, analysis_dir):
@@ -176,7 +186,12 @@ class Submission(object):
                               libraryOntology['has_mappings'],
                               dafTermOntology['has_file']))
         parser = RDF.Parser(name='rdfa')
-        new_statements = parser.parse_as_stream(libNode.uri)
+        try:
+            new_statements = parser.parse_as_stream(libNode.uri)
+        except RDF.RedlandError as e:
+            LOGGER.error(e)
+            return
+        LOGGER.debug("Scanning %s", str(libNode.uri))
         toadd = []
         for s in new_statements:
             # always add "collections"
@@ -310,7 +325,8 @@ class Submission(object):
         paired = ['Barcoded Illumina',
                   'Multiplexing',
                   'Nextera',
-                  'Paired End (non-multiplexed)',]
+                  'Paired End (non-multiplexed)',
+                  'Dual Index Illumina',]
         if library_type in single:
             return False
         elif library_type in paired:
index 87a52582eb5317afb7c1c3be487a354f05f6ee77..8b7b424094c0be0d1dcf4ee0e4f9c598cdc295ab 100644 (file)
@@ -35,14 +35,24 @@ class TrackHubSubmission(Submission):
             metadata = metadata[0]
             samples.append(metadata)
 
-        soft_template = loader.get_template('trackDb.txt')
+        template = loader.get_template('trackDb.txt')
         context = Context({
             'samples': samples,
         })
-        return str(soft_template.render(context))
+        return str(template.render(context))
 
-    def make_mainifest(self, result_map):
-        pass
+    def make_manifest(self, result_map):
+        files = []
+        for lib_id, result_dir in result_map.items():
+            an_analysis = self.get_submission_node(result_dir)
+            metadata = self.get_manifest_metadata(an_analysis)
+            files.extend(metadata)
+
+        template = loader.get_template('manifest.txt')
+        context = Context({
+            'files': files
+        })
+        return str(template.render(context))
         
     def get_sample_metadata(self, analysis_node):
         """Gather information for filling out sample section of a SOFT file
@@ -56,3 +66,13 @@ class TrackHubSubmission(Submission):
 
         results = self.execute_query(query_template, context)
         return results
+
+    def get_manifest_metadata(self, analysis_node):
+        query_template = loader.get_template('trackhub_manifest.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
+            })
+        results = self.execute_query(query_template, context)
+        return results
diff --git a/htsworkflow/templates/manifest.txt b/htsworkflow/templates/manifest.txt
new file mode 100644 (file)
index 0000000..adf0554
--- /dev/null
@@ -0,0 +1,2 @@
+#file_name     format  output_type     experiment      replicate       enriched_in     ucsc_db{% for r in files %}
+{{ r.relative_path }}  {{ r.file_format }}     {{ r.output_type }}     {{ r.dataset_id }}      {{ r.replicate }}       {{ r.enriched_in }}     {{ r.ucsc_db }}{% endfor %}
diff --git a/htsworkflow/templates/trackhub_manifest.sparql b/htsworkflow/templates/trackhub_manifest.sparql
new file mode 100644 (file)
index 0000000..295caa8
--- /dev/null
@@ -0,0 +1,32 @@
+PREFIX htswlib: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX encode3: <http://jumpgate.caltech.edu/wiki/Encode3#> 
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?filename ?relative_path ?file_format ?output_type ?dataset_id ?replicate ?enriched_in ?ucsc_db
+WHERE {
+  <{{submission}}> a submissionOntology:submission ;
+                   submissionOntology:name ?name ;
+                   ucscDaf:has_file ?file .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:relative_path ?relative_path ;
+        htswlib:library ?library ;
+        a ?fileClass .
+
+  ?fileClass geoSoft:fileTypeLabel ?file_format ;
+             ucscDaf:output_type ?output_type .
+  
+  ?library htswlib:replicate ?replicate ;
+           ucscDaf:enriched_in ?enriched_in;
+           ucscDaf:genome_build ?ucsc_db .
+
+  ?library encode3:dataset_id ?dataset_id .
+  # This is lame! why!!!!
+  # ?library2 encode3:dcc_id ?dcc_library_id .
+  # FILTER (?library = ?library2) 
+
+}