From: Diane Trout Date: Wed, 3 Jul 2013 17:57:10 +0000 (-0700) Subject: Generate manifest files for ENCODE3 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=ad6adfee371e67084e55b94a1e5335263d6fb26b Generate manifest files for ENCODE3 I added a new option to the trackhub generation script. There were some changes to the model generation to capture relative path names and add the library URI to files to make some queries faster. --- diff --git a/encode_submission/trackhub.py b/encode_submission/trackhub.py index fb0cea8..197d7f3 100644 --- a/encode_submission/trackhub.py +++ b/encode_submission/trackhub.py @@ -96,8 +96,11 @@ def main(cmdline=None): mapper.scan_submission_dirs(results) if opts.make_hub: - make_hub(results) + make_hub(mapper, results, opts.make_hub) + if opts.make_manifest: + make_manifest(mapper, results, opts.make_manifest) + if opts.sparql: sparql_query(model, opts.sparql) @@ -106,17 +109,23 @@ def main(cmdline=None): print writer.serialize_model_to_string(model) -def make_hub(results): +def make_hub(mapper, results, filename=None): trackdb = mapper.make_hub(results) - manifest = mapper.make_manifest(results) - trackstream = sys.stdout - #with open('trackDb.txt', 'w') as trackstream: - trackstream.write(trackdb) + if filename is None or filename == '-': + sys.stdout.write(trackdb) + else: + with open('trackDb.txt', 'w') as trackstream: + trackstream.write(trackdb) - #with open('manifest.txt', 'w') as mainifeststream: - manifeststream = sys.stdout - mainifeststream.write(mainifest) +def make_manifest(mapper, results, filename=None): + manifest = mapper.make_manifest(results) + + if filename is None or filename == '-': + sys.stdout.write(manifest) + else: + with open(filename, 'w') as mainifeststream: + mainifeststream.write(manifest) def make_parser(): parser = OptionParser() @@ -145,8 +154,12 @@ def make_parser(): help="generate scripts for making fastq files") commands.add_option('--scan-submission', default=False, action="store_true", help="Import metadata for submission into our model") - commands.add_option('--make-hub', help='make the hub file', default=False, - action="store_true") + commands.add_option('--make-hub', default=None, + help='name the hub file or - for stdout to create it') + commands.add_option('--make-manifest', + help='name the manifest file name or - for stdout to create it', + default=None) + parser.add_option_group(commands) diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py index 77a1b68..3320f1c 100644 --- a/htsworkflow/submission/submission.py +++ b/htsworkflow/submission/submission.py @@ -122,6 +122,11 @@ class Submission(object): RDF.Statement(fileNode, rdfNS['type'], file_type)) + self.model.add_statement( + RDF.Statement(fileNode, + libraryOntology['library'], + libNode)) + LOGGER.debug("Done.") def make_file_node(self, pathname, submissionNode): @@ -129,7 +134,8 @@ class Submission(object): """ # add file specific information path, filename = os.path.split(pathname) - fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname))) + pathname = os.path.abspath(pathname) + fileNode = RDF.Node(RDF.Uri('file://'+ pathname)) self.model.add_statement( RDF.Statement(submissionNode, dafTermOntology['has_file'], @@ -138,6 +144,10 @@ class Submission(object): RDF.Statement(fileNode, dafTermOntology['filename'], filename)) + self.model.add_statement( + RDF.Statement(fileNode, + dafTermOntology['relative_path'], + os.path.relpath(pathname))) return fileNode def add_md5s(self, filename, fileNode, analysis_dir): @@ -176,7 +186,12 @@ class Submission(object): libraryOntology['has_mappings'], dafTermOntology['has_file'])) parser = RDF.Parser(name='rdfa') - new_statements = parser.parse_as_stream(libNode.uri) + try: + new_statements = parser.parse_as_stream(libNode.uri) + except RDF.RedlandError as e: + LOGGER.error(e) + return + LOGGER.debug("Scanning %s", str(libNode.uri)) toadd = [] for s in new_statements: # always add "collections" @@ -310,7 +325,8 @@ class Submission(object): paired = ['Barcoded Illumina', 'Multiplexing', 'Nextera', - 'Paired End (non-multiplexed)',] + 'Paired End (non-multiplexed)', + 'Dual Index Illumina',] if library_type in single: return False elif library_type in paired: diff --git a/htsworkflow/submission/trackhub.py b/htsworkflow/submission/trackhub.py index 87a5258..8b7b424 100644 --- a/htsworkflow/submission/trackhub.py +++ b/htsworkflow/submission/trackhub.py @@ -35,14 +35,24 @@ class TrackHubSubmission(Submission): metadata = metadata[0] samples.append(metadata) - soft_template = loader.get_template('trackDb.txt') + template = loader.get_template('trackDb.txt') context = Context({ 'samples': samples, }) - return str(soft_template.render(context)) + return str(template.render(context)) - def make_mainifest(self, result_map): - pass + def make_manifest(self, result_map): + files = [] + for lib_id, result_dir in result_map.items(): + an_analysis = self.get_submission_node(result_dir) + metadata = self.get_manifest_metadata(an_analysis) + files.extend(metadata) + + template = loader.get_template('manifest.txt') + context = Context({ + 'files': files + }) + return str(template.render(context)) def get_sample_metadata(self, analysis_node): """Gather information for filling out sample section of a SOFT file @@ -56,3 +66,13 @@ class TrackHubSubmission(Submission): results = self.execute_query(query_template, context) return results + + def get_manifest_metadata(self, analysis_node): + query_template = loader.get_template('trackhub_manifest.sparql') + + context = Context({ + 'submission': str(analysis_node.uri), + 'submissionSet': str(self.submissionSetNS[''].uri), + }) + results = self.execute_query(query_template, context) + return results diff --git a/htsworkflow/templates/manifest.txt b/htsworkflow/templates/manifest.txt new file mode 100644 index 0000000..adf0554 --- /dev/null +++ b/htsworkflow/templates/manifest.txt @@ -0,0 +1,2 @@ +#file_name format output_type experiment replicate enriched_in ucsc_db{% for r in files %} +{{ r.relative_path }} {{ r.file_format }} {{ r.output_type }} {{ r.dataset_id }} {{ r.replicate }} {{ r.enriched_in }} {{ r.ucsc_db }}{% endfor %} diff --git a/htsworkflow/templates/trackhub_manifest.sparql b/htsworkflow/templates/trackhub_manifest.sparql new file mode 100644 index 0000000..295caa8 --- /dev/null +++ b/htsworkflow/templates/trackhub_manifest.sparql @@ -0,0 +1,32 @@ +PREFIX htswlib: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX encode3: +PREFIX ncbiTaxon: +PREFIX geoSoft: +PREFIX cells: + +select distinct ?name ?filename ?relative_path ?file_format ?output_type ?dataset_id ?replicate ?enriched_in ?ucsc_db +WHERE { + <{{submission}}> a submissionOntology:submission ; + submissionOntology:name ?name ; + ucscDaf:has_file ?file . + + ?file ucscDaf:filename ?filename ; + ucscDaf:relative_path ?relative_path ; + htswlib:library ?library ; + a ?fileClass . + + ?fileClass geoSoft:fileTypeLabel ?file_format ; + ucscDaf:output_type ?output_type . + + ?library htswlib:replicate ?replicate ; + ucscDaf:enriched_in ?enriched_in; + ucscDaf:genome_build ?ucsc_db . + + ?library encode3:dataset_id ?dataset_id . + # This is lame! why!!!! + # ?library2 encode3:dcc_id ?dcc_library_id . + # FILTER (?library = ?library2) + +}