From: Diane Trout Date: Mon, 22 Jul 2013 19:48:53 +0000 (-0700) Subject: Generate manifest.txt files for submitting to ENCODE3. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=fbebf34fa2f5842a2ab1eb8319176c82587fe3bb Generate manifest.txt files for submitting to ENCODE3. Change trackhub generation from my previous template version to use the Daler trackhub code. This includes a feature to complain if you offer a submission set name that diesn't exist. Also the samples query returns all the submission components in a single query instead of one of a time. (Which is much faster way of doing things). --- diff --git a/encode_submission/encode3.py b/encode_submission/encode3.py index 197d7f3..875d3bd 100644 --- a/encode_submission/encode3.py +++ b/encode_submission/encode3.py @@ -37,12 +37,15 @@ from htsworkflow.util.rdfhelp import \ sparql_query, \ submissionOntology from htsworkflow.submission.daf import get_submission_uri +from htsworkflow.submission.submission import list_submissions from htsworkflow.submission.results import ResultMap -from htsworkflow.submission.trackhub import TrackHubSubmission +from htsworkflow.submission.trackhub_submission import TrackHubSubmission from htsworkflow.submission.condorfastq import CondorFastqExtract logger = logging.getLogger(__name__) +INDENTED = " " + os.linesep + def main(cmdline=None): parser = make_parser() opts, args = parser.parse_args(cmdline) @@ -58,10 +61,31 @@ def main(cmdline=None): apidata = api.make_auth_from_opts(opts, parser) model = get_model(opts.model, opts.db_path) + + submission_names = list(list_submissions(model)) + name = opts.name + if len(submission_names) == 0 and opts.name is None: + parser.error("Please name this submission") + elif opts.name and submission_names and opts.name not in submission_names: + parser.error("{} is not in this model. Choose from: {}{}".format( + opts.name, + os.linesep, + INDENTED.join(submission_names))) + elif opts.name is None and len(submission_names) > 1: + parser.error("Please choose submission name from: {}{}".format( + os.linesep, + INDENTED.join(submission_names))) + elif len(submission_names) == 1: + name = submission_names[0] + mapper = None - if opts.name: - mapper = TrackHubSubmission(opts.name, model, host=opts.host) - submission_uri = get_submission_uri(opts.name) + if opts.make_track_hub: + mapper = TrackHubSubmission(name, + model, + baseurl=opts.make_track_hub, + baseupload=opts.track_hub_upload, + host=opts.host) + submission_uri = get_submission_uri(name) if opts.load_rdf is not None: @@ -91,12 +115,12 @@ def main(cmdline=None): extractor.create_scripts(results) if opts.scan_submission: - if opts.name is None: + if name is None: parser.error("Please define a submission name") mapper.scan_submission_dirs(results) - if opts.make_hub: - make_hub(mapper, results, opts.make_hub) + if opts.make_track_hub: + trackdb = mapper.make_hub(results) if opts.make_manifest: make_manifest(mapper, results, opts.make_manifest) @@ -109,15 +133,6 @@ def main(cmdline=None): print writer.serialize_model_to_string(model) -def make_hub(mapper, results, filename=None): - trackdb = mapper.make_hub(results) - - if filename is None or filename == '-': - sys.stdout.write(trackdb) - else: - with open('trackDb.txt', 'w') as trackstream: - trackstream.write(trackdb) - def make_manifest(mapper, results, filename=None): manifest = mapper.make_manifest(results) @@ -154,8 +169,10 @@ def make_parser(): help="generate scripts for making fastq files") commands.add_option('--scan-submission', default=False, action="store_true", help="Import metadata for submission into our model") - commands.add_option('--make-hub', default=None, - help='name the hub file or - for stdout to create it') + commands.add_option('--make-track-hub', default=None, + help='web root that will host the trackhub.') + commands.add_option('--track-hub-upload', default=None, + help='where to upload track hub :') commands.add_option('--make-manifest', help='name the manifest file name or - for stdout to create it', default=None) diff --git a/htsworkflow/submission/trackhub_submission.py b/htsworkflow/submission/trackhub_submission.py index 8b7b424..7bca809 100644 --- a/htsworkflow/submission/trackhub_submission.py +++ b/htsworkflow/submission/trackhub_submission.py @@ -1,5 +1,6 @@ import logging import os +import re import RDF @@ -10,20 +11,44 @@ from htsworkflow.util.rdfhelp import \ geoSoftNS, \ stripNamespace, \ submissionOntology +from htsworkflow.util.url import parse_ssh_url from django.conf import settings from django.template import Context, loader +from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack +from trackhub.track import TRACKTYPES, SubGroupDefinition +from trackhub.helpers import show_rendered_files +from trackhub.upload import upload_track, upload_hub LOGGER = logging.getLogger(__name__) class TrackHubSubmission(Submission): - def __init__(self, name, model, host): + def __init__(self, name, model, baseurl, baseupload, host): + """Create a trackhub based submission + + :Parameters: + - `name`: Name of submission + - `model`: librdf model reference + - `baseurl`: web root where trackhub will be hosted + - `baseupload`: filesystem root where trackhub will be hosted + - `host`: hostname for library pages. + """ super(TrackHubSubmission, self).__init__(name, model, host) + if baseurl is None: + raise ValueError("Need a web root to make a track hub") + self.baseurl = os.path.join(baseurl, self.name) + if baseupload: + sshurl = parse_ssh_url(baseupload) + print sshurl + self.user = sshurl.user + self.host = sshurl.host + self.uploadpath = sshurl.path + else: + self.uploadpath = None - def make_hub(self, result_map): + def make_hub_template(self, result_map): samples = [] - for lib_id, result_dir in result_map.items(): - an_analysis = self.get_submission_node(result_dir) + for an_analysis in self.analysis_nodes(result_map): metadata = self.get_sample_metadata(an_analysis) if len(metadata) == 0: errmsg = 'No metadata found for {0}' @@ -41,10 +66,84 @@ class TrackHubSubmission(Submission): }) return str(template.render(context)) + def make_hub(self, result_map): + genome_db = 'hg19' + hub_url = self.baseurl + '/' + hub, genomes_file, genome, trackdb = default_hub( + hub_name=self.name, + short_label=self.name, + long_label=self.name, + email='email', + genome=genome_db) + + hub.remote_dir = self.uploadpath + + # build higher order track types + composite = CompositeTrack( + name=self.sanitize_name(self.name), + short_label = self.sanitize_name(self.name), + long_label = str(self.name), + tracktype="bigWig", + dragAndDrop='subtracks', + visibility='full', + ) + trackdb.add_tracks(composite) + + subgroups = self.add_subgroups(composite) + + view_type = None + view = None + + for track in self.get_tracks(): + if track['file_type'] not in TRACKTYPES: + LOGGER.info('Unrecognized file type %s', track['file_type']) + continue + + view = self.add_new_view_if_needed(composite, view, track) + track_name = self.make_track_name(track) + + track_subgroup = self.make_track_subgroups(subgroups, track) + + newtrack = Track( + name=track_name, + tracktype = str(track['file_type']), + url= hub_url + str(track['relative_path']), + short_label=str(track['library_id']), + long_label=track_name, + subgroups=track_subgroup, + ) + view.add_tracks([newtrack]) + + results = hub.render() + if hub.remote_dir: + LOGGER.info("Uploading to %s @ %s : %s", + self.user, self.host, hub.remote_dir) + upload_hub(hub=hub, host=self.host, user='diane') + + def add_new_view_if_needed(self, composite, view, track): + """Add new trakkhub view if we've hit a new type of track. + + :Parameters: + - `composite`: composite track to attach to + - `view_type`: name of view type + - `track`: current track record + """ + current_view_type = str(track['output_type']) + if not view or current_view_type != view.name: + view = ViewTrack( + name=current_view_type, + view=current_view_type, + visibility='squish', + short_label=current_view_type, + tracktype=str(track['file_type']), + ) + composite.add_view(view) + view_type = current_view_type + return view + def make_manifest(self, result_map): files = [] - for lib_id, result_dir in result_map.items(): - an_analysis = self.get_submission_node(result_dir) + for an_analysis in self.analysis_nodes(result_map): metadata = self.get_manifest_metadata(an_analysis) files.extend(metadata) @@ -53,21 +152,94 @@ class TrackHubSubmission(Submission): 'files': files }) return str(template.render(context)) - - def get_sample_metadata(self, analysis_node): - """Gather information for filling out sample section of a SOFT file + + def make_track_name(self, track): + name = '{}_{}_{}'.format( + track['library_id'], + track['replicate'], + track['output_type'], + ) + return name + + def make_track_subgroups(self, subgroups, track): + track_subgroups = {} + for k in subgroups: + if k in track and track[k]: + value = self.sanitize_name(track[k]) + track_subgroups[k] = value + return track_subgroups + + def add_subgroups(self, composite): + """Add subgroups to composite track""" + search = [ ('htswlib:cell_line', 'cell'), + ('htswlib:replicate', 'replicate'), + ('encode3:library_id', 'library_id'), + ('encode3:assay', 'assay'), + ('encode3:rna_type', 'rna_type'), + ('encode3:protocol', 'protocol'), + ] + subgroups = [] + names = [] + for term, name in search: + subgroups.append(self.make_subgroupdefinition(term, name)) + names.append(name) + composite.add_subgroups(subgroups) + return names + + + def make_subgroupdefinition(self, term, name): + """Subgroup attributes need to be an attribute of the library. + """ + template = loader.get_template('trackhub_term_values.sparql') + context = Context({'term': term}) + results = self.execute_query(template, context) + values = {} + for row in results: + value = str(row['name']) + values[self.sanitize_name(value)] = value + + return SubGroupDefinition( + name=name, + label=name, + mapping=values, + ) + + def get_tracks(self): + """Collect information needed to describe trackhub tracks. """ query_template = loader.get_template('trackhub_samples.sparql') - context = Context({ - 'submission': str(analysis_node.uri), - 'submissionSet': str(self.submissionSetNS[''].uri), - }) + context = Context({ }) results = self.execute_query(query_template, context) return results + def sanitize_name(self, name): + replacements = [('poly-?a\+', 'PolyAplus'), + ('poly-?a-', 'PolyAminus'), + ('RNA-Seq', 'RNASeq'), + ('rna-seq', 'rnaseq'), + ('-', '_'), + (' ', '_'), + ('^0', 'Zero'), + ('^1', 'One'), + ('^2', 'Two'), + ('^3', 'Three'), + ('^4', 'Four'), + ('^5', 'Five'), + ('^6', 'Six'), + ('^7', 'Seven'), + ('^8', 'Eight'), + ('^9', 'Nine'), + ] + + for regex, substitution in replacements: + name = re.sub(regex, substitution, name, flags=re.IGNORECASE) + + return name + def get_manifest_metadata(self, analysis_node): + query_template = loader.get_template('trackhub_manifest.sparql') context = Context({ @@ -75,4 +247,6 @@ class TrackHubSubmission(Submission): 'submissionSet': str(self.submissionSetNS[''].uri), }) results = self.execute_query(query_template, context) + LOGGER.info("scanned %s for results found %s", + str(analysis_node), len(results)) return results diff --git a/htsworkflow/templates/trackhub_samples.sparql b/htsworkflow/templates/trackhub_samples.sparql index 5930d9f..19ce7e1 100644 --- a/htsworkflow/templates/trackhub_samples.sparql +++ b/htsworkflow/templates/trackhub_samples.sparql @@ -1,42 +1,27 @@ -PREFIX libraryOntology: +PREFIX htswlib: PREFIX submissionOntology: PREFIX ucscDaf: PREFIX ncbiTaxon: PREFIX geoSoft: PREFIX cells: +PREFIX encode3: -select distinct ?name ?bam ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?tier ?experiment_type ?library_selection ?library_source ?input_quantity -WHERE { - <{{submission}}> a submissionOntology:submission ; - submissionOntology:library ?library ; - submissionOntology:name ?name ; - ucscDaf:has_file ?file . - ?file ucscDaf:filename ?bam . - OPTIONAL { <{{submission}}> ucscDaf:control ?control } - OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId } - OPTIONAL { ?library libraryOntology:antibody ?antibody } - OPTIONAL { ?library libraryOntology:cell_line ?cell . - OPTIONAL { ?cell_line cells:cell ?cell . - OPTIONAL { ?cell_line cells:documents ?growthProtocol . } - OPTIONAL { ?cell_line cells:tier ?tier . } } } - OPTIONAL { ?library ucscDaf:sex ?sex } - OPTIONAL { ?library libraryOntology:library_id ?library_id } - OPTIONAL { ?library libraryOntology:replicate ?replicate } - OPTIONAL { ?library libraryOntology:species ?species_name . - ?species libraryOntology:species ?species_name ; - libraryOntology:taxon_id ?taxon_id . } - OPTIONAL { ?library libraryOntology:condition_term ?treatment } - OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type } - OPTIONAL { ?library libraryOntology:librarySelection ?library_selection } - OPTIONAL { ?library libraryOntology:librarySource ?library_source } - OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol } - OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule } - OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol } - OPTIONAL { ?library ucscDaf:protocol ?protocol } - OPTIONAL { ?library ucscDaf:readType ?readType } - OPTIONAL { ?library ucscDaf:strain ?strain } - OPTIONAL { ?library libraryOntology:insert_size ?insertLength } - OPTIONAL { ?library libraryOntology:inputQuantity ?input_quantity } - OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm } +select distinct ?lab_library_id ?library_id ?filename ?relative_path ?output_type ?file_type ?cell ?replicate ?assay ?rna_type ?protocol -} \ No newline at end of file +WHERE { + ?trackType geoSoft:fileTypeLabel ?file_type ; + ucscDaf:output_type ?output_type . + ?file ucscDaf:filename ?filename ; + ucscDaf:relative_path ?relative_path ; + htswlib:library ?library ; + a ?trackType . + OPTIONAL { ?library htswlib:library_id ?lab_library_id } + OPTIONAL { ?library encode3:library_id ?library_id } + OPTIONAL { ?library htswlib:cell_line ?cell . } + OPTIONAL { ?library htswlib:replicate ?replicate } + OPTIONAL { ?library encode3:assay ?assay . } + OPTIONAL { ?library encode3:rna_type ?rna_type. } + OPTIONAL { ?library encode3:protocol ?protocol. } + #OPTIONAL { ?library ucscDaf:readType ?read_type } +} +order by ?trackType diff --git a/htsworkflow/templates/trackhub_term_values.sparql b/htsworkflow/templates/trackhub_term_values.sparql new file mode 100644 index 0000000..6cff5d1 --- /dev/null +++ b/htsworkflow/templates/trackhub_term_values.sparql @@ -0,0 +1,14 @@ +PREFIX htswlib: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: +PREFIX cells: +PREFIX encode3: + +select distinct ?name +where +{ + ?library a htswlib:Library ; + {{term}} ?name. +}