Generate manifest.txt files for submitting to ENCODE3.
authorDiane Trout <diane@ghic.org>
Mon, 22 Jul 2013 19:48:53 +0000 (12:48 -0700)
committerDiane Trout <diane@ghic.org>
Mon, 22 Jul 2013 19:48:53 +0000 (12:48 -0700)
Change trackhub generation from my previous template version to use
the Daler trackhub code.

This includes a feature to complain if you offer a submission set
name that diesn't exist.

Also the samples query returns all the submission components in
a single query instead of one of a time. (Which is much faster
way of doing things).

encode_submission/encode3.py
htsworkflow/submission/trackhub_submission.py
htsworkflow/templates/trackhub_samples.sparql
htsworkflow/templates/trackhub_term_values.sparql [new file with mode: 0644]

index 197d7f34092ef87e9b2a617b0a8474d3896ae3f9..875d3bdd9d8205df2eaff65b2b48b5462366645a 100644 (file)
@@ -37,12 +37,15 @@ from htsworkflow.util.rdfhelp import \
      sparql_query, \
      submissionOntology
 from htsworkflow.submission.daf import get_submission_uri
+from htsworkflow.submission.submission import list_submissions
 from htsworkflow.submission.results import ResultMap
-from htsworkflow.submission.trackhub import TrackHubSubmission
+from htsworkflow.submission.trackhub_submission import TrackHubSubmission
 from htsworkflow.submission.condorfastq import CondorFastqExtract
 
 logger = logging.getLogger(__name__)
 
+INDENTED = "  " + os.linesep
+
 def main(cmdline=None):
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
@@ -58,10 +61,31 @@ def main(cmdline=None):
     apidata = api.make_auth_from_opts(opts, parser)
 
     model = get_model(opts.model, opts.db_path)
+
+    submission_names = list(list_submissions(model))
+    name = opts.name
+    if len(submission_names) == 0 and opts.name is None:
+        parser.error("Please name this submission")
+    elif opts.name and submission_names and opts.name not in submission_names:
+        parser.error("{} is not in this model. Choose from: {}{}".format(
+            opts.name,
+            os.linesep,
+            INDENTED.join(submission_names)))
+    elif opts.name is None and len(submission_names) > 1:
+        parser.error("Please choose submission name from: {}{}".format(
+            os.linesep,
+            INDENTED.join(submission_names)))
+    elif len(submission_names) == 1:
+        name = submission_names[0]
+
     mapper = None
-    if opts.name:
-        mapper = TrackHubSubmission(opts.name,  model, host=opts.host)
-        submission_uri = get_submission_uri(opts.name)
+    if opts.make_track_hub:
+        mapper = TrackHubSubmission(name,
+                                    model,
+                                    baseurl=opts.make_track_hub,
+                                    baseupload=opts.track_hub_upload,
+                                    host=opts.host)
+        submission_uri = get_submission_uri(name)
 
 
     if opts.load_rdf is not None:
@@ -91,12 +115,12 @@ def main(cmdline=None):
         extractor.create_scripts(results)
 
     if opts.scan_submission:
-        if opts.name is None:
+        if name is None:
             parser.error("Please define a submission name")
         mapper.scan_submission_dirs(results)
 
-    if opts.make_hub:
-        make_hub(mapper, results, opts.make_hub)
+    if opts.make_track_hub:
+        trackdb = mapper.make_hub(results)
 
     if opts.make_manifest:
         make_manifest(mapper, results, opts.make_manifest)
@@ -109,15 +133,6 @@ def main(cmdline=None):
         print writer.serialize_model_to_string(model)
 
 
-def make_hub(mapper, results, filename=None):
-    trackdb = mapper.make_hub(results)
-
-    if filename is None or filename == '-':
-        sys.stdout.write(trackdb)
-    else:
-        with open('trackDb.txt', 'w') as trackstream:
-            trackstream.write(trackdb)
-
 def make_manifest(mapper, results, filename=None):
     manifest = mapper.make_manifest(results)
 
@@ -154,8 +169,10 @@ def make_parser():
                         help="generate scripts for making fastq files")
     commands.add_option('--scan-submission', default=False, action="store_true",
                       help="Import metadata for submission into our model")
-    commands.add_option('--make-hub', default=None, 
-                        help='name the hub file or - for stdout to create it')
+    commands.add_option('--make-track-hub', default=None,
+                        help='web root that will host the trackhub.')
+    commands.add_option('--track-hub-upload', default=None,
+                        help='where to upload track hub <host>:<path>')
     commands.add_option('--make-manifest', 
                         help='name the manifest file name or - for stdout to create it', 
                         default=None)
index 8b7b424094c0be0d1dcf4ee0e4f9c598cdc295ab..7bca809a926798d87cae8f6b23486b1de3c7b23a 100644 (file)
@@ -1,5 +1,6 @@
 import logging
 import os
+import re
 
 import RDF
 
@@ -10,20 +11,44 @@ from htsworkflow.util.rdfhelp import \
      geoSoftNS, \
      stripNamespace, \
      submissionOntology
+from htsworkflow.util.url import parse_ssh_url
 
 from django.conf import settings
 from django.template import Context, loader
+from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
+from trackhub.track import TRACKTYPES, SubGroupDefinition
+from trackhub.helpers import show_rendered_files
+from trackhub.upload import upload_track, upload_hub
 
 LOGGER = logging.getLogger(__name__)
 
 class TrackHubSubmission(Submission):
-    def __init__(self, name, model, host):
+    def __init__(self, name, model, baseurl, baseupload, host):
+        """Create a trackhub based submission
+
+        :Parameters:
+          - `name`: Name of submission
+          - `model`: librdf model reference
+          - `baseurl`: web root where trackhub will be hosted
+          - `baseupload`: filesystem root where trackhub will be hosted
+          - `host`: hostname for library pages.
+        """
         super(TrackHubSubmission, self).__init__(name, model, host)
+        if baseurl is None:
+            raise ValueError("Need a web root to make a track hub")
+        self.baseurl = os.path.join(baseurl, self.name)
+        if baseupload:
+            sshurl = parse_ssh_url(baseupload)
+            print sshurl
+            self.user = sshurl.user
+            self.host = sshurl.host
+            self.uploadpath =  sshurl.path
+        else:
+            self.uploadpath = None
 
-    def make_hub(self, result_map):
+    def make_hub_template(self, result_map):
         samples = []
-        for lib_id, result_dir in result_map.items():
-            an_analysis = self.get_submission_node(result_dir)
+        for an_analysis in self.analysis_nodes(result_map):
             metadata = self.get_sample_metadata(an_analysis)
             if len(metadata) == 0:
                 errmsg = 'No metadata found for {0}'
@@ -41,10 +66,84 @@ class TrackHubSubmission(Submission):
         })
         return str(template.render(context))
 
+    def make_hub(self, result_map):
+        genome_db = 'hg19'
+        hub_url = self.baseurl + '/'
+        hub, genomes_file, genome, trackdb = default_hub(
+            hub_name=self.name,
+            short_label=self.name,
+            long_label=self.name,
+            email='email',
+            genome=genome_db)
+
+        hub.remote_dir = self.uploadpath
+
+        # build higher order track types
+        composite = CompositeTrack(
+            name=self.sanitize_name(self.name),
+            short_label = self.sanitize_name(self.name),
+            long_label = str(self.name),
+            tracktype="bigWig",
+            dragAndDrop='subtracks',
+            visibility='full',
+        )
+        trackdb.add_tracks(composite)
+
+        subgroups = self.add_subgroups(composite)
+
+        view_type = None
+        view = None
+
+        for track in self.get_tracks():
+            if track['file_type'] not in TRACKTYPES:
+                LOGGER.info('Unrecognized file type %s', track['file_type'])
+                continue
+
+            view = self.add_new_view_if_needed(composite, view, track)
+            track_name = self.make_track_name(track)
+
+            track_subgroup = self.make_track_subgroups(subgroups, track)
+
+            newtrack = Track(
+                name=track_name,
+                tracktype = str(track['file_type']),
+                url= hub_url + str(track['relative_path']),
+                short_label=str(track['library_id']),
+                long_label=track_name,
+                subgroups=track_subgroup,
+                )
+            view.add_tracks([newtrack])
+
+        results = hub.render()
+        if hub.remote_dir:
+            LOGGER.info("Uploading to %s @ %s : %s",
+                        self.user, self.host, hub.remote_dir)
+            upload_hub(hub=hub, host=self.host, user='diane')
+
+    def add_new_view_if_needed(self, composite, view, track):
+        """Add new trakkhub view if we've hit a new type of track.
+
+        :Parameters:
+          - `composite`: composite track to attach to
+          - `view_type`: name of view type
+          - `track`: current track record
+        """
+        current_view_type = str(track['output_type'])
+        if not view or current_view_type != view.name:
+            view = ViewTrack(
+                name=current_view_type,
+                view=current_view_type,
+                visibility='squish',
+                short_label=current_view_type,
+                tracktype=str(track['file_type']),
+            )
+            composite.add_view(view)
+            view_type = current_view_type
+        return view
+
     def make_manifest(self, result_map):
         files = []
-        for lib_id, result_dir in result_map.items():
-            an_analysis = self.get_submission_node(result_dir)
+        for an_analysis in self.analysis_nodes(result_map):
             metadata = self.get_manifest_metadata(an_analysis)
             files.extend(metadata)
 
@@ -53,21 +152,94 @@ class TrackHubSubmission(Submission):
             'files': files
         })
         return str(template.render(context))
-        
-    def get_sample_metadata(self, analysis_node):
-        """Gather information for filling out sample section of a SOFT file
+
+    def make_track_name(self, track):
+        name = '{}_{}_{}'.format(
+            track['library_id'],
+            track['replicate'],
+            track['output_type'],
+        )
+        return name
+
+    def make_track_subgroups(self, subgroups, track):
+        track_subgroups = {}
+        for k in subgroups:
+            if k in track and track[k]:
+                value = self.sanitize_name(track[k])
+                track_subgroups[k] = value
+        return track_subgroups
+
+    def add_subgroups(self, composite):
+        """Add subgroups to composite track"""
+        search = [ ('htswlib:cell_line', 'cell'),
+                   ('htswlib:replicate', 'replicate'),
+                   ('encode3:library_id', 'library_id'),
+                   ('encode3:assay', 'assay'),
+                   ('encode3:rna_type', 'rna_type'),
+                   ('encode3:protocol', 'protocol'),
+                 ]
+        subgroups = []
+        names = []
+        for term, name in search:
+            subgroups.append(self.make_subgroupdefinition(term, name))
+            names.append(name)
+        composite.add_subgroups(subgroups)
+        return names
+
+
+    def make_subgroupdefinition(self, term, name):
+        """Subgroup attributes need to be an attribute of the library.
+        """
+        template = loader.get_template('trackhub_term_values.sparql')
+        context = Context({'term': term})
+        results = self.execute_query(template, context)
+        values = {}
+        for row in results:
+            value = str(row['name'])
+            values[self.sanitize_name(value)] = value
+
+        return SubGroupDefinition(
+                name=name,
+                label=name,
+                mapping=values,
+        )
+
+    def get_tracks(self):
+        """Collect information needed to describe trackhub tracks.
         """
         query_template = loader.get_template('trackhub_samples.sparql')
 
-        context = Context({
-            'submission': str(analysis_node.uri),
-            'submissionSet': str(self.submissionSetNS[''].uri),
-            })
+        context = Context({ })
 
         results = self.execute_query(query_template, context)
         return results
 
+    def sanitize_name(self, name):
+        replacements = [('poly-?a\+', 'PolyAplus'),
+                        ('poly-?a-', 'PolyAminus'),
+                        ('RNA-Seq', 'RNASeq'),
+                        ('rna-seq', 'rnaseq'),
+                        ('-', '_'),
+                        (' ', '_'),
+                        ('^0', 'Zero'),
+                        ('^1', 'One'),
+                        ('^2', 'Two'),
+                        ('^3', 'Three'),
+                        ('^4', 'Four'),
+                        ('^5', 'Five'),
+                        ('^6', 'Six'),
+                        ('^7', 'Seven'),
+                        ('^8', 'Eight'),
+                        ('^9', 'Nine'),
+                        ]
+
+        for regex, substitution in replacements:
+            name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
+
+        return name
+
     def get_manifest_metadata(self, analysis_node):
+
         query_template = loader.get_template('trackhub_manifest.sparql')
 
         context = Context({
@@ -75,4 +247,6 @@ class TrackHubSubmission(Submission):
             'submissionSet': str(self.submissionSetNS[''].uri),
             })
         results = self.execute_query(query_template, context)
+        LOGGER.info("scanned %s for results found %s",
+                    str(analysis_node), len(results))
         return results
index 5930d9f89884aeeaf4c56e6c4e5711f906a50589..19ce7e1d5d7f32d35b88e961c867095104dbde88 100644 (file)
@@ -1,42 +1,27 @@
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX htswlib: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
 PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
 PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
 PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
 PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+PREFIX encode3: <http://jumpgate.caltech.edu/wiki/Encode3#>
 
-select distinct ?name ?bam ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?tier ?experiment_type ?library_selection ?library_source ?input_quantity
-WHERE {
-  <{{submission}}> a submissionOntology:submission ;
-                   submissionOntology:library ?library ;
-                   submissionOntology:name ?name ;
-                   ucscDaf:has_file ?file .
-  ?file ucscDaf:filename ?bam .
-  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
-  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
-  OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell .
-             OPTIONAL { ?cell_line cells:cell ?cell .
-                        OPTIONAL { ?cell_line cells:documents ?growthProtocol . }
-                        OPTIONAL { ?cell_line cells:tier ?tier . } } }
-  OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?library_id }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name .
-             ?species libraryOntology:species ?species_name ;
-                      libraryOntology:taxon_id ?taxon_id . }
-  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
-  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
-  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
-  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
-  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
-  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
-  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
-  OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  OPTIONAL { ?library ucscDaf:readType ?readType }
-  OPTIONAL { ?library ucscDaf:strain ?strain }
-  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  OPTIONAL { ?library libraryOntology:inputQuantity ?input_quantity }
-  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+select distinct ?lab_library_id ?library_id ?filename ?relative_path ?output_type ?file_type ?cell ?replicate ?assay ?rna_type ?protocol 
 
-}
\ No newline at end of file
+WHERE {
+  ?trackType geoSoft:fileTypeLabel ?file_type ;
+             ucscDaf:output_type ?output_type .
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:relative_path ?relative_path ;
+        htswlib:library ?library ;
+        a ?trackType .
+  OPTIONAL { ?library htswlib:library_id ?lab_library_id }
+  OPTIONAL { ?library encode3:library_id ?library_id }
+  OPTIONAL { ?library htswlib:cell_line ?cell . }
+  OPTIONAL { ?library htswlib:replicate ?replicate }
+  OPTIONAL { ?library encode3:assay ?assay . }
+  OPTIONAL { ?library encode3:rna_type ?rna_type. }
+  OPTIONAL { ?library encode3:protocol ?protocol. }
+  #OPTIONAL { ?library ucscDaf:readType ?read_type }
+}
+order by ?trackType
diff --git a/htsworkflow/templates/trackhub_term_values.sparql b/htsworkflow/templates/trackhub_term_values.sparql
new file mode 100644 (file)
index 0000000..6cff5d1
--- /dev/null
@@ -0,0 +1,14 @@
+PREFIX htswlib: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+PREFIX encode3: <http://jumpgate.caltech.edu/wiki/Encode3#>
+
+select distinct ?name
+where
+{
+  ?library a htswlib:Library ;
+           {{term}} ?name.
+}