remove some trailing whitespace
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
index 8b7b424094c0be0d1dcf4ee0e4f9c598cdc295ab..3aa4a96ae26fa96c44e0c0f58c67503890fe38dc 100644 (file)
@@ -1,5 +1,8 @@
 import logging
 import os
+from pprint import pformat
+import string
+import re
 
 import RDF
 
@@ -8,22 +11,46 @@ from htsworkflow.submission.submission import Submission
 from htsworkflow.util.rdfhelp import \
      fromTypedNode, \
      geoSoftNS, \
-     stripNamespace, \
      submissionOntology
+from htsworkflow.util.url import parse_ssh_url
+from htsworkflow.util.ucsc import bigWigInfo
 
 from django.conf import settings
 from django.template import Context, loader
+from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
+from trackhub.track import TRACKTYPES, SubGroupDefinition
+from trackhub.helpers import show_rendered_files
+from trackhub.upload import upload_track, upload_hub
 
 LOGGER = logging.getLogger(__name__)
 
 class TrackHubSubmission(Submission):
-    def __init__(self, name, model, host):
+    def __init__(self, name, model, baseurl, baseupload, host):
+        """Create a trackhub based submission
+
+        :Parameters:
+          - `name`: Name of submission
+          - `model`: librdf model reference
+          - `baseurl`: web root where trackhub will be hosted
+          - `baseupload`: filesystem root where trackhub will be hosted
+          - `host`: hostname for library pages.
+        """
         super(TrackHubSubmission, self).__init__(name, model, host)
+        if baseurl is None:
+            raise ValueError("Need a web root to make a track hub")
+        self.baseurl = os.path.join(baseurl, self.name)
+        if baseupload:
+            sshurl = parse_ssh_url(baseupload)
+            print sshurl
+            self.user = sshurl.user
+            self.host = sshurl.host
+            self.uploadpath =  sshurl.path
+        else:
+            self.uploadpath = None
 
-    def make_hub(self, result_map):
+    def make_hub_template(self, result_map):
         samples = []
-        for lib_id, result_dir in result_map.items():
-            an_analysis = self.get_submission_node(result_dir)
+        for an_analysis in self.analysis_nodes(result_map):
             metadata = self.get_sample_metadata(an_analysis)
             if len(metadata) == 0:
                 errmsg = 'No metadata found for {0}'
@@ -41,10 +68,100 @@ class TrackHubSubmission(Submission):
         })
         return str(template.render(context))
 
+    def make_hub(self, result_map):
+        genome_db = 'hg19'
+        hub_url = self.baseurl + '/'
+        hub, genomes_file, genome, trackdb = default_hub(
+            hub_name=self.name,
+            short_label=self.name,
+            long_label=self.name,
+            email='email',
+            genome=genome_db)
+
+        hub.remote_dir = self.uploadpath
+
+        # build higher order track types
+        composite = CompositeTrack(
+            name=self.sanitize_name(self.name),
+            short_label = self.sanitize_name(self.name),
+            long_label = str(self.name),
+            tracktype="bed 3",
+            dragAndDrop='subtracks',
+            visibility='full',
+        )
+        trackdb.add_tracks(composite)
+
+        subgroups = self.add_subgroups(composite)
+
+        view_type = None
+        view = None
+
+        for track in self.get_tracks():
+            if track['file_type'] not in TRACKTYPES:
+                LOGGER.info('Unrecognized file type %s', track['file_type'])
+                continue
+
+            view = self.add_new_view_if_needed(composite, view, track)
+            track_name = self.make_track_name(track)
+
+            track_subgroup = self.make_track_subgroups(subgroups, track)
+            track_type = self.make_track_type(track)
+
+            if 'file_label' in track:
+                track_label = self.sanitize_name(track['file_label'])
+            else:
+                track_label = track_name
+
+            attributes = {
+                'name': track_name,
+                'tracktype': track_type,
+                'url': hub_url + str(track['relative_path']),
+                'short_label': str(track['library_id']),
+                'long_label': str(track_label),
+                'subgroups': track_subgroup,
+            }
+
+            LOGGER.debug('track attributes: %s', pformat(attributes))
+            newtrack = Track(**attributes)
+            view.add_tracks([newtrack])
+
+        results = hub.render()
+        if hub.remote_dir:
+            LOGGER.info("Uploading to %s @ %s : %s",
+                        self.user, self.host, hub.remote_dir)
+            upload_hub(hub=hub, host=self.host, user='diane')
+
+    def add_new_view_if_needed(self, composite, view, track):
+        """Add new trakkhub view if we've hit a new type of track.
+
+        :Parameters:
+          - `composite`: composite track to attach to
+          - `view_type`: name of view type
+          - `track`: current track record
+        """
+        current_view_type = str(track['output_type'])
+        if not view or current_view_type != view.name:
+            attributes = {
+                'name': current_view_type,
+                'view': current_view_type,
+                'visibility': str(track.get('visibility', 'squish')),
+                'short_label': current_view_type,
+                'tracktype': str(track['file_type'])
+            }
+            maxHeightPixels = track.get('maxHeightPixels')
+            if maxHeightPixels:
+                attributes['maxHeightPixels'] = str(maxHeightPixels)
+            autoScale = track.get('autoScale')
+            if autoScale:
+                attributes['autoScale'] = str(autoScale)
+            view = ViewTrack(**attributes)
+            composite.add_view(view)
+            view_type = current_view_type
+        return view
+
     def make_manifest(self, result_map):
         files = []
-        for lib_id, result_dir in result_map.items():
-            an_analysis = self.get_submission_node(result_dir)
+        for an_analysis in self.analysis_nodes(result_map):
             metadata = self.get_manifest_metadata(an_analysis)
             files.extend(metadata)
 
@@ -53,20 +170,123 @@ class TrackHubSubmission(Submission):
             'files': files
         })
         return str(template.render(context))
-        
-    def get_sample_metadata(self, analysis_node):
-        """Gather information for filling out sample section of a SOFT file
+
+    def make_track_name(self, track):
+        return '{}_{}_{}'.format(
+            track['library_id'],
+            track['replicate'],
+            track['output_type'],
+        )
+
+    def make_track_subgroups(self, subgroups, track):
+        track_subgroups = {}
+        for k in subgroups:
+            if k in track and track[k]:
+                value = self.sanitize_name(track[k])
+                track_subgroups[k] = value
+        return track_subgroups
+
+    def make_track_type(self, track):
+        """Further annotate tracktype.
+
+        bigWig files can have additional information. Add it if we can
+        """
+        track_type = track['file_type']
+        if track_type.lower() == 'bigwig':
+            # something we can enhance
+            info = bigWigInfo(track['relative_path'])
+            if info.min is not None and info.max is not None:
+                track_type = '{} {} {}'.format(track_type, int(info.min), int(info.max))
+
+        LOGGER.debug("track_type: %s", track_type)
+        return str(track_type)
+
+    def add_subgroups(self, composite):
+        """Add subgroups to composite track"""
+        search = [ ('htswlib:cell_line', 'cell'),
+                   ('encode3:rna_type', 'rna_type'),
+                   ('encode3:protocol', 'protocol'),
+                   ('htswlib:replicate', 'replicate'),
+                   ('encode3:library_id', 'library_id'),
+                   ('encode3:assay', 'assay'),
+                 ]
+        subgroups = []
+        names = []
+        sortorder = []
+        dimnames = ('dim{}'.format(x) for x in string.ascii_uppercase)
+        dimensions = []
+        filtercomposite = []
+        for term, name in search:
+            definitions = self.make_subgroupdefinition(term, name)
+            if definitions:
+                subgroups.append(definitions)
+                names.append(name)
+                sortorder.append("{}=+".format(name))
+                d = dimnames.next()
+                dimensions.append("{}={}".format(d, name))
+                filtercomposite.append("{}=multi".format(d))
+
+        composite.add_subgroups(subgroups)
+        composite.add_params(sortOrder=' '.join(sortorder))
+        composite.add_params(dimensions=' '.join(dimensions))
+        composite.add_params(filterComposite=' '.join(filtercomposite))
+        return names
+
+
+    def make_subgroupdefinition(self, term, name):
+        """Subgroup attributes need to be an attribute of the library.
+        """
+        template = loader.get_template('trackhub_term_values.sparql')
+        context = Context({'term': term})
+        results = self.execute_query(template, context)
+        values = {}
+        for row in results:
+            value = str(row['name'])
+            values[self.sanitize_name(value)] = value
+
+        if values:
+            return SubGroupDefinition(
+                    name=name,
+                    label=name,
+                    mapping=values,
+            )
+        else:
+            return None
+
+    def get_tracks(self):
+        """Collect information needed to describe trackhub tracks.
         """
         query_template = loader.get_template('trackhub_samples.sparql')
 
-        context = Context({
-            'submission': str(analysis_node.uri),
-            'submissionSet': str(self.submissionSetNS[''].uri),
-            })
+        context = Context({ })
 
         results = self.execute_query(query_template, context)
         return results
 
+    def sanitize_name(self, name):
+        replacements = [('poly-?a\+', 'PolyAplus'),
+                        ('poly-?a-', 'PolyAminus'),
+                        ('RNA-Seq', 'RNASeq'),
+                        ('rna-seq', 'rnaseq'),
+                        ('-', '_'),
+                        (' ', '_'),
+                        ('^0', 'Zero'),
+                        ('^1', 'One'),
+                        ('^2', 'Two'),
+                        ('^3', 'Three'),
+                        ('^4', 'Four'),
+                        ('^5', 'Five'),
+                        ('^6', 'Six'),
+                        ('^7', 'Seven'),
+                        ('^8', 'Eight'),
+                        ('^9', 'Nine'),
+                        ]
+
+        for regex, substitution in replacements:
+            name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
+
+        return name
+
     def get_manifest_metadata(self, analysis_node):
         query_template = loader.get_template('trackhub_manifest.sparql')
 
@@ -75,4 +295,6 @@ class TrackHubSubmission(Submission):
             'submissionSet': str(self.submissionSetNS[''].uri),
             })
         results = self.execute_query(query_template, context)
+        LOGGER.info("scanned %s for results found %s",
+                    str(analysis_node), len(results))
         return results