e383175a16884c3d1d24a69042bf916a44aa71c9
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
1 import logging
2 import os
3 from pprint import pformat
4 import string
5 import re
6
7 import RDF
8
9 from htsworkflow.submission.submission import Submission
10
11 from htsworkflow.util.rdfhelp import \
12      fromTypedNode, \
13      geoSoftNS, \
14      stripNamespace, \
15      submissionOntology
16 from htsworkflow.util.url import parse_ssh_url
17 from htsworkflow.util.ucsc import bigWigInfo
18
19 from django.conf import settings
20 from django.template import Context, loader
21 from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
22 from trackhub.track import TRACKTYPES, SubGroupDefinition
23 from trackhub.helpers import show_rendered_files
24 from trackhub.upload import upload_track, upload_hub
25
26 LOGGER = logging.getLogger(__name__)
27
28 class TrackHubSubmission(Submission):
29     def __init__(self, name, model, baseurl, baseupload, host):
30         """Create a trackhub based submission
31
32         :Parameters:
33           - `name`: Name of submission
34           - `model`: librdf model reference
35           - `baseurl`: web root where trackhub will be hosted
36           - `baseupload`: filesystem root where trackhub will be hosted
37           - `host`: hostname for library pages.
38         """
39         super(TrackHubSubmission, self).__init__(name, model, host)
40         if baseurl is None:
41             raise ValueError("Need a web root to make a track hub")
42         self.baseurl = os.path.join(baseurl, self.name)
43         if baseupload:
44             sshurl = parse_ssh_url(baseupload)
45             print sshurl
46             self.user = sshurl.user
47             self.host = sshurl.host
48             self.uploadpath =  sshurl.path
49         else:
50             self.uploadpath = None
51
52     def make_hub_template(self, result_map):
53         samples = []
54         for an_analysis in self.analysis_nodes(result_map):
55             metadata = self.get_sample_metadata(an_analysis)
56             if len(metadata) == 0:
57                 errmsg = 'No metadata found for {0}'
58                 LOGGER.error(errmsg.format(str(an_analysis),))
59                 continue
60             elif len(metadata) > 1:
61                 errmsg = 'Confused there are more than one sample for %s'
62                 LOGGER.debug(errmsg % (str(an_analysis),))
63             metadata = metadata[0]
64             samples.append(metadata)
65
66         template = loader.get_template('trackDb.txt')
67         context = Context({
68             'samples': samples,
69         })
70         return str(template.render(context))
71
72     def make_hub(self, result_map):
73         genome_db = 'hg19'
74         hub_url = self.baseurl + '/'
75         hub, genomes_file, genome, trackdb = default_hub(
76             hub_name=self.name,
77             short_label=self.name,
78             long_label=self.name,
79             email='email',
80             genome=genome_db)
81
82         hub.remote_dir = self.uploadpath
83
84         # build higher order track types
85         composite = CompositeTrack(
86             name=self.sanitize_name(self.name),
87             short_label = self.sanitize_name(self.name),
88             long_label = str(self.name),
89             tracktype="bed 3",
90             dragAndDrop='subtracks',
91             visibility='full',
92         )
93         trackdb.add_tracks(composite)
94
95         subgroups = self.add_subgroups(composite)
96
97         view_type = None
98         view = None
99
100         for track in self.get_tracks():
101             if track['file_type'] not in TRACKTYPES:
102                 LOGGER.info('Unrecognized file type %s', track['file_type'])
103                 continue
104
105             view = self.add_new_view_if_needed(composite, view, track)
106             track_name = self.make_track_name(track)
107
108             track_subgroup = self.make_track_subgroups(subgroups, track)
109             track_type = self.make_track_type(track)
110
111             if 'file_label' in track:
112                 track_label = self.sanitize_name(track['file_label'])
113             else:
114                 track_label = track_name
115
116             attributes = {
117                 'name': track_name,
118                 'tracktype': track_type,
119                 'url': hub_url + str(track['relative_path']),
120                 'short_label': str(track['library_id']),
121                 'long_label': str(track_label),
122                 'subgroups': track_subgroup,
123             }
124             
125             LOGGER.debug('track attributes: %s', pformat(attributes))       
126             newtrack = Track(**attributes)                    
127             view.add_tracks([newtrack])
128
129         results = hub.render()
130         if hub.remote_dir:
131             LOGGER.info("Uploading to %s @ %s : %s",
132                         self.user, self.host, hub.remote_dir)
133             upload_hub(hub=hub, host=self.host, user='diane')
134
135     def add_new_view_if_needed(self, composite, view, track):
136         """Add new trakkhub view if we've hit a new type of track.
137
138         :Parameters:
139           - `composite`: composite track to attach to
140           - `view_type`: name of view type
141           - `track`: current track record
142         """
143         current_view_type = str(track['output_type'])
144         if not view or current_view_type != view.name:
145             attributes = {
146                 'name': current_view_type,
147                 'view': current_view_type,
148                 'visibility': str(track.get('visibility', 'squish')),
149                 'short_label': current_view_type,
150                 'tracktype': str(track['file_type'])
151             }
152             maxHeightPixels = track.get('maxHeightPixels')
153             if maxHeightPixels:
154                 attributes['maxHeightPixels'] = str(maxHeightPixels)
155             autoScale = track.get('autoScale')
156             if autoScale:
157                 attributes['autoScale'] = str(autoScale)
158             view = ViewTrack(**attributes)
159             composite.add_view(view)
160             view_type = current_view_type
161         return view
162
163     def make_manifest(self, result_map):
164         files = []
165         for an_analysis in self.analysis_nodes(result_map):
166             metadata = self.get_manifest_metadata(an_analysis)
167             files.extend(metadata)
168
169         template = loader.get_template('manifest.txt')
170         context = Context({
171             'files': files
172         })
173         return str(template.render(context))
174
175     def make_track_name(self, track):
176         return '{}_{}_{}'.format(
177             track['library_id'],
178             track['replicate'],
179             track['output_type'],
180         )
181
182     def make_track_subgroups(self, subgroups, track):
183         track_subgroups = {}
184         for k in subgroups:
185             if k in track and track[k]:
186                 value = self.sanitize_name(track[k])
187                 track_subgroups[k] = value
188         return track_subgroups
189     
190     def make_track_type(self, track):
191         """Further annotate tracktype.
192         
193         bigWig files can have additional information. Add it if we can
194         """
195         track_type = track['file_type']
196         if track_type.lower() == 'bigwig':
197             # something we can enhance
198             info = bigWigInfo(track['relative_path'])
199             if info.min is not None and info.max is not None:
200                 track_type = '{} {} {}'.format(track_type, int(info.min), int(info.max))
201
202         LOGGER.debug("track_type: %s", track_type)
203         return str(track_type)
204
205     def add_subgroups(self, composite):
206         """Add subgroups to composite track"""
207         search = [ ('htswlib:cell_line', 'cell'),
208                    ('encode3:rna_type', 'rna_type'),
209                    ('encode3:protocol', 'protocol'),
210                    ('htswlib:replicate', 'replicate'),
211                    ('encode3:library_id', 'library_id'),
212                    ('encode3:assay', 'assay'),
213                  ]
214         subgroups = []
215         names = []
216         sortorder = []
217         dimnames = ('dim{}'.format(x) for x in string.ascii_uppercase)
218         dimensions = []
219         filtercomposite = []
220         for term, name in search:
221             definitions = self.make_subgroupdefinition(term, name)
222             if definitions:
223                 subgroups.append(definitions)
224                 names.append(name)
225                 sortorder.append("{}=+".format(name))
226                 d = dimnames.next()
227                 dimensions.append("{}={}".format(d, name))
228                 filtercomposite.append("{}=multi".format(d))
229
230         composite.add_subgroups(subgroups)
231         composite.add_params(sortOrder=' '.join(sortorder))
232         composite.add_params(dimensions=' '.join(dimensions))
233         composite.add_params(filterComposite=' '.join(filtercomposite))
234         return names
235
236
237     def make_subgroupdefinition(self, term, name):
238         """Subgroup attributes need to be an attribute of the library.
239         """
240         template = loader.get_template('trackhub_term_values.sparql')
241         context = Context({'term': term})
242         results = self.execute_query(template, context)
243         values = {}
244         for row in results:
245             value = str(row['name'])
246             values[self.sanitize_name(value)] = value
247
248         if values:
249             return SubGroupDefinition(
250                     name=name,
251                     label=name,
252                     mapping=values,
253             )
254         else:
255             return None
256
257     def get_tracks(self):
258         """Collect information needed to describe trackhub tracks.
259         """
260         query_template = loader.get_template('trackhub_samples.sparql')
261
262         context = Context({ })
263
264         results = self.execute_query(query_template, context)
265         return results
266
267     def sanitize_name(self, name):
268         replacements = [('poly-?a\+', 'PolyAplus'),
269                         ('poly-?a-', 'PolyAminus'),
270                         ('RNA-Seq', 'RNASeq'),
271                         ('rna-seq', 'rnaseq'),
272                         ('-', '_'),
273                         (' ', '_'),
274                         ('^0', 'Zero'),
275                         ('^1', 'One'),
276                         ('^2', 'Two'),
277                         ('^3', 'Three'),
278                         ('^4', 'Four'),
279                         ('^5', 'Five'),
280                         ('^6', 'Six'),
281                         ('^7', 'Seven'),
282                         ('^8', 'Eight'),
283                         ('^9', 'Nine'),
284                         ]
285
286         for regex, substitution in replacements:
287             name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
288
289         return name
290
291     def get_manifest_metadata(self, analysis_node):
292         query_template = loader.get_template('trackhub_manifest.sparql')
293
294         context = Context({
295             'submission': str(analysis_node.uri),
296             'submissionSet': str(self.submissionSetNS[''].uri),
297             })
298         results = self.execute_query(query_template, context)
299         LOGGER.info("scanned %s for results found %s",
300                     str(analysis_node), len(results))
301         return results