Rename stripNamespace strip_namespace
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
1 import logging
2 import os
3 from pprint import pformat
4 import string
5 import re
6
7 import RDF
8
9 from htsworkflow.submission.submission import Submission
10
11 from htsworkflow.util.rdfhelp import \
12      fromTypedNode, \
13      geoSoftNS, \
14      submissionOntology
15 from htsworkflow.util.url import parse_ssh_url
16 from htsworkflow.util.ucsc import bigWigInfo
17
18 from django.conf import settings
19 from django.template import Context, loader
20 from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
21 from trackhub.track import TRACKTYPES, SubGroupDefinition
22 from trackhub.helpers import show_rendered_files
23 from trackhub.upload import upload_track, upload_hub
24
25 LOGGER = logging.getLogger(__name__)
26
27 class TrackHubSubmission(Submission):
28     def __init__(self, name, model, baseurl, baseupload, host):
29         """Create a trackhub based submission
30
31         :Parameters:
32           - `name`: Name of submission
33           - `model`: librdf model reference
34           - `baseurl`: web root where trackhub will be hosted
35           - `baseupload`: filesystem root where trackhub will be hosted
36           - `host`: hostname for library pages.
37         """
38         super(TrackHubSubmission, self).__init__(name, model, host)
39         if baseurl is None:
40             raise ValueError("Need a web root to make a track hub")
41         self.baseurl = os.path.join(baseurl, self.name)
42         if baseupload:
43             sshurl = parse_ssh_url(baseupload)
44             print sshurl
45             self.user = sshurl.user
46             self.host = sshurl.host
47             self.uploadpath =  sshurl.path
48         else:
49             self.uploadpath = None
50
51     def make_hub_template(self, result_map):
52         samples = []
53         for an_analysis in self.analysis_nodes(result_map):
54             metadata = self.get_sample_metadata(an_analysis)
55             if len(metadata) == 0:
56                 errmsg = 'No metadata found for {0}'
57                 LOGGER.error(errmsg.format(str(an_analysis),))
58                 continue
59             elif len(metadata) > 1:
60                 errmsg = 'Confused there are more than one sample for %s'
61                 LOGGER.debug(errmsg % (str(an_analysis),))
62             metadata = metadata[0]
63             samples.append(metadata)
64
65         template = loader.get_template('trackDb.txt')
66         context = Context({
67             'samples': samples,
68         })
69         return str(template.render(context))
70
71     def make_hub(self, result_map):
72         genome_db = 'hg19'
73         hub_url = self.baseurl + '/'
74         hub, genomes_file, genome, trackdb = default_hub(
75             hub_name=self.name,
76             short_label=self.name,
77             long_label=self.name,
78             email='email',
79             genome=genome_db)
80
81         hub.remote_dir = self.uploadpath
82
83         # build higher order track types
84         composite = CompositeTrack(
85             name=self.sanitize_name(self.name),
86             short_label = self.sanitize_name(self.name),
87             long_label = str(self.name),
88             tracktype="bed 3",
89             dragAndDrop='subtracks',
90             visibility='full',
91         )
92         trackdb.add_tracks(composite)
93
94         subgroups = self.add_subgroups(composite)
95
96         view_type = None
97         view = None
98
99         for track in self.get_tracks():
100             if track['file_type'] not in TRACKTYPES:
101                 LOGGER.info('Unrecognized file type %s', track['file_type'])
102                 continue
103
104             view = self.add_new_view_if_needed(composite, view, track)
105             track_name = self.make_track_name(track)
106
107             track_subgroup = self.make_track_subgroups(subgroups, track)
108             track_type = self.make_track_type(track)
109
110             if 'file_label' in track:
111                 track_label = self.sanitize_name(track['file_label'])
112             else:
113                 track_label = track_name
114
115             attributes = {
116                 'name': track_name,
117                 'tracktype': track_type,
118                 'url': hub_url + str(track['relative_path']),
119                 'short_label': str(track['library_id']),
120                 'long_label': str(track_label),
121                 'subgroups': track_subgroup,
122             }
123             
124             LOGGER.debug('track attributes: %s', pformat(attributes))       
125             newtrack = Track(**attributes)                    
126             view.add_tracks([newtrack])
127
128         results = hub.render()
129         if hub.remote_dir:
130             LOGGER.info("Uploading to %s @ %s : %s",
131                         self.user, self.host, hub.remote_dir)
132             upload_hub(hub=hub, host=self.host, user='diane')
133
134     def add_new_view_if_needed(self, composite, view, track):
135         """Add new trakkhub view if we've hit a new type of track.
136
137         :Parameters:
138           - `composite`: composite track to attach to
139           - `view_type`: name of view type
140           - `track`: current track record
141         """
142         current_view_type = str(track['output_type'])
143         if not view or current_view_type != view.name:
144             attributes = {
145                 'name': current_view_type,
146                 'view': current_view_type,
147                 'visibility': str(track.get('visibility', 'squish')),
148                 'short_label': current_view_type,
149                 'tracktype': str(track['file_type'])
150             }
151             maxHeightPixels = track.get('maxHeightPixels')
152             if maxHeightPixels:
153                 attributes['maxHeightPixels'] = str(maxHeightPixels)
154             autoScale = track.get('autoScale')
155             if autoScale:
156                 attributes['autoScale'] = str(autoScale)
157             view = ViewTrack(**attributes)
158             composite.add_view(view)
159             view_type = current_view_type
160         return view
161
162     def make_manifest(self, result_map):
163         files = []
164         for an_analysis in self.analysis_nodes(result_map):
165             metadata = self.get_manifest_metadata(an_analysis)
166             files.extend(metadata)
167
168         template = loader.get_template('manifest.txt')
169         context = Context({
170             'files': files
171         })
172         return str(template.render(context))
173
174     def make_track_name(self, track):
175         return '{}_{}_{}'.format(
176             track['library_id'],
177             track['replicate'],
178             track['output_type'],
179         )
180
181     def make_track_subgroups(self, subgroups, track):
182         track_subgroups = {}
183         for k in subgroups:
184             if k in track and track[k]:
185                 value = self.sanitize_name(track[k])
186                 track_subgroups[k] = value
187         return track_subgroups
188     
189     def make_track_type(self, track):
190         """Further annotate tracktype.
191         
192         bigWig files can have additional information. Add it if we can
193         """
194         track_type = track['file_type']
195         if track_type.lower() == 'bigwig':
196             # something we can enhance
197             info = bigWigInfo(track['relative_path'])
198             if info.min is not None and info.max is not None:
199                 track_type = '{} {} {}'.format(track_type, int(info.min), int(info.max))
200
201         LOGGER.debug("track_type: %s", track_type)
202         return str(track_type)
203
204     def add_subgroups(self, composite):
205         """Add subgroups to composite track"""
206         search = [ ('htswlib:cell_line', 'cell'),
207                    ('encode3:rna_type', 'rna_type'),
208                    ('encode3:protocol', 'protocol'),
209                    ('htswlib:replicate', 'replicate'),
210                    ('encode3:library_id', 'library_id'),
211                    ('encode3:assay', 'assay'),
212                  ]
213         subgroups = []
214         names = []
215         sortorder = []
216         dimnames = ('dim{}'.format(x) for x in string.ascii_uppercase)
217         dimensions = []
218         filtercomposite = []
219         for term, name in search:
220             definitions = self.make_subgroupdefinition(term, name)
221             if definitions:
222                 subgroups.append(definitions)
223                 names.append(name)
224                 sortorder.append("{}=+".format(name))
225                 d = dimnames.next()
226                 dimensions.append("{}={}".format(d, name))
227                 filtercomposite.append("{}=multi".format(d))
228
229         composite.add_subgroups(subgroups)
230         composite.add_params(sortOrder=' '.join(sortorder))
231         composite.add_params(dimensions=' '.join(dimensions))
232         composite.add_params(filterComposite=' '.join(filtercomposite))
233         return names
234
235
236     def make_subgroupdefinition(self, term, name):
237         """Subgroup attributes need to be an attribute of the library.
238         """
239         template = loader.get_template('trackhub_term_values.sparql')
240         context = Context({'term': term})
241         results = self.execute_query(template, context)
242         values = {}
243         for row in results:
244             value = str(row['name'])
245             values[self.sanitize_name(value)] = value
246
247         if values:
248             return SubGroupDefinition(
249                     name=name,
250                     label=name,
251                     mapping=values,
252             )
253         else:
254             return None
255
256     def get_tracks(self):
257         """Collect information needed to describe trackhub tracks.
258         """
259         query_template = loader.get_template('trackhub_samples.sparql')
260
261         context = Context({ })
262
263         results = self.execute_query(query_template, context)
264         return results
265
266     def sanitize_name(self, name):
267         replacements = [('poly-?a\+', 'PolyAplus'),
268                         ('poly-?a-', 'PolyAminus'),
269                         ('RNA-Seq', 'RNASeq'),
270                         ('rna-seq', 'rnaseq'),
271                         ('-', '_'),
272                         (' ', '_'),
273                         ('^0', 'Zero'),
274                         ('^1', 'One'),
275                         ('^2', 'Two'),
276                         ('^3', 'Three'),
277                         ('^4', 'Four'),
278                         ('^5', 'Five'),
279                         ('^6', 'Six'),
280                         ('^7', 'Seven'),
281                         ('^8', 'Eight'),
282                         ('^9', 'Nine'),
283                         ]
284
285         for regex, substitution in replacements:
286             name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
287
288         return name
289
290     def get_manifest_metadata(self, analysis_node):
291         query_template = loader.get_template('trackhub_manifest.sparql')
292
293         context = Context({
294             'submission': str(analysis_node.uri),
295             'submissionSet': str(self.submissionSetNS[''].uri),
296             })
297         results = self.execute_query(query_template, context)
298         LOGGER.info("scanned %s for results found %s",
299                     str(analysis_node), len(results))
300         return results