7bca809a926798d87cae8f6b23486b1de3c7b23a
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
1 import logging
2 import os
3 import re
4
5 import RDF
6
7 from htsworkflow.submission.submission import Submission
8
9 from htsworkflow.util.rdfhelp import \
10      fromTypedNode, \
11      geoSoftNS, \
12      stripNamespace, \
13      submissionOntology
14 from htsworkflow.util.url import parse_ssh_url
15
16 from django.conf import settings
17 from django.template import Context, loader
18 from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
19 from trackhub.track import TRACKTYPES, SubGroupDefinition
20 from trackhub.helpers import show_rendered_files
21 from trackhub.upload import upload_track, upload_hub
22
23 LOGGER = logging.getLogger(__name__)
24
25 class TrackHubSubmission(Submission):
26     def __init__(self, name, model, baseurl, baseupload, host):
27         """Create a trackhub based submission
28
29         :Parameters:
30           - `name`: Name of submission
31           - `model`: librdf model reference
32           - `baseurl`: web root where trackhub will be hosted
33           - `baseupload`: filesystem root where trackhub will be hosted
34           - `host`: hostname for library pages.
35         """
36         super(TrackHubSubmission, self).__init__(name, model, host)
37         if baseurl is None:
38             raise ValueError("Need a web root to make a track hub")
39         self.baseurl = os.path.join(baseurl, self.name)
40         if baseupload:
41             sshurl = parse_ssh_url(baseupload)
42             print sshurl
43             self.user = sshurl.user
44             self.host = sshurl.host
45             self.uploadpath =  sshurl.path
46         else:
47             self.uploadpath = None
48
49     def make_hub_template(self, result_map):
50         samples = []
51         for an_analysis in self.analysis_nodes(result_map):
52             metadata = self.get_sample_metadata(an_analysis)
53             if len(metadata) == 0:
54                 errmsg = 'No metadata found for {0}'
55                 LOGGER.error(errmsg.format(str(an_analysis),))
56                 continue
57             elif len(metadata) > 1:
58                 errmsg = 'Confused there are more than one sample for %s'
59                 LOGGER.debug(errmsg % (str(an_analysis),))
60             metadata = metadata[0]
61             samples.append(metadata)
62
63         template = loader.get_template('trackDb.txt')
64         context = Context({
65             'samples': samples,
66         })
67         return str(template.render(context))
68
69     def make_hub(self, result_map):
70         genome_db = 'hg19'
71         hub_url = self.baseurl + '/'
72         hub, genomes_file, genome, trackdb = default_hub(
73             hub_name=self.name,
74             short_label=self.name,
75             long_label=self.name,
76             email='email',
77             genome=genome_db)
78
79         hub.remote_dir = self.uploadpath
80
81         # build higher order track types
82         composite = CompositeTrack(
83             name=self.sanitize_name(self.name),
84             short_label = self.sanitize_name(self.name),
85             long_label = str(self.name),
86             tracktype="bigWig",
87             dragAndDrop='subtracks',
88             visibility='full',
89         )
90         trackdb.add_tracks(composite)
91
92         subgroups = self.add_subgroups(composite)
93
94         view_type = None
95         view = None
96
97         for track in self.get_tracks():
98             if track['file_type'] not in TRACKTYPES:
99                 LOGGER.info('Unrecognized file type %s', track['file_type'])
100                 continue
101
102             view = self.add_new_view_if_needed(composite, view, track)
103             track_name = self.make_track_name(track)
104
105             track_subgroup = self.make_track_subgroups(subgroups, track)
106
107             newtrack = Track(
108                 name=track_name,
109                 tracktype = str(track['file_type']),
110                 url= hub_url + str(track['relative_path']),
111                 short_label=str(track['library_id']),
112                 long_label=track_name,
113                 subgroups=track_subgroup,
114                 )
115             view.add_tracks([newtrack])
116
117         results = hub.render()
118         if hub.remote_dir:
119             LOGGER.info("Uploading to %s @ %s : %s",
120                         self.user, self.host, hub.remote_dir)
121             upload_hub(hub=hub, host=self.host, user='diane')
122
123     def add_new_view_if_needed(self, composite, view, track):
124         """Add new trakkhub view if we've hit a new type of track.
125
126         :Parameters:
127           - `composite`: composite track to attach to
128           - `view_type`: name of view type
129           - `track`: current track record
130         """
131         current_view_type = str(track['output_type'])
132         if not view or current_view_type != view.name:
133             view = ViewTrack(
134                 name=current_view_type,
135                 view=current_view_type,
136                 visibility='squish',
137                 short_label=current_view_type,
138                 tracktype=str(track['file_type']),
139             )
140             composite.add_view(view)
141             view_type = current_view_type
142         return view
143
144     def make_manifest(self, result_map):
145         files = []
146         for an_analysis in self.analysis_nodes(result_map):
147             metadata = self.get_manifest_metadata(an_analysis)
148             files.extend(metadata)
149
150         template = loader.get_template('manifest.txt')
151         context = Context({
152             'files': files
153         })
154         return str(template.render(context))
155
156     def make_track_name(self, track):
157         name = '{}_{}_{}'.format(
158             track['library_id'],
159             track['replicate'],
160             track['output_type'],
161         )
162         return name
163
164     def make_track_subgroups(self, subgroups, track):
165         track_subgroups = {}
166         for k in subgroups:
167             if k in track and track[k]:
168                 value = self.sanitize_name(track[k])
169                 track_subgroups[k] = value
170         return track_subgroups
171
172     def add_subgroups(self, composite):
173         """Add subgroups to composite track"""
174         search = [ ('htswlib:cell_line', 'cell'),
175                    ('htswlib:replicate', 'replicate'),
176                    ('encode3:library_id', 'library_id'),
177                    ('encode3:assay', 'assay'),
178                    ('encode3:rna_type', 'rna_type'),
179                    ('encode3:protocol', 'protocol'),
180                  ]
181         subgroups = []
182         names = []
183         for term, name in search:
184             subgroups.append(self.make_subgroupdefinition(term, name))
185             names.append(name)
186         composite.add_subgroups(subgroups)
187         return names
188
189
190     def make_subgroupdefinition(self, term, name):
191         """Subgroup attributes need to be an attribute of the library.
192         """
193         template = loader.get_template('trackhub_term_values.sparql')
194         context = Context({'term': term})
195         results = self.execute_query(template, context)
196         values = {}
197         for row in results:
198             value = str(row['name'])
199             values[self.sanitize_name(value)] = value
200
201         return SubGroupDefinition(
202                 name=name,
203                 label=name,
204                 mapping=values,
205         )
206
207     def get_tracks(self):
208         """Collect information needed to describe trackhub tracks.
209         """
210         query_template = loader.get_template('trackhub_samples.sparql')
211
212         context = Context({ })
213
214         results = self.execute_query(query_template, context)
215         return results
216
217     def sanitize_name(self, name):
218         replacements = [('poly-?a\+', 'PolyAplus'),
219                         ('poly-?a-', 'PolyAminus'),
220                         ('RNA-Seq', 'RNASeq'),
221                         ('rna-seq', 'rnaseq'),
222                         ('-', '_'),
223                         (' ', '_'),
224                         ('^0', 'Zero'),
225                         ('^1', 'One'),
226                         ('^2', 'Two'),
227                         ('^3', 'Three'),
228                         ('^4', 'Four'),
229                         ('^5', 'Five'),
230                         ('^6', 'Six'),
231                         ('^7', 'Seven'),
232                         ('^8', 'Eight'),
233                         ('^9', 'Nine'),
234                         ]
235
236         for regex, substitution in replacements:
237             name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
238
239         return name
240
241     def get_manifest_metadata(self, analysis_node):
242
243         query_template = loader.get_template('trackhub_manifest.sparql')
244
245         context = Context({
246             'submission': str(analysis_node.uri),
247             'submissionSet': str(self.submissionSetNS[''].uri),
248             })
249         results = self.execute_query(query_template, context)
250         LOGGER.info("scanned %s for results found %s",
251                     str(analysis_node), len(results))
252         return results