7a83e8770ee39515ffe90fa223bce3ea19767e47
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
1 import logging
2 import os
3 import string
4 import re
5
6 import RDF
7
8 from htsworkflow.submission.submission import Submission
9
10 from htsworkflow.util.rdfhelp import \
11      fromTypedNode, \
12      geoSoftNS, \
13      stripNamespace, \
14      submissionOntology
15 from htsworkflow.util.url import parse_ssh_url
16
17 from django.conf import settings
18 from django.template import Context, loader
19 from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
20 from trackhub.track import TRACKTYPES, SubGroupDefinition
21 from trackhub.helpers import show_rendered_files
22 from trackhub.upload import upload_track, upload_hub
23
24 LOGGER = logging.getLogger(__name__)
25
26 class TrackHubSubmission(Submission):
27     def __init__(self, name, model, baseurl, baseupload, host):
28         """Create a trackhub based submission
29
30         :Parameters:
31           - `name`: Name of submission
32           - `model`: librdf model reference
33           - `baseurl`: web root where trackhub will be hosted
34           - `baseupload`: filesystem root where trackhub will be hosted
35           - `host`: hostname for library pages.
36         """
37         super(TrackHubSubmission, self).__init__(name, model, host)
38         if baseurl is None:
39             raise ValueError("Need a web root to make a track hub")
40         self.baseurl = os.path.join(baseurl, self.name)
41         if baseupload:
42             sshurl = parse_ssh_url(baseupload)
43             print sshurl
44             self.user = sshurl.user
45             self.host = sshurl.host
46             self.uploadpath =  sshurl.path
47         else:
48             self.uploadpath = None
49
50     def make_hub_template(self, result_map):
51         samples = []
52         for an_analysis in self.analysis_nodes(result_map):
53             metadata = self.get_sample_metadata(an_analysis)
54             if len(metadata) == 0:
55                 errmsg = 'No metadata found for {0}'
56                 LOGGER.error(errmsg.format(str(an_analysis),))
57                 continue
58             elif len(metadata) > 1:
59                 errmsg = 'Confused there are more than one sample for %s'
60                 LOGGER.debug(errmsg % (str(an_analysis),))
61             metadata = metadata[0]
62             samples.append(metadata)
63
64         template = loader.get_template('trackDb.txt')
65         context = Context({
66             'samples': samples,
67         })
68         return str(template.render(context))
69
70     def make_hub(self, result_map):
71         genome_db = 'hg19'
72         hub_url = self.baseurl + '/'
73         hub, genomes_file, genome, trackdb = default_hub(
74             hub_name=self.name,
75             short_label=self.name,
76             long_label=self.name,
77             email='email',
78             genome=genome_db)
79
80         hub.remote_dir = self.uploadpath
81
82         # build higher order track types
83         composite = CompositeTrack(
84             name=self.sanitize_name(self.name),
85             short_label = self.sanitize_name(self.name),
86             long_label = str(self.name),
87             tracktype="bed 3",
88             dragAndDrop='subtracks',
89             visibility='full',
90         )
91         trackdb.add_tracks(composite)
92
93         subgroups = self.add_subgroups(composite)
94
95         view_type = None
96         view = None
97
98         for track in self.get_tracks():
99             if track['file_type'] not in TRACKTYPES:
100                 LOGGER.info('Unrecognized file type %s', track['file_type'])
101                 continue
102
103             view = self.add_new_view_if_needed(composite, view, track)
104             track_name = self.make_track_name(track)
105
106             track_subgroup = self.make_track_subgroups(subgroups, track)
107
108             if 'file_label' in track:
109                 track_label = self.sanitize_name(track['file_label'])
110             else:
111                 track_label = track_name
112
113             newtrack = Track(
114                 name=track_name,
115                 tracktype = str(track['file_type']),
116                 url= hub_url + str(track['relative_path']),
117                 short_label=str(track['library_id']),
118                 long_label=str(track_label),
119                 subgroups=track_subgroup,
120                 )
121             view.add_tracks([newtrack])
122
123         results = hub.render()
124         if hub.remote_dir:
125             LOGGER.info("Uploading to %s @ %s : %s",
126                         self.user, self.host, hub.remote_dir)
127             upload_hub(hub=hub, host=self.host, user='diane')
128
129     def add_new_view_if_needed(self, composite, view, track):
130         """Add new trakkhub view if we've hit a new type of track.
131
132         :Parameters:
133           - `composite`: composite track to attach to
134           - `view_type`: name of view type
135           - `track`: current track record
136         """
137         current_view_type = str(track['output_type'])
138         if not view or current_view_type != view.name:
139             view = ViewTrack(
140                 name=current_view_type,
141                 view=current_view_type,
142                 visibility='squish',
143                 short_label=current_view_type,
144                 tracktype=str(track['file_type']),
145             )
146             composite.add_view(view)
147             view_type = current_view_type
148         return view
149
150     def make_manifest(self, result_map):
151         files = []
152         for an_analysis in self.analysis_nodes(result_map):
153             metadata = self.get_manifest_metadata(an_analysis)
154             files.extend(metadata)
155
156         template = loader.get_template('manifest.txt')
157         context = Context({
158             'files': files
159         })
160         return str(template.render(context))
161
162     def make_track_name(self, track):
163         return '{}_{}_{}'.format(
164             track['library_id'],
165             track['replicate'],
166             track['output_type'],
167         )
168
169     def make_track_subgroups(self, subgroups, track):
170         track_subgroups = {}
171         for k in subgroups:
172             if k in track and track[k]:
173                 value = self.sanitize_name(track[k])
174                 track_subgroups[k] = value
175         return track_subgroups
176
177     def add_subgroups(self, composite):
178         """Add subgroups to composite track"""
179         search = [ ('htswlib:cell_line', 'cell'),
180                    ('encode3:rna_type', 'rna_type'),
181                    ('encode3:protocol', 'protocol'),
182                    ('htswlib:replicate', 'replicate'),
183                    ('encode3:library_id', 'library_id'),
184                    ('encode3:assay', 'assay'),
185                  ]
186         subgroups = []
187         names = []
188         sortorder = []
189         dimnames = ('dim{}'.format(x) for x in string.ascii_uppercase)
190         dimensions = []
191         filtercomposite = []
192         for term, name in search:
193             definitions = self.make_subgroupdefinition(term, name)
194             if definitions:
195                 subgroups.append(definitions)
196                 names.append(name)
197                 sortorder.append("{}=+".format(name))
198                 d = dimnames.next()
199                 dimensions.append("{}={}".format(d, name))
200                 filtercomposite.append("{}=multi".format(d))
201
202         composite.add_subgroups(subgroups)
203         composite.add_params(sortOrder=' '.join(sortorder))
204         composite.add_params(dimensions=' '.join(dimensions))
205         composite.add_params(filterComposite=' '.join(filtercomposite))
206         return names
207
208
209     def make_subgroupdefinition(self, term, name):
210         """Subgroup attributes need to be an attribute of the library.
211         """
212         template = loader.get_template('trackhub_term_values.sparql')
213         context = Context({'term': term})
214         results = self.execute_query(template, context)
215         values = {}
216         for row in results:
217             value = str(row['name'])
218             values[self.sanitize_name(value)] = value
219
220         if values:
221             return SubGroupDefinition(
222                     name=name,
223                     label=name,
224                     mapping=values,
225             )
226         else:
227             return None
228
229     def get_tracks(self):
230         """Collect information needed to describe trackhub tracks.
231         """
232         query_template = loader.get_template('trackhub_samples.sparql')
233
234         context = Context({ })
235
236         results = self.execute_query(query_template, context)
237         return results
238
239     def sanitize_name(self, name):
240         replacements = [('poly-?a\+', 'PolyAplus'),
241                         ('poly-?a-', 'PolyAminus'),
242                         ('RNA-Seq', 'RNASeq'),
243                         ('rna-seq', 'rnaseq'),
244                         ('-', '_'),
245                         (' ', '_'),
246                         ('^0', 'Zero'),
247                         ('^1', 'One'),
248                         ('^2', 'Two'),
249                         ('^3', 'Three'),
250                         ('^4', 'Four'),
251                         ('^5', 'Five'),
252                         ('^6', 'Six'),
253                         ('^7', 'Seven'),
254                         ('^8', 'Eight'),
255                         ('^9', 'Nine'),
256                         ]
257
258         for regex, substitution in replacements:
259             name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
260
261         return name
262
263     def get_manifest_metadata(self, analysis_node):
264         query_template = loader.get_template('trackhub_manifest.sparql')
265
266         context = Context({
267             'submission': str(analysis_node.uri),
268             'submissionSet': str(self.submissionSetNS[''].uri),
269             })
270         results = self.execute_query(query_template, context)
271         LOGGER.info("scanned %s for results found %s",
272                     str(analysis_node), len(results))
273         return results