Initial port to python3
[htsworkflow.git] / htsworkflow / submission / trackhub_submission.py
1 import logging
2 import os
3 import string
4 import re
5
6 import RDF
7
8 from htsworkflow.submission.submission import Submission
9
10 from htsworkflow.util.rdfhelp import \
11      fromTypedNode, \
12      geoSoftNS, \
13      stripNamespace, \
14      submissionOntology
15 from htsworkflow.util.url import parse_ssh_url
16
17 from django.conf import settings
18 from django.template import Context, loader
19 from trackhub import default_hub, CompositeTrack, Track, SuperTrack, ViewTrack
20 from trackhub.track import TRACKTYPES, SubGroupDefinition
21 from trackhub.helpers import show_rendered_files
22 from trackhub.upload import upload_track, upload_hub
23
24 LOGGER = logging.getLogger(__name__)
25
26 class TrackHubSubmission(Submission):
27     def __init__(self, name, model, baseurl, baseupload, host):
28         """Create a trackhub based submission
29
30         :Parameters:
31           - `name`: Name of submission
32           - `model`: librdf model reference
33           - `baseurl`: web root where trackhub will be hosted
34           - `baseupload`: filesystem root where trackhub will be hosted
35           - `host`: hostname for library pages.
36         """
37         super(TrackHubSubmission, self).__init__(name, model, host)
38         if baseurl is None:
39             raise ValueError("Need a web root to make a track hub")
40         self.baseurl = os.path.join(baseurl, self.name)
41         if baseupload:
42             sshurl = parse_ssh_url(baseupload)
43             print(sshurl)
44             self.user = sshurl.user
45             self.host = sshurl.host
46             self.uploadpath =  sshurl.path
47         else:
48             self.uploadpath = None
49
50     def make_hub_template(self, result_map):
51         samples = []
52         for an_analysis in self.analysis_nodes(result_map):
53             metadata = self.get_sample_metadata(an_analysis)
54             if len(metadata) == 0:
55                 errmsg = 'No metadata found for {0}'
56                 LOGGER.error(errmsg.format(str(an_analysis),))
57                 continue
58             elif len(metadata) > 1:
59                 errmsg = 'Confused there are more than one sample for %s'
60                 LOGGER.debug(errmsg % (str(an_analysis),))
61             metadata = metadata[0]
62             samples.append(metadata)
63
64         template = loader.get_template('trackDb.txt')
65         context = Context({
66             'samples': samples,
67         })
68         return str(template.render(context))
69
70     def make_hub(self, result_map):
71         genome_db = 'hg19'
72         hub_url = self.baseurl + '/'
73         hub, genomes_file, genome, trackdb = default_hub(
74             hub_name=self.name,
75             short_label=self.name,
76             long_label=self.name,
77             email='email',
78             genome=genome_db)
79
80         hub.remote_dir = self.uploadpath
81
82         # build higher order track types
83         composite = CompositeTrack(
84             name=self.sanitize_name(self.name),
85             short_label = self.sanitize_name(self.name),
86             long_label = str(self.name),
87             tracktype="bed 3",
88             dragAndDrop='subtracks',
89             visibility='full',
90         )
91         trackdb.add_tracks(composite)
92
93         subgroups = self.add_subgroups(composite)
94
95         view_type = None
96         view = None
97
98         for track in self.get_tracks():
99             if track['file_type'] not in TRACKTYPES:
100                 LOGGER.info('Unrecognized file type %s', track['file_type'])
101                 continue
102
103             view = self.add_new_view_if_needed(composite, view, track)
104             track_name = self.make_track_name(track)
105
106             track_subgroup = self.make_track_subgroups(subgroups, track)
107
108             newtrack = Track(
109                 name=track_name,
110                 tracktype = str(track['file_type']),
111                 url= hub_url + str(track['relative_path']),
112                 short_label=str(track['library_id']),
113                 long_label=track_name,
114                 subgroups=track_subgroup,
115                 )
116             view.add_tracks([newtrack])
117
118         results = hub.render()
119         if hub.remote_dir:
120             LOGGER.info("Uploading to %s @ %s : %s",
121                         self.user, self.host, hub.remote_dir)
122             upload_hub(hub=hub, host=self.host, user='diane')
123
124     def add_new_view_if_needed(self, composite, view, track):
125         """Add new trakkhub view if we've hit a new type of track.
126
127         :Parameters:
128           - `composite`: composite track to attach to
129           - `view_type`: name of view type
130           - `track`: current track record
131         """
132         current_view_type = str(track['output_type'])
133         if not view or current_view_type != view.name:
134             view = ViewTrack(
135                 name=current_view_type,
136                 view=current_view_type,
137                 visibility='squish',
138                 short_label=current_view_type,
139                 tracktype=str(track['file_type']),
140             )
141             composite.add_view(view)
142             view_type = current_view_type
143         return view
144
145     def make_manifest(self, result_map):
146         files = []
147         for an_analysis in self.analysis_nodes(result_map):
148             metadata = self.get_manifest_metadata(an_analysis)
149             files.extend(metadata)
150
151         template = loader.get_template('manifest.txt')
152         context = Context({
153             'files': files
154         })
155         return str(template.render(context))
156
157     def make_track_name(self, track):
158         name = '{}_{}_{}'.format(
159             track['library_id'],
160             track['replicate'],
161             track['output_type'],
162         )
163         return name
164
165     def make_track_subgroups(self, subgroups, track):
166         track_subgroups = {}
167         for k in subgroups:
168             if k in track and track[k]:
169                 value = self.sanitize_name(track[k])
170                 track_subgroups[k] = value
171         return track_subgroups
172
173     def add_subgroups(self, composite):
174         """Add subgroups to composite track"""
175         search = [ ('htswlib:cell_line', 'cell'),
176                    ('encode3:rna_type', 'rna_type'),
177                    ('encode3:protocol', 'protocol'),
178                    ('htswlib:replicate', 'replicate'),
179                    ('encode3:library_id', 'library_id'),
180                    ('encode3:assay', 'assay'),
181                  ]
182         subgroups = []
183         names = []
184         sortorder = []
185         dimnames = ('dim{}'.format(x) for x in string.ascii_uppercase)
186         dimensions = []
187         filtercomposite = []
188         for term, name in search:
189             definitions = self.make_subgroupdefinition(term, name)
190             if definitions:
191                 subgroups.append(definitions)
192                 names.append(name)
193                 sortorder.append("{}=+".format(name))
194                 d = next(dimnames)
195                 dimensions.append("{}={}".format(d, name))
196                 filtercomposite.append("{}=multi".format(d))
197
198         composite.add_subgroups(subgroups)
199         composite.add_params(sortOrder=' '.join(sortorder))
200         composite.add_params(dimensions=' '.join(dimensions))
201         composite.add_params(filterComposite=' '.join(filtercomposite))
202         return names
203
204
205     def make_subgroupdefinition(self, term, name):
206         """Subgroup attributes need to be an attribute of the library.
207         """
208         template = loader.get_template('trackhub_term_values.sparql')
209         context = Context({'term': term})
210         results = self.execute_query(template, context)
211         values = {}
212         for row in results:
213             value = str(row['name'])
214             values[self.sanitize_name(value)] = value
215
216         if values:
217             return SubGroupDefinition(
218                     name=name,
219                     label=name,
220                     mapping=values,
221             )
222         else:
223             return None
224
225     def get_tracks(self):
226         """Collect information needed to describe trackhub tracks.
227         """
228         query_template = loader.get_template('trackhub_samples.sparql')
229
230         context = Context({ })
231
232         results = self.execute_query(query_template, context)
233         return results
234
235     def sanitize_name(self, name):
236         replacements = [('poly-?a\+', 'PolyAplus'),
237                         ('poly-?a-', 'PolyAminus'),
238                         ('RNA-Seq', 'RNASeq'),
239                         ('rna-seq', 'rnaseq'),
240                         ('-', '_'),
241                         (' ', '_'),
242                         ('^0', 'Zero'),
243                         ('^1', 'One'),
244                         ('^2', 'Two'),
245                         ('^3', 'Three'),
246                         ('^4', 'Four'),
247                         ('^5', 'Five'),
248                         ('^6', 'Six'),
249                         ('^7', 'Seven'),
250                         ('^8', 'Eight'),
251                         ('^9', 'Nine'),
252                         ]
253
254         for regex, substitution in replacements:
255             name = re.sub(regex, substitution, name, flags=re.IGNORECASE)
256
257         return name
258
259     def get_manifest_metadata(self, analysis_node):
260
261         query_template = loader.get_template('trackhub_manifest.sparql')
262
263         context = Context({
264             'submission': str(analysis_node.uri),
265             'submissionSet': str(self.submissionSetNS[''].uri),
266             })
267         results = self.execute_query(query_template, context)
268         LOGGER.info("scanned %s for results found %s",
269                     str(analysis_node), len(results))
270         return results