Rework ucsc gather to use RDF models for gathering and storing track metadata.
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2
3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
5 import httplib2
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
8 # python keyring
9 import keyring
10 import logging
11 import os
12 import re
13 # redland rdf lib
14 import RDF 
15 import sys
16 import urllib
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.util.rdfhelp import \
21      dublinCoreNS, \
22      get_model, \
23      get_serializer, \
24      submitOntology, \
25      libraryOntology, \
26      load_into_model, \
27      rdfNS, \
28      rdfsNS, \
29      xsdNS
30
31 # URL mappings
32 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
33
34
35 from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
36 download_ddf = urlparse.urljoin(UCSCEncodePipeline, "download_ddf#", allow_fragments=True)
37 ddfNS = RDF.NS(download_ddf)
38                
39 DBDIR = os.path.expanduser("~diane/proj/submission")
40
41 logger = logging.getLogger("encode_find")
42
43 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
44 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
45
46 USERNAME = 'detrout'
47 CHARSET = 'utf-8'
48
49 def main(cmdline=None):
50     parser = make_parser()
51     opts, args = parser.parse_args(cmdline)
52
53     if opts.verbose:
54         logging.basicConfig(level=logging.INFO)
55
56     htsw_authdata = api.make_auth_from_opts(opts, parser)
57     htswapi = api.HtswApi(opts.host, htsw_authdata)
58     
59     cookie = None
60     model = get_model(opts.load_model, DBDIR)
61     
62     if opts.load_rdf is not None:
63         ns_uri = submitOntology[''].uri
64         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
65         
66     if opts.update:
67         cookie = login(cookie=cookie)
68         load_my_submissions(model, cookie=cookie)
69         load_encode_libraries(model, htswapi)
70
71     if opts.sparql is not None:
72         sparql_query(model, opts.sparql)
73
74     if opts.find_submission_with_no_library:
75         missing = find_submissions_with_no_library(model)
76                 
77     if opts.print_rdf:
78         serializer = get_serializer(name=opts.rdf_parser_name)
79         print serializer.serialize_model_to_string(model)
80
81
82 def make_parser():
83     parser = OptionParser()
84     commands = OptionGroup(parser, "Commands")
85     commands.add_option('--load-model', default=None,
86       help="Load model database")
87     commands.add_option('--load-rdf', default=None,
88       help="load rdf statements into model")
89     commands.add_option('--print-rdf', action="store_true", default=False,
90       help="print ending model state")
91     commands.add_option('--update', action="store_true", default=False,
92       help="Query remote data sources and update our database")
93     #commands.add_option('--update-ucsc-status', default=None,
94     #  help="download status from ucsc, requires filename for extra rules")
95     #commands.add_option('--update-ddfs', action="store_true", default=False,
96     #  help="download ddf information for known submission")
97     #commands.add_option('--update-library', default=None,
98     #  help="download library info from htsw, requires filename for extra rules")
99     parser.add_option_group(commands)
100                       
101     queries = OptionGroup(parser, "Queries")
102     queries.add_option('--sparql', default=None,
103       help="execute arbitrary sparql query")
104     queries.add_option('--find-submission-with-no-library', default=False,
105       action="store_true",
106       help="find submissions with no library ID")    
107     parser.add_option_group(queries)
108
109     options = OptionGroup(parser, "Options")
110     options.add_option("--rdf-parser-name", default="turtle",
111       help="set rdf file parser type")
112     options.add_option("-v", "--verbose", action="store_true", default=False)
113     parser.add_option_group(options)
114     
115     api.add_auth_options(parser)
116
117     return parser
118
119 def load_my_submissions(model, cookie=None):
120     if cookie is None:
121         cookie = login()
122         
123     soup = get_url_as_soup(USER_URL, 'GET', cookie)
124     p = soup.find('table', attrs={'id':'projects'})
125     tr = p.findNext('tr')
126     # first record is header
127     tr = tr.findNext()
128     TypeN = rdfsNS['type']
129     NameN = submitOntology['name']
130     SpeciesN = submitOntology['species']
131     LibraryURN = submitOntology['library_urn']
132
133     while tr is not None:
134         td = tr.findAll('td')
135         if td is not None and len(td) > 1:
136             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
137             subUrn = RDF.Uri(submission_view_url(subUrnText))
138
139             add_stmt(model, subUrn, TypeN, submitOntology['Submission'])
140                 
141             name = get_contents(td[4])
142             add_stmt(model, subUrn, NameN, name)
143                 
144             species = get_contents(td[2])
145             if species is not None:
146                 add_stmt(model, subUrn, SpeciesN, species)
147
148             library_id = get_library_id(name)
149             if library_id is not None:
150                 add_submission_to_library_urn(model,
151                                               subUrn,
152                                               LibraryURN,
153                                               library_id)
154
155             add_submission_creation_date(model, subUrn, cookie)
156
157             # grab changing atttributes
158             status = get_contents(td[6]).strip()
159             last_mod_datetime = get_date_contents(td[8])
160             last_mod = last_mod_datetime.isoformat()
161
162             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
163
164             logging.info("Processed {0}".format( subUrn))
165             
166         tr = tr.findNext('tr')
167
168
169 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
170     """Add a link from a UCSC submission to woldlab library if needed
171     """
172     libraryUrn = libraryNS[library_id]
173     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
174     if not model.contains_statement(query):
175         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
176         logger.info("Adding Sub -> Lib link: {0}".format(link))
177         model.add_statement(link)
178     else:
179         logger.debug("Found: {0}".format(str(query)))
180
181     
182 def find_submissions_with_no_library(model):
183     missing_lib_query = RDF.SPARQLQuery("""
184 PREFIX submissionOntology:<{submissionOntology}>
185
186 SELECT 
187  ?subid ?name
188 WHERE {{
189   ?subid submissionOntology:name ?name
190   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
191   FILTER  (!bound(?libid))
192 }}""".format(submissionOntology=submitOntology[''].uri)
193 )    
194
195     results = missing_lib_query.execute(model)
196     for row in results:
197         subid = row['subid']
198         name = row['name']
199         print "# {0}".format(name)
200         print "<{0}>".format(subid.uri)
201         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
202         print ""
203     
204
205 def add_submission_creation_date(model, subUrn, cookie):
206     # in theory the submission page might have more information on it.
207     creationDateN = libraryOntology['date']
208     dateTimeType = xsdNS['dateTime']
209     query = RDF.Statement(subUrn, creationDateN, None)
210     creation_dates = list(model.find_statements(query))
211     if len(creation_dates) == 0:
212         logger.info("Getting creation date for: {0}".format(str(subUrn)))
213         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
214         created_label = soup.find(text="Created: ")
215         if created_label:
216             created_date = get_date_contents(created_label.next)
217             created_date_node = RDF.Node(literal=created_date.isoformat(),
218                                          datatype=dateTimeType.uri)
219             add_stmt(model, subUrn, creationDateN, created_date_node)
220     else:
221         logger.debug("Found creation date for: {0}".format(str(subUrn)))
222
223 def update_submission_detail(model, subUrn, status, recent_update, cookie):
224     HasStatusN = submitOntology['has_status']
225     StatusN = submitOntology['status']
226     LastModifyN = submitOntology['last_modify_date']
227
228     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
229     status_nodes = list(model.find_statements(status_nodes_query))
230
231     if len(status_nodes) == 0:
232         # has no status node, add one
233         logging.info("Adding status node to {0}".format(subUrn))
234         status_blank = RDF.Node()
235         add_stmt(model, subUrn, HasStatusN, status_blank)
236         add_stmt(model, status_blank, rdfsNS['type'], StatusN)
237         add_stmt(model, status_blank, StatusN, status)
238         add_stmt(model, status_blank, LastModifyN, recent_update)
239         update_ddf(model, subUrn, status_blank, cookie=cookie)
240     else:
241         logging.info("Found {0} status blanks".format(len(status_nodes)))
242         for status_statement in status_nodes:
243             status_blank = status_statement.object
244             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
245             last_mod_nodes = model.find_statements(last_modified_query)
246             for last_mod_statement in last_mod_nodes:
247                 last_mod_date = str(last_mod_statement.object)
248                 if recent_update == str(last_mod_date):
249                     update_ddf(model, subUrn, status_blank, cookie=cookie)
250                     break
251
252
253     
254 def update_ddf(model, subUrn, statusNode, cookie):
255     TypeN = rdfsNS['type']
256     
257     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
258     ddfUrn = RDF.Uri(download_ddf_url)
259     
260     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
261     if not model.contains_statement(status_is_ddf):
262         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
263         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
264         add_ddf_statements(model, statusNode, ddf_text)
265         model.add_statement(status_is_ddf)
266
267
268 def add_ddf_statements(model, statusNode, ddf_string):
269     """Convert a ddf text file into RDF Statements
270     """
271     ddf_lines = ddf_string.split('\n')
272     # first line is header
273     header = ddf_lines[0].split()
274     attributes = [ ddfNS[x] for x in header ]
275     statements = []
276
277     for ddf_line in ddf_lines[1:]:
278         ddf_line = ddf_line.strip()
279         if len(ddf_line) == 0:
280             continue
281         if ddf_line.startswith("#"):
282             continue
283         
284         ddf_record = ddf_line.split('\t')
285         files = ddf_record[0].split(',')
286         file_attributes = ddf_record[1:]
287
288         for f in files:
289             fileNode = RDF.Node()
290             add_stmt(model, statusNode, submitOntology['has_file'], fileNode)
291             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
292             add_stmt(model, fileNode, ddfNS['filename'], f)
293
294             for predicate, object in zip( attributes[1:], file_attributes):
295                 add_stmt(model, fileNode, predicate, object)
296
297
298 def load_encode_libraries(model, htswapi):
299     """Get libraries associated with encode.
300     """
301     encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
302     rdfaParser = RDF.Parser(name='rdfa')
303     print encodeUrl
304     rdfaParser.parse_into_model(model, encodeUrl)
305     query = RDF.Statement(None, libraryOntology['library_id'], None)
306     libraries = model.find_statements(query)
307     for statement in libraries:
308         libraryUrn = statement.subject
309         load_library_detail(model, libraryUrn)
310
311
312 def load_library_detail(model, libraryUrn):
313     """Grab detail information from library page
314     """
315     rdfaParser = RDF.Parser(name='rdfa')
316     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
317     results = list(model.find_statements(query))
318     if len(results) == 0:
319         logger.info("Loading {0}".format(str(libraryUrn)))
320         rdfaParser.parse_into_model(model, libraryUrn.uri)
321     elif len(results) == 1:
322         pass # Assuming that a loaded dataset has one record
323     else:
324         logging.warning("Many dates for {0}".format(libraryUrn))
325                         
326 def get_library_id(name):
327     """Guess library ID from library name
328     """
329     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
330     library_id = None
331     if match is not None:
332         library_id = match.group('id')
333     return library_id
334
335
336 def get_contents(element):
337     """Return contents or none.
338     """
339     if len(element.contents) == 0:
340         return None
341
342     a = element.find('a')
343     if a is not None:
344         return a.contents[0].encode(CHARSET)
345
346     return element.contents[0].encode(CHARSET)
347     
348     
349 def get_date_contents(element):
350     data = get_contents(element)
351     if data:
352         return datetime.strptime(data, "%Y-%m-%d %H:%M")
353     else:
354         return None
355
356 def sparql_query(model, query_filename):
357     """Execute sparql query from file
358     """
359     query_body = open(query_filename,'r').read()
360     query = RDF.SPARQLQuery(query_body)
361     results = query.execute(model)
362     for row in results:
363         output = []
364         for k,v in row.items()[::-1]:
365             print "{0}: {1}".format(k,v)
366         print 
367
368         
369 def load_into_model(model, parser_name, filename):
370     if not os.path.exists(filename):
371         raise IOError("Can't find {0}".format(filename))
372     
373     data = open(filename, 'r').read()
374     rdf_parser = RDF.Parser(name=parser_name)
375     rdf_parser.parse_string_into_model(model, data, ns_uri)
376
377 def add_stmt(model, subject, predicate, object):
378     """Convienence create RDF Statement and add to a model
379     """
380     return model.add_statement(
381         RDF.Statement(subject, predicate, object)
382     )
383
384 def login(cookie=None):
385     """Login if we don't have a cookie
386     """
387     if cookie is not None:
388         return cookie
389     
390     keys = keyring.get_keyring()
391     password = keys.get_password(LOGIN_URL, USERNAME)
392     credentials = {'login': USERNAME,
393                    'password': password}
394     headers = {'Content-type': 'application/x-www-form-urlencoded'}
395     http = httplib2.Http()
396     response, content = http.request(LOGIN_URL,
397                                      'POST',
398                                      headers=headers,
399                                      body=urllib.urlencode(credentials))
400     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
401                                                     response['status']))
402     
403     cookie = response.get('set-cookie', None)
404     if cookie is None:
405         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
406     return cookie
407
408                 
409 def get_url_as_soup(url, method, cookie=None):
410     http = httplib2.Http()
411     headers = {}
412     if cookie is not None:
413         headers['Cookie'] = cookie
414     response, content = http.request(url, method, headers=headers)
415     if response['status'] == '200':
416         soup = BeautifulSoup(content,
417                              fromEncoding="utf-8", # should read from header
418                              convertEntities=BeautifulSoup.HTML_ENTITIES
419                              )
420         return soup
421     else:
422         msg = "error accessing {0}, status {1}"
423         msg = msg.format(url, response['status'])
424         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
425
426 def get_url_as_text(url, method, cookie=None):
427     http = httplib2.Http()
428     headers = {}
429     if cookie is not None:
430         headers['Cookie'] = cookie
431     response, content = http.request(url, method, headers=headers)
432     if response['status'] == '200':
433         return content
434     else:
435         msg = "error accessing {0}, status {1}"
436         msg = msg.format(url, response['status'])
437         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
438     
439 ################
440 #  old stuff
441 SUBMISSIONS_LACKING_LIBID = [
442     ('1x75-Directional-HeLa-Rep1',    '11208'),
443     ('1x75-Directional-HeLa-Rep2',    '11207'),
444     ('1x75-Directional-HepG2-Rep1',   '11210'),
445     ('1x75-Directional-HepG2-Rep2',   '11209'),
446     ('1x75-Directional-H1-hESC-Rep1', '10947'),
447     ('1x75-Directional-H1-hESC-Rep2', '11009'),
448     ('1x75-Directional-HUVEC-Rep1',   '11206'),
449     ('1x75-Directional-HUVEC-Rep2',   '11205'),
450     ('1x75-Directional-K562-Rep1',    '11008'),
451     ('1x75-Directional-K562-Rep2',    '11007'),
452     ('1x75-Directional-NHEK-Rep1',    '11204'),
453     ('1x75-Directional-GM12878-Rep1', '11011'),
454     ('1x75-Directional-GM12878-Rep2', '11010'),
455     ]
456
457
458
459 def select_by_library_id(submission_list):
460     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
461     libraries = {}
462     for lib_id, subobj in subl:
463         libraries.setdefault(lib_id, []).append(subobj)
464
465     for submission in libraries.values():
466         submission.sort(key=attrgetter('date'), reverse=True)
467         
468     return libraries
469
470 def library_to_freeze(selected_libraries):
471     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
472     lib_ids = sorted(selected_libraries.keys())
473     report = ['<html><table border="1">']
474     report = ["""<html>
475 <head>
476 <style type="text/css">
477  td {border-width:0 0 1px 1px; border-style:solid;}
478 </style>
479 </head>
480 <body>
481 <table>
482 """]
483     report.append('<thead>')
484     report.append('<tr><td>Library ID</td><td>Name</td>')
485     for f in freezes:
486         report.append('<td>{0}</td>'.format(f))
487     report.append('</tr>')
488     report.append('</thead>')
489     report.append('<tbody>')
490     for lib_id in lib_ids:
491         report.append('<tr>')
492         lib_url = libraryNS[lib_id].uri
493         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
494         submissions = selected_libraries[lib_id]
495         report.append('<td>{0}</td>'.format(submissions[0].name))
496         batched = {}
497         for sub in submissions:
498             date = date_to_freeze(sub.date)
499             batched.setdefault(date, []).append(sub)
500         print lib_id, batched
501         for d in freezes:
502             report.append('<td>')
503             for s in batched.get(d, []):
504                 show_url = submission_view_url(s.subid)
505                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
506                 report.append("{0}:{1}".format(subid, s.status))
507             report.append('</td>')
508         else:
509             report.append('<td></td>')
510         report.append("</tr>")
511     report.append('</tbody>')
512     report.append("</table></html>")
513     return "\n".join(report)
514
515             
516 def date_to_freeze(d):
517     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
518                 (datetime(2010, 7, 30), '2010-Jul'),
519                 (datetime(2011, 1, 30), '2011-Jan'),
520                 ]
521     for end, name in freezes:
522         if d < end:
523             return name
524     else:
525         return None
526
527 if __name__ == "__main__":
528     main()
529