Start to move all the RDF namespace definitions into a single module.
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2
3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
5 import httplib2
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
8 # python keyring
9 import keyring
10 import logging
11 import os
12 import re
13 # redland rdf lib
14 import RDF 
15 import sys
16 import urllib
17
18 from htsworkflow.util import api
19 from htsworkflow.util.rdfhelp import \
20      dublinCoreNS, \
21      submitOntology, \
22      libraryOntology, \
23      rdfNS, \
24      rdfsNS, \
25      xsdNS
26
27 # URL mappings
28 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
29
30
31 from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
32 ddfNS = RDF.NS(RDF.Uri(UCSCEncodePipeline + "/download_ddf#"))
33                
34 DBDIR = os.path.expanduser("~diane/proj/submission")
35
36 logger = logging.getLogger("encode_find")
37
38 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
39 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
40
41 USERNAME = 'detrout'
42 CHARSET = 'utf-8'
43
44 def main(cmdline=None):
45     parser = make_parser()
46     opts, args = parser.parse_args(cmdline)
47
48     if opts.verbose:
49         logging.basicConfig(level=logging.INFO)
50
51     htsw_authdata = api.make_auth_from_opts(opts, parser)
52     htswapi = api.HtswApi(opts.host, htsw_authdata)
53     
54     cookie = None
55     model = get_model(opts.load_model)
56     
57     if opts.load_rdf is not None:
58         load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
59         
60     if opts.update:
61         cookie = login(cookie=cookie)
62         load_my_submissions(model, cookie=cookie)
63         load_encode_libraries(model, htswapi)
64
65     if opts.sparql is not None:
66         sparql_query(model, opts.sparql)
67
68     if opts.find_submission_with_no_library:
69         missing = find_submissions_with_no_library(model)
70                 
71     if opts.print_rdf:
72         serializer = RDF.Serializer(name=opts.rdf_parser_name)
73         print serializer.serialize_model_to_string(model)
74
75
76 def make_parser():
77     parser = OptionParser()
78     commands = OptionGroup(parser, "Commands")
79     commands.add_option('--load-model', default=None,
80       help="Load model database")
81     commands.add_option('--load-rdf', default=None,
82       help="load rdf statements into model")
83     commands.add_option('--print-rdf', action="store_true", default=False,
84       help="print ending model state")
85     commands.add_option('--update', action="store_true", default=False,
86       help="Query remote data sources and update our database")
87     #commands.add_option('--update-ucsc-status', default=None,
88     #  help="download status from ucsc, requires filename for extra rules")
89     #commands.add_option('--update-ddfs', action="store_true", default=False,
90     #  help="download ddf information for known submission")
91     #commands.add_option('--update-library', default=None,
92     #  help="download library info from htsw, requires filename for extra rules")
93     parser.add_option_group(commands)
94                       
95     queries = OptionGroup(parser, "Queries")
96     queries.add_option('--sparql', default=None,
97       help="execute arbitrary sparql query")
98     queries.add_option('--find-submission-with-no-library', default=False,
99       action="store_true",
100       help="find submissions with no library ID")    
101     parser.add_option_group(queries)
102
103     options = OptionGroup(parser, "Options")
104     options.add_option("--rdf-parser-name", default="turtle",
105       help="set rdf file parser type")
106     options.add_option("-v", "--verbose", action="store_true", default=False)
107     parser.add_option_group(options)
108     
109     api.add_auth_options(parser)
110
111     return parser
112
113 def get_model(model_name=None):
114     if model_name is None:
115         storage = RDF.MemoryStorage()
116     else:
117         storage = RDF.HashStorage(model_name,
118                       options="hash-type='bdb',dir='{0}'".format(DBDIR))
119     model = RDF.Model(storage)
120     return model
121         
122 def load_my_submissions(model, cookie=None):
123     if cookie is None:
124         cookie = login()
125         
126     soup = get_url_as_soup(USER_URL, 'GET', cookie)
127     p = soup.find('table', attrs={'id':'projects'})
128     tr = p.findNext('tr')
129     # first record is header
130     tr = tr.findNext()
131     TypeN = rdfsNS['type']
132     NameN = submitOntology['name']
133     SpeciesN = submitOntology['species']
134     LibraryURN = submitOntology['library_urn']
135
136     while tr is not None:
137         td = tr.findAll('td')
138         if td is not None and len(td) > 1:
139             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
140             subUrn = RDF.Uri(submission_view_url(subUrnText))
141
142             add_stmt(model, subUrn, TypeN, submitOntology['Submission'])
143                 
144             name = get_contents(td[4])
145             add_stmt(model, subUrn, NameN, name)
146                 
147             species = get_contents(td[2])
148             if species is not None:
149                 add_stmt(model, subUrn, SpeciesN, species)
150
151             library_id = get_library_id(name)
152             if library_id is not None:
153                 add_submission_to_library_urn(model,
154                                               subUrn,
155                                               LibraryURN,
156                                               library_id)
157
158             add_submission_creation_date(model, subUrn, cookie)
159
160             # grab changing atttributes
161             status = get_contents(td[6]).strip()
162             last_mod_datetime = get_date_contents(td[8])
163             last_mod = last_mod_datetime.isoformat()
164
165             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
166
167             logging.info("Processed {0}".format( subUrn))
168             
169         tr = tr.findNext('tr')
170
171
172 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
173     """Add a link from a UCSC submission to woldlab library if needed
174     """
175     libraryUrn = libraryNS[library_id]
176     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
177     if not model.contains_statement(query):
178         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
179         logger.info("Adding Sub -> Lib link: {0}".format(link))
180         model.add_statement(link)
181     else:
182         logger.debug("Found: {0}".format(str(query)))
183
184     
185 def find_submissions_with_no_library(model):
186     missing_lib_query = RDF.SPARQLQuery("""
187 PREFIX submissionOntology:<{submissionOntology}>
188
189 SELECT 
190  ?subid ?name
191 WHERE {{
192   ?subid submissionOntology:name ?name
193   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
194   FILTER  (!bound(?libid))
195 }}""".format(submissionOntology=submitOntology[''].uri)
196 )    
197
198     results = missing_lib_query.execute(model)
199     for row in results:
200         subid = row['subid']
201         name = row['name']
202         print "# {0}".format(name)
203         print "<{0}>".format(subid.uri)
204         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
205         print ""
206     
207
208 def add_submission_creation_date(model, subUrn, cookie):
209     # in theory the submission page might have more information on it.
210     creationDateN = libraryOntology['date']
211     dateTimeType = xsdNS['dateTime']
212     query = RDF.Statement(subUrn, creationDateN, None)
213     creation_dates = list(model.find_statements(query))
214     if len(creation_dates) == 0:
215         logger.info("Getting creation date for: {0}".format(str(subUrn)))
216         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
217         created_label = soup.find(text="Created: ")
218         if created_label:
219             created_date = get_date_contents(created_label.next)
220             created_date_node = RDF.Node(literal=created_date.isoformat(),
221                                          datatype=dateTimeType.uri)
222             add_stmt(model, subUrn, creationDateN, created_date_node)
223     else:
224         logger.debug("Found creation date for: {0}".format(str(subUrn)))
225
226 def update_submission_detail(model, subUrn, status, recent_update, cookie):
227     HasStatusN = submitOntology['has_status']
228     StatusN = submitOntology['status']
229     LastModifyN = submitOntology['last_modify_date']
230
231     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
232     status_nodes = list(model.find_statements(status_nodes_query))
233
234     if len(status_nodes) == 0:
235         # has no status node, add one
236         logging.info("Adding status node to {0}".format(subUrn))
237         status_blank = RDF.Node()
238         add_stmt(model, subUrn, HasStatusN, status_blank)
239         add_stmt(model, status_blank, rdfsNS['type'], StatusN)
240         add_stmt(model, status_blank, StatusN, status)
241         add_stmt(model, status_blank, LastModifyN, recent_update)
242         update_ddf(model, subUrn, status_blank, cookie=cookie)
243     else:
244         logging.info("Found {0} status blanks".format(len(status_nodes)))
245         for status_statement in status_nodes:
246             status_blank = status_statement.object
247             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
248             last_mod_nodes = model.find_statements(last_modified_query)
249             for last_mod_statement in last_mod_nodes:
250                 last_mod_date = str(last_mod_statement.object)
251                 if recent_update == str(last_mod_date):
252                     update_ddf(model, subUrn, status_blank, cookie=cookie)
253                     break
254
255
256     
257 def update_ddf(model, subUrn, statusNode, cookie):
258     TypeN = rdfsNS['type']
259     
260     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
261     ddfUrn = RDF.Uri(download_ddf_url)
262     
263     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
264     if not model.contains_statement(status_is_ddf):
265         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
266         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
267         add_ddf_statements(model, statusNode, ddf_text)
268         model.add_statement(status_is_ddf)
269
270
271 def add_ddf_statements(model, statusNode, ddf_string):
272     """Convert a ddf text file into RDF Statements
273     """
274     ddf_lines = ddf_string.split('\n')
275     # first line is header
276     header = ddf_lines[0].split()
277     attributes = [ ddfNS[x] for x in header ]
278     statements = []
279
280     for ddf_line in ddf_lines[1:]:
281         ddf_line = ddf_line.strip()
282         if len(ddf_line) == 0:
283             continue
284         if ddf_line.startswith("#"):
285             continue
286         
287         ddf_record = ddf_line.split('\t')
288         files = ddf_record[0].split(',')
289         file_attributes = ddf_record[1:]
290
291         for f in files:
292             fileNode = RDF.Node()
293             add_stmt(model, statusNode, submitOntology['has_file'], fileNode)
294             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
295             add_stmt(model, fileNode, ddfNS['filename'], f)
296
297             for predicate, object in zip( attributes[1:], file_attributes):
298                 add_stmt(model, fileNode, predicate, object)
299
300
301 def load_encode_libraries(model, htswapi):
302     """Get libraries associated with encode.
303     """
304     encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
305     rdfaParser = RDF.Parser(name='rdfa')
306     print encodeUrl
307     rdfaParser.parse_into_model(model, encodeUrl)
308     query = RDF.Statement(None, libraryOntology['library_id'], None)
309     libraries = model.find_statements(query)
310     for statement in libraries:
311         libraryUrn = statement.subject
312         load_library_detail(model, libraryUrn)
313
314
315 def load_library_detail(model, libraryUrn):
316     """Grab detail information from library page
317     """
318     rdfaParser = RDF.Parser(name='rdfa')
319     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
320     results = list(model.find_statements(query))
321     if len(results) == 0:
322         logger.info("Loading {0}".format(str(libraryUrn)))
323         rdfaParser.parse_into_model(model, libraryUrn.uri)
324     elif len(results) == 1:
325         pass # Assuming that a loaded dataset has one record
326     else:
327         logging.warning("Many dates for {0}".format(libraryUrn))
328                         
329 def get_library_id(name):
330     """Guess library ID from library name
331     """
332     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
333     library_id = None
334     if match is not None:
335         library_id = match.group('id')
336     return library_id
337
338
339 def get_contents(element):
340     """Return contents or none.
341     """
342     if len(element.contents) == 0:
343         return None
344
345     a = element.find('a')
346     if a is not None:
347         return a.contents[0].encode(CHARSET)
348
349     return element.contents[0].encode(CHARSET)
350     
351     
352 def get_date_contents(element):
353     data = get_contents(element)
354     if data:
355         return datetime.strptime(data, "%Y-%m-%d %H:%M")
356     else:
357         return None
358
359 def sparql_query(model, query_filename):
360     """Execute sparql query from file
361     """
362     query_body = open(query_filename,'r').read()
363     query = RDF.SPARQLQuery(query_body)
364     results = query.execute(model)
365     for row in results:
366         output = []
367         for k,v in row.items()[::-1]:
368             print "{0}: {1}".format(k,v)
369         print 
370
371         
372 def load_into_model(model, parser_name, filename):
373     if not os.path.exists(filename):
374         raise IOError("Can't find {0}".format(filename))
375     
376     data = open(filename, 'r').read()
377     rdf_parser = RDF.Parser(name=parser_name)
378     ns_uri = submitOntology[''].uri
379     rdf_parser.parse_string_into_model(model, data, ns_uri)
380
381 def add_stmt(model, subject, predicate, object):
382     """Convienence create RDF Statement and add to a model
383     """
384     return model.add_statement(
385         RDF.Statement(subject, predicate, object)
386     )
387
388 def login(cookie=None):
389     """Login if we don't have a cookie
390     """
391     if cookie is not None:
392         return cookie
393     
394     keys = keyring.get_keyring()
395     password = keys.get_password(LOGIN_URL, USERNAME)
396     credentials = {'login': USERNAME,
397                    'password': password}
398     headers = {'Content-type': 'application/x-www-form-urlencoded'}
399     http = httplib2.Http()
400     response, content = http.request(LOGIN_URL,
401                                      'POST',
402                                      headers=headers,
403                                      body=urllib.urlencode(credentials))
404     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
405                                                     response['status']))
406     
407     cookie = response.get('set-cookie', None)
408     if cookie is None:
409         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
410     return cookie
411
412                 
413 def get_url_as_soup(url, method, cookie=None):
414     http = httplib2.Http()
415     headers = {}
416     if cookie is not None:
417         headers['Cookie'] = cookie
418     response, content = http.request(url, method, headers=headers)
419     if response['status'] == '200':
420         soup = BeautifulSoup(content,
421                              fromEncoding="utf-8", # should read from header
422                              convertEntities=BeautifulSoup.HTML_ENTITIES
423                              )
424         return soup
425     else:
426         msg = "error accessing {0}, status {1}"
427         msg = msg.format(url, response['status'])
428         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
429
430 def get_url_as_text(url, method, cookie=None):
431     http = httplib2.Http()
432     headers = {}
433     if cookie is not None:
434         headers['Cookie'] = cookie
435     response, content = http.request(url, method, headers=headers)
436     if response['status'] == '200':
437         return content
438     else:
439         msg = "error accessing {0}, status {1}"
440         msg = msg.format(url, response['status'])
441         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
442     
443 ################
444 #  old stuff
445 SUBMISSIONS_LACKING_LIBID = [
446     ('1x75-Directional-HeLa-Rep1',    '11208'),
447     ('1x75-Directional-HeLa-Rep2',    '11207'),
448     ('1x75-Directional-HepG2-Rep1',   '11210'),
449     ('1x75-Directional-HepG2-Rep2',   '11209'),
450     ('1x75-Directional-H1-hESC-Rep1', '10947'),
451     ('1x75-Directional-H1-hESC-Rep2', '11009'),
452     ('1x75-Directional-HUVEC-Rep1',   '11206'),
453     ('1x75-Directional-HUVEC-Rep2',   '11205'),
454     ('1x75-Directional-K562-Rep1',    '11008'),
455     ('1x75-Directional-K562-Rep2',    '11007'),
456     ('1x75-Directional-NHEK-Rep1',    '11204'),
457     ('1x75-Directional-GM12878-Rep1', '11011'),
458     ('1x75-Directional-GM12878-Rep2', '11010'),
459     ]
460
461
462
463 def select_by_library_id(submission_list):
464     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
465     libraries = {}
466     for lib_id, subobj in subl:
467         libraries.setdefault(lib_id, []).append(subobj)
468
469     for submission in libraries.values():
470         submission.sort(key=attrgetter('date'), reverse=True)
471         
472     return libraries
473
474 def library_to_freeze(selected_libraries):
475     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
476     lib_ids = sorted(selected_libraries.keys())
477     report = ['<html><table border="1">']
478     report = ["""<html>
479 <head>
480 <style type="text/css">
481  td {border-width:0 0 1px 1px; border-style:solid;}
482 </style>
483 </head>
484 <body>
485 <table>
486 """]
487     report.append('<thead>')
488     report.append('<tr><td>Library ID</td><td>Name</td>')
489     for f in freezes:
490         report.append('<td>{0}</td>'.format(f))
491     report.append('</tr>')
492     report.append('</thead>')
493     report.append('<tbody>')
494     for lib_id in lib_ids:
495         report.append('<tr>')
496         lib_url = libraryNS[lib_id].uri
497         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
498         submissions = selected_libraries[lib_id]
499         report.append('<td>{0}</td>'.format(submissions[0].name))
500         batched = {}
501         for sub in submissions:
502             date = date_to_freeze(sub.date)
503             batched.setdefault(date, []).append(sub)
504         print lib_id, batched
505         for d in freezes:
506             report.append('<td>')
507             for s in batched.get(d, []):
508                 show_url = submission_view_url(s.subid)
509                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
510                 report.append("{0}:{1}".format(subid, s.status))
511             report.append('</td>')
512         else:
513             report.append('<td></td>')
514         report.append("</tr>")
515     report.append('</tbody>')
516     report.append("</table></html>")
517     return "\n".join(report)
518
519             
520 def date_to_freeze(d):
521     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
522                 (datetime(2010, 7, 30), '2010-Jul'),
523                 (datetime(2011, 1, 30), '2011-Jan'),
524                 ]
525     for end, name in freezes:
526         if d < end:
527             return name
528     else:
529         return None
530
531 if __name__ == "__main__":
532     main()
533