df51119566a0162a11a25af5512169e3cc1be10b
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2
3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
5 import httplib2
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
8 # python keyring
9 import keyring
10 import logging
11 import os
12 import re
13 # redland rdf lib
14 import RDF 
15 import sys
16 import urllib
17
18 from htsworkflow.util import api
19
20 logger = logging.getLogger("encode_find")
21
22 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
23 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
24 submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
25 ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
26 libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
27
28 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
29 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
30 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
31 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
32
33 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
34 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
35
36 USERNAME = 'detrout'
37 CHARSET = 'utf-8'
38
39 def main(cmdline=None):
40     parser = make_parser()
41     opts, args = parser.parse_args(cmdline)
42
43     if opts.verbose:
44         logging.basicConfig(level=logging.INFO)
45
46     htsw_authdata = api.make_auth_from_opts(opts, parser)
47     htswapi = api.HtswApi(opts.host, htsw_authdata)
48     
49     cookie = None
50     model = get_model(opts.load_model)
51     
52     if opts.load_rdf is not None:
53         load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
54         
55     if opts.update:
56         cookie = login(cookie=cookie)
57         load_my_submissions(model, cookie=cookie)
58         load_encode_libraries(model, htswapi)
59
60     if opts.sparql is not None:
61         sparql_query(model, opts.sparql)
62
63     if opts.find_submission_with_no_library:
64         missing = find_submissions_with_no_library(model)
65                 
66     if opts.print_rdf:
67         serializer = RDF.Serializer(name=opts.rdf_parser_name)
68         print serializer.serialize_model_to_string(model)
69
70
71 def make_parser():
72     parser = OptionParser()
73     commands = OptionGroup(parser, "Commands")
74     commands.add_option('--load-model', default=None,
75       help="Load model database")
76     commands.add_option('--load-rdf', default=None,
77       help="load rdf statements into model")
78     commands.add_option('--print-rdf', action="store_true", default=False,
79       help="print ending model state")
80     commands.add_option('--update', action="store_true", default=False,
81       help="Query remote data sources and update our database")
82     #commands.add_option('--update-ucsc-status', default=None,
83     #  help="download status from ucsc, requires filename for extra rules")
84     #commands.add_option('--update-ddfs', action="store_true", default=False,
85     #  help="download ddf information for known submission")
86     #commands.add_option('--update-library', default=None,
87     #  help="download library info from htsw, requires filename for extra rules")
88     parser.add_option_group(commands)
89                       
90     queries = OptionGroup(parser, "Queries")
91     queries.add_option('--sparql', default=None,
92       help="execute arbitrary sparql query")
93     queries.add_option('--find-submission-with-no-library', default=False,
94       action="store_true",
95       help="find submissions with no library ID")    
96     parser.add_option_group(queries)
97
98     options = OptionGroup(parser, "Options")
99     options.add_option("--rdf-parser-name", default="turtle",
100       help="set rdf file parser type")
101     options.add_option("-v", "--verbose", action="store_true", default=False)
102     parser.add_option_group(options)
103     
104     api.add_auth_options(parser)
105
106     return parser
107
108 def get_model(model_name=None):
109     if model_name is None:
110         storage = RDF.MemoryStorage()
111     else:
112         storage = RDF.HashStorage(model_name, options="hash-type='bdb',dir='/tmp'")
113     model = RDF.Model(storage)
114     return model
115         
116 def load_my_submissions(model, cookie=None):
117     if cookie is None:
118         cookie = login()
119         
120     soup = get_url_as_soup(USER_URL, 'GET', cookie)
121     p = soup.find('table', attrs={'id':'projects'})
122     tr = p.findNext('tr')
123     # first record is header
124     tr = tr.findNext()
125     TypeN = rdfsNS['type']
126     NameN = submitOntologyNS['name']
127     SpeciesN = submitOntologyNS['species']
128     LibraryURN = submitOntologyNS['library_urn']
129
130     while tr is not None:
131         td = tr.findAll('td')
132         if td is not None and len(td) > 1:
133             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
134             subUrn = submissionNS[subUrnText]
135
136             add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission'])
137                 
138             name = get_contents(td[4])
139             add_stmt(model, subUrn, NameN, name)
140                 
141             species = get_contents(td[2])
142             if species is not None:
143                 add_stmt(model, subUrn, SpeciesN, species)
144
145             library_id = get_library_id(name)
146             if library_id is not None:
147                 add_submission_to_library_urn(model,
148                                               subUrn,
149                                               LibraryURN,
150                                               library_id)
151
152             add_submission_creation_date(model, subUrn, cookie)
153
154             # grab changing atttributes
155             status = get_contents(td[6]).strip()
156             last_mod_datetime = get_date_contents(td[8])
157             last_mod = last_mod_datetime.isoformat()
158
159             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
160
161             logging.info("Processed {0}".format( subUrn))
162             
163         tr = tr.findNext('tr')
164
165
166 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
167     """Add a link from a UCSC submission to woldlab library if needed
168     """
169     libraryUrn = libraryNS[library_id]
170     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
171     if not model.contains_statement(query):
172         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
173         logger.info("Adding Sub -> Lib link: {0}".format(link))
174         model.add_statement(link)
175     else:
176         logger.debug("Found: {0}".format(str(query)))
177
178     
179 def find_submissions_with_no_library(model):
180     missing_lib_query = RDF.SPARQLQuery("""
181 PREFIX submissionOntology:<{submissionOntology}>
182
183 SELECT 
184  ?subid ?name
185 WHERE {{
186   ?subid submissionOntology:name ?name
187   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
188   FILTER  (!bound(?libid))
189 }}""".format(submissionOntology=submitOntologyNS[''].uri)
190 )    
191
192     results = missing_lib_query.execute(model)
193     for row in results:
194         subid = row['subid']
195         name = row['name']
196         print "# {0}".format(name)
197         print "<{0}>".format(subid.uri)
198         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
199         print ""
200     
201
202 def add_submission_creation_date(model, subUrn, cookie):
203     # in theory the submission page might have more information on it.
204     creationDateN = libOntNS['date']
205     dateTimeType = xsdNS['dateTime']
206     query = RDF.Statement(subUrn, creationDateN, None)
207     creation_dates = list(model.find_statements(query))
208     if len(creation_dates) == 0:
209         logger.info("Getting creation date for: {0}".format(str(subUrn)))
210         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
211         created_label = soup.find(text="Created: ")
212         if created_label:
213             created_date = get_date_contents(created_label.next)
214             created_date_node = RDF.Node(literal=created_date.isoformat(),
215                                          datatype=dateTimeType.uri)
216             add_stmt(model, subUrn, creationDateN, created_date_node)
217     else:
218         logger.debug("Found creation date for: {0}".format(str(subUrn)))
219
220 def update_submission_detail(model, subUrn, status, recent_update, cookie):
221     HasStatusN = submitOntologyNS['has_status']
222     StatusN = submitOntologyNS['status']
223     LastModifyN = submitOntologyNS['last_modify_date']
224
225     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
226     status_nodes = list(model.find_statements(status_nodes_query))
227
228     if len(status_nodes) == 0:
229         # has no status node, add one
230         logging.info("Adding status node to {0}".format(subUrn))
231         status_blank = RDF.Node()
232         add_stmt(model, subUrn, HasStatusN, status_blank)
233         add_stmt(model, status_blank, rdfs['type'], StatusT)
234         add_stmt(model, status_blank, StatusN, status)
235         add_stmt(model, status_blank, LastModifyN, recent_update)
236         update_ddf(model, subUrn, status_blank, cookie=cookie)
237     else:
238         logging.info("Found {0} status blanks".format(len(status_nodes)))
239         for status_statement in status_nodes:
240             status_blank = status_statement.object
241             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
242             last_mod_nodes = model.find_statements(last_modified_query)
243             for last_mod_statement in last_mod_nodes:
244                 last_mod_date = str(last_mod_statement.object)
245                 if recent_update == str(last_mod_date):
246                     update_ddf(model, subUrn, status_blank, cookie=cookie)
247                     break
248
249
250     
251 def update_ddf(model, subUrn, statusNode, cookie):
252     TypeN = rdfsNS['type']
253     
254     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
255     ddfUrn = RDF.Uri(download_ddf_url)
256     
257     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
258     if not model.contains_statement(status_is_ddf):
259         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
260         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
261         add_ddf_statements(model, statusNode, ddf_text)
262         model.add_statement(status_is_ddf)
263
264
265 def add_ddf_statements(model, statusNode, ddf_string):
266     """Convert a ddf text file into RDF Statements
267     """
268     ddf_lines = ddf_string.split('\n')
269     # first line is header
270     header = ddf_lines[0].split()
271     attributes = [ ddfNS[x] for x in header ]
272     statements = []
273
274     for ddf_line in ddf_lines[1:]:
275         ddf_line = ddf_line.strip()
276         if len(ddf_line) == 0:
277             continue
278         if ddf_line.startswith("#"):
279             continue
280         
281         ddf_record = ddf_line.split('\t')
282         files = ddf_record[0].split(',')
283         file_attributes = ddf_record[1:]
284
285         for f in files:
286             fileNode = RDF.Node()
287             add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode)
288             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
289             add_stmt(model, fileNode, ddfNS['filename'], f)
290
291             for predicate, object in zip( attributes[1:], file_attributes):
292                 add_stmt(model, fileNode, predicate, object)
293
294
295 def load_encode_libraries(model, htswapi):
296     """Get libraries associated with encode.
297     """
298     encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
299     rdfaParser = RDF.Parser(name='rdfa')
300     print encodeUrl
301     rdfaParser.parse_into_model(model, encodeUrl)
302     query = RDF.Statement(None, libOntNS['library_id'], None)
303     libraries = model.find_statements(query)
304     for statement in libraries:
305         libraryUrn = statement.subject
306         load_library_detail(model, libraryUrn)
307
308
309 def load_library_detail(model, libraryUrn):
310     """Grab detail information from library page
311     """
312     rdfaParser = RDF.Parser(name='rdfa')
313     query = RDF.Statement(libraryUrn, libOntNS['date'], None)
314     results = list(model.find_statements(query))
315     if len(results) == 0:
316         logger.info("Loading {0}".format(str(libraryUrn)))
317         rdfaParser.parse_into_model(model, libraryUrn.uri)
318     elif len(results) == 1:
319         pass # Assuming that a loaded dataset has one record
320     else:
321         logging.warning("Many dates for {0}".format(libraryUrn))
322                         
323 def get_library_id(name):
324     """Guess library ID from library name
325     """
326     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
327     library_id = None
328     if match is not None:
329         library_id = match.group('id')
330     return library_id
331
332
333 def get_contents(element):
334     """Return contents or none.
335     """
336     if len(element.contents) == 0:
337         return None
338
339     a = element.find('a')
340     if a is not None:
341         return a.contents[0].encode(CHARSET)
342
343     return element.contents[0].encode(CHARSET)
344     
345     
346 def get_date_contents(element):
347     data = get_contents(element)
348     if data:
349         return datetime.strptime(data, "%Y-%m-%d %H:%M")
350     else:
351         return None
352
353 def sparql_query(model, query_filename):
354     """Execute sparql query from file
355     """
356     query_body = open(query_filename,'r').read()
357     query = RDF.SPARQLQuery(query_body)
358     results = query.execute(model)
359     for row in results:
360         output = []
361         for k,v in row.items()[::-1]:
362             print "{0}: {1}".format(k,v)
363         print 
364
365         
366 def load_into_model(model, parser_name, filename):
367     if not os.path.exists(filename):
368         raise IOError("Can't find {0}".format(filename))
369     
370     data = open(filename, 'r').read()
371     rdf_parser = RDF.Parser(name=parser_name)
372     ns_uri = submitOntologyNS[''].uri
373     rdf_parser.parse_string_into_model(model, data, ns_uri)
374
375 def add_stmt(model, subject, predicate, object):
376     """Convienence create RDF Statement and add to a model
377     """
378     return model.add_statement(
379         RDF.Statement(subject, predicate, object)
380     )
381
382 def login(cookie=None):
383     """Login if we don't have a cookie
384     """
385     if cookie is not None:
386         return cookie
387     
388     keys = keyring.get_keyring()
389     password = keys.get_password(LOGIN_URL, USERNAME)
390     credentials = {'login': USERNAME,
391                    'password': password}
392     headers = {'Content-type': 'application/x-www-form-urlencoded'}
393     http = httplib2.Http()
394     response, content = http.request(LOGIN_URL,
395                                      'POST',
396                                      headers=headers,
397                                      body=urllib.urlencode(credentials))
398     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
399                                                     response['status']))
400     
401     cookie = response.get('set-cookie', None)
402     if cookie is None:
403         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
404     return cookie
405
406                 
407 def get_url_as_soup(url, method, cookie=None):
408     http = httplib2.Http()
409     headers = {}
410     if cookie is not None:
411         headers['Cookie'] = cookie
412     response, content = http.request(url, method, headers=headers)
413     if response['status'] == '200':
414         soup = BeautifulSoup(content,
415                              fromEncoding="utf-8", # should read from header
416                              convertEntities=BeautifulSoup.HTML_ENTITIES
417                              )
418         return soup
419     else:
420         msg = "error accessing {0}, status {1}"
421         msg = msg.format(url, response['status'])
422         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
423
424 def get_url_as_text(url, method, cookie=None):
425     http = httplib2.Http()
426     headers = {}
427     if cookie is not None:
428         headers['Cookie'] = cookie
429     response, content = http.request(url, method, headers=headers)
430     if response['status'] == '200':
431         return content
432     else:
433         msg = "error accessing {0}, status {1}"
434         msg = msg.format(url, response['status'])
435         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
436     
437 ################
438 #  old stuff
439 SUBMISSIONS_LACKING_LIBID = [
440     ('1x75-Directional-HeLa-Rep1',    '11208'),
441     ('1x75-Directional-HeLa-Rep2',    '11207'),
442     ('1x75-Directional-HepG2-Rep1',   '11210'),
443     ('1x75-Directional-HepG2-Rep2',   '11209'),
444     ('1x75-Directional-H1-hESC-Rep1', '10947'),
445     ('1x75-Directional-H1-hESC-Rep2', '11009'),
446     ('1x75-Directional-HUVEC-Rep1',   '11206'),
447     ('1x75-Directional-HUVEC-Rep2',   '11205'),
448     ('1x75-Directional-K562-Rep1',    '11008'),
449     ('1x75-Directional-K562-Rep2',    '11007'),
450     ('1x75-Directional-NHEK-Rep1',    '11204'),
451     ('1x75-Directional-GM12878-Rep1', '11011'),
452     ('1x75-Directional-GM12878-Rep2', '11010'),
453     ]
454
455
456
457 def select_by_library_id(submission_list):
458     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
459     libraries = {}
460     for lib_id, subobj in subl:
461         libraries.setdefault(lib_id, []).append(subobj)
462
463     for submission in libraries.values():
464         submission.sort(key=attrgetter('date'), reverse=True)
465         
466     return libraries
467
468 def library_to_freeze(selected_libraries):
469     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
470     lib_ids = sorted(selected_libraries.keys())
471     report = ['<html><table border="1">']
472     report = ["""<html>
473 <head>
474 <style type="text/css">
475  td {border-width:0 0 1px 1px; border-style:solid;}
476 </style>
477 </head>
478 <body>
479 <table>
480 """]
481     report.append('<thead>')
482     report.append('<tr><td>Library ID</td><td>Name</td>')
483     for f in freezes:
484         report.append('<td>{0}</td>'.format(f))
485     report.append('</tr>')
486     report.append('</thead>')
487     report.append('<tbody>')
488     for lib_id in lib_ids:
489         report.append('<tr>')
490         lib_url = libraryNS[lib_id].uri
491         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
492         submissions = selected_libraries[lib_id]
493         report.append('<td>{0}</td>'.format(submissions[0].name))
494         batched = {}
495         for sub in submissions:
496             date = date_to_freeze(sub.date)
497             batched.setdefault(date, []).append(sub)
498         print lib_id, batched
499         for d in freezes:
500             report.append('<td>')
501             for s in batched.get(d, []):
502                 show_url = submissionNS[s.subid].uri
503                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
504                 report.append("{0}:{1}".format(subid, s.status))
505             report.append('</td>')
506         else:
507             report.append('<td></td>')
508         report.append("</tr>")
509     report.append('</tbody>')
510     report.append("</table></html>")
511     return "\n".join(report)
512
513             
514 def date_to_freeze(d):
515     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
516                 (datetime(2010, 7, 30), '2010-Jul'),
517                 (datetime(2011, 1, 30), '2011-Jan'),
518                 ]
519     for end, name in freezes:
520         if d < end:
521             return name
522     else:
523         return None
524
525 if __name__ == "__main__":
526     main()
527