Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from datetime import datetime
7 import hashlib
8 import httplib2
9 import keyring
10 import logging
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
14 # python keyring
15 import os
16 import re
17 # redland rdf lib
18 import RDF
19 import sys
20 import urllib
21 import urlparse
22
23 from htsworkflow.submission import daf
24
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
27      dafTermOntology, \
28      dublinCoreNS, \
29      get_model, \
30      get_serializer, \
31      sparql_query, \
32      submissionOntology, \
33      libraryOntology, \
34      load_into_model, \
35      rdfNS, \
36      rdfsNS, \
37      xsdNS
38 TYPE_N = rdfNS['type']
39
40 # URL mappings
41 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
42
43 from htsworkflow.submission.ucsc import \
44      daf_download_url, \
45      ddf_download_url, \
46      submission_view_url, \
47      UCSCEncodePipeline
48
49 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
50 DDF_NS = RDF.NS(DOWNLOAD_DDF)
51
52 DBDIR = os.path.expanduser("~diane/proj/submission")
53
54 LOGGER = logging.getLogger("encode_find")
55
56 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
57 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
58
59 USERNAME = 'detrout'
60 CHARSET = 'utf-8'
61
62
63 def main(cmdline=None):
64     """
65     Parse command line arguments
66
67     Takes a list of arguments (assuming arg[0] is the program name) or None
68     If None, it looks at sys.argv
69     """
70     parser = make_parser()
71     opts, args = parser.parse_args(cmdline)
72
73     if opts.debug:
74         logging.basicConfig(level=logging.DEBUG)
75     elif opts.verbose:
76         logging.basicConfig(level=logging.INFO)
77
78     htsw_authdata = api.make_auth_from_opts(opts, parser)
79     htswapi = api.HtswApi(opts.host, htsw_authdata)
80
81     cookie = None
82     model = get_model(opts.load_model, DBDIR)
83
84     if opts.load_rdf is not None:
85         ns_uri = submissionOntology[''].uri
86         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
87
88     if len(args) == 0:
89         limit = None
90     else:
91         limit = args
92
93     if opts.update:
94         cookie = login(cookie=cookie)
95         load_my_submissions(model, limit=limit, cookie=cookie)
96         load_encode_libraries(model, htswapi)
97
98     if opts.sparql is not None:
99         sparql_query(model, opts.sparql)
100
101     if opts.find_submission_with_no_library:
102         find_submissions_with_no_library(model)
103
104     if opts.print_rdf:
105         serializer = get_serializer(name=opts.rdf_parser_name)
106         print serializer.serialize_model_to_string(model)
107
108
109 def make_parser():
110     """Construct option parser
111     """
112     parser = OptionParser()
113     commands = OptionGroup(parser, "Commands")
114     commands.add_option('--load-model', default=None,
115       help="Load model database")
116     commands.add_option('--load-rdf', default=None,
117       help="load rdf statements into model")
118     commands.add_option('--print-rdf', action="store_true", default=False,
119       help="print ending model state")
120     commands.add_option('--update', action="store_true", default=False,
121       help="Query remote data sources and update our database")
122     #commands.add_option('--update-ucsc-status', default=None,
123     #  help="download status from ucsc, requires filename for extra rules")
124     #commands.add_option('--update-ddfs', action="store_true", default=False,
125     #  help="download ddf information for known submission")
126     #commands.add_option('--update-library', default=None,
127     #  help="download library info from htsw, "\
128     #       "requires filename for extra rules")
129     parser.add_option_group(commands)
130
131     queries = OptionGroup(parser, "Queries")
132     queries.add_option('--sparql', default=None,
133       help="execute arbitrary sparql query")
134     queries.add_option('--find-submission-with-no-library', default=False,
135       action="store_true",
136       help="find submissions with no library ID")
137     parser.add_option_group(queries)
138
139     options = OptionGroup(parser, "Options")
140     options.add_option("--rdf-parser-name", default="turtle",
141       help="set rdf file parser type")
142     options.add_option("-v", "--verbose", action="store_true", default=False)
143     options.add_option("--debug", action="store_true", default=False)
144     parser.add_option_group(options)
145
146     api.add_auth_options(parser)
147
148     return parser
149
150
151 def load_my_submissions(model, limit=None, cookie=None):
152     """Parse all the submissions from UCSC into model
153     It will look at the global USER_URL to figure out who to scrape
154     cookie contains the session cookie, if none, will attempt to login
155     """
156     if cookie is None:
157         cookie = login()
158
159     tree = get_url_as_tree(USER_URL, 'GET', cookie)
160     table_rows = tree.xpath('//table[@id="projects"]/tr')
161     # first record is header
162     name_n = submissionOntology['name']
163     species_n = submissionOntology['species']
164     library_urn = submissionOntology['library_urn']
165
166     # skip header
167     for row in table_rows[1:]:
168         cell = row.xpath('td')
169         if cell is not None and len(cell) > 1:
170             submission_id = str(cell[0].text_content())
171             if limit is None or submission_id in limit:
172                 subUrn = RDF.Uri(submission_view_url(submission_id))
173
174                 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
175
176                 name = str(cell[4].text_content())
177                 add_stmt(model, subUrn, name_n, name)
178
179                 species = str(cell[2].text_content())
180                 if species is not None:
181                     add_stmt(model, subUrn, species_n, species)
182
183                 library_id = get_library_id(name)
184                 if library_id is not None:
185                     add_submission_to_library_urn(model,
186                                                   subUrn,
187                                                   library_urn,
188                                                   library_id)
189                 else:
190                     errmsg = 'Unable to find library id in {0} for {1}'
191                     LOGGER.warn(errmsg.format(name, str(subUrn)))
192
193                 add_submission_creation_date(model, subUrn, cookie)
194
195                 # grab changing atttributes
196                 status = str(cell[6].text_content()).strip()
197                 last_mod_datetime = get_date_contents(cell[8])
198                 last_mod = last_mod_datetime.isoformat()
199
200                 update_submission_detail(model, subUrn, status, last_mod,
201                                          cookie=cookie)
202
203                 LOGGER.info("Processed {0}".format(subUrn))
204
205
206
207
208 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
209     """Add a link from a UCSC submission to woldlab library if needed
210     """
211     libraryUrn = LIBRARY_NS[library_id + '/']
212     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
213     if not model.contains_statement(query):
214         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
215         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
216         model.add_statement(link)
217     else:
218         LOGGER.debug("Found: {0}".format(str(query)))
219
220
221 def find_submissions_with_no_library(model):
222     missing_lib_query_text = """
223 PREFIX submissionOntology:<{submissionOntology}>
224
225 SELECT
226  ?subid ?name
227 WHERE {{
228   ?subid submissionOntology:name ?name
229   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
230   FILTER  (!bound(?libid))
231 }}""".format(submissionOntology=submissionOntology[''].uri)
232     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
233
234     results = missing_lib_query.execute(model)
235     for row in results:
236         subid = row['subid']
237         name = row['name']
238         print "# {0}".format(name)
239         print "<{0}>".format(subid.uri)
240         print "  encodeSubmit:library_urn "\
241               "<http://jumpgate.caltech.edu/library/> ."
242         print ""
243
244
245 def add_submission_creation_date(model, subUrn, cookie):
246     # in theory the submission page might have more information on it.
247     creationDateN = libraryOntology['date']
248     dateTimeType = xsdNS['dateTime']
249     query = RDF.Statement(subUrn, creationDateN, None)
250     creation_dates = list(model.find_statements(query))
251     if len(creation_dates) == 0:
252         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
253         tree = get_url_as_tree(str(subUrn), 'GET', cookie)
254         cells = tree.findall('.//td')
255         created_label = [x for x in cells
256                          if x.text_content().startswith('Created')]
257         if len(created_label) == 1:
258             created_date = get_date_contents(created_label[0].getnext())
259             created_date_node = RDF.Node(literal=created_date.isoformat(),
260                                          datatype=dateTimeType.uri)
261             add_stmt(model, subUrn, creationDateN, created_date_node)
262         else:
263             msg = 'Unable to find creation date for {0}'.format(str(subUrn))
264             LOGGER.warn(msg)
265             raise Warning(msg)
266     else:
267         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
268
269
270 def update_submission_detail(model, subUrn, status, recent_update, cookie):
271     HasStatusN = submissionOntology['has_status']
272     StatusN = submissionOntology['status']
273     LastModifyN = submissionOntology['last_modify_date']
274
275     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
276     status_nodes = list(model.find_statements(status_nodes_query))
277
278     if len(status_nodes) == 0:
279         # has no status node, add one
280         LOGGER.info("Adding status node to {0}".format(subUrn))
281         status_node = create_status_node(subUrn, recent_update)
282         add_stmt(model, subUrn, HasStatusN, status_node)
283         add_stmt(model, status_node, rdfNS['type'], StatusN)
284         add_stmt(model, status_node, StatusN, status)
285         add_stmt(model, status_node, LastModifyN, recent_update)
286         update_ddf(model, subUrn, status_node, cookie=cookie)
287         update_daf(model, subUrn, status_node, cookie=cookie)
288     else:
289         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
290         for status_statement in status_nodes:
291             status_node = status_statement.object
292             last_modified_query = RDF.Statement(status_node,
293                                                 LastModifyN,
294                                                 None)
295             last_mod_nodes = model.find_statements(last_modified_query)
296             for last_mod_statement in last_mod_nodes:
297                 last_mod_date = str(last_mod_statement.object)
298                 if recent_update == str(last_mod_date):
299                     update_ddf(model, subUrn, status_node, cookie=cookie)
300                     update_daf(model, subUrn, status_node, cookie=cookie)
301                     break
302
303
304 def update_daf(model, submission_url, status_node, cookie):
305     download_daf_uri = str(submission_url).replace('show', 'download_daf')
306     daf_uri = RDF.Uri(download_daf_uri)
307
308     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
309     if not model.contains_statement(status_is_daf):
310         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
311                                                      status_node))
312         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
313         daf_hash = hashlib.md5(daf_text).hexdigest()
314         daf_hash_stmt = RDF.Statement(status_node,
315                                       dafTermOntology['md5sum'],
316                                       daf_hash)
317         model.add_statement(daf_hash_stmt)
318         daf.fromstring_into_model(model, status_node, daf_text)
319
320
321 def update_ddf(model, subUrn, statusNode, cookie):
322     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
323     ddfUrn = RDF.Uri(download_ddf_url)
324
325     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
326     if not model.contains_statement(status_is_ddf):
327         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
328         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
329         add_ddf_statements(model, statusNode, ddf_text)
330         model.add_statement(status_is_ddf)
331
332
333 def add_ddf_statements(model, statusNode, ddf_string):
334     """Convert a ddf text file into RDF Statements
335     """
336     ddf_lines = ddf_string.split('\n')
337     # first line is header
338     header = ddf_lines[0].split()
339     attributes = [DDF_NS[x] for x in header]
340
341     for ddf_line in ddf_lines[1:]:
342         ddf_line = ddf_line.strip()
343         if len(ddf_line) == 0:
344             continue
345         if ddf_line.startswith("#"):
346             continue
347
348         ddf_record = ddf_line.split('\t')
349         files = ddf_record[0].split(',')
350         file_attributes = ddf_record[1:]
351
352         for f in files:
353             fileNode = RDF.Node()
354             add_stmt(model,
355                      statusNode,
356                      submissionOntology['has_file'],
357                      fileNode)
358             add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
359             add_stmt(model, fileNode, DDF_NS['filename'], f)
360
361             for predicate, object in zip(attributes[1:], file_attributes):
362                 add_stmt(model, fileNode, predicate, object)
363
364
365 def load_encode_libraries(model, htswapi):
366     """Get libraries associated with encode.
367     """
368     encodeFilters = ["/library/?affiliations__id__exact=44",
369                      "/library/?affiliations__id__exact=80",
370                     ]
371
372     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
373     rdfaParser = RDF.Parser(name='rdfa')
374     for encodeUrl in encodeUrls:
375         LOGGER.info("Scanning library url {0}".format(encodeUrl))
376         rdfaParser.parse_into_model(model, encodeUrl)
377         query = RDF.Statement(None, libraryOntology['library_id'], None)
378         libraries = model.find_statements(query)
379         for statement in libraries:
380             libraryUrn = statement.subject
381             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
382             load_library_detail(model, libraryUrn)
383
384
385 def load_library_detail(model, libraryUrn):
386     """Grab detail information from library page
387     """
388     rdfaParser = RDF.Parser(name='rdfa')
389     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
390     results = list(model.find_statements(query))
391     log_message = "Found {0} statements for {1}"
392     LOGGER.debug(log_message.format(len(results), libraryUrn))
393     if len(results) == 0:
394         LOGGER.info("Loading {0}".format(str(libraryUrn)))
395         rdfaParser.parse_into_model(model, libraryUrn.uri)
396     elif len(results) == 1:
397         pass  # Assuming that a loaded dataset has one record
398     else:
399         LOGGER.warning("Many dates for {0}".format(libraryUrn))
400
401
402 def get_library_id(name):
403     """Guess library ID from library name
404
405     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
406     '11039'
407     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
408     '10150'
409     """
410     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
411     library_id = None
412     if match is not None:
413         library_id = match.group('id')
414     return library_id
415
416
417 def get_contents(element):
418     """Return contents or none.
419     """
420     if len(element.contents) == 0:
421         return None
422
423     a = element.find('a')
424     if a is not None:
425         return a.contents[0].encode(CHARSET)
426
427     return element.contents[0].encode(CHARSET)
428
429
430 def create_status_node(submission_uri, timestamp):
431     submission_uri = daf.submission_uri_to_string(submission_uri)
432     status_uri = urlparse.urljoin(submission_uri, timestamp)
433     return RDF.Node(RDF.Uri(status_uri))
434
435
436 def get_date_contents(element):
437     data = element.text_content()
438     if data:
439         return datetime.strptime(data, "%Y-%m-%d %H:%M")
440     else:
441         return None
442
443
444 def add_stmt(model, subject, predicate, rdf_object):
445     """Convienence create RDF Statement and add to a model
446     """
447     return model.add_statement(
448         RDF.Statement(subject, predicate, rdf_object))
449
450
451 def login(cookie=None):
452     """Login if we don't have a cookie
453     """
454     if cookie is not None:
455         return cookie
456
457     keys = keyring.get_keyring()
458     password = keys.get_password(LOGIN_URL, USERNAME)
459     credentials = {'login': USERNAME,
460                    'password': password}
461     headers = {'Content-type': 'application/x-www-form-urlencoded'}
462     http = httplib2.Http()
463     response, content = http.request(LOGIN_URL,
464                                      'POST',
465                                      headers=headers,
466                                      body=urllib.urlencode(credentials))
467     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
468                                                     response['status']))
469
470     cookie = response.get('set-cookie', None)
471     if cookie is None:
472         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
473     return cookie
474
475
476 def get_url_as_tree(url, method, cookie=None):
477     http = httplib2.Http()
478     headers = {}
479     if cookie is not None:
480         headers['Cookie'] = cookie
481     response, content = http.request(url, method, headers=headers)
482     if response['status'] == '200':
483         tree = fromstring(content, base_url=url)
484         return tree
485     else:
486         msg = "error accessing {0}, status {1}"
487         msg = msg.format(url, response['status'])
488         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
489
490
491 def get_url_as_text(url, method, cookie=None):
492     http = httplib2.Http()
493     headers = {}
494     if cookie is not None:
495         headers['Cookie'] = cookie
496     response, content = http.request(url, method, headers=headers)
497     if response['status'] == '200':
498         return content
499     else:
500         msg = "error accessing {0}, status {1}"
501         msg = msg.format(url, response['status'])
502         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
503
504 ################
505 #  old stuff
506 SUBMISSIONS_LACKING_LIBID = [
507     ('1x75-Directional-HeLa-Rep1',    '11208'),
508     ('1x75-Directional-HeLa-Rep2',    '11207'),
509     ('1x75-Directional-HepG2-Rep1',   '11210'),
510     ('1x75-Directional-HepG2-Rep2',   '11209'),
511     ('1x75-Directional-H1-hESC-Rep1', '10947'),
512     ('1x75-Directional-H1-hESC-Rep2', '11009'),
513     ('1x75-Directional-HUVEC-Rep1',   '11206'),
514     ('1x75-Directional-HUVEC-Rep2',   '11205'),
515     ('1x75-Directional-K562-Rep1',    '11008'),
516     ('1x75-Directional-K562-Rep2',    '11007'),
517     ('1x75-Directional-NHEK-Rep1',    '11204'),
518     ('1x75-Directional-GM12878-Rep1', '11011'),
519     ('1x75-Directional-GM12878-Rep2', '11010'),
520     ]
521
522
523 def select_by_library_id(submission_list):
524     subl = [(x.library_id, x) for x in submission_list if x.library_id]
525     libraries = {}
526     for lib_id, subobj in subl:
527         libraries.setdefault(lib_id, []).append(subobj)
528
529     for submission in libraries.values():
530         submission.sort(key=attrgetter('date'), reverse=True)
531
532     return libraries
533
534
535 def library_to_freeze(selected_libraries):
536     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
537     lib_ids = sorted(selected_libraries.keys())
538     report = ['<html><table border="1">']
539     report = ["""<html>
540 <head>
541 <style type="text/css">
542  td {border-width:0 0 1px 1px; border-style:solid;}
543 </style>
544 </head>
545 <body>
546 <table>
547 """]
548     report.append('<thead>')
549     report.append('<tr><td>Library ID</td><td>Name</td>')
550     for f in freezes:
551         report.append('<td>{0}</td>'.format(f))
552     report.append('</tr>')
553     report.append('</thead>')
554     report.append('<tbody>')
555     for lib_id in lib_ids:
556         report.append('<tr>')
557         lib_url = LIBRARY_NS[lib_id].uri
558         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
559         submissions = selected_libraries[lib_id]
560         report.append('<td>{0}</td>'.format(submissions[0].name))
561         batched = {}
562         for sub in submissions:
563             date = date_to_freeze(sub.date)
564             batched.setdefault(date, []).append(sub)
565         for d in freezes:
566             report.append('<td>')
567             for s in batched.get(d, []):
568                 show_url = submission_view_url(s.subid)
569                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
570                 report.append("{0}:{1}".format(subid, s.status))
571             report.append('</td>')
572         else:
573             report.append('<td></td>')
574         report.append("</tr>")
575     report.append('</tbody>')
576     report.append("</table></html>")
577     return "\n".join(report)
578
579
580 def date_to_freeze(d):
581     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
582                (datetime(2010, 7, 30), '2010-Jul'),
583                (datetime(2011, 1, 30), '2011-Jan'),
584                ]
585     for end, name in freezes:
586         if d < end:
587             return name
588     else:
589         return None
590
591 if __name__ == "__main__":
592     main()