Use a logger initialized to the module name much more consistently.
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from lxml.html import fromstring
7 from datetime import datetime
8 import httplib2
9 from operator import attrgetter
10 from optparse import OptionParser, OptionGroup
11 # python keyring
12 import keyring
13 import logging
14 import os
15 import re
16 # redland rdf lib
17 import RDF
18 import sys
19 import urllib
20 import urlparse
21
22 from htsworkflow.submission import daf
23
24 from htsworkflow.util import api
25 from htsworkflow.util.rdfhelp import \
26      dafTermOntology, \
27      dublinCoreNS, \
28      get_model, \
29      get_serializer, \
30      sparql_query, \
31      submissionOntology, \
32      libraryOntology, \
33      load_into_model, \
34      rdfNS, \
35      rdfsNS, \
36      xsdNS
37 TYPE_N = rdfNS['type']
38
39 # URL mappings
40 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
41
42 from htsworkflow.submission.ucsc import \
43      daf_download_url, \
44      ddf_download_url, \
45      submission_view_url, \
46      UCSCEncodePipeline
47
48 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
49 DDF_NS = RDF.NS(DOWNLOAD_DDF)
50
51 DBDIR = os.path.expanduser("~diane/proj/submission")
52
53 LOGGER = logging.getLogger("encode_find")
54
55 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
56 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
57
58 USERNAME = 'detrout'
59 CHARSET = 'utf-8'
60
61
62 def main(cmdline=None):
63     """
64     Parse command line arguments
65
66     Takes a list of arguments (assuming arg[0] is the program name) or None
67     If None, it looks at sys.argv
68     """
69     parser = make_parser()
70     opts, args = parser.parse_args(cmdline)
71
72     if opts.debug:
73         logging.basicConfig(level=logging.DEBUG)
74     elif opts.verbose:
75         logging.basicConfig(level=logging.INFO)
76
77     htsw_authdata = api.make_auth_from_opts(opts, parser)
78     htswapi = api.HtswApi(opts.host, htsw_authdata)
79
80     cookie = None
81     model = get_model(opts.load_model, DBDIR)
82
83     if opts.load_rdf is not None:
84         ns_uri = submissionOntology[''].uri
85         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
86
87     if len(args) == 0:
88         limit = None
89     else:
90         limit = args
91
92     if opts.update:
93         cookie = login(cookie=cookie)
94         load_my_submissions(model, limit=limit, cookie=cookie)
95         load_encode_libraries(model, htswapi)
96
97     if opts.sparql is not None:
98         sparql_query(model, opts.sparql)
99
100     if opts.find_submission_with_no_library:
101         find_submissions_with_no_library(model)
102
103     if opts.print_rdf:
104         serializer = get_serializer(name=opts.rdf_parser_name)
105         print serializer.serialize_model_to_string(model)
106
107
108 def make_parser():
109     """Construct option parser
110     """
111     parser = OptionParser()
112     commands = OptionGroup(parser, "Commands")
113     commands.add_option('--load-model', default=None,
114       help="Load model database")
115     commands.add_option('--load-rdf', default=None,
116       help="load rdf statements into model")
117     commands.add_option('--print-rdf', action="store_true", default=False,
118       help="print ending model state")
119     commands.add_option('--update', action="store_true", default=False,
120       help="Query remote data sources and update our database")
121     #commands.add_option('--update-ucsc-status', default=None,
122     #  help="download status from ucsc, requires filename for extra rules")
123     #commands.add_option('--update-ddfs', action="store_true", default=False,
124     #  help="download ddf information for known submission")
125     #commands.add_option('--update-library', default=None,
126     #  help="download library info from htsw, "\
127     #       "requires filename for extra rules")
128     parser.add_option_group(commands)
129
130     queries = OptionGroup(parser, "Queries")
131     queries.add_option('--sparql', default=None,
132       help="execute arbitrary sparql query")
133     queries.add_option('--find-submission-with-no-library', default=False,
134       action="store_true",
135       help="find submissions with no library ID")
136     parser.add_option_group(queries)
137
138     options = OptionGroup(parser, "Options")
139     options.add_option("--rdf-parser-name", default="turtle",
140       help="set rdf file parser type")
141     options.add_option("-v", "--verbose", action="store_true", default=False)
142     options.add_option("--debug", action="store_true", default=False)
143     parser.add_option_group(options)
144
145     api.add_auth_options(parser)
146
147     return parser
148
149
150 def load_my_submissions(model, limit=None, cookie=None):
151     """Parse all the submissions from UCSC into model
152     It will look at the global USER_URL to figure out who to scrape
153     cookie contains the session cookie, if none, will attempt to login
154     """
155     if cookie is None:
156         cookie = login()
157
158     tree = get_url_as_tree(USER_URL, 'GET', cookie)
159     table_rows = tree.xpath('//table[@id="projects"]/tr')
160     # first record is header
161     name_n = submissionOntology['name']
162     species_n = submissionOntology['species']
163     library_urn = submissionOntology['library_urn']
164
165     # skip header
166     for row in table_rows[1:]:
167         cell = row.xpath('td')
168         if cell is not None and len(cell) > 1:
169             submission_id = str(cell[0].text_content())
170             if limit is None or submission_id in limit:
171                 subUrn = RDF.Uri(submission_view_url(submission_id))
172
173                 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
174
175                 name = str(cell[4].text_content())
176                 add_stmt(model, subUrn, name_n, name)
177
178                 species = str(cell[2].text_content())
179                 if species is not None:
180                     add_stmt(model, subUrn, species_n, species)
181
182                 library_id = get_library_id(name)
183                 if library_id is not None:
184                     add_submission_to_library_urn(model,
185                                                   subUrn,
186                                                   library_urn,
187                                                   library_id)
188
189                 add_submission_creation_date(model, subUrn, cookie)
190
191                 # grab changing atttributes
192                 status = str(cell[6].text_content()).strip()
193                 last_mod_datetime = get_date_contents(cell[8])
194                 last_mod = last_mod_datetime.isoformat()
195
196                 update_submission_detail(model, subUrn, status, last_mod,
197                                          cookie=cookie)
198
199                 LOGGER.info("Processed {0}".format(subUrn))
200
201
202
203
204 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
205     """Add a link from a UCSC submission to woldlab library if needed
206     """
207     libraryUrn = LIBRARY_NS[library_id + '/']
208     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
209     if not model.contains_statement(query):
210         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
211         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
212         model.add_statement(link)
213     else:
214         LOGGER.debug("Found: {0}".format(str(query)))
215
216
217 def find_submissions_with_no_library(model):
218     missing_lib_query = RDF.SPARQLQuery("""
219 PREFIX submissionOntology:<{submissionOntology}>
220
221 SELECT
222  ?subid ?name
223 WHERE {{
224   ?subid submissionOntology:name ?name
225   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
226   FILTER  (!bound(?libid))
227 }}""".format(submissionOntology=submissionOntology[''].uri))
228
229     results = missing_lib_query.execute(model)
230     for row in results:
231         subid = row['subid']
232         name = row['name']
233         print "# {0}".format(name)
234         print "<{0}>".format(subid.uri)
235         print "  encodeSubmit:library_urn "\
236               "<http://jumpgate.caltech.edu/library/> ."
237         print ""
238
239
240 def add_submission_creation_date(model, subUrn, cookie):
241     # in theory the submission page might have more information on it.
242     creationDateN = libraryOntology['date']
243     dateTimeType = xsdNS['dateTime']
244     query = RDF.Statement(subUrn, creationDateN, None)
245     creation_dates = list(model.find_statements(query))
246     if len(creation_dates) == 0:
247         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
248         tree = get_url_as_tree(str(subUrn), 'GET', cookie)
249         cells = tree.xpath('//div[@id="content"]/table/tr/td')
250         created_label = [x for x in cells
251                          if x.text_content().startswith('Created')]
252         if len(created_label) == 1:
253             created_date = get_date_contents(created_label[0].getnext())
254             created_date_node = RDF.Node(literal=created_date.isoformat(),
255                                          datatype=dateTimeType.uri)
256             add_stmt(model, subUrn, creationDateN, created_date_node)
257     else:
258         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
259
260
261 def update_submission_detail(model, subUrn, status, recent_update, cookie):
262     HasStatusN = submissionOntology['has_status']
263     StatusN = submissionOntology['status']
264     LastModifyN = submissionOntology['last_modify_date']
265
266     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
267     status_nodes = list(model.find_statements(status_nodes_query))
268
269     if len(status_nodes) == 0:
270         # has no status node, add one
271         LOGGER.info("Adding status node to {0}".format(subUrn))
272         status_node = create_status_node(subUrn, recent_update)
273         add_stmt(model, subUrn, HasStatusN, status_node)
274         add_stmt(model, status_node, rdfsNS['type'], StatusN)
275         add_stmt(model, status_node, StatusN, status)
276         add_stmt(model, status_node, LastModifyN, recent_update)
277         update_ddf(model, subUrn, status_node, cookie=cookie)
278         update_daf(model, subUrn, status_node, cookie=cookie)
279     else:
280         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
281         for status_statement in status_nodes:
282             status_node = status_statement.object
283             last_modified_query = RDF.Statement(status_node,
284                                                 LastModifyN,
285                                                 None)
286             last_mod_nodes = model.find_statements(last_modified_query)
287             for last_mod_statement in last_mod_nodes:
288                 last_mod_date = str(last_mod_statement.object)
289                 if recent_update == str(last_mod_date):
290                     update_ddf(model, subUrn, status_node, cookie=cookie)
291                     update_daf(model, subUrn, status_node, cookie=cookie)
292                     break
293
294
295 def update_daf(model, submission_url, status_node, cookie):
296     download_daf_uri = str(submission_url).replace('show', 'download_daf')
297     daf_uri = RDF.Uri(download_daf_uri)
298
299     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
300     if not model.contains_statement(status_is_daf):
301         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
302                                                      status_node))
303         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
304         daf.fromstring_into_model(model, status_node, daf_text)
305
306
307 def update_ddf(model, subUrn, statusNode, cookie):
308     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
309     ddfUrn = RDF.Uri(download_ddf_url)
310
311     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
312     if not model.contains_statement(status_is_ddf):
313         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
314         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
315         add_ddf_statements(model, statusNode, ddf_text)
316         model.add_statement(status_is_ddf)
317
318
319 def add_ddf_statements(model, statusNode, ddf_string):
320     """Convert a ddf text file into RDF Statements
321     """
322     ddf_lines = ddf_string.split('\n')
323     # first line is header
324     header = ddf_lines[0].split()
325     attributes = [DDF_NS[x] for x in header]
326
327     for ddf_line in ddf_lines[1:]:
328         ddf_line = ddf_line.strip()
329         if len(ddf_line) == 0:
330             continue
331         if ddf_line.startswith("#"):
332             continue
333
334         ddf_record = ddf_line.split('\t')
335         files = ddf_record[0].split(',')
336         file_attributes = ddf_record[1:]
337
338         for f in files:
339             fileNode = RDF.Node()
340             add_stmt(model,
341                      statusNode,
342                      submissionOntology['has_file'],
343                      fileNode)
344             add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
345             add_stmt(model, fileNode, DDF_NS['filename'], f)
346
347             for predicate, object in zip(attributes[1:], file_attributes):
348                 add_stmt(model, fileNode, predicate, object)
349
350
351 def load_encode_libraries(model, htswapi):
352     """Get libraries associated with encode.
353     """
354     encodeFilters = ["/library/?affiliations__id__exact=44",
355                      "/library/?affiliations__id__exact=80",
356                     ]
357
358     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
359     rdfaParser = RDF.Parser(name='rdfa')
360     for encodeUrl in encodeUrls:
361         LOGGER.info("Scanning library url {0}".format(encodeUrl))
362         rdfaParser.parse_into_model(model, encodeUrl)
363         query = RDF.Statement(None, libraryOntology['library_id'], None)
364         libraries = model.find_statements(query)
365         for statement in libraries:
366             libraryUrn = statement.subject
367             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
368             load_library_detail(model, libraryUrn)
369
370
371 def load_library_detail(model, libraryUrn):
372     """Grab detail information from library page
373     """
374     rdfaParser = RDF.Parser(name='rdfa')
375     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
376     results = list(model.find_statements(query))
377     log_message = "Found {0} statements for {1}"
378     LOGGER.debug(log_message.format(len(results), libraryUrn))
379     if len(results) == 0:
380         LOGGER.info("Loading {0}".format(str(libraryUrn)))
381         rdfaParser.parse_into_model(model, libraryUrn.uri)
382     elif len(results) == 1:
383         pass  # Assuming that a loaded dataset has one record
384     else:
385         LOGGER.warning("Many dates for {0}".format(libraryUrn))
386
387
388 def get_library_id(name):
389     """Guess library ID from library name
390
391     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
392     '11039'
393     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
394     '10150'
395     """
396     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
397     library_id = None
398     if match is not None:
399         library_id = match.group('id')
400     return library_id
401
402
403 def get_contents(element):
404     """Return contents or none.
405     """
406     if len(element.contents) == 0:
407         return None
408
409     a = element.find('a')
410     if a is not None:
411         return a.contents[0].encode(CHARSET)
412
413     return element.contents[0].encode(CHARSET)
414
415
416 def create_status_node(submission_uri, timestamp):
417     submission_uri = daf.submission_uri_to_string(submission_uri)
418     status_uri = urlparse.urljoin(submission_uri, timestamp)
419     return RDF.Node(RDF.Uri(status_uri))
420
421
422 def get_date_contents(element):
423     data = element.text_content()
424     if data:
425         return datetime.strptime(data, "%Y-%m-%d %H:%M")
426     else:
427         return None
428
429
430 def add_stmt(model, subject, predicate, rdf_object):
431     """Convienence create RDF Statement and add to a model
432     """
433     return model.add_statement(
434         RDF.Statement(subject, predicate, rdf_object))
435
436
437 def login(cookie=None):
438     """Login if we don't have a cookie
439     """
440     if cookie is not None:
441         return cookie
442
443     keys = keyring.get_keyring()
444     password = keys.get_password(LOGIN_URL, USERNAME)
445     credentials = {'login': USERNAME,
446                    'password': password}
447     headers = {'Content-type': 'application/x-www-form-urlencoded'}
448     http = httplib2.Http()
449     response, content = http.request(LOGIN_URL,
450                                      'POST',
451                                      headers=headers,
452                                      body=urllib.urlencode(credentials))
453     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
454                                                     response['status']))
455
456     cookie = response.get('set-cookie', None)
457     if cookie is None:
458         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
459     return cookie
460
461
462 def get_url_as_tree(url, method, cookie=None):
463     http = httplib2.Http()
464     headers = {}
465     if cookie is not None:
466         headers['Cookie'] = cookie
467     response, content = http.request(url, method, headers=headers)
468     if response['status'] == '200':
469         tree = fromstring(content, base_url=url)
470         return tree
471     else:
472         msg = "error accessing {0}, status {1}"
473         msg = msg.format(url, response['status'])
474         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
475
476
477 def get_url_as_text(url, method, cookie=None):
478     http = httplib2.Http()
479     headers = {}
480     if cookie is not None:
481         headers['Cookie'] = cookie
482     response, content = http.request(url, method, headers=headers)
483     if response['status'] == '200':
484         return content
485     else:
486         msg = "error accessing {0}, status {1}"
487         msg = msg.format(url, response['status'])
488         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
489
490 ################
491 #  old stuff
492 SUBMISSIONS_LACKING_LIBID = [
493     ('1x75-Directional-HeLa-Rep1',    '11208'),
494     ('1x75-Directional-HeLa-Rep2',    '11207'),
495     ('1x75-Directional-HepG2-Rep1',   '11210'),
496     ('1x75-Directional-HepG2-Rep2',   '11209'),
497     ('1x75-Directional-H1-hESC-Rep1', '10947'),
498     ('1x75-Directional-H1-hESC-Rep2', '11009'),
499     ('1x75-Directional-HUVEC-Rep1',   '11206'),
500     ('1x75-Directional-HUVEC-Rep2',   '11205'),
501     ('1x75-Directional-K562-Rep1',    '11008'),
502     ('1x75-Directional-K562-Rep2',    '11007'),
503     ('1x75-Directional-NHEK-Rep1',    '11204'),
504     ('1x75-Directional-GM12878-Rep1', '11011'),
505     ('1x75-Directional-GM12878-Rep2', '11010'),
506     ]
507
508
509 def select_by_library_id(submission_list):
510     subl = [(x.library_id, x) for x in submission_list if x.library_id]
511     libraries = {}
512     for lib_id, subobj in subl:
513         libraries.setdefault(lib_id, []).append(subobj)
514
515     for submission in libraries.values():
516         submission.sort(key=attrgetter('date'), reverse=True)
517
518     return libraries
519
520
521 def library_to_freeze(selected_libraries):
522     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
523     lib_ids = sorted(selected_libraries.keys())
524     report = ['<html><table border="1">']
525     report = ["""<html>
526 <head>
527 <style type="text/css">
528  td {border-width:0 0 1px 1px; border-style:solid;}
529 </style>
530 </head>
531 <body>
532 <table>
533 """]
534     report.append('<thead>')
535     report.append('<tr><td>Library ID</td><td>Name</td>')
536     for f in freezes:
537         report.append('<td>{0}</td>'.format(f))
538     report.append('</tr>')
539     report.append('</thead>')
540     report.append('<tbody>')
541     for lib_id in lib_ids:
542         report.append('<tr>')
543         lib_url = LIBRARY_NS[lib_id].uri
544         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
545         submissions = selected_libraries[lib_id]
546         report.append('<td>{0}</td>'.format(submissions[0].name))
547         batched = {}
548         for sub in submissions:
549             date = date_to_freeze(sub.date)
550             batched.setdefault(date, []).append(sub)
551         for d in freezes:
552             report.append('<td>')
553             for s in batched.get(d, []):
554                 show_url = submission_view_url(s.subid)
555                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
556                 report.append("{0}:{1}".format(subid, s.status))
557             report.append('</td>')
558         else:
559             report.append('<td></td>')
560         report.append("</tr>")
561     report.append('</tbody>')
562     report.append("</table></html>")
563     return "\n".join(report)
564
565
566 def date_to_freeze(d):
567     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
568                (datetime(2010, 7, 30), '2010-Jul'),
569                (datetime(2011, 1, 30), '2011-Jan'),
570                ]
571     for end, name in freezes:
572         if d < end:
573             return name
574     else:
575         return None
576
577 if __name__ == "__main__":
578     main()