Attempt to download DAF data for a encodesubmit submission
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from BeautifulSoup import BeautifulSoup
7 from datetime import datetime
8 import httplib2
9 from operator import attrgetter
10 from optparse import OptionParser, OptionGroup
11 # python keyring
12 import keyring
13 import logging
14 import os
15 import re
16 # redland rdf lib
17 import RDF
18 import sys
19 import urllib
20 import urlparse
21
22 from htsworkflow.submission import daf
23
24 from htsworkflow.util import api
25 from htsworkflow.util.rdfhelp import \
26      dafTermOntology, \
27      dublinCoreNS, \
28      get_model, \
29      get_serializer, \
30      sparql_query, \
31      submissionOntology, \
32      libraryOntology, \
33      load_into_model, \
34      rdfNS, \
35      rdfsNS, \
36      xsdNS
37 TYPE_N = rdfNS['type']
38
39 # URL mappings
40 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
41
42 from htsworkflow.submission.ucsc import \
43      daf_download_url, \
44      ddf_download_url, \
45      submission_view_url, \
46      UCSCEncodePipeline
47
48 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
49 DDF_NS = RDF.NS(DOWNLOAD_DDF)
50
51 DBDIR = os.path.expanduser("~diane/proj/submission")
52
53 LOGGER = logging.getLogger("encode_find")
54
55 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
56 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
57
58 USERNAME = 'detrout'
59 CHARSET = 'utf-8'
60
61
62 def main(cmdline=None):
63     """
64     Parse command line arguments
65
66     Takes a list of arguments (assuming arg[0] is the program name) or None
67     If None, it looks at sys.argv
68     """
69     parser = make_parser()
70     opts, args = parser.parse_args(cmdline)
71
72     if opts.debug:
73         logging.basicConfig(level=logging.DEBUG)
74     elif opts.verbose:
75         logging.basicConfig(level=logging.INFO)
76
77     htsw_authdata = api.make_auth_from_opts(opts, parser)
78     htswapi = api.HtswApi(opts.host, htsw_authdata)
79
80     cookie = None
81     model = get_model(opts.load_model, DBDIR)
82
83     if opts.load_rdf is not None:
84         ns_uri = submissionOntology[''].uri
85         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
86
87     if len(args) == 0:
88         limit = None
89     else:
90         limit = args
91
92     if opts.update:
93         cookie = login(cookie=cookie)
94         load_my_submissions(model, limit=limit, cookie=cookie)
95         load_encode_libraries(model, htswapi)
96
97     if opts.sparql is not None:
98         sparql_query(model, opts.sparql)
99
100     if opts.find_submission_with_no_library:
101         find_submissions_with_no_library(model)
102
103     if opts.print_rdf:
104         serializer = get_serializer(name=opts.rdf_parser_name)
105         print serializer.serialize_model_to_string(model)
106
107
108 def make_parser():
109     """Construct option parser
110     """
111     parser = OptionParser()
112     commands = OptionGroup(parser, "Commands")
113     commands.add_option('--load-model', default=None,
114       help="Load model database")
115     commands.add_option('--load-rdf', default=None,
116       help="load rdf statements into model")
117     commands.add_option('--print-rdf', action="store_true", default=False,
118       help="print ending model state")
119     commands.add_option('--update', action="store_true", default=False,
120       help="Query remote data sources and update our database")
121     #commands.add_option('--update-ucsc-status', default=None,
122     #  help="download status from ucsc, requires filename for extra rules")
123     #commands.add_option('--update-ddfs', action="store_true", default=False,
124     #  help="download ddf information for known submission")
125     #commands.add_option('--update-library', default=None,
126     #  help="download library info from htsw, "\
127     #       "requires filename for extra rules")
128     parser.add_option_group(commands)
129
130     queries = OptionGroup(parser, "Queries")
131     queries.add_option('--sparql', default=None,
132       help="execute arbitrary sparql query")
133     queries.add_option('--find-submission-with-no-library', default=False,
134       action="store_true",
135       help="find submissions with no library ID")
136     parser.add_option_group(queries)
137
138     options = OptionGroup(parser, "Options")
139     options.add_option("--rdf-parser-name", default="turtle",
140       help="set rdf file parser type")
141     options.add_option("-v", "--verbose", action="store_true", default=False)
142     options.add_option("--debug", action="store_true", default=False)
143     parser.add_option_group(options)
144
145     api.add_auth_options(parser)
146
147     return parser
148
149
150 def load_my_submissions(model, limit=None, cookie=None):
151     """Parse all the submissions from UCSC into model
152     It will look at the global USER_URL to figure out who to scrape
153     cookie contains the session cookie, if none, will attempt to login
154     """
155     if cookie is None:
156         cookie = login()
157
158     soup = get_url_as_soup(USER_URL, 'GET', cookie)
159     projects = soup.find('table', attrs={'id': 'projects'})
160     table_row = projects.findNext('tr')
161     # first record is header
162     table_row = table_row.findNext()
163     name_n = submissionOntology['name']
164     species_n = submissionOntology['species']
165     library_urn = submissionOntology['library_urn']
166
167     while table_row is not None:
168         cell = table_row.findAll('td')
169         if cell is not None and len(cell) > 1:
170             submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
171             if limit is None or submission_id in limit:
172                 subUrn = RDF.Uri(submission_view_url(submission_id))
173
174                 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
175
176                 name = get_contents(cell[4])
177                 add_stmt(model, subUrn, name_n, name)
178
179                 species = get_contents(cell[2])
180                 if species is not None:
181                     add_stmt(model, subUrn, species_n, species)
182
183                 library_id = get_library_id(name)
184                 if library_id is not None:
185                     add_submission_to_library_urn(model,
186                                                   subUrn,
187                                                   library_urn,
188                                                   library_id)
189
190                 add_submission_creation_date(model, subUrn, cookie)
191
192                 # grab changing atttributes
193                 status = get_contents(cell[6]).strip()
194                 last_mod_datetime = get_date_contents(cell[8])
195                 last_mod = last_mod_datetime.isoformat()
196
197                 update_submission_detail(model, subUrn, status, last_mod,
198                                          cookie=cookie)
199
200                 logging.info("Processed {0}".format(subUrn))
201
202         table_row = table_row.findNext('tr')
203
204
205 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
206     """Add a link from a UCSC submission to woldlab library if needed
207     """
208     libraryUrn = LIBRARY_NS[library_id + '/']
209     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
210     if not model.contains_statement(query):
211         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
212         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
213         model.add_statement(link)
214     else:
215         LOGGER.debug("Found: {0}".format(str(query)))
216
217
218 def find_submissions_with_no_library(model):
219     missing_lib_query = RDF.SPARQLQuery("""
220 PREFIX submissionOntology:<{submissionOntology}>
221
222 SELECT
223  ?subid ?name
224 WHERE {{
225   ?subid submissionOntology:name ?name
226   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
227   FILTER  (!bound(?libid))
228 }}""".format(submissionOntology=submissionOntology[''].uri))
229
230     results = missing_lib_query.execute(model)
231     for row in results:
232         subid = row['subid']
233         name = row['name']
234         print "# {0}".format(name)
235         print "<{0}>".format(subid.uri)
236         print "  encodeSubmit:library_urn"\
237               "<http://jumpgate.caltech.edu/library/> ."
238         print ""
239
240
241 def add_submission_creation_date(model, subUrn, cookie):
242     # in theory the submission page might have more information on it.
243     creationDateN = libraryOntology['date']
244     dateTimeType = xsdNS['dateTime']
245     query = RDF.Statement(subUrn, creationDateN, None)
246     creation_dates = list(model.find_statements(query))
247     if len(creation_dates) == 0:
248         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
249         soup = get_url_as_soup(str(subUrn), 'GET', cookie)
250         created_label = soup.find(text="Created: ")
251         if created_label:
252             created_date = get_date_contents(created_label.next)
253             created_date_node = RDF.Node(literal=created_date.isoformat(),
254                                          datatype=dateTimeType.uri)
255             add_stmt(model, subUrn, creationDateN, created_date_node)
256     else:
257         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
258
259
260 def update_submission_detail(model, subUrn, status, recent_update, cookie):
261     HasStatusN = submissionOntology['has_status']
262     StatusN = submissionOntology['status']
263     LastModifyN = submissionOntology['last_modify_date']
264
265     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
266     status_nodes = list(model.find_statements(status_nodes_query))
267
268     if len(status_nodes) == 0:
269         # has no status node, add one
270         logging.info("Adding status node to {0}".format(subUrn))
271         status_node = create_status_node(subUrn, recent_update)
272         add_stmt(model, subUrn, HasStatusN, status_node)
273         add_stmt(model, status_node, rdfsNS['type'], StatusN)
274         add_stmt(model, status_node, StatusN, status)
275         add_stmt(model, status_node, LastModifyN, recent_update)
276         update_ddf(model, subUrn, status_node, cookie=cookie)
277         update_daf(model, subUrn, status_node, cookie=cookie)
278     else:
279         logging.info("Found {0} status blanks".format(len(status_nodes)))
280         for status_statement in status_nodes:
281             status_node = status_statement.object
282             last_modified_query = RDF.Statement(status_node,
283                                                 LastModifyN,
284                                                 None)
285             last_mod_nodes = model.find_statements(last_modified_query)
286             for last_mod_statement in last_mod_nodes:
287                 last_mod_date = str(last_mod_statement.object)
288                 if recent_update == str(last_mod_date):
289                     update_ddf(model, subUrn, status_node, cookie=cookie)
290                     update_daf(model, subUrn, status_node, cookie=cookie)
291                     break
292
293
294 def update_daf(model, submission_url, status_node, cookie):
295     download_daf_uri = str(submission_url).replace('show', 'download_daf')
296     daf_uri = RDF.Uri(download_daf_uri)
297
298     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
299     if not model.contains_statement(status_is_daf):
300         logging.info('Adding daf to {0}, {1}'.format(submission_url,
301                                                      status_node))
302         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
303         daf.fromstring_into_model(model, status_node, daf_text)
304
305
306 def update_ddf(model, subUrn, statusNode, cookie):
307     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
308     ddfUrn = RDF.Uri(download_ddf_url)
309
310     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
311     if not model.contains_statement(status_is_ddf):
312         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
313         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
314         add_ddf_statements(model, statusNode, ddf_text)
315         model.add_statement(status_is_ddf)
316
317
318 def add_ddf_statements(model, statusNode, ddf_string):
319     """Convert a ddf text file into RDF Statements
320     """
321     ddf_lines = ddf_string.split('\n')
322     # first line is header
323     header = ddf_lines[0].split()
324     attributes = [DDF_NS[x] for x in header]
325
326     for ddf_line in ddf_lines[1:]:
327         ddf_line = ddf_line.strip()
328         if len(ddf_line) == 0:
329             continue
330         if ddf_line.startswith("#"):
331             continue
332
333         ddf_record = ddf_line.split('\t')
334         files = ddf_record[0].split(',')
335         file_attributes = ddf_record[1:]
336
337         for f in files:
338             fileNode = RDF.Node()
339             add_stmt(model,
340                      statusNode,
341                      submissionOntology['has_file'],
342                      fileNode)
343             add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
344             add_stmt(model, fileNode, DDF_NS['filename'], f)
345
346             for predicate, object in zip(attributes[1:], file_attributes):
347                 add_stmt(model, fileNode, predicate, object)
348
349
350 def load_encode_libraries(model, htswapi):
351     """Get libraries associated with encode.
352     """
353     encodeFilters = ["/library/?affiliations__id__exact=44",
354                      "/library/?affiliations__id__exact=80",
355                     ]
356
357     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
358     rdfaParser = RDF.Parser(name='rdfa')
359     for encodeUrl in encodeUrls:
360         LOGGER.info("Scanning library url {0}".format(encodeUrl))
361         rdfaParser.parse_into_model(model, encodeUrl)
362         query = RDF.Statement(None, libraryOntology['library_id'], None)
363         libraries = model.find_statements(query)
364         for statement in libraries:
365             libraryUrn = statement.subject
366             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
367             load_library_detail(model, libraryUrn)
368
369
370 def load_library_detail(model, libraryUrn):
371     """Grab detail information from library page
372     """
373     rdfaParser = RDF.Parser(name='rdfa')
374     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
375     results = list(model.find_statements(query))
376     log_message = "Found {0} statements for {1}"
377     LOGGER.debug(log_message.format(len(results), libraryUrn))
378     if len(results) == 0:
379         LOGGER.info("Loading {0}".format(str(libraryUrn)))
380         rdfaParser.parse_into_model(model, libraryUrn.uri)
381     elif len(results) == 1:
382         pass  # Assuming that a loaded dataset has one record
383     else:
384         logging.warning("Many dates for {0}".format(libraryUrn))
385
386
387 def get_library_id(name):
388     """Guess library ID from library name
389
390     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
391     '11039'
392     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
393     '10150'
394     """
395     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
396     library_id = None
397     if match is not None:
398         library_id = match.group('id')
399     return library_id
400
401
402 def get_contents(element):
403     """Return contents or none.
404     """
405     if len(element.contents) == 0:
406         return None
407
408     a = element.find('a')
409     if a is not None:
410         return a.contents[0].encode(CHARSET)
411
412     return element.contents[0].encode(CHARSET)
413
414
415 def create_status_node(submission_uri, timestamp):
416     submission_uri = daf.submission_uri_to_string(submission_uri)
417     status_uri = urlparse.urljoin(submission_uri, timestamp)
418     return RDF.Node(RDF.Uri(status_uri))
419
420 def get_date_contents(element):
421     data = get_contents(element)
422     if data:
423         return datetime.strptime(data, "%Y-%m-%d %H:%M")
424     else:
425         return None
426
427
428 def add_stmt(model, subject, predicate, rdf_object):
429     """Convienence create RDF Statement and add to a model
430     """
431     return model.add_statement(
432         RDF.Statement(subject, predicate, rdf_object))
433
434
435 def login(cookie=None):
436     """Login if we don't have a cookie
437     """
438     if cookie is not None:
439         return cookie
440
441     keys = keyring.get_keyring()
442     password = keys.get_password(LOGIN_URL, USERNAME)
443     credentials = {'login': USERNAME,
444                    'password': password}
445     headers = {'Content-type': 'application/x-www-form-urlencoded'}
446     http = httplib2.Http()
447     response, content = http.request(LOGIN_URL,
448                                      'POST',
449                                      headers=headers,
450                                      body=urllib.urlencode(credentials))
451     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
452                                                     response['status']))
453
454     cookie = response.get('set-cookie', None)
455     if cookie is None:
456         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
457     return cookie
458
459
460 def get_url_as_soup(url, method, cookie=None):
461     http = httplib2.Http()
462     headers = {}
463     if cookie is not None:
464         headers['Cookie'] = cookie
465     response, content = http.request(url, method, headers=headers)
466     if response['status'] == '200':
467         soup = BeautifulSoup(content,
468                              fromEncoding="utf-8",  # should read from header
469                              convertEntities=BeautifulSoup.HTML_ENTITIES)
470         return soup
471     else:
472         msg = "error accessing {0}, status {1}"
473         msg = msg.format(url, response['status'])
474         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
475
476
477 def get_url_as_text(url, method, cookie=None):
478     http = httplib2.Http()
479     headers = {}
480     if cookie is not None:
481         headers['Cookie'] = cookie
482     response, content = http.request(url, method, headers=headers)
483     if response['status'] == '200':
484         return content
485     else:
486         msg = "error accessing {0}, status {1}"
487         msg = msg.format(url, response['status'])
488         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
489
490 ################
491 #  old stuff
492 SUBMISSIONS_LACKING_LIBID = [
493     ('1x75-Directional-HeLa-Rep1',    '11208'),
494     ('1x75-Directional-HeLa-Rep2',    '11207'),
495     ('1x75-Directional-HepG2-Rep1',   '11210'),
496     ('1x75-Directional-HepG2-Rep2',   '11209'),
497     ('1x75-Directional-H1-hESC-Rep1', '10947'),
498     ('1x75-Directional-H1-hESC-Rep2', '11009'),
499     ('1x75-Directional-HUVEC-Rep1',   '11206'),
500     ('1x75-Directional-HUVEC-Rep2',   '11205'),
501     ('1x75-Directional-K562-Rep1',    '11008'),
502     ('1x75-Directional-K562-Rep2',    '11007'),
503     ('1x75-Directional-NHEK-Rep1',    '11204'),
504     ('1x75-Directional-GM12878-Rep1', '11011'),
505     ('1x75-Directional-GM12878-Rep2', '11010'),
506     ]
507
508
509 def select_by_library_id(submission_list):
510     subl = [(x.library_id, x) for x in submission_list if x.library_id]
511     libraries = {}
512     for lib_id, subobj in subl:
513         libraries.setdefault(lib_id, []).append(subobj)
514
515     for submission in libraries.values():
516         submission.sort(key=attrgetter('date'), reverse=True)
517
518     return libraries
519
520
521 def library_to_freeze(selected_libraries):
522     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
523     lib_ids = sorted(selected_libraries.keys())
524     report = ['<html><table border="1">']
525     report = ["""<html>
526 <head>
527 <style type="text/css">
528  td {border-width:0 0 1px 1px; border-style:solid;}
529 </style>
530 </head>
531 <body>
532 <table>
533 """]
534     report.append('<thead>')
535     report.append('<tr><td>Library ID</td><td>Name</td>')
536     for f in freezes:
537         report.append('<td>{0}</td>'.format(f))
538     report.append('</tr>')
539     report.append('</thead>')
540     report.append('<tbody>')
541     for lib_id in lib_ids:
542         report.append('<tr>')
543         lib_url = LIBRARY_NS[lib_id].uri
544         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
545         submissions = selected_libraries[lib_id]
546         report.append('<td>{0}</td>'.format(submissions[0].name))
547         batched = {}
548         for sub in submissions:
549             date = date_to_freeze(sub.date)
550             batched.setdefault(date, []).append(sub)
551         for d in freezes:
552             report.append('<td>')
553             for s in batched.get(d, []):
554                 show_url = submission_view_url(s.subid)
555                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
556                 report.append("{0}:{1}".format(subid, s.status))
557             report.append('</td>')
558         else:
559             report.append('<td></td>')
560         report.append("</tr>")
561     report.append('</tbody>')
562     report.append("</table></html>")
563     return "\n".join(report)
564
565
566 def date_to_freeze(d):
567     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
568                (datetime(2010, 7, 30), '2010-Jul'),
569                (datetime(2011, 1, 30), '2011-Jan'),
570                ]
571     for end, name in freezes:
572         if d < end:
573             return name
574     else:
575         return None
576
577 if __name__ == "__main__":
578     main()