Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from datetime import datetime
7 import hashlib
8 import httplib2
9 import keyring
10 import logging
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
14 # python keyring
15 import os
16 import re
17 # redland rdf lib
18 import RDF
19 import sys
20 import urllib
21 import urlparse
22
23 from htsworkflow.submission import daf, ucsc
24
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
27      dafTermOntology, \
28      dublinCoreNS, \
29      get_model, \
30      get_serializer, \
31      sparql_query, \
32      submissionOntology, \
33      libraryOntology, \
34      load_into_model, \
35      rdfNS, \
36      rdfsNS, \
37      xsdNS
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
40
41 # URL mappings
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
43
44 from htsworkflow.submission.ucsc import \
45      daf_download_url, \
46      ddf_download_url, \
47      get_ucsc_file_index, \
48      submission_view_url, \
49      UCSCEncodePipeline
50
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
52
53 DBDIR = os.path.expanduser("~diane/proj/submission")
54
55 LOGGER = logging.getLogger("encode_find")
56
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
59
60 USERNAME = 'detrout'
61 CHARSET = 'utf-8'
62
63 GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
64                    "{genome}/encodeDCC/{composite}/"
65 def main(cmdline=None):
66     """
67     Parse command line arguments
68
69     Takes a list of arguments (assuming arg[0] is the program name) or None
70     If None, it looks at sys.argv
71     """
72     parser = make_parser()
73     opts, args = parser.parse_args(cmdline)
74
75     if opts.debug:
76         logging.basicConfig(level=logging.DEBUG)
77     elif opts.verbose:
78         logging.basicConfig(level=logging.INFO)
79
80     htsw_authdata = api.make_auth_from_opts(opts, parser)
81     htswapi = api.HtswApi(opts.host, htsw_authdata)
82
83     cookie = None
84     model = get_model(opts.load_model, DBDIR)
85
86     if opts.load_rdf is not None:
87         ns_uri = submissionOntology[''].uri
88         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
89
90     if len(args) == 0:
91         limit = None
92     else:
93         limit = args
94
95     if opts.update:
96         cookie = login(cookie=cookie)
97         load_my_submissions(model, limit=limit, cookie=cookie)
98         load_encode_libraries(model, htswapi)
99         our_tracks = [
100             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
101             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
102             {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
103             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
104         ]
105         for track_info in our_tracks:
106             load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
107
108
109     if opts.sparql is not None:
110         sparql_query(model, opts.sparql)
111
112     if opts.find_submission_with_no_library:
113         find_submissions_with_no_library(model)
114
115     if opts.print_rdf:
116         serializer = get_serializer(name=opts.rdf_parser_name)
117         print serializer.serialize_model_to_string(model)
118
119
120 def make_parser():
121     """Construct option parser
122     """
123     parser = OptionParser()
124     commands = OptionGroup(parser, "Commands")
125     commands.add_option('--load-model', default=None,
126       help="Load model database")
127     commands.add_option('--load-rdf', default=None,
128       help="load rdf statements into model")
129     commands.add_option('--print-rdf', action="store_true", default=False,
130       help="print ending model state")
131     commands.add_option('--update', action="store_true", default=False,
132       help="Query remote data sources and update our database")
133     #commands.add_option('--update-ucsc-status', default=None,
134     #  help="download status from ucsc, requires filename for extra rules")
135     #commands.add_option('--update-ddfs', action="store_true", default=False,
136     #  help="download ddf information for known submission")
137     #commands.add_option('--update-library', default=None,
138     #  help="download library info from htsw, "\
139     #       "requires filename for extra rules")
140     parser.add_option_group(commands)
141
142     queries = OptionGroup(parser, "Queries")
143     queries.add_option('--sparql', default=None,
144       help="execute arbitrary sparql query")
145     queries.add_option('--find-submission-with-no-library', default=False,
146       action="store_true",
147       help="find submissions with no library ID")
148     parser.add_option_group(queries)
149
150     options = OptionGroup(parser, "Options")
151     options.add_option("--rdf-parser-name", default="turtle",
152       help="set rdf file parser type")
153     options.add_option("-v", "--verbose", action="store_true", default=False)
154     options.add_option("--debug", action="store_true", default=False)
155     parser.add_option_group(options)
156
157     api.add_auth_options(parser)
158
159     return parser
160
161
162 def load_my_submissions(model, limit=None, cookie=None):
163     """Parse all the submissions from UCSC into model
164     It will look at the global USER_URL to figure out who to scrape
165     cookie contains the session cookie, if none, will attempt to login
166     """
167     if cookie is None:
168         cookie = login()
169
170     tree = get_url_as_tree(USER_URL, 'GET', cookie)
171     table_rows = tree.xpath('//table[@id="projects"]/tr')
172     # first record is header
173     name_n = submissionOntology['name']
174     species_n = submissionOntology['species']
175     library_urn = submissionOntology['library_urn']
176
177     # skip header
178     for row in table_rows[1:]:
179         cell = row.xpath('td')
180         if cell is not None and len(cell) > 1:
181             submission_id = str(cell[0].text_content())
182             if limit is None or submission_id in limit:
183                 subUrn = RDF.Uri(submission_view_url(submission_id))
184
185                 add_stmt(model,
186                          subUrn,
187                          TYPE_N,
188                          submissionOntology['Submission'])
189                 add_stmt(model,
190                          subUrn,
191                          DCC_NS['subId'],
192                          RDF.Node(submission_id))
193
194                 name = str(cell[4].text_content())
195                 add_stmt(model, subUrn, name_n, name)
196
197                 species = str(cell[2].text_content())
198                 if species is not None:
199                     add_stmt(model, subUrn, species_n, species)
200
201                 library_id = get_library_id(name)
202                 if library_id is not None:
203                     add_submission_to_library_urn(model,
204                                                   subUrn,
205                                                   library_urn,
206                                                   library_id)
207                 else:
208                     errmsg = 'Unable to find library id in {0} for {1}'
209                     LOGGER.warn(errmsg.format(name, str(subUrn)))
210
211                 add_submission_creation_date(model, subUrn, cookie)
212
213                 # grab changing atttributes
214                 status = str(cell[6].text_content()).strip()
215                 last_mod_datetime = get_date_contents(cell[8])
216                 last_mod = last_mod_datetime.isoformat()
217
218                 update_submission_detail(model, subUrn, status, last_mod,
219                                          cookie=cookie)
220
221                 LOGGER.info("Processed {0}".format(subUrn))
222
223
224 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
225     """Add a link from a UCSC submission to woldlab library if needed
226     """
227     libraryUrn = LIBRARY_NS[library_id + '/']
228     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
229     if not model.contains_statement(query):
230         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
231         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
232         model.add_statement(link)
233     else:
234         LOGGER.debug("Found: {0}".format(str(query)))
235
236
237 def find_submissions_with_no_library(model):
238     missing_lib_query_text = """
239 PREFIX submissionOntology:<{submissionOntology}>
240
241 SELECT
242  ?subid ?name
243 WHERE {{
244   ?subid submissionOntology:name ?name
245   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
246   FILTER  (!bound(?libid))
247 }}""".format(submissionOntology=submissionOntology[''].uri)
248     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
249
250     results = missing_lib_query.execute(model)
251     for row in results:
252         subid = row['subid']
253         name = row['name']
254         print "# {0}".format(name)
255         print "<{0}>".format(subid.uri)
256         print "  encodeSubmit:library_urn "\
257               "<http://jumpgate.caltech.edu/library/> ."
258         print ""
259
260
261 def add_submission_creation_date(model, subUrn, cookie):
262     # in theory the submission page might have more information on it.
263     creation_dates = get_creation_dates(model, subUrn)
264     if len(creation_dates) == 0:
265         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
266         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
267         parse_submission_page(model, cells, subUrn)
268     else:
269         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
270
271 def get_creation_dates(model, subUrn):
272     query = RDF.Statement(subUrn, CREATION_DATE, None)
273     creation_dates = list(model.find_statements(query))
274     return creation_dates
275
276 def parse_submission_page(model, submissionTree, subUrn):
277     cells = submissionTree.findall('.//td')
278     dateTimeType = xsdNS['dateTime']
279     created_label = [x for x in cells
280                      if x.text_content().startswith('Created')]
281     if len(created_label) == 1:
282         created_date = get_date_contents(created_label[0].getnext())
283         created_date_node = RDF.Node(literal=created_date.isoformat(),
284                                      datatype=dateTimeType.uri)
285         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
286     else:
287         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
288         LOGGER.warn(msg)
289         raise Warning(msg)
290
291
292 def update_submission_detail(model, subUrn, status, recent_update, cookie):
293     HasStatusN = submissionOntology['has_status']
294     StatusN = submissionOntology['status']
295     LastModifyN = submissionOntology['last_modify_date']
296
297     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
298     status_nodes = list(model.find_statements(status_nodes_query))
299
300     if len(status_nodes) == 0:
301         # has no status node, add one
302         LOGGER.info("Adding status node to {0}".format(subUrn))
303         status_node = create_status_node(subUrn, recent_update)
304         add_stmt(model, subUrn, HasStatusN, status_node)
305         add_stmt(model, status_node, rdfNS['type'], StatusN)
306         add_stmt(model, status_node, StatusN, status)
307         add_stmt(model, status_node, LastModifyN, recent_update)
308         update_ddf(model, subUrn, status_node, cookie=cookie)
309         update_daf(model, subUrn, status_node, cookie=cookie)
310     else:
311         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
312         for status_statement in status_nodes:
313             status_node = status_statement.object
314             last_modified_query = RDF.Statement(status_node,
315                                                 LastModifyN,
316                                                 None)
317             last_mod_nodes = model.find_statements(last_modified_query)
318             for last_mod_statement in last_mod_nodes:
319                 last_mod_date = str(last_mod_statement.object)
320                 if recent_update == str(last_mod_date):
321                     update_ddf(model, subUrn, status_node, cookie=cookie)
322                     update_daf(model, subUrn, status_node, cookie=cookie)
323                     break
324
325
326 def update_daf(model, submission_url, status_node, cookie):
327     download_daf_uri = str(submission_url).replace('show', 'download_daf')
328     daf_uri = RDF.Uri(download_daf_uri)
329
330     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
331     if not model.contains_statement(status_is_daf):
332         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
333                                                      status_node))
334         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
335         daf_hash = hashlib.md5(daf_text).hexdigest()
336         daf_hash_stmt = RDF.Statement(status_node,
337                                       dafTermOntology['md5sum'],
338                                       daf_hash)
339         model.add_statement(daf_hash_stmt)
340         daf.fromstring_into_model(model, status_node, daf_text)
341
342
343 def update_ddf(model, subUrn, statusNode, cookie):
344     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
345     ddfUrn = RDF.Uri(download_ddf_url)
346
347     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
348     if not model.contains_statement(status_is_ddf):
349         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
350         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
351         add_ddf_statements(model, statusNode, ddf_text)
352         model.add_statement(status_is_ddf)
353
354
355 def add_ddf_statements(model, statusNode, ddf_string):
356     """Convert a ddf text file into RDF Statements
357     """
358     ddf_lines = ddf_string.split('\n')
359     # first line is header
360     header = ddf_lines[0].split()
361     attributes = [DCC_NS[x] for x in header]
362
363     for ddf_line in ddf_lines[1:]:
364         ddf_line = ddf_line.strip()
365         if len(ddf_line) == 0:
366             continue
367         if ddf_line.startswith("#"):
368             continue
369
370         ddf_record = ddf_line.split('\t')
371         files = ddf_record[0].split(',')
372         file_attributes = ddf_record[1:]
373
374         for f in files:
375             fileNode = RDF.Node()
376             add_stmt(model,
377                      statusNode,
378                      submissionOntology['has_file'],
379                      fileNode)
380             add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
381             add_stmt(model, fileNode, DCC_NS['filename'], f)
382
383             for predicate, object in zip(attributes[1:], file_attributes):
384                 add_stmt(model, fileNode, predicate, object)
385
386
387 def load_encode_libraries(model, htswapi):
388     """Get libraries associated with encode.
389     """
390     encodeFilters = ["/library/?affiliations__id__exact=44",
391                      "/library/?affiliations__id__exact=80",
392                     ]
393
394     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
395     rdfaParser = RDF.Parser(name='rdfa')
396     for encodeUrl in encodeUrls:
397         LOGGER.info("Scanning library url {0}".format(encodeUrl))
398         rdfaParser.parse_into_model(model, encodeUrl)
399         query = RDF.Statement(None, libraryOntology['library_id'], None)
400         libraries = model.find_statements(query)
401         for statement in libraries:
402             libraryUrn = statement.subject
403             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
404             load_library_detail(model, libraryUrn)
405
406
407 def load_encodedcc_files(model, base_url):
408     if base_url[-1] != '/':
409         base_url += '/'
410
411     file_index = ucsc.get_ucsc_file_index(base_url)
412     for filename, attributes in file_index.items():
413         s = RDF.Node(RDF.Uri(base_url + filename))
414         for name, value in attributes.items():
415             p = RDF.Node(DCC_NS[name])
416             o = RDF.Node(value)
417             model.add_statement(RDF.Statement(s,p,o))
418
419 def load_library_detail(model, libraryUrn):
420     """Grab detail information from library page
421     """
422     rdfaParser = RDF.Parser(name='rdfa')
423     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
424     results = list(model.find_statements(query))
425     log_message = "Found {0} statements for {1}"
426     LOGGER.debug(log_message.format(len(results), libraryUrn))
427     if len(results) == 0:
428         LOGGER.info("Loading {0}".format(str(libraryUrn)))
429         rdfaParser.parse_into_model(model, libraryUrn.uri)
430     elif len(results) == 1:
431         pass  # Assuming that a loaded dataset has one record
432     else:
433         LOGGER.warning("Many dates for {0}".format(libraryUrn))
434
435
436 def get_library_id(name):
437     """Guess library ID from library name
438
439     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
440     '11039'
441     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
442     '10150'
443     """
444     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
445     library_id = None
446     if match is not None:
447         library_id = match.group('id')
448     return library_id
449
450
451 def get_contents(element):
452     """Return contents or none.
453     """
454     if len(element.contents) == 0:
455         return None
456
457     a = element.find('a')
458     if a is not None:
459         return a.contents[0].encode(CHARSET)
460
461     return element.contents[0].encode(CHARSET)
462
463
464 def create_status_node(submission_uri, timestamp):
465     submission_uri = daf.submission_uri_to_string(submission_uri)
466     if submission_uri[-1] != '/':
467         sumbission_uri += '/'
468     status_uri = submission_uri + timestamp
469     return RDF.Node(RDF.Uri(status_uri))
470
471
472 def get_date_contents(element):
473     data = element.text_content()
474     if data:
475         return datetime.strptime(data, "%Y-%m-%d %H:%M")
476     else:
477         return None
478
479
480 def add_stmt(model, subject, predicate, rdf_object):
481     """Convienence create RDF Statement and add to a model
482     """
483     return model.add_statement(
484         RDF.Statement(subject, predicate, rdf_object))
485
486
487 def login(cookie=None):
488     """Login if we don't have a cookie
489     """
490     if cookie is not None:
491         return cookie
492
493     keys = keyring.get_keyring()
494     password = keys.get_password(LOGIN_URL, USERNAME)
495     credentials = {'login': USERNAME,
496                    'password': password}
497     headers = {'Content-type': 'application/x-www-form-urlencoded'}
498     http = httplib2.Http()
499     response, content = http.request(LOGIN_URL,
500                                      'POST',
501                                      headers=headers,
502                                      body=urllib.urlencode(credentials))
503     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
504                                                     response['status']))
505
506     cookie = response.get('set-cookie', None)
507     if cookie is None:
508         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
509     return cookie
510
511
512 def get_url_as_tree(url, method, cookie=None):
513     http = httplib2.Http()
514     headers = {}
515     if cookie is not None:
516         headers['Cookie'] = cookie
517     response, content = http.request(url, method, headers=headers)
518     if response['status'] == '200':
519         tree = fromstring(content, base_url=url)
520         return tree
521     else:
522         msg = "error accessing {0}, status {1}"
523         msg = msg.format(url, response['status'])
524         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
525
526
527 def get_url_as_text(url, method, cookie=None):
528     http = httplib2.Http()
529     headers = {}
530     if cookie is not None:
531         headers['Cookie'] = cookie
532     response, content = http.request(url, method, headers=headers)
533     if response['status'] == '200':
534         return content
535     else:
536         msg = "error accessing {0}, status {1}"
537         msg = msg.format(url, response['status'])
538         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
539
540 ################
541 #  old stuff
542 SUBMISSIONS_LACKING_LIBID = [
543     ('1x75-Directional-HeLa-Rep1',    '11208'),
544     ('1x75-Directional-HeLa-Rep2',    '11207'),
545     ('1x75-Directional-HepG2-Rep1',   '11210'),
546     ('1x75-Directional-HepG2-Rep2',   '11209'),
547     ('1x75-Directional-H1-hESC-Rep1', '10947'),
548     ('1x75-Directional-H1-hESC-Rep2', '11009'),
549     ('1x75-Directional-HUVEC-Rep1',   '11206'),
550     ('1x75-Directional-HUVEC-Rep2',   '11205'),
551     ('1x75-Directional-K562-Rep1',    '11008'),
552     ('1x75-Directional-K562-Rep2',    '11007'),
553     ('1x75-Directional-NHEK-Rep1',    '11204'),
554     ('1x75-Directional-GM12878-Rep1', '11011'),
555     ('1x75-Directional-GM12878-Rep2', '11010'),
556     ]
557
558
559 def select_by_library_id(submission_list):
560     subl = [(x.library_id, x) for x in submission_list if x.library_id]
561     libraries = {}
562     for lib_id, subobj in subl:
563         libraries.setdefault(lib_id, []).append(subobj)
564
565     for submission in libraries.values():
566         submission.sort(key=attrgetter('date'), reverse=True)
567
568     return libraries
569
570
571 def library_to_freeze(selected_libraries):
572     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
573     lib_ids = sorted(selected_libraries.keys())
574     report = ['<html><table border="1">']
575     report = ["""<html>
576 <head>
577 <style type="text/css">
578  td {border-width:0 0 1px 1px; border-style:solid;}
579 </style>
580 </head>
581 <body>
582 <table>
583 """]
584     report.append('<thead>')
585     report.append('<tr><td>Library ID</td><td>Name</td>')
586     for f in freezes:
587         report.append('<td>{0}</td>'.format(f))
588     report.append('</tr>')
589     report.append('</thead>')
590     report.append('<tbody>')
591     for lib_id in lib_ids:
592         report.append('<tr>')
593         lib_url = LIBRARY_NS[lib_id].uri
594         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
595         submissions = selected_libraries[lib_id]
596         report.append('<td>{0}</td>'.format(submissions[0].name))
597         batched = {}
598         for sub in submissions:
599             date = date_to_freeze(sub.date)
600             batched.setdefault(date, []).append(sub)
601         for d in freezes:
602             report.append('<td>')
603             for s in batched.get(d, []):
604                 show_url = submission_view_url(s.subid)
605                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
606                 report.append("{0}:{1}".format(subid, s.status))
607             report.append('</td>')
608         else:
609             report.append('<td></td>')
610         report.append("</tr>")
611     report.append('</tbody>')
612     report.append("</table></html>")
613     return "\n".join(report)
614
615
616 def date_to_freeze(d):
617     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
618                (datetime(2010, 7, 30), '2010-Jul'),
619                (datetime(2011, 1, 30), '2011-Jan'),
620                ]
621     for end, name in freezes:
622         if d < end:
623             return name
624     else:
625         return None
626
627 if __name__ == "__main__":
628     main()