extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2
   3 from BeautifulSoup import BeautifulSoup
   4 from datetime import datetime
   5 import httplib2
   6 from operator import attrgetter
   7 from optparse import OptionParser, OptionGroup
   8 # python keyring
   9 import keyring
  10 import logging
  11 import os
  12 import re
  13 # redland rdf lib
  14 import RDF
  15 import sys
  16 import urllib
  17
  18 from htsworkflow.util import api
  19
  20 DBDIR = os.path.expanduser("~diane/proj/submission")
  21
  22 logger = logging.getLogger("encode_find")
  23
  24 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
  25 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
  26 submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
  27 ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
  28 libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
  29
  30 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  31 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  32 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  33 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
  34
  35 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  36 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  37
  38 USERNAME = 'detrout'
  39 CHARSET = 'utf-8'
  40
  41 def main(cmdline=None):
  42     parser = make_parser()
  43     opts, args = parser.parse_args(cmdline)
  44
  45     if opts.verbose:
  46         logging.basicConfig(level=logging.INFO)
  47
  48     htsw_authdata = api.make_auth_from_opts(opts, parser)
  49     htswapi = api.HtswApi(opts.host, htsw_authdata)
  50
  51     cookie = None
  52     model = get_model(opts.load_model)
  53
  54     if opts.load_rdf is not None:
  55         load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
  56
  57     if opts.update:
  58         cookie = login(cookie=cookie)
  59         load_my_submissions(model, cookie=cookie)
  60         load_encode_libraries(model, htswapi)
  61
  62     if opts.sparql is not None:
  63         sparql_query(model, opts.sparql)
  64
  65     if opts.find_submission_with_no_library:
  66         missing = find_submissions_with_no_library(model)
  67
  68     if opts.print_rdf:
  69         serializer = RDF.Serializer(name=opts.rdf_parser_name)
  70         print serializer.serialize_model_to_string(model)
  71
  72
  73 def make_parser():
  74     parser = OptionParser()
  75     commands = OptionGroup(parser, "Commands")
  76     commands.add_option('--load-model', default=None,
  77       help="Load model database")
  78     commands.add_option('--load-rdf', default=None,
  79       help="load rdf statements into model")
  80     commands.add_option('--print-rdf', action="store_true", default=False,
  81       help="print ending model state")
  82     commands.add_option('--update', action="store_true", default=False,
  83       help="Query remote data sources and update our database")
  84     #commands.add_option('--update-ucsc-status', default=None,
  85     #  help="download status from ucsc, requires filename for extra rules")
  86     #commands.add_option('--update-ddfs', action="store_true", default=False,
  87     #  help="download ddf information for known submission")
  88     #commands.add_option('--update-library', default=None,
  89     #  help="download library info from htsw, requires filename for extra rules")
  90     parser.add_option_group(commands)
  91
  92     queries = OptionGroup(parser, "Queries")
  93     queries.add_option('--sparql', default=None,
  94       help="execute arbitrary sparql query")
  95     queries.add_option('--find-submission-with-no-library', default=False,
  96       action="store_true",
  97       help="find submissions with no library ID")
  98     parser.add_option_group(queries)
  99
 100     options = OptionGroup(parser, "Options")
 101     options.add_option("--rdf-parser-name", default="turtle",
 102       help="set rdf file parser type")
 103     options.add_option("-v", "--verbose", action="store_true", default=False)
 104     parser.add_option_group(options)
 105
 106     api.add_auth_options(parser)
 107
 108     return parser
 109
 110 def get_model(model_name=None):
 111     if model_name is None:
 112         storage = RDF.MemoryStorage()
 113     else:
 114         storage = RDF.HashStorage(model_name,
 115                       options="hash-type='bdb',dir='{0}'".format(DBDIR))
 116     model = RDF.Model(storage)
 117     return model
 118
 119 def load_my_submissions(model, cookie=None):
 120     if cookie is None:
 121         cookie = login()
 122
 123     soup = get_url_as_soup(USER_URL, 'GET', cookie)
 124     p = soup.find('table', attrs={'id':'projects'})
 125     tr = p.findNext('tr')
 126     # first record is header
 127     tr = tr.findNext()
 128     TypeN = rdfsNS['type']
 129     NameN = submitOntologyNS['name']
 130     SpeciesN = submitOntologyNS['species']
 131     LibraryURN = submitOntologyNS['library_urn']
 132
 133     while tr is not None:
 134         td = tr.findAll('td')
 135         if td is not None and len(td) > 1:
 136             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
 137             subUrn = submissionNS[subUrnText]
 138
 139             add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission'])
 140
 141             name = get_contents(td[4])
 142             add_stmt(model, subUrn, NameN, name)
 143
 144             species = get_contents(td[2])
 145             if species is not None:
 146                 add_stmt(model, subUrn, SpeciesN, species)
 147
 148             library_id = get_library_id(name)
 149             if library_id is not None:
 150                 add_submission_to_library_urn(model,
 151                                               subUrn,
 152                                               LibraryURN,
 153                                               library_id)
 154
 155             add_submission_creation_date(model, subUrn, cookie)
 156
 157             # grab changing atttributes
 158             status = get_contents(td[6]).strip()
 159             last_mod_datetime = get_date_contents(td[8])
 160             last_mod = last_mod_datetime.isoformat()
 161
 162             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
 163
 164             logging.info("Processed {0}".format( subUrn))
 165
 166         tr = tr.findNext('tr')
 167
 168
 169 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 170     """Add a link from a UCSC submission to woldlab library if needed
 171     """
 172     libraryUrn = libraryNS[library_id]
 173     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
 174     if not model.contains_statement(query):
 175         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
 176         logger.info("Adding Sub -> Lib link: {0}".format(link))
 177         model.add_statement(link)
 178     else:
 179         logger.debug("Found: {0}".format(str(query)))
 180
 181
 182 def find_submissions_with_no_library(model):
 183     missing_lib_query = RDF.SPARQLQuery("""
 184 PREFIX submissionOntology:<{submissionOntology}>
 185
 186 SELECT
 187  ?subid ?name
 188 WHERE {{
 189   ?subid submissionOntology:name ?name
 190   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
 191   FILTER  (!bound(?libid))
 192 }}""".format(submissionOntology=submitOntologyNS[''].uri)
 193 )
 194
 195     results = missing_lib_query.execute(model)
 196     for row in results:
 197         subid = row['subid']
 198         name = row['name']
 199         print "# {0}".format(name)
 200         print "<{0}>".format(subid.uri)
 201         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
 202         print ""
 203
 204
 205 def add_submission_creation_date(model, subUrn, cookie):
 206     # in theory the submission page might have more information on it.
 207     creationDateN = libOntNS['date']
 208     dateTimeType = xsdNS['dateTime']
 209     query = RDF.Statement(subUrn, creationDateN, None)
 210     creation_dates = list(model.find_statements(query))
 211     if len(creation_dates) == 0:
 212         logger.info("Getting creation date for: {0}".format(str(subUrn)))
 213         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
 214         created_label = soup.find(text="Created: ")
 215         if created_label:
 216             created_date = get_date_contents(created_label.next)
 217             created_date_node = RDF.Node(literal=created_date.isoformat(),
 218                                          datatype=dateTimeType.uri)
 219             add_stmt(model, subUrn, creationDateN, created_date_node)
 220     else:
 221         logger.debug("Found creation date for: {0}".format(str(subUrn)))
 222
 223 def update_submission_detail(model, subUrn, status, recent_update, cookie):
 224     HasStatusN = submitOntologyNS['has_status']
 225     StatusN = submitOntologyNS['status']
 226     LastModifyN = submitOntologyNS['last_modify_date']
 227
 228     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
 229     status_nodes = list(model.find_statements(status_nodes_query))
 230
 231     if len(status_nodes) == 0:
 232         # has no status node, add one
 233         logging.info("Adding status node to {0}".format(subUrn))
 234         status_blank = RDF.Node()
 235         add_stmt(model, subUrn, HasStatusN, status_blank)
 236         add_stmt(model, status_blank, rdfsNS['type'], StatusN)
 237         add_stmt(model, status_blank, StatusN, status)
 238         add_stmt(model, status_blank, LastModifyN, recent_update)
 239         update_ddf(model, subUrn, status_blank, cookie=cookie)
 240     else:
 241         logging.info("Found {0} status blanks".format(len(status_nodes)))
 242         for status_statement in status_nodes:
 243             status_blank = status_statement.object
 244             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
 245             last_mod_nodes = model.find_statements(last_modified_query)
 246             for last_mod_statement in last_mod_nodes:
 247                 last_mod_date = str(last_mod_statement.object)
 248                 if recent_update == str(last_mod_date):
 249                     update_ddf(model, subUrn, status_blank, cookie=cookie)
 250                     break
 251
 252
 253
 254 def update_ddf(model, subUrn, statusNode, cookie):
 255     TypeN = rdfsNS['type']
 256
 257     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
 258     ddfUrn = RDF.Uri(download_ddf_url)
 259
 260     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
 261     if not model.contains_statement(status_is_ddf):
 262         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
 263         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
 264         add_ddf_statements(model, statusNode, ddf_text)
 265         model.add_statement(status_is_ddf)
 266
 267
 268 def add_ddf_statements(model, statusNode, ddf_string):
 269     """Convert a ddf text file into RDF Statements
 270     """
 271     ddf_lines = ddf_string.split('\n')
 272     # first line is header
 273     header = ddf_lines[0].split()
 274     attributes = [ ddfNS[x] for x in header ]
 275     statements = []
 276
 277     for ddf_line in ddf_lines[1:]:
 278         ddf_line = ddf_line.strip()
 279         if len(ddf_line) == 0:
 280             continue
 281         if ddf_line.startswith("#"):
 282             continue
 283
 284         ddf_record = ddf_line.split('\t')
 285         files = ddf_record[0].split(',')
 286         file_attributes = ddf_record[1:]
 287
 288         for f in files:
 289             fileNode = RDF.Node()
 290             add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode)
 291             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
 292             add_stmt(model, fileNode, ddfNS['filename'], f)
 293
 294             for predicate, object in zip( attributes[1:], file_attributes):
 295                 add_stmt(model, fileNode, predicate, object)
 296
 297
 298 def load_encode_libraries(model, htswapi):
 299     """Get libraries associated with encode.
 300     """
 301     encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
 302     rdfaParser = RDF.Parser(name='rdfa')
 303     print encodeUrl
 304     rdfaParser.parse_into_model(model, encodeUrl)
 305     query = RDF.Statement(None, libOntNS['library_id'], None)
 306     libraries = model.find_statements(query)
 307     for statement in libraries:
 308         libraryUrn = statement.subject
 309         load_library_detail(model, libraryUrn)
 310
 311
 312 def load_library_detail(model, libraryUrn):
 313     """Grab detail information from library page
 314     """
 315     rdfaParser = RDF.Parser(name='rdfa')
 316     query = RDF.Statement(libraryUrn, libOntNS['date'], None)
 317     results = list(model.find_statements(query))
 318     if len(results) == 0:
 319         logger.info("Loading {0}".format(str(libraryUrn)))
 320         rdfaParser.parse_into_model(model, libraryUrn.uri)
 321     elif len(results) == 1:
 322         pass # Assuming that a loaded dataset has one record
 323     else:
 324         logging.warning("Many dates for {0}".format(libraryUrn))
 325
 326 def get_library_id(name):
 327     """Guess library ID from library name
 328     """
 329     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
 330     library_id = None
 331     if match is not None:
 332         library_id = match.group('id')
 333     return library_id
 334
 335
 336 def get_contents(element):
 337     """Return contents or none.
 338     """
 339     if len(element.contents) == 0:
 340         return None
 341
 342     a = element.find('a')
 343     if a is not None:
 344         return a.contents[0].encode(CHARSET)
 345
 346     return element.contents[0].encode(CHARSET)
 347
 348
 349 def get_date_contents(element):
 350     data = get_contents(element)
 351     if data:
 352         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 353     else:
 354         return None
 355
 356 def sparql_query(model, query_filename):
 357     """Execute sparql query from file
 358     """
 359     query_body = open(query_filename,'r').read()
 360     query = RDF.SPARQLQuery(query_body)
 361     results = query.execute(model)
 362     for row in results:
 363         output = []
 364         for k,v in row.items()[::-1]:
 365             print "{0}: {1}".format(k,v)
 366         print
 367
 368
 369 def load_into_model(model, parser_name, filename):
 370     if not os.path.exists(filename):
 371         raise IOError("Can't find {0}".format(filename))
 372
 373     data = open(filename, 'r').read()
 374     rdf_parser = RDF.Parser(name=parser_name)
 375     ns_uri = submitOntologyNS[''].uri
 376     rdf_parser.parse_string_into_model(model, data, ns_uri)
 377
 378 def add_stmt(model, subject, predicate, object):
 379     """Convienence create RDF Statement and add to a model
 380     """
 381     return model.add_statement(
 382         RDF.Statement(subject, predicate, object)
 383     )
 384
 385 def login(cookie=None):
 386     """Login if we don't have a cookie
 387     """
 388     if cookie is not None:
 389         return cookie
 390
 391     keys = keyring.get_keyring()
 392     password = keys.get_password(LOGIN_URL, USERNAME)
 393     credentials = {'login': USERNAME,
 394                    'password': password}
 395     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 396     http = httplib2.Http()
 397     response, content = http.request(LOGIN_URL,
 398                                      'POST',
 399                                      headers=headers,
 400                                      body=urllib.urlencode(credentials))
 401     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
 402                                                     response['status']))
 403
 404     cookie = response.get('set-cookie', None)
 405     if cookie is None:
 406         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 407     return cookie
 408
 409
 410 def get_url_as_soup(url, method, cookie=None):
 411     http = httplib2.Http()
 412     headers = {}
 413     if cookie is not None:
 414         headers['Cookie'] = cookie
 415     response, content = http.request(url, method, headers=headers)
 416     if response['status'] == '200':
 417         soup = BeautifulSoup(content,
 418                              fromEncoding="utf-8", # should read from header
 419                              convertEntities=BeautifulSoup.HTML_ENTITIES
 420                              )
 421         return soup
 422     else:
 423         msg = "error accessing {0}, status {1}"
 424         msg = msg.format(url, response['status'])
 425         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 426
 427 def get_url_as_text(url, method, cookie=None):
 428     http = httplib2.Http()
 429     headers = {}
 430     if cookie is not None:
 431         headers['Cookie'] = cookie
 432     response, content = http.request(url, method, headers=headers)
 433     if response['status'] == '200':
 434         return content
 435     else:
 436         msg = "error accessing {0}, status {1}"
 437         msg = msg.format(url, response['status'])
 438         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 439
 440 ################
 441 #  old stuff
 442 SUBMISSIONS_LACKING_LIBID = [
 443     ('1x75-Directional-HeLa-Rep1',    '11208'),
 444     ('1x75-Directional-HeLa-Rep2',    '11207'),
 445     ('1x75-Directional-HepG2-Rep1',   '11210'),
 446     ('1x75-Directional-HepG2-Rep2',   '11209'),
 447     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 448     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 449     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 450     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 451     ('1x75-Directional-K562-Rep1',    '11008'),
 452     ('1x75-Directional-K562-Rep2',    '11007'),
 453     ('1x75-Directional-NHEK-Rep1',    '11204'),
 454     ('1x75-Directional-GM12878-Rep1', '11011'),
 455     ('1x75-Directional-GM12878-Rep2', '11010'),
 456     ]
 457
 458
 459
 460 def select_by_library_id(submission_list):
 461     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
 462     libraries = {}
 463     for lib_id, subobj in subl:
 464         libraries.setdefault(lib_id, []).append(subobj)
 465
 466     for submission in libraries.values():
 467         submission.sort(key=attrgetter('date'), reverse=True)
 468
 469     return libraries
 470
 471 def library_to_freeze(selected_libraries):
 472     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 473     lib_ids = sorted(selected_libraries.keys())
 474     report = ['<html><table border="1">']
 475     report = ["""<html>
 476 <head>
 477 <style type="text/css">
 478  td {border-width:0 0 1px 1px; border-style:solid;}
 479 </style>
 480 </head>
 481 <body>
 482 <table>
 483 """]
 484     report.append('<thead>')
 485     report.append('<tr><td>Library ID</td><td>Name</td>')
 486     for f in freezes:
 487         report.append('<td>{0}</td>'.format(f))
 488     report.append('</tr>')
 489     report.append('</thead>')
 490     report.append('<tbody>')
 491     for lib_id in lib_ids:
 492         report.append('<tr>')
 493         lib_url = libraryNS[lib_id].uri
 494         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 495         submissions = selected_libraries[lib_id]
 496         report.append('<td>{0}</td>'.format(submissions[0].name))
 497         batched = {}
 498         for sub in submissions:
 499             date = date_to_freeze(sub.date)
 500             batched.setdefault(date, []).append(sub)
 501         print lib_id, batched
 502         for d in freezes:
 503             report.append('<td>')
 504             for s in batched.get(d, []):
 505                 show_url = submissionNS[s.subid].uri
 506                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 507                 report.append("{0}:{1}".format(subid, s.status))
 508             report.append('</td>')
 509         else:
 510             report.append('<td></td>')
 511         report.append("</tr>")
 512     report.append('</tbody>')
 513     report.append("</table></html>")
 514     return "\n".join(report)
 515
 516
 517 def date_to_freeze(d):
 518     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
 519                 (datetime(2010, 7, 30), '2010-Jul'),
 520                 (datetime(2011, 1, 30), '2011-Jan'),
 521                 ]
 522     for end, name in freezes:
 523         if d < end:
 524             return name
 525     else:
 526         return None
 527
 528 if __name__ == "__main__":
 529     main()
 530