extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2 """
   3 Gather information about our submissions into a single RDF store
   4 """
   5
   6 from BeautifulSoup import BeautifulSoup
   7 from datetime import datetime
   8 import httplib2
   9 from operator import attrgetter
  10 from optparse import OptionParser, OptionGroup
  11 # python keyring
  12 import keyring
  13 import logging
  14 import os
  15 import re
  16 # redland rdf lib
  17 import RDF
  18 import sys
  19 import urllib
  20 import urlparse
  21
  22 from htsworkflow.submission import daf
  23
  24 from htsworkflow.util import api
  25 from htsworkflow.util.rdfhelp import \
  26      dafTermOntology, \
  27      dublinCoreNS, \
  28      get_model, \
  29      get_serializer, \
  30      sparql_query, \
  31      submissionOntology, \
  32      libraryOntology, \
  33      load_into_model, \
  34      rdfNS, \
  35      rdfsNS, \
  36      xsdNS
  37 TYPE_N = rdfNS['type']
  38
  39 # URL mappings
  40 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  41
  42 from htsworkflow.submission.ucsc import \
  43      daf_download_url, \
  44      ddf_download_url, \
  45      submission_view_url, \
  46      UCSCEncodePipeline
  47
  48 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
  49 DDF_NS = RDF.NS(DOWNLOAD_DDF)
  50
  51 DBDIR = os.path.expanduser("~diane/proj/submission")
  52
  53 LOGGER = logging.getLogger("encode_find")
  54
  55 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  56 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  57
  58 USERNAME = 'detrout'
  59 CHARSET = 'utf-8'
  60
  61
  62 def main(cmdline=None):
  63     """
  64     Parse command line arguments
  65
  66     Takes a list of arguments (assuming arg[0] is the program name) or None
  67     If None, it looks at sys.argv
  68     """
  69     parser = make_parser()
  70     opts, args = parser.parse_args(cmdline)
  71
  72     if opts.debug:
  73         logging.basicConfig(level=logging.DEBUG)
  74     elif opts.verbose:
  75         logging.basicConfig(level=logging.INFO)
  76
  77     htsw_authdata = api.make_auth_from_opts(opts, parser)
  78     htswapi = api.HtswApi(opts.host, htsw_authdata)
  79
  80     cookie = None
  81     model = get_model(opts.load_model, DBDIR)
  82
  83     if opts.load_rdf is not None:
  84         ns_uri = submissionOntology[''].uri
  85         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
  86
  87     if len(args) == 0:
  88         limit = None
  89     else:
  90         limit = args
  91
  92     if opts.update:
  93         cookie = login(cookie=cookie)
  94         load_my_submissions(model, limit=limit, cookie=cookie)
  95         load_encode_libraries(model, htswapi)
  96
  97     if opts.sparql is not None:
  98         sparql_query(model, opts.sparql)
  99
 100     if opts.find_submission_with_no_library:
 101         find_submissions_with_no_library(model)
 102
 103     if opts.print_rdf:
 104         serializer = get_serializer(name=opts.rdf_parser_name)
 105         print serializer.serialize_model_to_string(model)
 106
 107
 108 def make_parser():
 109     """Construct option parser
 110     """
 111     parser = OptionParser()
 112     commands = OptionGroup(parser, "Commands")
 113     commands.add_option('--load-model', default=None,
 114       help="Load model database")
 115     commands.add_option('--load-rdf', default=None,
 116       help="load rdf statements into model")
 117     commands.add_option('--print-rdf', action="store_true", default=False,
 118       help="print ending model state")
 119     commands.add_option('--update', action="store_true", default=False,
 120       help="Query remote data sources and update our database")
 121     #commands.add_option('--update-ucsc-status', default=None,
 122     #  help="download status from ucsc, requires filename for extra rules")
 123     #commands.add_option('--update-ddfs', action="store_true", default=False,
 124     #  help="download ddf information for known submission")
 125     #commands.add_option('--update-library', default=None,
 126     #  help="download library info from htsw, "\
 127     #       "requires filename for extra rules")
 128     parser.add_option_group(commands)
 129
 130     queries = OptionGroup(parser, "Queries")
 131     queries.add_option('--sparql', default=None,
 132       help="execute arbitrary sparql query")
 133     queries.add_option('--find-submission-with-no-library', default=False,
 134       action="store_true",
 135       help="find submissions with no library ID")
 136     parser.add_option_group(queries)
 137
 138     options = OptionGroup(parser, "Options")
 139     options.add_option("--rdf-parser-name", default="turtle",
 140       help="set rdf file parser type")
 141     options.add_option("-v", "--verbose", action="store_true", default=False)
 142     options.add_option("--debug", action="store_true", default=False)
 143     parser.add_option_group(options)
 144
 145     api.add_auth_options(parser)
 146
 147     return parser
 148
 149
 150 def load_my_submissions(model, limit=None, cookie=None):
 151     """Parse all the submissions from UCSC into model
 152     It will look at the global USER_URL to figure out who to scrape
 153     cookie contains the session cookie, if none, will attempt to login
 154     """
 155     if cookie is None:
 156         cookie = login()
 157
 158     soup = get_url_as_soup(USER_URL, 'GET', cookie)
 159     projects = soup.find('table', attrs={'id': 'projects'})
 160     table_row = projects.findNext('tr')
 161     # first record is header
 162     table_row = table_row.findNext()
 163     name_n = submissionOntology['name']
 164     species_n = submissionOntology['species']
 165     library_urn = submissionOntology['library_urn']
 166
 167     while table_row is not None:
 168         cell = table_row.findAll('td')
 169         if cell is not None and len(cell) > 1:
 170             submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
 171             if limit is None or submission_id in limit:
 172                 subUrn = RDF.Uri(submission_view_url(submission_id))
 173
 174                 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
 175
 176                 name = get_contents(cell[4])
 177                 add_stmt(model, subUrn, name_n, name)
 178
 179                 species = get_contents(cell[2])
 180                 if species is not None:
 181                     add_stmt(model, subUrn, species_n, species)
 182
 183                 library_id = get_library_id(name)
 184                 if library_id is not None:
 185                     add_submission_to_library_urn(model,
 186                                                   subUrn,
 187                                                   library_urn,
 188                                                   library_id)
 189
 190                 add_submission_creation_date(model, subUrn, cookie)
 191
 192                 # grab changing atttributes
 193                 status = get_contents(cell[6]).strip()
 194                 last_mod_datetime = get_date_contents(cell[8])
 195                 last_mod = last_mod_datetime.isoformat()
 196
 197                 update_submission_detail(model, subUrn, status, last_mod,
 198                                          cookie=cookie)
 199
 200                 logging.info("Processed {0}".format(subUrn))
 201
 202         table_row = table_row.findNext('tr')
 203
 204
 205 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 206     """Add a link from a UCSC submission to woldlab library if needed
 207     """
 208     libraryUrn = LIBRARY_NS[library_id + '/']
 209     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
 210     if not model.contains_statement(query):
 211         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
 212         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
 213         model.add_statement(link)
 214     else:
 215         LOGGER.debug("Found: {0}".format(str(query)))
 216
 217
 218 def find_submissions_with_no_library(model):
 219     missing_lib_query = RDF.SPARQLQuery("""
 220 PREFIX submissionOntology:<{submissionOntology}>
 221
 222 SELECT
 223  ?subid ?name
 224 WHERE {{
 225   ?subid submissionOntology:name ?name
 226   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
 227   FILTER  (!bound(?libid))
 228 }}""".format(submissionOntology=submissionOntology[''].uri))
 229
 230     results = missing_lib_query.execute(model)
 231     for row in results:
 232         subid = row['subid']
 233         name = row['name']
 234         print "# {0}".format(name)
 235         print "<{0}>".format(subid.uri)
 236         print "  encodeSubmit:library_urn"\
 237               "<http://jumpgate.caltech.edu/library/> ."
 238         print ""
 239
 240
 241 def add_submission_creation_date(model, subUrn, cookie):
 242     # in theory the submission page might have more information on it.
 243     creationDateN = libraryOntology['date']
 244     dateTimeType = xsdNS['dateTime']
 245     query = RDF.Statement(subUrn, creationDateN, None)
 246     creation_dates = list(model.find_statements(query))
 247     if len(creation_dates) == 0:
 248         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
 249         soup = get_url_as_soup(str(subUrn), 'GET', cookie)
 250         created_label = soup.find(text="Created: ")
 251         if created_label:
 252             created_date = get_date_contents(created_label.next)
 253             created_date_node = RDF.Node(literal=created_date.isoformat(),
 254                                          datatype=dateTimeType.uri)
 255             add_stmt(model, subUrn, creationDateN, created_date_node)
 256     else:
 257         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 258
 259
 260 def update_submission_detail(model, subUrn, status, recent_update, cookie):
 261     HasStatusN = submissionOntology['has_status']
 262     StatusN = submissionOntology['status']
 263     LastModifyN = submissionOntology['last_modify_date']
 264
 265     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
 266     status_nodes = list(model.find_statements(status_nodes_query))
 267
 268     if len(status_nodes) == 0:
 269         # has no status node, add one
 270         logging.info("Adding status node to {0}".format(subUrn))
 271         status_node = create_status_node(subUrn, recent_update)
 272         add_stmt(model, subUrn, HasStatusN, status_node)
 273         add_stmt(model, status_node, rdfsNS['type'], StatusN)
 274         add_stmt(model, status_node, StatusN, status)
 275         add_stmt(model, status_node, LastModifyN, recent_update)
 276         update_ddf(model, subUrn, status_node, cookie=cookie)
 277         update_daf(model, subUrn, status_node, cookie=cookie)
 278     else:
 279         logging.info("Found {0} status blanks".format(len(status_nodes)))
 280         for status_statement in status_nodes:
 281             status_node = status_statement.object
 282             last_modified_query = RDF.Statement(status_node,
 283                                                 LastModifyN,
 284                                                 None)
 285             last_mod_nodes = model.find_statements(last_modified_query)
 286             for last_mod_statement in last_mod_nodes:
 287                 last_mod_date = str(last_mod_statement.object)
 288                 if recent_update == str(last_mod_date):
 289                     update_ddf(model, subUrn, status_node, cookie=cookie)
 290                     update_daf(model, subUrn, status_node, cookie=cookie)
 291                     break
 292
 293
 294 def update_daf(model, submission_url, status_node, cookie):
 295     download_daf_uri = str(submission_url).replace('show', 'download_daf')
 296     daf_uri = RDF.Uri(download_daf_uri)
 297
 298     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
 299     if not model.contains_statement(status_is_daf):
 300         logging.info('Adding daf to {0}, {1}'.format(submission_url,
 301                                                      status_node))
 302         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
 303         daf.fromstring_into_model(model, status_node, daf_text)
 304
 305
 306 def update_ddf(model, subUrn, statusNode, cookie):
 307     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
 308     ddfUrn = RDF.Uri(download_ddf_url)
 309
 310     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
 311     if not model.contains_statement(status_is_ddf):
 312         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
 313         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
 314         add_ddf_statements(model, statusNode, ddf_text)
 315         model.add_statement(status_is_ddf)
 316
 317
 318 def add_ddf_statements(model, statusNode, ddf_string):
 319     """Convert a ddf text file into RDF Statements
 320     """
 321     ddf_lines = ddf_string.split('\n')
 322     # first line is header
 323     header = ddf_lines[0].split()
 324     attributes = [DDF_NS[x] for x in header]
 325
 326     for ddf_line in ddf_lines[1:]:
 327         ddf_line = ddf_line.strip()
 328         if len(ddf_line) == 0:
 329             continue
 330         if ddf_line.startswith("#"):
 331             continue
 332
 333         ddf_record = ddf_line.split('\t')
 334         files = ddf_record[0].split(',')
 335         file_attributes = ddf_record[1:]
 336
 337         for f in files:
 338             fileNode = RDF.Node()
 339             add_stmt(model,
 340                      statusNode,
 341                      submissionOntology['has_file'],
 342                      fileNode)
 343             add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
 344             add_stmt(model, fileNode, DDF_NS['filename'], f)
 345
 346             for predicate, object in zip(attributes[1:], file_attributes):
 347                 add_stmt(model, fileNode, predicate, object)
 348
 349
 350 def load_encode_libraries(model, htswapi):
 351     """Get libraries associated with encode.
 352     """
 353     encodeFilters = ["/library/?affiliations__id__exact=44",
 354                      "/library/?affiliations__id__exact=80",
 355                     ]
 356
 357     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
 358     rdfaParser = RDF.Parser(name='rdfa')
 359     for encodeUrl in encodeUrls:
 360         LOGGER.info("Scanning library url {0}".format(encodeUrl))
 361         rdfaParser.parse_into_model(model, encodeUrl)
 362         query = RDF.Statement(None, libraryOntology['library_id'], None)
 363         libraries = model.find_statements(query)
 364         for statement in libraries:
 365             libraryUrn = statement.subject
 366             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
 367             load_library_detail(model, libraryUrn)
 368
 369
 370 def load_library_detail(model, libraryUrn):
 371     """Grab detail information from library page
 372     """
 373     rdfaParser = RDF.Parser(name='rdfa')
 374     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
 375     results = list(model.find_statements(query))
 376     log_message = "Found {0} statements for {1}"
 377     LOGGER.debug(log_message.format(len(results), libraryUrn))
 378     if len(results) == 0:
 379         LOGGER.info("Loading {0}".format(str(libraryUrn)))
 380         rdfaParser.parse_into_model(model, libraryUrn.uri)
 381     elif len(results) == 1:
 382         pass  # Assuming that a loaded dataset has one record
 383     else:
 384         logging.warning("Many dates for {0}".format(libraryUrn))
 385
 386
 387 def get_library_id(name):
 388     """Guess library ID from library name
 389
 390     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
 391     '11039'
 392     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
 393     '10150'
 394     """
 395     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
 396     library_id = None
 397     if match is not None:
 398         library_id = match.group('id')
 399     return library_id
 400
 401
 402 def get_contents(element):
 403     """Return contents or none.
 404     """
 405     if len(element.contents) == 0:
 406         return None
 407
 408     a = element.find('a')
 409     if a is not None:
 410         return a.contents[0].encode(CHARSET)
 411
 412     return element.contents[0].encode(CHARSET)
 413
 414
 415 def create_status_node(submission_uri, timestamp):
 416     submission_uri = daf.submission_uri_to_string(submission_uri)
 417     status_uri = urlparse.urljoin(submission_uri, timestamp)
 418     return RDF.Node(RDF.Uri(status_uri))
 419
 420 def get_date_contents(element):
 421     data = get_contents(element)
 422     if data:
 423         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 424     else:
 425         return None
 426
 427
 428 def add_stmt(model, subject, predicate, rdf_object):
 429     """Convienence create RDF Statement and add to a model
 430     """
 431     return model.add_statement(
 432         RDF.Statement(subject, predicate, rdf_object))
 433
 434
 435 def login(cookie=None):
 436     """Login if we don't have a cookie
 437     """
 438     if cookie is not None:
 439         return cookie
 440
 441     keys = keyring.get_keyring()
 442     password = keys.get_password(LOGIN_URL, USERNAME)
 443     credentials = {'login': USERNAME,
 444                    'password': password}
 445     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 446     http = httplib2.Http()
 447     response, content = http.request(LOGIN_URL,
 448                                      'POST',
 449                                      headers=headers,
 450                                      body=urllib.urlencode(credentials))
 451     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
 452                                                     response['status']))
 453
 454     cookie = response.get('set-cookie', None)
 455     if cookie is None:
 456         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 457     return cookie
 458
 459
 460 def get_url_as_soup(url, method, cookie=None):
 461     http = httplib2.Http()
 462     headers = {}
 463     if cookie is not None:
 464         headers['Cookie'] = cookie
 465     response, content = http.request(url, method, headers=headers)
 466     if response['status'] == '200':
 467         soup = BeautifulSoup(content,
 468                              fromEncoding="utf-8",  # should read from header
 469                              convertEntities=BeautifulSoup.HTML_ENTITIES)
 470         return soup
 471     else:
 472         msg = "error accessing {0}, status {1}"
 473         msg = msg.format(url, response['status'])
 474         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 475
 476
 477 def get_url_as_text(url, method, cookie=None):
 478     http = httplib2.Http()
 479     headers = {}
 480     if cookie is not None:
 481         headers['Cookie'] = cookie
 482     response, content = http.request(url, method, headers=headers)
 483     if response['status'] == '200':
 484         return content
 485     else:
 486         msg = "error accessing {0}, status {1}"
 487         msg = msg.format(url, response['status'])
 488         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 489
 490 ################
 491 #  old stuff
 492 SUBMISSIONS_LACKING_LIBID = [
 493     ('1x75-Directional-HeLa-Rep1',    '11208'),
 494     ('1x75-Directional-HeLa-Rep2',    '11207'),
 495     ('1x75-Directional-HepG2-Rep1',   '11210'),
 496     ('1x75-Directional-HepG2-Rep2',   '11209'),
 497     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 498     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 499     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 500     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 501     ('1x75-Directional-K562-Rep1',    '11008'),
 502     ('1x75-Directional-K562-Rep2',    '11007'),
 503     ('1x75-Directional-NHEK-Rep1',    '11204'),
 504     ('1x75-Directional-GM12878-Rep1', '11011'),
 505     ('1x75-Directional-GM12878-Rep2', '11010'),
 506     ]
 507
 508
 509 def select_by_library_id(submission_list):
 510     subl = [(x.library_id, x) for x in submission_list if x.library_id]
 511     libraries = {}
 512     for lib_id, subobj in subl:
 513         libraries.setdefault(lib_id, []).append(subobj)
 514
 515     for submission in libraries.values():
 516         submission.sort(key=attrgetter('date'), reverse=True)
 517
 518     return libraries
 519
 520
 521 def library_to_freeze(selected_libraries):
 522     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 523     lib_ids = sorted(selected_libraries.keys())
 524     report = ['<html><table border="1">']
 525     report = ["""<html>
 526 <head>
 527 <style type="text/css">
 528  td {border-width:0 0 1px 1px; border-style:solid;}
 529 </style>
 530 </head>
 531 <body>
 532 <table>
 533 """]
 534     report.append('<thead>')
 535     report.append('<tr><td>Library ID</td><td>Name</td>')
 536     for f in freezes:
 537         report.append('<td>{0}</td>'.format(f))
 538     report.append('</tr>')
 539     report.append('</thead>')
 540     report.append('<tbody>')
 541     for lib_id in lib_ids:
 542         report.append('<tr>')
 543         lib_url = LIBRARY_NS[lib_id].uri
 544         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 545         submissions = selected_libraries[lib_id]
 546         report.append('<td>{0}</td>'.format(submissions[0].name))
 547         batched = {}
 548         for sub in submissions:
 549             date = date_to_freeze(sub.date)
 550             batched.setdefault(date, []).append(sub)
 551         for d in freezes:
 552             report.append('<td>')
 553             for s in batched.get(d, []):
 554                 show_url = submission_view_url(s.subid)
 555                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 556                 report.append("{0}:{1}".format(subid, s.status))
 557             report.append('</td>')
 558         else:
 559             report.append('<td></td>')
 560         report.append("</tr>")
 561     report.append('</tbody>')
 562     report.append("</table></html>")
 563     return "\n".join(report)
 564
 565
 566 def date_to_freeze(d):
 567     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
 568                (datetime(2010, 7, 30), '2010-Jul'),
 569                (datetime(2011, 1, 30), '2011-Jan'),
 570                ]
 571     for end, name in freezes:
 572         if d < end:
 573             return name
 574     else:
 575         return None
 576
 577 if __name__ == "__main__":
 578     main()