extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2 """
   3 Gather information about our submissions into a single RDF store
   4 """
   5
   6 from datetime import datetime
   7 import hashlib
   8 import httplib2
   9 import keyring
  10 import logging
  11 from lxml.html import fromstring
  12 from operator import attrgetter
  13 from optparse import OptionParser, OptionGroup
  14 # python keyring
  15 import os
  16 import re
  17 # redland rdf lib
  18 import RDF
  19 import sys
  20 import urllib
  21 import urlparse
  22
  23 from htsworkflow.submission import daf
  24
  25 from htsworkflow.util import api
  26 from htsworkflow.util.rdfhelp import \
  27      dafTermOntology, \
  28      dublinCoreNS, \
  29      get_model, \
  30      get_serializer, \
  31      sparql_query, \
  32      submissionOntology, \
  33      libraryOntology, \
  34      load_into_model, \
  35      rdfNS, \
  36      rdfsNS, \
  37      xsdNS
  38 TYPE_N = rdfNS['type']
  39
  40 # URL mappings
  41 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
  42
  43 from htsworkflow.submission.ucsc import \
  44      daf_download_url, \
  45      ddf_download_url, \
  46      submission_view_url, \
  47      UCSCEncodePipeline
  48
  49 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
  50 DDF_NS = RDF.NS(DOWNLOAD_DDF)
  51
  52 DBDIR = os.path.expanduser("~diane/proj/submission")
  53
  54 LOGGER = logging.getLogger("encode_find")
  55
  56 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  57 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  58
  59 USERNAME = 'detrout'
  60 CHARSET = 'utf-8'
  61
  62
  63 def main(cmdline=None):
  64     """
  65     Parse command line arguments
  66
  67     Takes a list of arguments (assuming arg[0] is the program name) or None
  68     If None, it looks at sys.argv
  69     """
  70     parser = make_parser()
  71     opts, args = parser.parse_args(cmdline)
  72
  73     if opts.debug:
  74         logging.basicConfig(level=logging.DEBUG)
  75     elif opts.verbose:
  76         logging.basicConfig(level=logging.INFO)
  77
  78     htsw_authdata = api.make_auth_from_opts(opts, parser)
  79     htswapi = api.HtswApi(opts.host, htsw_authdata)
  80
  81     cookie = None
  82     model = get_model(opts.load_model, DBDIR)
  83
  84     if opts.load_rdf is not None:
  85         ns_uri = submissionOntology[''].uri
  86         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
  87
  88     if len(args) == 0:
  89         limit = None
  90     else:
  91         limit = args
  92
  93     if opts.update:
  94         cookie = login(cookie=cookie)
  95         load_my_submissions(model, limit=limit, cookie=cookie)
  96         load_encode_libraries(model, htswapi)
  97
  98     if opts.sparql is not None:
  99         sparql_query(model, opts.sparql)
 100
 101     if opts.find_submission_with_no_library:
 102         find_submissions_with_no_library(model)
 103
 104     if opts.print_rdf:
 105         serializer = get_serializer(name=opts.rdf_parser_name)
 106         print serializer.serialize_model_to_string(model)
 107
 108
 109 def make_parser():
 110     """Construct option parser
 111     """
 112     parser = OptionParser()
 113     commands = OptionGroup(parser, "Commands")
 114     commands.add_option('--load-model', default=None,
 115       help="Load model database")
 116     commands.add_option('--load-rdf', default=None,
 117       help="load rdf statements into model")
 118     commands.add_option('--print-rdf', action="store_true", default=False,
 119       help="print ending model state")
 120     commands.add_option('--update', action="store_true", default=False,
 121       help="Query remote data sources and update our database")
 122     #commands.add_option('--update-ucsc-status', default=None,
 123     #  help="download status from ucsc, requires filename for extra rules")
 124     #commands.add_option('--update-ddfs', action="store_true", default=False,
 125     #  help="download ddf information for known submission")
 126     #commands.add_option('--update-library', default=None,
 127     #  help="download library info from htsw, "\
 128     #       "requires filename for extra rules")
 129     parser.add_option_group(commands)
 130
 131     queries = OptionGroup(parser, "Queries")
 132     queries.add_option('--sparql', default=None,
 133       help="execute arbitrary sparql query")
 134     queries.add_option('--find-submission-with-no-library', default=False,
 135       action="store_true",
 136       help="find submissions with no library ID")
 137     parser.add_option_group(queries)
 138
 139     options = OptionGroup(parser, "Options")
 140     options.add_option("--rdf-parser-name", default="turtle",
 141       help="set rdf file parser type")
 142     options.add_option("-v", "--verbose", action="store_true", default=False)
 143     options.add_option("--debug", action="store_true", default=False)
 144     parser.add_option_group(options)
 145
 146     api.add_auth_options(parser)
 147
 148     return parser
 149
 150
 151 def load_my_submissions(model, limit=None, cookie=None):
 152     """Parse all the submissions from UCSC into model
 153     It will look at the global USER_URL to figure out who to scrape
 154     cookie contains the session cookie, if none, will attempt to login
 155     """
 156     if cookie is None:
 157         cookie = login()
 158
 159     tree = get_url_as_tree(USER_URL, 'GET', cookie)
 160     table_rows = tree.xpath('//table[@id="projects"]/tr')
 161     # first record is header
 162     name_n = submissionOntology['name']
 163     species_n = submissionOntology['species']
 164     library_urn = submissionOntology['library_urn']
 165
 166     # skip header
 167     for row in table_rows[1:]:
 168         cell = row.xpath('td')
 169         if cell is not None and len(cell) > 1:
 170             submission_id = str(cell[0].text_content())
 171             if limit is None or submission_id in limit:
 172                 subUrn = RDF.Uri(submission_view_url(submission_id))
 173
 174                 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
 175
 176                 name = str(cell[4].text_content())
 177                 add_stmt(model, subUrn, name_n, name)
 178
 179                 species = str(cell[2].text_content())
 180                 if species is not None:
 181                     add_stmt(model, subUrn, species_n, species)
 182
 183                 library_id = get_library_id(name)
 184                 if library_id is not None:
 185                     add_submission_to_library_urn(model,
 186                                                   subUrn,
 187                                                   library_urn,
 188                                                   library_id)
 189                 else:
 190                     errmsg = 'Unable to find library id in {0} for {1}'
 191                     LOGGER.warn(errmsg.format(name, str(subUrn)))
 192
 193                 add_submission_creation_date(model, subUrn, cookie)
 194
 195                 # grab changing atttributes
 196                 status = str(cell[6].text_content()).strip()
 197                 last_mod_datetime = get_date_contents(cell[8])
 198                 last_mod = last_mod_datetime.isoformat()
 199
 200                 update_submission_detail(model, subUrn, status, last_mod,
 201                                          cookie=cookie)
 202
 203                 LOGGER.info("Processed {0}".format(subUrn))
 204
 205
 206
 207
 208 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 209     """Add a link from a UCSC submission to woldlab library if needed
 210     """
 211     libraryUrn = LIBRARY_NS[library_id + '/']
 212     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
 213     if not model.contains_statement(query):
 214         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
 215         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
 216         model.add_statement(link)
 217     else:
 218         LOGGER.debug("Found: {0}".format(str(query)))
 219
 220
 221 def find_submissions_with_no_library(model):
 222     missing_lib_query_text = """
 223 PREFIX submissionOntology:<{submissionOntology}>
 224
 225 SELECT
 226  ?subid ?name
 227 WHERE {{
 228   ?subid submissionOntology:name ?name
 229   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
 230   FILTER  (!bound(?libid))
 231 }}""".format(submissionOntology=submissionOntology[''].uri)
 232     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
 233
 234     results = missing_lib_query.execute(model)
 235     for row in results:
 236         subid = row['subid']
 237         name = row['name']
 238         print "# {0}".format(name)
 239         print "<{0}>".format(subid.uri)
 240         print "  encodeSubmit:library_urn "\
 241               "<http://jumpgate.caltech.edu/library/> ."
 242         print ""
 243
 244
 245 def add_submission_creation_date(model, subUrn, cookie):
 246     # in theory the submission page might have more information on it.
 247     creationDateN = libraryOntology['date']
 248     dateTimeType = xsdNS['dateTime']
 249     query = RDF.Statement(subUrn, creationDateN, None)
 250     creation_dates = list(model.find_statements(query))
 251     if len(creation_dates) == 0:
 252         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
 253         tree = get_url_as_tree(str(subUrn), 'GET', cookie)
 254         cells = tree.findall('.//td')
 255         created_label = [x for x in cells
 256                          if x.text_content().startswith('Created')]
 257         if len(created_label) == 1:
 258             created_date = get_date_contents(created_label[0].getnext())
 259             created_date_node = RDF.Node(literal=created_date.isoformat(),
 260                                          datatype=dateTimeType.uri)
 261             add_stmt(model, subUrn, creationDateN, created_date_node)
 262         else:
 263             msg = 'Unable to find creation date for {0}'.format(str(subUrn))
 264             LOGGER.warn(msg)
 265             raise Warning(msg)
 266     else:
 267         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 268
 269
 270 def update_submission_detail(model, subUrn, status, recent_update, cookie):
 271     HasStatusN = submissionOntology['has_status']
 272     StatusN = submissionOntology['status']
 273     LastModifyN = submissionOntology['last_modify_date']
 274
 275     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
 276     status_nodes = list(model.find_statements(status_nodes_query))
 277
 278     if len(status_nodes) == 0:
 279         # has no status node, add one
 280         LOGGER.info("Adding status node to {0}".format(subUrn))
 281         status_node = create_status_node(subUrn, recent_update)
 282         add_stmt(model, subUrn, HasStatusN, status_node)
 283         add_stmt(model, status_node, rdfNS['type'], StatusN)
 284         add_stmt(model, status_node, StatusN, status)
 285         add_stmt(model, status_node, LastModifyN, recent_update)
 286         update_ddf(model, subUrn, status_node, cookie=cookie)
 287         update_daf(model, subUrn, status_node, cookie=cookie)
 288     else:
 289         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
 290         for status_statement in status_nodes:
 291             status_node = status_statement.object
 292             last_modified_query = RDF.Statement(status_node,
 293                                                 LastModifyN,
 294                                                 None)
 295             last_mod_nodes = model.find_statements(last_modified_query)
 296             for last_mod_statement in last_mod_nodes:
 297                 last_mod_date = str(last_mod_statement.object)
 298                 if recent_update == str(last_mod_date):
 299                     update_ddf(model, subUrn, status_node, cookie=cookie)
 300                     update_daf(model, subUrn, status_node, cookie=cookie)
 301                     break
 302
 303
 304 def update_daf(model, submission_url, status_node, cookie):
 305     download_daf_uri = str(submission_url).replace('show', 'download_daf')
 306     daf_uri = RDF.Uri(download_daf_uri)
 307
 308     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
 309     if not model.contains_statement(status_is_daf):
 310         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
 311                                                      status_node))
 312         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
 313         daf_hash = hashlib.md5(daf_text).hexdigest()
 314         daf_hash_stmt = RDF.Statement(status_node,
 315                                       dafTermOntology['md5sum'],
 316                                       daf_hash)
 317         model.add_statement(daf_hash_stmt)
 318         daf.fromstring_into_model(model, status_node, daf_text)
 319
 320
 321 def update_ddf(model, subUrn, statusNode, cookie):
 322     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
 323     ddfUrn = RDF.Uri(download_ddf_url)
 324
 325     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
 326     if not model.contains_statement(status_is_ddf):
 327         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
 328         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
 329         add_ddf_statements(model, statusNode, ddf_text)
 330         model.add_statement(status_is_ddf)
 331
 332
 333 def add_ddf_statements(model, statusNode, ddf_string):
 334     """Convert a ddf text file into RDF Statements
 335     """
 336     ddf_lines = ddf_string.split('\n')
 337     # first line is header
 338     header = ddf_lines[0].split()
 339     attributes = [DDF_NS[x] for x in header]
 340
 341     for ddf_line in ddf_lines[1:]:
 342         ddf_line = ddf_line.strip()
 343         if len(ddf_line) == 0:
 344             continue
 345         if ddf_line.startswith("#"):
 346             continue
 347
 348         ddf_record = ddf_line.split('\t')
 349         files = ddf_record[0].split(',')
 350         file_attributes = ddf_record[1:]
 351
 352         for f in files:
 353             fileNode = RDF.Node()
 354             add_stmt(model,
 355                      statusNode,
 356                      submissionOntology['has_file'],
 357                      fileNode)
 358             add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
 359             add_stmt(model, fileNode, DDF_NS['filename'], f)
 360
 361             for predicate, object in zip(attributes[1:], file_attributes):
 362                 add_stmt(model, fileNode, predicate, object)
 363
 364
 365 def load_encode_libraries(model, htswapi):
 366     """Get libraries associated with encode.
 367     """
 368     encodeFilters = ["/library/?affiliations__id__exact=44",
 369                      "/library/?affiliations__id__exact=80",
 370                     ]
 371
 372     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
 373     rdfaParser = RDF.Parser(name='rdfa')
 374     for encodeUrl in encodeUrls:
 375         LOGGER.info("Scanning library url {0}".format(encodeUrl))
 376         rdfaParser.parse_into_model(model, encodeUrl)
 377         query = RDF.Statement(None, libraryOntology['library_id'], None)
 378         libraries = model.find_statements(query)
 379         for statement in libraries:
 380             libraryUrn = statement.subject
 381             LOGGER.info("Scanning {0}".format(str(libraryUrn)))
 382             load_library_detail(model, libraryUrn)
 383
 384
 385 def load_library_detail(model, libraryUrn):
 386     """Grab detail information from library page
 387     """
 388     rdfaParser = RDF.Parser(name='rdfa')
 389     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
 390     results = list(model.find_statements(query))
 391     log_message = "Found {0} statements for {1}"
 392     LOGGER.debug(log_message.format(len(results), libraryUrn))
 393     if len(results) == 0:
 394         LOGGER.info("Loading {0}".format(str(libraryUrn)))
 395         rdfaParser.parse_into_model(model, libraryUrn.uri)
 396     elif len(results) == 1:
 397         pass  # Assuming that a loaded dataset has one record
 398     else:
 399         LOGGER.warning("Many dates for {0}".format(libraryUrn))
 400
 401
 402 def get_library_id(name):
 403     """Guess library ID from library name
 404
 405     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
 406     '11039'
 407     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
 408     '10150'
 409     """
 410     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
 411     library_id = None
 412     if match is not None:
 413         library_id = match.group('id')
 414     return library_id
 415
 416
 417 def get_contents(element):
 418     """Return contents or none.
 419     """
 420     if len(element.contents) == 0:
 421         return None
 422
 423     a = element.find('a')
 424     if a is not None:
 425         return a.contents[0].encode(CHARSET)
 426
 427     return element.contents[0].encode(CHARSET)
 428
 429
 430 def create_status_node(submission_uri, timestamp):
 431     submission_uri = daf.submission_uri_to_string(submission_uri)
 432     status_uri = urlparse.urljoin(submission_uri, timestamp)
 433     return RDF.Node(RDF.Uri(status_uri))
 434
 435
 436 def get_date_contents(element):
 437     data = element.text_content()
 438     if data:
 439         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 440     else:
 441         return None
 442
 443
 444 def add_stmt(model, subject, predicate, rdf_object):
 445     """Convienence create RDF Statement and add to a model
 446     """
 447     return model.add_statement(
 448         RDF.Statement(subject, predicate, rdf_object))
 449
 450
 451 def login(cookie=None):
 452     """Login if we don't have a cookie
 453     """
 454     if cookie is not None:
 455         return cookie
 456
 457     keys = keyring.get_keyring()
 458     password = keys.get_password(LOGIN_URL, USERNAME)
 459     credentials = {'login': USERNAME,
 460                    'password': password}
 461     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 462     http = httplib2.Http()
 463     response, content = http.request(LOGIN_URL,
 464                                      'POST',
 465                                      headers=headers,
 466                                      body=urllib.urlencode(credentials))
 467     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
 468                                                     response['status']))
 469
 470     cookie = response.get('set-cookie', None)
 471     if cookie is None:
 472         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 473     return cookie
 474
 475
 476 def get_url_as_tree(url, method, cookie=None):
 477     http = httplib2.Http()
 478     headers = {}
 479     if cookie is not None:
 480         headers['Cookie'] = cookie
 481     response, content = http.request(url, method, headers=headers)
 482     if response['status'] == '200':
 483         tree = fromstring(content, base_url=url)
 484         return tree
 485     else:
 486         msg = "error accessing {0}, status {1}"
 487         msg = msg.format(url, response['status'])
 488         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 489
 490
 491 def get_url_as_text(url, method, cookie=None):
 492     http = httplib2.Http()
 493     headers = {}
 494     if cookie is not None:
 495         headers['Cookie'] = cookie
 496     response, content = http.request(url, method, headers=headers)
 497     if response['status'] == '200':
 498         return content
 499     else:
 500         msg = "error accessing {0}, status {1}"
 501         msg = msg.format(url, response['status'])
 502         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 503
 504 ################
 505 #  old stuff
 506 SUBMISSIONS_LACKING_LIBID = [
 507     ('1x75-Directional-HeLa-Rep1',    '11208'),
 508     ('1x75-Directional-HeLa-Rep2',    '11207'),
 509     ('1x75-Directional-HepG2-Rep1',   '11210'),
 510     ('1x75-Directional-HepG2-Rep2',   '11209'),
 511     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 512     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 513     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 514     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 515     ('1x75-Directional-K562-Rep1',    '11008'),
 516     ('1x75-Directional-K562-Rep2',    '11007'),
 517     ('1x75-Directional-NHEK-Rep1',    '11204'),
 518     ('1x75-Directional-GM12878-Rep1', '11011'),
 519     ('1x75-Directional-GM12878-Rep2', '11010'),
 520     ]
 521
 522
 523 def select_by_library_id(submission_list):
 524     subl = [(x.library_id, x) for x in submission_list if x.library_id]
 525     libraries = {}
 526     for lib_id, subobj in subl:
 527         libraries.setdefault(lib_id, []).append(subobj)
 528
 529     for submission in libraries.values():
 530         submission.sort(key=attrgetter('date'), reverse=True)
 531
 532     return libraries
 533
 534
 535 def library_to_freeze(selected_libraries):
 536     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 537     lib_ids = sorted(selected_libraries.keys())
 538     report = ['<html><table border="1">']
 539     report = ["""<html>
 540 <head>
 541 <style type="text/css">
 542  td {border-width:0 0 1px 1px; border-style:solid;}
 543 </style>
 544 </head>
 545 <body>
 546 <table>
 547 """]
 548     report.append('<thead>')
 549     report.append('<tr><td>Library ID</td><td>Name</td>')
 550     for f in freezes:
 551         report.append('<td>{0}</td>'.format(f))
 552     report.append('</tr>')
 553     report.append('</thead>')
 554     report.append('<tbody>')
 555     for lib_id in lib_ids:
 556         report.append('<tr>')
 557         lib_url = LIBRARY_NS[lib_id].uri
 558         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 559         submissions = selected_libraries[lib_id]
 560         report.append('<td>{0}</td>'.format(submissions[0].name))
 561         batched = {}
 562         for sub in submissions:
 563             date = date_to_freeze(sub.date)
 564             batched.setdefault(date, []).append(sub)
 565         for d in freezes:
 566             report.append('<td>')
 567             for s in batched.get(d, []):
 568                 show_url = submission_view_url(s.subid)
 569                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 570                 report.append("{0}:{1}".format(subid, s.status))
 571             report.append('</td>')
 572         else:
 573             report.append('<td></td>')
 574         report.append("</tr>")
 575     report.append('</tbody>')
 576     report.append("</table></html>")
 577     return "\n".join(report)
 578
 579
 580 def date_to_freeze(d):
 581     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
 582                (datetime(2010, 7, 30), '2010-Jul'),
 583                (datetime(2011, 1, 30), '2011-Jan'),
 584                ]
 585     for end, name in freezes:
 586         if d < end:
 587             return name
 588     else:
 589         return None
 590
 591 if __name__ == "__main__":
 592     main()