extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2
   3 from BeautifulSoup import BeautifulSoup
   4 from datetime import datetime
   5 import httplib2
   6 from operator import attrgetter
   7 from optparse import OptionParser, OptionGroup
   8 # python keyring
   9 import keyring
  10 import logging
  11 import os
  12 import re
  13 # redland rdf lib
  14 import RDF
  15 import sys
  16 import urllib
  17 import urlparse
  18
  19 from htsworkflow.util import api
  20 from htsworkflow.util.rdfhelp import \
  21      dublinCoreNS, \
  22      get_model, \
  23      get_serializer, \
  24      sparql_query, \
  25      submissionOntology, \
  26      libraryOntology, \
  27      load_into_model, \
  28      rdfNS, \
  29      rdfsNS, \
  30      xsdNS
  31
  32 # URL mappings
  33 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
  34
  35
  36 from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
  37 download_ddf = UCSCEncodePipeline+"download_ddf#"
  38 ddfNS = RDF.NS(download_ddf)
  39
  40 DBDIR = os.path.expanduser("~diane/proj/submission")
  41
  42 logger = logging.getLogger("encode_find")
  43
  44 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  45 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  46
  47 USERNAME = 'detrout'
  48 CHARSET = 'utf-8'
  49
  50 def main(cmdline=None):
  51     parser = make_parser()
  52     opts, args = parser.parse_args(cmdline)
  53
  54     if opts.debug:
  55         logging.basicConfig(level=logging.DEBUG)
  56     elif opts.verbose:
  57         logging.basicConfig(level=logging.INFO)
  58
  59     htsw_authdata = api.make_auth_from_opts(opts, parser)
  60     htswapi = api.HtswApi(opts.host, htsw_authdata)
  61
  62     cookie = None
  63     model = get_model(opts.load_model, DBDIR)
  64
  65     if opts.load_rdf is not None:
  66         ns_uri = submissionOntology[''].uri
  67         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
  68
  69     if opts.update:
  70         cookie = login(cookie=cookie)
  71         load_my_submissions(model, cookie=cookie)
  72         load_encode_libraries(model, htswapi)
  73
  74     if opts.sparql is not None:
  75         sparql_query(model, opts.sparql)
  76
  77     if opts.find_submission_with_no_library:
  78         missing = find_submissions_with_no_library(model)
  79
  80     if opts.print_rdf:
  81         serializer = get_serializer(name=opts.rdf_parser_name)
  82         print serializer.serialize_model_to_string(model)
  83
  84
  85 def make_parser():
  86     parser = OptionParser()
  87     commands = OptionGroup(parser, "Commands")
  88     commands.add_option('--load-model', default=None,
  89       help="Load model database")
  90     commands.add_option('--load-rdf', default=None,
  91       help="load rdf statements into model")
  92     commands.add_option('--print-rdf', action="store_true", default=False,
  93       help="print ending model state")
  94     commands.add_option('--update', action="store_true", default=False,
  95       help="Query remote data sources and update our database")
  96     #commands.add_option('--update-ucsc-status', default=None,
  97     #  help="download status from ucsc, requires filename for extra rules")
  98     #commands.add_option('--update-ddfs', action="store_true", default=False,
  99     #  help="download ddf information for known submission")
 100     #commands.add_option('--update-library', default=None,
 101     #  help="download library info from htsw, requires filename for extra rules")
 102     parser.add_option_group(commands)
 103
 104     queries = OptionGroup(parser, "Queries")
 105     queries.add_option('--sparql', default=None,
 106       help="execute arbitrary sparql query")
 107     queries.add_option('--find-submission-with-no-library', default=False,
 108       action="store_true",
 109       help="find submissions with no library ID")
 110     parser.add_option_group(queries)
 111
 112     options = OptionGroup(parser, "Options")
 113     options.add_option("--rdf-parser-name", default="turtle",
 114       help="set rdf file parser type")
 115     options.add_option("-v", "--verbose", action="store_true", default=False)
 116     options.add_option("--debug", action="store_true", default=False)
 117     parser.add_option_group(options)
 118
 119     api.add_auth_options(parser)
 120
 121     return parser
 122
 123 def load_my_submissions(model, cookie=None):
 124     if cookie is None:
 125         cookie = login()
 126
 127     soup = get_url_as_soup(USER_URL, 'GET', cookie)
 128     p = soup.find('table', attrs={'id':'projects'})
 129     tr = p.findNext('tr')
 130     # first record is header
 131     tr = tr.findNext()
 132     TypeN = rdfsNS['type']
 133     NameN = submissionOntology['name']
 134     SpeciesN = submissionOntology['species']
 135     LibraryURN = submissionOntology['library_urn']
 136
 137     while tr is not None:
 138         td = tr.findAll('td')
 139         if td is not None and len(td) > 1:
 140             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
 141             subUrn = RDF.Uri(submission_view_url(subUrnText))
 142
 143             add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
 144
 145             name = get_contents(td[4])
 146             add_stmt(model, subUrn, NameN, name)
 147
 148             species = get_contents(td[2])
 149             if species is not None:
 150                 add_stmt(model, subUrn, SpeciesN, species)
 151
 152             library_id = get_library_id(name)
 153             if library_id is not None:
 154                 add_submission_to_library_urn(model,
 155                                               subUrn,
 156                                               LibraryURN,
 157                                               library_id)
 158
 159             add_submission_creation_date(model, subUrn, cookie)
 160
 161             # grab changing atttributes
 162             status = get_contents(td[6]).strip()
 163             last_mod_datetime = get_date_contents(td[8])
 164             last_mod = last_mod_datetime.isoformat()
 165
 166             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
 167
 168             logging.info("Processed {0}".format( subUrn))
 169
 170         tr = tr.findNext('tr')
 171
 172
 173 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 174     """Add a link from a UCSC submission to woldlab library if needed
 175     """
 176     libraryUrn = libraryNS[library_id+'/']
 177     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
 178     if not model.contains_statement(query):
 179         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
 180         logger.info("Adding Sub -> Lib link: {0}".format(link))
 181         model.add_statement(link)
 182     else:
 183         logger.debug("Found: {0}".format(str(query)))
 184
 185
 186 def find_submissions_with_no_library(model):
 187     missing_lib_query = RDF.SPARQLQuery("""
 188 PREFIX submissionOntology:<{submissionOntology}>
 189
 190 SELECT
 191  ?subid ?name
 192 WHERE {{
 193   ?subid submissionOntology:name ?name
 194   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
 195   FILTER  (!bound(?libid))
 196 }}""".format(submissionOntology=submissionOntology[''].uri)
 197 )
 198
 199     results = missing_lib_query.execute(model)
 200     for row in results:
 201         subid = row['subid']
 202         name = row['name']
 203         print "# {0}".format(name)
 204         print "<{0}>".format(subid.uri)
 205         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
 206         print ""
 207
 208
 209 def add_submission_creation_date(model, subUrn, cookie):
 210     # in theory the submission page might have more information on it.
 211     creationDateN = libraryOntology['date']
 212     dateTimeType = xsdNS['dateTime']
 213     query = RDF.Statement(subUrn, creationDateN, None)
 214     creation_dates = list(model.find_statements(query))
 215     if len(creation_dates) == 0:
 216         logger.info("Getting creation date for: {0}".format(str(subUrn)))
 217         soup = get_url_as_soup(str(subUrn), 'GET', cookie)
 218         created_label = soup.find(text="Created: ")
 219         if created_label:
 220             created_date = get_date_contents(created_label.next)
 221             created_date_node = RDF.Node(literal=created_date.isoformat(),
 222                                          datatype=dateTimeType.uri)
 223             add_stmt(model, subUrn, creationDateN, created_date_node)
 224     else:
 225         logger.debug("Found creation date for: {0}".format(str(subUrn)))
 226
 227 def update_submission_detail(model, subUrn, status, recent_update, cookie):
 228     HasStatusN = submissionOntology['has_status']
 229     StatusN = submissionOntology['status']
 230     LastModifyN = submissionOntology['last_modify_date']
 231
 232     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
 233     status_nodes = list(model.find_statements(status_nodes_query))
 234
 235     if len(status_nodes) == 0:
 236         # has no status node, add one
 237         logging.info("Adding status node to {0}".format(subUrn))
 238         status_blank = RDF.Node()
 239         add_stmt(model, subUrn, HasStatusN, status_blank)
 240         add_stmt(model, status_blank, rdfsNS['type'], StatusN)
 241         add_stmt(model, status_blank, StatusN, status)
 242         add_stmt(model, status_blank, LastModifyN, recent_update)
 243         update_ddf(model, subUrn, status_blank, cookie=cookie)
 244     else:
 245         logging.info("Found {0} status blanks".format(len(status_nodes)))
 246         for status_statement in status_nodes:
 247             status_blank = status_statement.object
 248             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
 249             last_mod_nodes = model.find_statements(last_modified_query)
 250             for last_mod_statement in last_mod_nodes:
 251                 last_mod_date = str(last_mod_statement.object)
 252                 if recent_update == str(last_mod_date):
 253                     update_ddf(model, subUrn, status_blank, cookie=cookie)
 254                     break
 255
 256
 257
 258 def update_ddf(model, subUrn, statusNode, cookie):
 259     TypeN = rdfsNS['type']
 260
 261     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
 262     ddfUrn = RDF.Uri(download_ddf_url)
 263
 264     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
 265     if not model.contains_statement(status_is_ddf):
 266         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
 267         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
 268         add_ddf_statements(model, statusNode, ddf_text)
 269         model.add_statement(status_is_ddf)
 270
 271
 272 def add_ddf_statements(model, statusNode, ddf_string):
 273     """Convert a ddf text file into RDF Statements
 274     """
 275     ddf_lines = ddf_string.split('\n')
 276     # first line is header
 277     header = ddf_lines[0].split()
 278     attributes = [ ddfNS[x] for x in header ]
 279     statements = []
 280
 281     for ddf_line in ddf_lines[1:]:
 282         ddf_line = ddf_line.strip()
 283         if len(ddf_line) == 0:
 284             continue
 285         if ddf_line.startswith("#"):
 286             continue
 287
 288         ddf_record = ddf_line.split('\t')
 289         files = ddf_record[0].split(',')
 290         file_attributes = ddf_record[1:]
 291
 292         for f in files:
 293             fileNode = RDF.Node()
 294             add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
 295             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
 296             add_stmt(model, fileNode, ddfNS['filename'], f)
 297
 298             for predicate, object in zip( attributes[1:], file_attributes):
 299                 add_stmt(model, fileNode, predicate, object)
 300
 301
 302 def load_encode_libraries(model, htswapi):
 303     """Get libraries associated with encode.
 304     """
 305     encodeFilters = ["/library/?affiliations__id__exact=44",
 306                      "/library/?affiliations__id__exact=80",]
 307
 308
 309     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
 310     rdfaParser = RDF.Parser(name='rdfa')
 311     for encodeUrl in encodeUrls:
 312         logger.info("Scanning library url {0}".format(encodeUrl))
 313         rdfaParser.parse_into_model(model, encodeUrl)
 314         query = RDF.Statement(None, libraryOntology['library_id'], None)
 315         libraries = model.find_statements(query)
 316         for statement in libraries:
 317             libraryUrn = statement.subject
 318             logger.info("Scanning {0}".format(str(libraryUrn)))
 319             load_library_detail(model, libraryUrn)
 320
 321
 322 def load_library_detail(model, libraryUrn):
 323     """Grab detail information from library page
 324     """
 325     rdfaParser = RDF.Parser(name='rdfa')
 326     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
 327     results = list(model.find_statements(query))
 328     logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
 329     if len(results) == 0:
 330         logger.info("Loading {0}".format(str(libraryUrn)))
 331         rdfaParser.parse_into_model(model, libraryUrn.uri)
 332     elif len(results) == 1:
 333         pass # Assuming that a loaded dataset has one record
 334     else:
 335         logging.warning("Many dates for {0}".format(libraryUrn))
 336
 337 def get_library_id(name):
 338     """Guess library ID from library name
 339
 340     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
 341     '11039'
 342     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
 343     '10150'
 344     """
 345     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
 346     library_id = None
 347     if match is not None:
 348         library_id = match.group('id')
 349     return library_id
 350
 351
 352 def get_contents(element):
 353     """Return contents or none.
 354     """
 355     if len(element.contents) == 0:
 356         return None
 357
 358     a = element.find('a')
 359     if a is not None:
 360         return a.contents[0].encode(CHARSET)
 361
 362     return element.contents[0].encode(CHARSET)
 363
 364
 365 def get_date_contents(element):
 366     data = get_contents(element)
 367     if data:
 368         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 369     else:
 370         return None
 371
 372
 373 def add_stmt(model, subject, predicate, object):
 374     """Convienence create RDF Statement and add to a model
 375     """
 376     return model.add_statement(
 377         RDF.Statement(subject, predicate, object)
 378     )
 379
 380
 381 def login(cookie=None):
 382     """Login if we don't have a cookie
 383     """
 384     if cookie is not None:
 385         return cookie
 386
 387     keys = keyring.get_keyring()
 388     password = keys.get_password(LOGIN_URL, USERNAME)
 389     credentials = {'login': USERNAME,
 390                    'password': password}
 391     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 392     http = httplib2.Http()
 393     response, content = http.request(LOGIN_URL,
 394                                      'POST',
 395                                      headers=headers,
 396                                      body=urllib.urlencode(credentials))
 397     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
 398                                                     response['status']))
 399
 400     cookie = response.get('set-cookie', None)
 401     if cookie is None:
 402         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 403     return cookie
 404
 405
 406 def get_url_as_soup(url, method, cookie=None):
 407     http = httplib2.Http()
 408     headers = {}
 409     if cookie is not None:
 410         headers['Cookie'] = cookie
 411     response, content = http.request(url, method, headers=headers)
 412     if response['status'] == '200':
 413         soup = BeautifulSoup(content,
 414                              fromEncoding="utf-8", # should read from header
 415                              convertEntities=BeautifulSoup.HTML_ENTITIES
 416                              )
 417         return soup
 418     else:
 419         msg = "error accessing {0}, status {1}"
 420         msg = msg.format(url, response['status'])
 421         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 422
 423 def get_url_as_text(url, method, cookie=None):
 424     http = httplib2.Http()
 425     headers = {}
 426     if cookie is not None:
 427         headers['Cookie'] = cookie
 428     response, content = http.request(url, method, headers=headers)
 429     if response['status'] == '200':
 430         return content
 431     else:
 432         msg = "error accessing {0}, status {1}"
 433         msg = msg.format(url, response['status'])
 434         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 435
 436 ################
 437 #  old stuff
 438 SUBMISSIONS_LACKING_LIBID = [
 439     ('1x75-Directional-HeLa-Rep1',    '11208'),
 440     ('1x75-Directional-HeLa-Rep2',    '11207'),
 441     ('1x75-Directional-HepG2-Rep1',   '11210'),
 442     ('1x75-Directional-HepG2-Rep2',   '11209'),
 443     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 444     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 445     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 446     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 447     ('1x75-Directional-K562-Rep1',    '11008'),
 448     ('1x75-Directional-K562-Rep2',    '11007'),
 449     ('1x75-Directional-NHEK-Rep1',    '11204'),
 450     ('1x75-Directional-GM12878-Rep1', '11011'),
 451     ('1x75-Directional-GM12878-Rep2', '11010'),
 452     ]
 453
 454
 455
 456 def select_by_library_id(submission_list):
 457     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
 458     libraries = {}
 459     for lib_id, subobj in subl:
 460         libraries.setdefault(lib_id, []).append(subobj)
 461
 462     for submission in libraries.values():
 463         submission.sort(key=attrgetter('date'), reverse=True)
 464
 465     return libraries
 466
 467 def library_to_freeze(selected_libraries):
 468     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 469     lib_ids = sorted(selected_libraries.keys())
 470     report = ['<html><table border="1">']
 471     report = ["""<html>
 472 <head>
 473 <style type="text/css">
 474  td {border-width:0 0 1px 1px; border-style:solid;}
 475 </style>
 476 </head>
 477 <body>
 478 <table>
 479 """]
 480     report.append('<thead>')
 481     report.append('<tr><td>Library ID</td><td>Name</td>')
 482     for f in freezes:
 483         report.append('<td>{0}</td>'.format(f))
 484     report.append('</tr>')
 485     report.append('</thead>')
 486     report.append('<tbody>')
 487     for lib_id in lib_ids:
 488         report.append('<tr>')
 489         lib_url = libraryNS[lib_id].uri
 490         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 491         submissions = selected_libraries[lib_id]
 492         report.append('<td>{0}</td>'.format(submissions[0].name))
 493         batched = {}
 494         for sub in submissions:
 495             date = date_to_freeze(sub.date)
 496             batched.setdefault(date, []).append(sub)
 497         print lib_id, batched
 498         for d in freezes:
 499             report.append('<td>')
 500             for s in batched.get(d, []):
 501                 show_url = submission_view_url(s.subid)
 502                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 503                 report.append("{0}:{1}".format(subid, s.status))
 504             report.append('</td>')
 505         else:
 506             report.append('<td></td>')
 507         report.append("</tr>")
 508     report.append('</tbody>')
 509     report.append("</table></html>")
 510     return "\n".join(report)
 511
 512
 513 def date_to_freeze(d):
 514     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
 515                 (datetime(2010, 7, 30), '2010-Jul'),
 516                 (datetime(2011, 1, 30), '2011-Jan'),
 517                 ]
 518     for end, name in freezes:
 519         if d < end:
 520             return name
 521     else:
 522         return None
 523
 524 if __name__ == "__main__":
 525     main()
 526