extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2
   3 from BeautifulSoup import BeautifulSoup
   4 from datetime import datetime
   5 import httplib2
   6 from operator import attrgetter
   7 from optparse import OptionParser, OptionGroup
   8 # python keyring
   9 import keyring
  10 import logging
  11 import os
  12 import re
  13 # redland rdf lib
  14 import RDF
  15 import sys
  16 import urllib
  17
  18 from htsworkflow.util import api
  19
  20 logger = logging.getLogger("encode_find")
  21
  22 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
  23 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
  24 submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
  25 ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
  26 libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
  27
  28 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  29 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  30 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  31 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
  32
  33 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  34 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  35
  36 USERNAME = 'detrout'
  37 CHARSET = 'utf-8'
  38
  39 def main(cmdline=None):
  40     parser = make_parser()
  41     opts, args = parser.parse_args(cmdline)
  42
  43     if opts.verbose:
  44         logging.basicConfig(level=logging.INFO)
  45
  46     htsw_authdata = api.make_auth_from_opts(opts, parser)
  47     htswapi = api.HtswApi(opts.host, htsw_authdata)
  48
  49     cookie = None
  50     model = get_model(opts.load_model)
  51
  52     if opts.load_rdf is not None:
  53         load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
  54
  55     if opts.update:
  56         cookie = login(cookie=cookie)
  57         load_my_submissions(model, cookie=cookie)
  58         update_submission_detail(model, cookie=cookie)
  59         load_libraries(model, htswapi)
  60
  61     if opts.sparql is not None:
  62         sparql_query(model, opts.sparql)
  63
  64     if opts.find_submission_with_no_library:
  65         missing = find_submissions_with_no_library(model)
  66
  67     if opts.print_rdf:
  68         serializer = RDF.Serializer(name=opts.rdf_parser_name)
  69         print serializer.serialize_model_to_string(model)
  70
  71
  72 def make_parser():
  73     parser = OptionParser()
  74     commands = OptionGroup(parser, "Commands")
  75     commands.add_option('--load-model', default=None,
  76       help="Load model database")
  77     commands.add_option('--load-rdf', default=None,
  78       help="load rdf statements into model")
  79     commands.add_option('--print-rdf', action="store_true", default=False,
  80       help="print ending model state")
  81     commands.add_option('--update', action="store_true", default=False,
  82       help="Query remote data sources and update our database")
  83     #commands.add_option('--update-ucsc-status', default=None,
  84     #  help="download status from ucsc, requires filename for extra rules")
  85     #commands.add_option('--update-ddfs', action="store_true", default=False,
  86     #  help="download ddf information for known submission")
  87     #commands.add_option('--update-library', default=None,
  88     #  help="download library info from htsw, requires filename for extra rules")
  89     parser.add_option_group(commands)
  90
  91     queries = OptionGroup(parser, "Queries")
  92     queries.add_option('--sparql', default=None,
  93       help="execute arbitrary sparql query")
  94     queries.add_option('--find-submission-with-no-library', default=False,
  95       action="store_true",
  96       help="find submissions with no library ID")
  97     parser.add_option_group(queries)
  98
  99     options = OptionGroup(parser, "Options")
 100     options.add_option("--rdf-parser-name", default="turtle",
 101       help="set rdf file parser type")
 102     options.add_option("-v", "--verbose", action="store_true", default=False)
 103     parser.add_option_group(options)
 104
 105     api.add_auth_options(parser)
 106
 107     return parser
 108
 109 def get_model(model_name=None):
 110     if model_name is None:
 111         storage = RDF.MemoryStorage()
 112     else:
 113         storage = RDF.HashStorage(model_name, options="hash-type='bdb',dir='/tmp'")
 114     model = RDF.Model(storage)
 115     return model
 116
 117 def load_my_submissions(model, cookie=None):
 118     if cookie is None:
 119         cookie = login()
 120
 121     soup = get_url_as_soup(USER_URL, 'GET', cookie)
 122     p = soup.find('table', attrs={'id':'projects'})
 123     tr = p.findNext('tr')
 124     # first record is header
 125     tr = tr.findNext()
 126     ClassP = rdfsNS['Class']
 127     NameP = submitOntologyNS['name']
 128     StatusP = submitOntologyNS['status']
 129     LastModifyP = submitOntologyNS['last_modify_date']
 130     SpeciesP = submitOntologyNS['species']
 131     LibraryURN = submitOntologyNS['library_urn']
 132     # typing saving
 133     add_stmt = model.add_statement
 134     Stmt = RDF.Statement
 135     while tr is not None:
 136         td = tr.findAll('td')
 137         if td is not None and len(td) > 1:
 138             subIdText = td[0].contents[0].contents[0].encode(CHARSET)
 139             subId = submissionNS[subIdText]
 140             submission_stmt = Stmt(subId, ClassP,
 141                                    submitOntologyNS['Submission'])
 142             if model.contains_statement(submission_stmt):
 143                 logger.debug("Have {0}".format(str(submission_stmt)))
 144             else:
 145                 logger.info("New submission {0}".format(str(submission_stmt)))
 146                 add_stmt(submission_stmt)
 147
 148                 name = get_contents(td[4])
 149                 add_stmt(Stmt(subId, NameP, name))
 150
 151                 status = get_contents(td[6]).strip()
 152                 add_stmt(Stmt(subId, StatusP, status))
 153
 154                 last_mod_datetime = get_date_contents(td[8])
 155                 last_mod = last_mod_datetime.isoformat()
 156                 add_stmt(Stmt(subId, LastModifyP, last_mod))
 157
 158                 species = get_contents(td[2])
 159                 if species is not None:
 160                     add_stmt(Stmt(subId, SpeciesP, species))
 161
 162                 library_id = get_library_id(name)
 163                 if library_id is not None:
 164                     add_submission_to_library_urn(model,
 165                                                   subId,
 166                                                   LibraryURN,
 167                                                   library_id)
 168
 169         tr = tr.findNext('tr')
 170
 171
 172 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 173     """Add a link from a UCSC submission to woldlab library if needed
 174     """
 175     libraryUrn = libraryNS[library_id]
 176     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
 177     if not model.contains_statement(query):
 178         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
 179         logger.info("Adding Sub -> Lib link: {0}".format(link))
 180         model.add_statement(link)
 181     else:
 182         logger.info("Found: {0}".format(str(result[0])))
 183
 184
 185 def find_submissions_with_no_library(model):
 186     p = os.path.abspath(__file__)
 187     sourcedir = os.path.dirname(p)
 188     no_lib = open(os.path.join(sourcedir, "no-lib.sparql"),'r').read()
 189     query = RDF.SPARQLQuery(no_lib)
 190     results = query.execute(model)
 191     for row in results:
 192         subid = row['subid']
 193         name = row['name']
 194         print "# {0}".format(name)
 195         print "<{0}>".format(subid.uri)
 196         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
 197         print ""
 198
 199 def update_submission_detail(model, cookie=None):
 200     """Look for submission IDs in our model and go get their ddfs
 201     """
 202     submissions = model.get_sources(rdfsNS['Class'],
 203                                     submitOntologyNS['Submission'])
 204     for subUrn in submissions:
 205         logging.info("Updating detail for: {0}".format(str(subUrn)))
 206         update_submission_creation_date(model, subUrn, cookie)
 207         download_ddf(model, subUrn, cookie=cookie)
 208
 209
 210 def update_submission_creation_date(model, subUrn, cookie):
 211     # in theory the submission page might have more information on it.
 212     creationDateP = libNS['date']
 213     dateTimeType = xsdNS['dateTime']
 214     query = RDF.Statement(subUrn, creationDateP, None)
 215     creation_dates = list(model.find_statements(query))
 216     if len(creation_dates) == 0:
 217         logger.info("Getting creation date for: {0}".format(str(subUrn)))
 218         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
 219         created_label = soup.find(text="Created: ")
 220         if created_label:
 221             created_date = get_date_contents(created_label.next)
 222             created_date_node = RDF.Node(literal=created_date.isoformat(),
 223                                          datatype=dateTimeType.uri)
 224             model.add_statement(
 225                 RDF.Statement(subUrn, creationDateP, created_date_node)
 226             )
 227
 228
 229 def download_ddf(model, subId, cookie=None):
 230     """Read a DDF
 231     """
 232     if cookie is None:
 233         cookie = login()
 234
 235     download_ddf_url = str(subId).replace('show', 'download_ddf')
 236     ddf = get_url_as_text(download_ddf_url, 'GET', cookie)
 237     ddfUrn = RDF.Uri(download_ddf_url)
 238     query = RDF.Statement(ddfUrn, rdfsNS['Class'], ddfNS['ddf'])
 239     if not model.contains_statement(query):
 240         statements = parse_ddf(subId, ddf)
 241         for s in statements:
 242             model.add_statement(s)
 243
 244
 245 def parse_ddf(subId, ddf_blob):
 246     """Convert a ddf text file into RDF Statements
 247     """
 248     ddf_data = ddf_blob.split('\n')
 249     # first line is header
 250     header = ddf_data[0].split()
 251     attributes = [ ddfNS[x] for x in header ]
 252     statements = []
 253     subIdUri = str(subId.uri)
 254     # force it to look like a namespace
 255     if subIdUri[-1] != '/':
 256         subIdUri += '/'
 257     subIdNS = RDF.NS(subIdUri)
 258     for ddf_line in ddf_data[1:]:
 259         ddf_line = ddf_line.strip()
 260         if len(ddf_line) == 0:
 261             continue
 262         if ddf_line.startswith("#"):
 263             continue
 264
 265         ddf_records = ddf_line.split('\t')
 266         files = ddf_records[0].split(',')
 267         file_attributes = ddf_records[1:]
 268
 269         for f in files:
 270             blank = RDF.Node()
 271             statements += [RDF.Statement(subId,
 272                                          submitOntologyNS['has_file'],
 273                                          blank)]
 274             statements += [RDF.Statement(blank, rdfsNS['Class'],
 275                                          submitOntologyNS['File'])]
 276             statements += [RDF.Statement(blank, ddfNS['filename'], f)]
 277             file_uri_list = [ blank ] * len(file_attributes)
 278             for s,p,o in zip(file_uri_list, attributes[1:], file_attributes):
 279                 statements += [RDF.Statement(s,p,o)]
 280
 281     return statements
 282
 283 def load_libraries(model, htswapi):
 284     """
 285     """
 286     query = RDF.SPARQLQuery("""
 287     SELECT distinct ?library_urn
 288     WHERE {
 289       ?subid <http://jumpgate.caltech.edu/wiki/EncodeSubmit#library_urn> ?library_urn .
 290     }""")
 291     results = query.execute(model)
 292     #newmodel = get_model()
 293     newmodel = model
 294     for row in results:
 295         lib_id = row['library_urn']
 296         lib_uri = str(row['library_urn'].uri)
 297         short_lib_id = lib_uri.replace(libraryNS._prefix,"")
 298         logging.info("Loading library info: {0}".format(short_lib_id))
 299         if short_lib_id.startswith("SL"):
 300             continue
 301         lib_info = htswapi.get_library(short_lib_id)
 302
 303         for lib_k, lib_v in lib_info.items():
 304             if lib_k != 'lane_set':
 305                 attribute = lib_k.encode(CHARSET)
 306                 newmodel.append(
 307                     RDF.Statement(lib_id,
 308                                   submitOntologyNS[attribute],
 309                                   str(lib_v)))
 310             else:
 311                 for flowcell in lib_v:
 312                     blank = RDF.Node()
 313                     newmodel.append(
 314                         RDF.Statement(lib_id,
 315                                       submitOntologyNS['has_lane'],
 316                                       blank))
 317                     for fc_k, fc_v in flowcell.items():
 318                         newmodel.append(
 319                             RDF.Statement(blank,
 320                                           submitOntologyNS[fc_k.encode(CHARSET)],
 321                                           str(fc_v)))
 322
 323     #serializer = RDF.Serializer('turtle')
 324     #print serializer.serialize_model_to_string(newmodel)
 325
 326 def get_library_id(name):
 327     """Guess library ID from library name
 328     """
 329     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
 330     library_id = None
 331     if match is not None:
 332         library_id = match.group('id')
 333     return library_id
 334
 335
 336 def get_contents(element):
 337     """Return contents or none.
 338     """
 339     if len(element.contents) == 0:
 340         return None
 341
 342     a = element.find('a')
 343     if a is not None:
 344         return a.contents[0].encode(CHARSET)
 345
 346     return element.contents[0].encode(CHARSET)
 347
 348
 349 def get_date_contents(element):
 350     data = get_contents(element)
 351     if data:
 352         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 353     else:
 354         return None
 355
 356 def sparql_query(model, query_filename):
 357     """Execute sparql query from file
 358     """
 359     query_body = open(query_filename,'r').read()
 360     query = RDF.SPARQLQuery(query_body)
 361     results = query.execute(model)
 362     for row in results:
 363         output = []
 364         for k,v in row.items()[::-1]:
 365             print "{0}: {1}".format(k,v)
 366         print
 367
 368
 369 def load_into_model(model, parser_name, filename):
 370     if not os.path.exists(filename):
 371         raise IOError("Can't find {0}".format(filename))
 372
 373     data = open(filename, 'r').read()
 374     rdf_parser = RDF.Parser(name=parser_name)
 375     ns_uri = submitOntologyNS[''].uri
 376     rdf_parser.parse_string_into_model(model, data, ns_uri)
 377
 378
 379 def login(cookie=None):
 380     """Login if we don't have a cookie
 381     """
 382     if cookie is not None:
 383         return cookie
 384
 385     keys = keyring.get_keyring()
 386     password = keys.get_password(LOGIN_URL, USERNAME)
 387     credentials = {'login': USERNAME,
 388                    'password': password}
 389     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 390     http = httplib2.Http()
 391     response, content = http.request(LOGIN_URL,
 392                                      'POST',
 393                                      headers=headers,
 394                                      body=urllib.urlencode(credentials))
 395     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
 396                                                     response['status']))
 397
 398     cookie = response.get('set-cookie', None)
 399     if cookie is None:
 400         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 401     return cookie
 402
 403
 404 def get_url_as_soup(url, method, cookie=None):
 405     http = httplib2.Http()
 406     headers = {}
 407     if cookie is not None:
 408         headers['Cookie'] = cookie
 409     response, content = http.request(url, method, headers=headers)
 410     if response['status'] == '200':
 411         soup = BeautifulSoup(content,
 412                              fromEncoding="utf-8", # should read from header
 413                              convertEntities=BeautifulSoup.HTML_ENTITIES
 414                              )
 415         return soup
 416     else:
 417         msg = "error accessing {0}, status {1}"
 418         msg = msg.format(url, response['status'])
 419         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 420
 421 def get_url_as_text(url, method, cookie=None):
 422     http = httplib2.Http()
 423     headers = {}
 424     if cookie is not None:
 425         headers['Cookie'] = cookie
 426     response, content = http.request(url, method, headers=headers)
 427     if response['status'] == '200':
 428         return content
 429     else:
 430         msg = "error accessing {0}, status {1}"
 431         msg = msg.format(url, response['status'])
 432         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 433
 434 ################
 435 #  old stuff
 436 SUBMISSIONS_LACKING_LIBID = [
 437     ('1x75-Directional-HeLa-Rep1',    '11208'),
 438     ('1x75-Directional-HeLa-Rep2',    '11207'),
 439     ('1x75-Directional-HepG2-Rep1',   '11210'),
 440     ('1x75-Directional-HepG2-Rep2',   '11209'),
 441     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 442     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 443     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 444     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 445     ('1x75-Directional-K562-Rep1',    '11008'),
 446     ('1x75-Directional-K562-Rep2',    '11007'),
 447     ('1x75-Directional-NHEK-Rep1',    '11204'),
 448     ('1x75-Directional-GM12878-Rep1', '11011'),
 449     ('1x75-Directional-GM12878-Rep2', '11010'),
 450     ]
 451
 452
 453
 454 def select_by_library_id(submission_list):
 455     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
 456     libraries = {}
 457     for lib_id, subobj in subl:
 458         libraries.setdefault(lib_id, []).append(subobj)
 459
 460     for submission in libraries.values():
 461         submission.sort(key=attrgetter('date'), reverse=True)
 462
 463     return libraries
 464
 465 def library_to_freeze(selected_libraries):
 466     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 467     lib_ids = sorted(selected_libraries.keys())
 468     report = ['<html><table border="1">']
 469     report = ["""<html>
 470 <head>
 471 <style type="text/css">
 472  td {border-width:0 0 1px 1px; border-style:solid;}
 473 </style>
 474 </head>
 475 <body>
 476 <table>
 477 """]
 478     report.append('<thead>')
 479     report.append('<tr><td>Library ID</td><td>Name</td>')
 480     for f in freezes:
 481         report.append('<td>{0}</td>'.format(f))
 482     report.append('</tr>')
 483     report.append('</thead>')
 484     report.append('<tbody>')
 485     for lib_id in lib_ids:
 486         report.append('<tr>')
 487         lib_url = libraryNS[lib_id].uri
 488         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 489         submissions = selected_libraries[lib_id]
 490         report.append('<td>{0}</td>'.format(submissions[0].name))
 491         batched = {}
 492         for sub in submissions:
 493             date = date_to_freeze(sub.date)
 494             batched.setdefault(date, []).append(sub)
 495         print lib_id, batched
 496         for d in freezes:
 497             report.append('<td>')
 498             for s in batched.get(d, []):
 499                 show_url = submissionNS[s.subid].uri
 500                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 501                 report.append("{0}:{1}".format(subid, s.status))
 502             report.append('</td>')
 503         else:
 504             report.append('<td></td>')
 505         report.append("</tr>")
 506     report.append('</tbody>')
 507     report.append("</table></html>")
 508     return "\n".join(report)
 509
 510
 511 def date_to_freeze(d):
 512     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
 513                 (datetime(2010, 7, 30), '2010-Jul'),
 514                 (datetime(2011, 1, 30), '2011-Jan'),
 515                 ]
 516     for end, name in freezes:
 517         if d < end:
 518             return name
 519     else:
 520         return None
 521
 522 if __name__ == "__main__":
 523     main()
 524