encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2 """
   3 Gather information about our submissions into a single RDF store
   4 """
   5 from __future__ import print_function
   6
   7 from datetime import datetime
   8 import hashlib
   9 import httplib2
  10 import keyring
  11 import logging
  12 from lxml.html import fromstring
  13 from operator import attrgetter
  14 from optparse import OptionParser, OptionGroup
  15 # python keyring
  16 import os
  17 import re
  18 # redland rdf lib
  19
  20 import sys
  21 from six.moves import urllib
  22
  23 from rdflib import BNode, Graph, Literal, Namespace, URIRef
  24 from rdflib.namespace import RDF, RDFS, XSD
  25 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
  26     os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
  27
  28 from htsworkflow.submission import daf, ucsc
  29
  30 from htsworkflow.util import api
  31 from htsworkflow.util.rdfns import (
  32      submissionOntology,
  33      libraryOntology,
  34 )
  35 TYPE_N = RDF['type']
  36 CREATION_DATE = libraryOntology['date']
  37
  38 # URL mappings
  39 LIBRARY_NS = Namespace("http://jumpgate.caltech.edu/library/")
  40
  41 from htsworkflow.submission.ucsc import \
  42      daf_download_url, \
  43      ddf_download_url, \
  44      get_encodedcc_file_index, \
  45      submission_view_url, \
  46      UCSCEncodePipeline
  47
  48 DCC_NS = Namespace(UCSCEncodePipeline + 'download_ddf#')
  49
  50 DBDIR = os.path.expanduser("~diane/proj/submission")
  51
  52 LOGGER = logging.getLogger("encode_find")
  53
  54 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  55 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  56
  57 USERNAME = 'detrout'
  58 CHARSET = 'utf-8'
  59
  60 SL_MAP = {'SL2970': '02970',
  61           'SL2971': '02971',
  62           'SL2973': '02973',}
  63
  64 def main(cmdline=None):
  65     """
  66     Parse command line arguments
  67
  68     Takes a list of arguments (assuming arg[0] is the program name) or None
  69     If None, it looks at sys.argv
  70     """
  71     parser = make_parser()
  72     opts, args = parser.parse_args(cmdline)
  73
  74     if opts.debug:
  75         logging.basicConfig(level=logging.DEBUG)
  76     elif opts.verbose:
  77         logging.basicConfig(level=logging.INFO)
  78     else:
  79         logging.basicConfig(level=logging.ERROR)
  80
  81     htsw_authdata = api.make_auth_from_opts(opts, parser)
  82     htswapi = api.HtswApi(opts.host, htsw_authdata)
  83
  84     cookie = None
  85     model = get_model(opts.model, DBDIR)
  86
  87     if opts.load_rdf is not None:
  88         ns_uri = submissionOntology[''].uri
  89         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
  90
  91     if len(args) == 0:
  92         limit = None
  93     else:
  94         limit = args
  95
  96     if opts.reload_libraries:
  97         reload_libraries(model, args)
  98         return
  99
 100     if opts.update:
 101         opts.update_submission = True
 102         opts.update_libraries = True
 103         opts.update_ucsc_downloads = True
 104
 105     if opts.update_submission:
 106         cookie = login(cookie=cookie)
 107         load_my_submissions(model, limit=limit, cookie=cookie)
 108
 109     if opts.update_libraries:
 110         load_encode_assigned_libraries(model, htswapi)
 111         load_unassigned_submitted_libraries(model)
 112
 113     if opts.update_ucsc_downloads:
 114         our_tracks = [
 115             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
 116             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
 117             #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
 118             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
 119         ]
 120         for track_info in our_tracks:
 121             load_encodedcc_files(model, **track_info )
 122
 123     if opts.sparql is not None:
 124         sparql_query(model, opts.sparql, 'html')
 125
 126     if opts.find_submission_with_no_library:
 127         report_submissions_with_no_library(model)
 128
 129     if opts.print_rdf:
 130         serializer = get_serializer(name=opts.rdf_parser_name)
 131         print(serializer.serialize_model_to_string(model))
 132
 133
 134 def make_parser():
 135     """Construct option parser
 136     """
 137     parser = OptionParser()
 138     commands = OptionGroup(parser, "Commands")
 139     commands.add_option('--model', default=None,
 140       help="Load model database")
 141     commands.add_option('--load-rdf', default=None,
 142       help="load rdf statements into model")
 143     commands.add_option('--print-rdf', action="store_true", default=False,
 144       help="print ending model state")
 145     commands.add_option('--update', action="store_true", default=False,
 146       help="Do all updates")
 147     commands.add_option('--update-submission', action="store_true",
 148                         default=False,
 149       help="download status from ucsc")
 150     commands.add_option('--update-ucsc-downloads', action="store_true",
 151                         default=False,
 152       help="Update download locations from UCSC")
 153     commands.add_option('--update-libraries', action="store_true",
 154                         default=False,
 155       help="download library info from htsw")
 156     commands.add_option('--reload-libraries', action="store_true",
 157                         default=False,
 158                         help="Delete and redownload library information. "\
 159                              "Optionally list specific library IDs.")
 160     parser.add_option_group(commands)
 161
 162     queries = OptionGroup(parser, "Queries")
 163     queries.add_option('--sparql', default=None,
 164       help="execute arbitrary sparql query")
 165     queries.add_option('--find-submission-with-no-library', default=False,
 166       action="store_true",
 167       help="find submissions with no library ID")
 168     parser.add_option_group(queries)
 169
 170     options = OptionGroup(parser, "Options")
 171     options.add_option("--rdf-parser-name", default="turtle",
 172       help="set rdf file parser type")
 173     options.add_option("-v", "--verbose", action="store_true", default=False)
 174     options.add_option("--debug", action="store_true", default=False)
 175     parser.add_option_group(options)
 176
 177     api.add_auth_options(parser)
 178
 179     return parser
 180
 181
 182 def load_my_submissions(model, limit=None, cookie=None):
 183     """Parse all of my submissions from encodesubmit into model
 184     It will look at the global USER_URL to figure out who to scrape
 185     cookie contains the session cookie, if none, will attempt to login
 186     """
 187     if cookie is None:
 188         cookie = login()
 189
 190     tree = get_url_as_tree(USER_URL, 'GET', cookie)
 191     table_rows = tree.xpath('//table[@id="projects"]/tr')
 192     # first record is header
 193     name_n = submissionOntology['name']
 194     species_n = submissionOntology['species']
 195     library_urn = submissionOntology['library_urn']
 196
 197     # skip header
 198     for row in table_rows[1:]:
 199         cell = row.xpath('td')
 200         if cell is not None and len(cell) > 1:
 201             submission_id = str(cell[0].text_content())
 202             if limit is None or submission_id in limit:
 203                 subUrn = URIRef(submission_view_url(submission_id))
 204
 205                 add_stmt(model,
 206                          subUrn,
 207                          TYPE_N,
 208                          submissionOntology['Submission'])
 209                 add_stmt(model,
 210                          subUrn,
 211                          DCC_NS['subId'],
 212                          Literal(submission_id))
 213
 214                 name = str(cell[4].text_content())
 215                 add_stmt(model, subUrn, name_n, name)
 216
 217                 species = str(cell[2].text_content())
 218                 if species is not None:
 219                     add_stmt(model, subUrn, species_n, species)
 220
 221                 library_id = get_library_id(name)
 222                 if library_id is not None:
 223                     add_submission_to_library_urn(model,
 224                                                   subUrn,
 225                                                   library_urn,
 226                                                   library_id)
 227                 else:
 228                     errmsg = 'Unable to find library id in {0} for {1}'
 229                     LOGGER.warn(errmsg.format(name, str(subUrn)))
 230
 231                 add_submission_creation_date(model, subUrn, cookie)
 232
 233                 # grab changing atttributes
 234                 status = str(cell[6].text_content()).strip()
 235                 last_mod_datetime = get_date_contents(cell[8])
 236                 last_mod = last_mod_datetime.isoformat()
 237
 238                 update_submission_detail(model, subUrn, status, last_mod,
 239                                          cookie=cookie)
 240
 241                 LOGGER.info("Processed {0}".format(subUrn))
 242
 243
 244 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
 245     """Add a link from a UCSC submission to woldlab library if needed
 246     """
 247     libraryUrn = LIBRARY_NS[library_id + '/']
 248     query = (submissionUrn, predicate, libraryUrn)
 249     if not query in model:
 250         link = (submissionUrn, predicate, libraryUrn)
 251         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
 252         model.add(link)
 253     else:
 254         LOGGER.debug("Found: {0}".format(str(query)))
 255
 256
 257 def report_submissions_with_no_library(model):
 258     missing = find_submissions_with_no_library(model)
 259     for row in results:
 260         subid = row['subid']
 261         name = row['name']
 262         print("# {0}".format(name))
 263         print("<{0}>".format(subid.uri))
 264         print("  encodeSubmit:library_urn "\
 265               "<http://jumpgate.caltech.edu/library/> .")
 266         print("")
 267
 268 def find_submissions_with_no_library(model):
 269     missing_lib_query_text = """
 270 PREFIX submissionOntology:<{submissionOntology}>
 271
 272 SELECT
 273  ?subid ?name
 274 WHERE {{
 275   ?subid submissionOntology:name ?name
 276   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
 277   FILTER  (!bound(?libid))
 278 }}""".format(submissionOntology=submissionOntology[''].uri)
 279
 280     return model.query(missing_lib_query_text)
 281
 282
 283 def find_unscanned_submitted_libraries(model):
 284     """Scan model for libraries that don't have library details loaded
 285     """
 286     unscanned_libraries = """
 287 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 288 PREFIX submissionOntology:<{submissionOntology}>
 289
 290 SELECT distinct ?submission ?library_urn
 291 WHERE {{
 292   ?submission submissionOntology:library_urn ?library_urn .
 293   OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
 294   FILTER(!BOUND(?library_type))
 295 }}""".format(submissionOntology=submissionOntology[''].uri)
 296     return model.query(unscanned_libraries)
 297
 298 def find_all_libraries(model):
 299     """Scan model for every library marked as
 300     """
 301     libraries = """
 302 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 303 PREFIX libraryOntology:<{libraryOntology}>
 304
 305 SELECT distinct ?library_urn
 306 WHERE {{
 307   ?library_urn rdf:type ?library_type .
 308   FILTER(regex(?libray
 309 }}""".format(libraryOntology=libraryOntology[''].uri)
 310     return model.query(query)
 311
 312
 313 def add_submission_creation_date(model, subUrn, cookie):
 314     # in theory the submission page might have more information on it.
 315     creation_dates = get_creation_dates(model, subUrn)
 316     if len(creation_dates) == 0:
 317         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
 318         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
 319         parse_submission_page(model, submissionTree, subUrn)
 320     else:
 321         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
 322
 323
 324 def get_creation_dates(model, subUrn):
 325     query = (subUrn, CREATION_DATE, None)
 326     creation_dates = list(model.triples(query))
 327     return creation_dates
 328
 329
 330 def parse_submission_page(model, submissionTree, subUrn):
 331     cells = submissionTree.findall('.//td')
 332     created_label = [x for x in cells
 333                      if x.text_content().startswith('Created')]
 334     if len(created_label) == 1:
 335         created_date = get_date_contents(created_label[0].getnext())
 336         created_date_node = Literal(created_date.isoformat(),
 337                                     datatype=XSD.dateTime)
 338         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
 339     else:
 340         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
 341         LOGGER.warn(msg)
 342         raise Warning(msg)
 343
 344
 345 def update_submission_detail(model, subUrn, status, recent_update, cookie):
 346     HasStatusN = submissionOntology['has_status']
 347     StatusN = submissionOntology['status']
 348     LastModifyN = submissionOntology['last_modify_date']
 349
 350     status_nodes_query = (subUrn, HasStatusN, None)
 351     status_nodes = list(model.triples(status_nodes_query))
 352
 353     if len(status_nodes) == 0:
 354         # has no status node, add one
 355         LOGGER.info("Adding status node to {0}".format(subUrn))
 356         status_node = create_status_node(subUrn, recent_update)
 357         add_stmt(model, subUrn, HasStatusN, status_node)
 358         add_stmt(model, status_node, RDF['type'], StatusN)
 359         add_stmt(model, status_node, StatusN, status)
 360         add_stmt(model, status_node, LastModifyN, recent_update)
 361         update_ddf(model, subUrn, status_node, cookie=cookie)
 362         update_daf(model, subUrn, status_node, cookie=cookie)
 363     else:
 364         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
 365         for status_statement in status_nodes:
 366             status_node = status_statement[2]
 367             last_modified_query = (status_node, LastModifyN, None)
 368             last_mod_nodes = model.find_statements(last_modified_query)
 369             for last_mod_statement in last_mod_nodes:
 370                 last_mod_date = str(last_mod_statement[2])
 371                 if recent_update == str(last_mod_date):
 372                     update_ddf(model, subUrn, status_node, cookie=cookie)
 373                     update_daf(model, subUrn, status_node, cookie=cookie)
 374                     break
 375
 376
 377 def update_daf(model, submission_url, status_node, cookie):
 378     download_daf_uri = str(submission_url).replace('show', 'download_daf')
 379     daf_uri = URIRef(download_daf_uri)
 380
 381     status_is_daf = (status_node, TYPE_N, dafTermOntology[''])
 382     if status_is_daf not in model:
 383         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
 384                                                      status_node))
 385         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
 386         daf_hash = hashlib.md5(daf_text).hexdigest()
 387         daf_hash_stmt = (status_node, dafTermOntology['md5sum'], daf_hash)
 388         model.add(daf_hash_stmt)
 389         daf.fromstring_into_model(model, status_node, daf_text)
 390
 391
 392 def update_ddf(model, subUrn, statusNode, cookie):
 393     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
 394     ddfUrn = URIRef(download_ddf_url)
 395
 396     status_is_ddf = (statusNode, TYPE_N, DCC_NS[''])
 397     if status_is_ddf in model:
 398         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
 399         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
 400         add_ddf_statements(model, statusNode, ddf_text)
 401         model.add_statement(status_is_ddf)
 402
 403
 404 def add_ddf_statements(model, statusNode, ddf_string):
 405     """Convert a ddf text file into RDF Statements
 406     """
 407     ddf_lines = ddf_string.split('\n')
 408     # first line is header
 409     header = ddf_lines[0].split()
 410     attributes = [DCC_NS[x] for x in header]
 411
 412     for ddf_line in ddf_lines[1:]:
 413         ddf_line = ddf_line.strip()
 414         if len(ddf_line) == 0:
 415             continue
 416         if ddf_line.startswith("#"):
 417             continue
 418
 419         ddf_record = ddf_line.split('\t')
 420         files = ddf_record[0].split(',')
 421         file_attributes = ddf_record[1:]
 422
 423         for f in files:
 424             fileNode = BNode()
 425             add_stmt(model,
 426                      statusNode,
 427                      submissionOntology['has_file'],
 428                      fileNode)
 429             add_stmt(model, fileNode, RDF['type'], DCC_NS['file'])
 430             add_stmt(model, fileNode, DCC_NS['filename'], f)
 431
 432             for predicate, object in zip(attributes[1:], file_attributes):
 433                 add_stmt(model, fileNode, predicate, object)
 434
 435
 436 def load_encode_assigned_libraries(model, htswapi):
 437     """Get libraries associated with encode.
 438     """
 439     encodeFilters = ["/library/?affiliations__id__exact=44",
 440                      "/library/?affiliations__id__exact=80",
 441                     ]
 442
 443     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
 444     for encodeUrl in encodeUrls:
 445         LOGGER.info("Scanning library url {0}".format(encodeUrl))
 446         model.parse(source=encodeUrl, format='rdfa')
 447         query = (None, libraryOntology['library_id'], None)
 448         libraries = model.triples(query)
 449         for statement in libraries:
 450             libraryUrn = statement[0]
 451             load_library_detail(model, libraryUrn)
 452
 453
 454 def load_unassigned_submitted_libraries(model):
 455     unassigned = find_unscanned_submitted_libraries(model)
 456     for query_record in unassigned:
 457         library_urn = query_record['library_urn']
 458         LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
 459         load_library_detail(model, library_urn)
 460
 461 def reload_libraries(model, library_list):
 462     if len(library_list) == 0:
 463         # reload everything.
 464         queryset = find_all_libraries(model)
 465         libraries = ( str(s['library_urn']) for s in queryset )
 466     else:
 467         libraries = ( user_library_id_to_library_urn(l) for l in library_list )
 468
 469     for library_urn in libraries:
 470         delete_library(model, library_urn)
 471         load_library_detail(model, library_urn)
 472
 473 def user_library_id_to_library_urn(library_id):
 474     split_url = urllib.parse.urlsplit(library_id)
 475     if len(split_url.scheme) == 0:
 476         return LIBRARY_NS[library_id]
 477     else:
 478         return library_id
 479
 480 def delete_library(model, library_urn):
 481     if not isinstance(library_urn, (Literal, URIRef)):
 482         raise ValueError("library urn must be a Literal")
 483
 484     LOGGER.info("Deleting {0}".format(str(library_urn)))
 485     lane_query = (library_urn, libraryOntology['has_lane'],None)
 486     for lane in model.triples(lane_query):
 487         delete_lane(model, lane[2])
 488     library_attrib_query = (library_urn, None, None)
 489     for library_attrib in model.triples(library_attrib_query):
 490         LOGGER.debug("Deleting {0}".format(str(library_attrib)))
 491         model.remove(library_attrib)
 492
 493
 494 def delete_lane(model, lane_urn):
 495     if not isinstance(lane_urn, (Literal, URIRef)):
 496         raise ValueError("lane urn must be a Literal or URIRef")
 497
 498     delete_lane_mapping(model, lane_urn)
 499     lane_attrib_query = (lane_urn,None,None)
 500     for lane_attrib in model.triples(lane_attrib_query):
 501         LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
 502         model.remove(lane_attrib)
 503
 504
 505 def delete_lane_mapping(model, lane_urn):
 506     if not isinstance(lane_urn, (Literal, URIRef)):
 507         raise ValueError("lane urn must be a Literal or URIRef")
 508
 509     lane_mapping_query = (lane_urn,
 510                           libraryOntology['has_mappings'],
 511                           None)
 512     for lane_mapping in model.triples(lane_mapping_query):
 513         mapping_attrib_query = (lane_mapping[2], None, None)
 514         for mapping_attrib in model.triples(mapping_attrib_query):
 515             LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
 516             model.remove(mapping_attrib)
 517
 518
 519 def load_encodedcc_files(model, genome, composite):
 520     file_index = ucsc.get_encodedcc_file_index(genome, composite)
 521     if file_index is None:
 522         return
 523
 524     lib_term = submissionOntology['library_urn']
 525     sub_term = submissionOntology['submission_urn']
 526     for filename, attributes in file_index.items():
 527         s = URIRef(filename)
 528         model.add((s, TYPE_N, submissionOntology['ucsc_track']))
 529         for name, value in attributes.items():
 530             p = DCC_NS[name]
 531             o = Literal(value)
 532             model.add((s,p,o))
 533             if name.lower() == 'labexpid':
 534                 model.add((s, lib_term, LIBRARY_NS[value+'/']))
 535             elif name.lower() == 'subid':
 536                 sub_url = URIRef(submission_view_url(value))
 537                 model.add((s, sub_term, sub_url))
 538
 539
 540 def load_library_detail(model, libraryUrn):
 541     """Grab detail information from library page
 542     """
 543     rdfaParser = RDF.Parser(name='rdfa')
 544     query = (libraryUrn, libraryOntology['date'], None)
 545     results = list(model.find_statements(query))
 546     log_message = "Found {0} statements for {1}"
 547     LOGGER.debug(log_message.format(len(results), libraryUrn))
 548     if len(results) == 0:
 549         LOGGER.info("Loading {0}".format(str(libraryUrn)))
 550         try:
 551             body = get_url_as_text(str(libraryUrn), 'GET')
 552             rdfaParser.parse_string_into_model(model, body, libraryUrn)
 553         except httplib2.HttpLib2ErrorWithResponse as e:
 554             LOGGER.error(str(e))
 555     elif len(results) == 1:
 556         pass  # Assuming that a loaded dataset has one record
 557     else:
 558         LOGGER.warning("Many dates for {0}".format(libraryUrn))
 559
 560
 561 def get_library_id(name):
 562     """Guess library ID from library name
 563
 564     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
 565     '11039'
 566     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
 567     '10150'
 568     >>> get_library_id('2x75-GM12892-rep2-SL2970')
 569     '02970'
 570     """
 571     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
 572     library_id = None
 573     if match is not None:
 574         library_id = match.group('id')
 575     if library_id in SL_MAP:
 576         library_id = SL_MAP[library_id]
 577     return library_id
 578
 579
 580 def get_contents(element):
 581     """Return contents or none.
 582     """
 583     if len(element.contents) == 0:
 584         return None
 585
 586     a = element.find('a')
 587     if a is not None:
 588         return a.contents[0].encode(CHARSET)
 589
 590     return element.contents[0].encode(CHARSET)
 591
 592
 593 def create_status_node(submission_uri, timestamp):
 594     submission_uri = daf.submission_uri_to_string(submission_uri)
 595     if submission_uri[-1] != '/':
 596         sumbission_uri += '/'
 597     status_uri = submission_uri + timestamp
 598     return URIRef(status_uri)
 599
 600
 601 def get_date_contents(element):
 602     data = element.text_content()
 603     if data:
 604         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 605     else:
 606         return None
 607
 608
 609 def add_stmt(model, subject, predicate, rdf_object):
 610     """Convienence create RDF Statement and add to a model
 611     """
 612     return model.add((subject, predicate, rdf_object))
 613
 614
 615 def login(cookie=None):
 616     """Login if we don't have a cookie
 617     """
 618     if cookie is not None:
 619         return cookie
 620
 621     keys = keyring.get_keyring()
 622     password = keys.get_password(LOGIN_URL, USERNAME)
 623     credentials = {'login': USERNAME,
 624                    'password': password}
 625     headers = {'Content-type': 'application/x-www-form-urlencoded'}
 626     http = httplib2.Http()
 627     response, content = http.request(LOGIN_URL,
 628                                      'POST',
 629                                      headers=headers,
 630                                      body=urllib.parse.urlencode(credentials))
 631     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
 632                                                     response['status']))
 633
 634     cookie = response.get('set-cookie', None)
 635     if cookie is None:
 636         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
 637     return cookie
 638
 639
 640 def get_url_as_tree(url, method, cookie=None):
 641     http = httplib2.Http()
 642     headers = {}
 643     if cookie is not None:
 644         headers['Cookie'] = cookie
 645     response, content = http.request(url, method, headers=headers)
 646     if response['status'] == '200':
 647         tree = fromstring(content, base_url=url)
 648         return tree
 649     else:
 650         msg = "error accessing {0}, status {1}"
 651         msg = msg.format(url, response['status'])
 652         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 653         raise e
 654
 655
 656 def get_url_as_text(url, method, cookie=None):
 657     http = httplib2.Http()
 658     headers = {}
 659     if cookie is not None:
 660         headers['Cookie'] = cookie
 661     response, content = http.request(url, method, headers=headers)
 662     if response['status'] == '200':
 663         return content
 664     else:
 665         msg = "error accessing {0}, status {1}"
 666         msg = msg.format(url, response['status'])
 667         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 668         raise e
 669
 670 ################
 671 #  old stuff
 672 SUBMISSIONS_LACKING_LIBID = [
 673     ('1x75-Directional-HeLa-Rep1',    '11208'),
 674     ('1x75-Directional-HeLa-Rep2',    '11207'),
 675     ('1x75-Directional-HepG2-Rep1',   '11210'),
 676     ('1x75-Directional-HepG2-Rep2',   '11209'),
 677     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 678     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 679     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 680     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 681     ('1x75-Directional-K562-Rep1',    '11008'),
 682     ('1x75-Directional-K562-Rep2',    '11007'),
 683     ('1x75-Directional-NHEK-Rep1',    '11204'),
 684     ('1x75-Directional-GM12878-Rep1', '11011'),
 685     ('1x75-Directional-GM12878-Rep2', '11010'),
 686     ]
 687
 688
 689 def select_by_library_id(submission_list):
 690     subl = [(x.library_id, x) for x in submission_list if x.library_id]
 691     libraries = {}
 692     for lib_id, subobj in subl:
 693         libraries.setdefault(lib_id, []).append(subobj)
 694
 695     for submission in libraries.values():
 696         submission.sort(key=attrgetter('date'), reverse=True)
 697
 698     return libraries
 699
 700
 701 def library_to_freeze(selected_libraries):
 702     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 703     lib_ids = sorted(selected_libraries.keys())
 704     report = ['<html><table border="1">']
 705     report = ["""<html>
 706 <head>
 707 <style type="text/css">
 708  td {border-width:0 0 1px 1px; border-style:solid;}
 709 </style>
 710 </head>
 711 <body>
 712 <table>
 713 """]
 714     report.append('<thead>')
 715     report.append('<tr><td>Library ID</td><td>Name</td>')
 716     for f in freezes:
 717         report.append('<td>{0}</td>'.format(f))
 718     report.append('</tr>')
 719     report.append('</thead>')
 720     report.append('<tbody>')
 721     for lib_id in lib_ids:
 722         report.append('<tr>')
 723         lib_url = LIBRARY_NS[lib_id]
 724         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 725         submissions = selected_libraries[lib_id]
 726         report.append('<td>{0}</td>'.format(submissions[0].name))
 727         batched = {}
 728         for sub in submissions:
 729             date = date_to_freeze(sub.date)
 730             batched.setdefault(date, []).append(sub)
 731         for d in freezes:
 732             report.append('<td>')
 733             for s in batched.get(d, []):
 734                 show_url = submission_view_url(s.subid)
 735                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
 736                 report.append("{0}:{1}".format(subid, s.status))
 737             report.append('</td>')
 738         else:
 739             report.append('<td></td>')
 740         report.append("</tr>")
 741     report.append('</tbody>')
 742     report.append("</table></html>")
 743     return "\n".join(report)
 744
 745
 746 def date_to_freeze(d):
 747     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
 748                (datetime(2010, 7, 30), '2010-Jul'),
 749                (datetime(2011, 1, 30), '2011-Jan'),
 750                ]
 751     for end, name in freezes:
 752         if d < end:
 753             return name
 754     else:
 755         return None
 756
 757 if __name__ == "__main__":
 758     main()