Move code out of ucsc_gather and into the htsworkflow tree.
[htsworkflow.git] / encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from datetime import datetime
7 import hashlib
8 import httplib2
9 import keyring
10 import logging
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
14 # python keyring
15 import os
16 import re
17 # redland rdf lib
18 import RDF
19 import sys
20 import urllib
21 import urlparse
22
23 from htsworkflow.submission import daf, ucsc
24
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
27      dafTermOntology, \
28      dublinCoreNS, \
29      get_model, \
30      get_serializer, \
31      sparql_query, \
32      submissionOntology, \
33      libraryOntology, \
34      load_into_model, \
35      rdfNS, \
36      rdfsNS, \
37      xsdNS
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
40
41 # URL mappings
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
43
44 from htsworkflow.submission.ucsc import \
45      daf_download_url, \
46      ddf_download_url, \
47      get_encodedcc_file_index, \
48      submission_view_url, \
49      UCSCEncodePipeline
50
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
52
53 DBDIR = os.path.expanduser("~diane/proj/submission")
54
55 LOGGER = logging.getLogger("encode_find")
56
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
59
60 USERNAME = 'detrout'
61 CHARSET = 'utf-8'
62
63 SL_MAP = {'SL2970': '02970',
64           'SL2971': '02971',
65           'SL2973': '02973',}
66
67 def main(cmdline=None):
68     """
69     Parse command line arguments
70
71     Takes a list of arguments (assuming arg[0] is the program name) or None
72     If None, it looks at sys.argv
73     """
74     parser = make_parser()
75     opts, args = parser.parse_args(cmdline)
76
77     if opts.debug:
78         logging.basicConfig(level=logging.DEBUG)
79     elif opts.verbose:
80         logging.basicConfig(level=logging.INFO)
81     else:
82         logging.basicConfig(level=logging.ERROR)
83
84     htsw_authdata = api.make_auth_from_opts(opts, parser)
85     htswapi = api.HtswApi(opts.host, htsw_authdata)
86
87     cookie = None
88     model = get_model(opts.model, DBDIR)
89
90     if opts.load_rdf is not None:
91         ns_uri = submissionOntology[''].uri
92         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
93
94     if len(args) == 0:
95         limit = None
96     else:
97         limit = args
98
99     if opts.reload_libraries:
100         reload_libraries(model, args)
101         return
102
103     if opts.update:
104         opts.update_submission = True
105         opts.update_libraries = True
106         opts.update_ucsc_downloads = True
107
108     if opts.update_submission:
109         cookie = login(cookie=cookie)
110         load_my_submissions(model, limit=limit, cookie=cookie)
111
112     if opts.update_libraries:
113         load_encode_assigned_libraries(model, htswapi)
114         load_unassigned_submitted_libraries(model)
115
116     if opts.update_ucsc_downloads:
117         our_tracks = [
118             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
119             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
120             #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
121             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
122         ]
123         for track_info in our_tracks:
124             load_encodedcc_files(model, **track_info )
125
126     if opts.sparql is not None:
127         sparql_query(model, opts.sparql)
128
129     if opts.find_submission_with_no_library:
130         report_submissions_with_no_library(model)
131
132     if opts.print_rdf:
133         serializer = get_serializer(name=opts.rdf_parser_name)
134         print serializer.serialize_model_to_string(model)
135
136
137 def make_parser():
138     """Construct option parser
139     """
140     parser = OptionParser()
141     commands = OptionGroup(parser, "Commands")
142     commands.add_option('--model', default=None,
143       help="Load model database")
144     commands.add_option('--load-rdf', default=None,
145       help="load rdf statements into model")
146     commands.add_option('--print-rdf', action="store_true", default=False,
147       help="print ending model state")
148     commands.add_option('--update', action="store_true", default=False,
149       help="Do all updates")
150     commands.add_option('--update-submission', action="store_true",
151                         default=False,
152       help="download status from ucsc")
153     commands.add_option('--update-ucsc-downloads', action="store_true",
154                         default=False,
155       help="Update download locations from UCSC")
156     commands.add_option('--update-libraries', action="store_true",
157                         default=False,
158       help="download library info from htsw")
159     commands.add_option('--reload-libraries', action="store_true",
160                         default=False,
161                         help="Delete and redownload library information. "\
162                              "Optionally list specific library IDs.")
163     parser.add_option_group(commands)
164
165     queries = OptionGroup(parser, "Queries")
166     queries.add_option('--sparql', default=None,
167       help="execute arbitrary sparql query")
168     queries.add_option('--find-submission-with-no-library', default=False,
169       action="store_true",
170       help="find submissions with no library ID")
171     parser.add_option_group(queries)
172
173     options = OptionGroup(parser, "Options")
174     options.add_option("--rdf-parser-name", default="turtle",
175       help="set rdf file parser type")
176     options.add_option("-v", "--verbose", action="store_true", default=False)
177     options.add_option("--debug", action="store_true", default=False)
178     parser.add_option_group(options)
179
180     api.add_auth_options(parser)
181
182     return parser
183
184
185 def load_my_submissions(model, limit=None, cookie=None):
186     """Parse all the submissions from UCSC into model
187     It will look at the global USER_URL to figure out who to scrape
188     cookie contains the session cookie, if none, will attempt to login
189     """
190     if cookie is None:
191         cookie = login()
192
193     tree = get_url_as_tree(USER_URL, 'GET', cookie)
194     table_rows = tree.xpath('//table[@id="projects"]/tr')
195     # first record is header
196     name_n = submissionOntology['name']
197     species_n = submissionOntology['species']
198     library_urn = submissionOntology['library_urn']
199
200     # skip header
201     for row in table_rows[1:]:
202         cell = row.xpath('td')
203         if cell is not None and len(cell) > 1:
204             submission_id = str(cell[0].text_content())
205             if limit is None or submission_id in limit:
206                 subUrn = RDF.Uri(submission_view_url(submission_id))
207
208                 add_stmt(model,
209                          subUrn,
210                          TYPE_N,
211                          submissionOntology['Submission'])
212                 add_stmt(model,
213                          subUrn,
214                          DCC_NS['subId'],
215                          RDF.Node(submission_id))
216
217                 name = str(cell[4].text_content())
218                 add_stmt(model, subUrn, name_n, name)
219
220                 species = str(cell[2].text_content())
221                 if species is not None:
222                     add_stmt(model, subUrn, species_n, species)
223
224                 library_id = get_library_id(name)
225                 if library_id is not None:
226                     add_submission_to_library_urn(model,
227                                                   subUrn,
228                                                   library_urn,
229                                                   library_id)
230                 else:
231                     errmsg = 'Unable to find library id in {0} for {1}'
232                     LOGGER.warn(errmsg.format(name, str(subUrn)))
233
234                 add_submission_creation_date(model, subUrn, cookie)
235
236                 # grab changing atttributes
237                 status = str(cell[6].text_content()).strip()
238                 last_mod_datetime = get_date_contents(cell[8])
239                 last_mod = last_mod_datetime.isoformat()
240
241                 update_submission_detail(model, subUrn, status, last_mod,
242                                          cookie=cookie)
243
244                 LOGGER.info("Processed {0}".format(subUrn))
245
246
247 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
248     """Add a link from a UCSC submission to woldlab library if needed
249     """
250     libraryUrn = LIBRARY_NS[library_id + '/']
251     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
252     if not model.contains_statement(query):
253         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
254         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
255         model.add_statement(link)
256     else:
257         LOGGER.debug("Found: {0}".format(str(query)))
258
259
260 def report_submissions_with_no_library(model):
261     missing = find_submissions_with_no_library(model)
262     for row in results:
263         subid = row['subid']
264         name = row['name']
265         print "# {0}".format(name)
266         print "<{0}>".format(subid.uri)
267         print "  encodeSubmit:library_urn "\
268               "<http://jumpgate.caltech.edu/library/> ."
269         print ""
270
271 def find_submissions_with_no_library(model):
272     missing_lib_query_text = """
273 PREFIX submissionOntology:<{submissionOntology}>
274
275 SELECT
276  ?subid ?name
277 WHERE {{
278   ?subid submissionOntology:name ?name
279   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
280   FILTER  (!bound(?libid))
281 }}""".format(submissionOntology=submissionOntology[''].uri)
282     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
283
284     return missing_lib_query.execute(model)
285
286
287 def find_unscanned_submitted_libraries(model):
288     """Scan model for libraries that don't have library details loaded
289     """
290     unscanned_libraries = """
291 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
292 PREFIX submissionOntology:<{submissionOntology}>
293
294 SELECT distinct ?submission ?library_urn
295 WHERE {{
296   ?submission submissionOntology:library_urn ?library_urn .
297   OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
298   FILTER(!BOUND(?library_type))
299 }}""".format(submissionOntology=submissionOntology[''].uri)
300     query = RDF.SPARQLQuery(unscanned_libraries)
301     return query.execute(model)
302
303 def find_all_libraries(model):
304     """Scan model for every library marked as
305     """
306     libraries = """
307 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
308 PREFIX libraryOntology:<{libraryOntology}>
309
310 SELECT distinct ?library_urn
311 WHERE {{
312   ?library_urn rdf:type ?library_type .
313   FILTER(regex(?libray
314 }}""".format(libraryOntology=libraryOntology[''].uri)
315     query = RDF.SPARQLQuery(libraries)
316     return query.execute(model)
317
318
319 def add_submission_creation_date(model, subUrn, cookie):
320     # in theory the submission page might have more information on it.
321     creation_dates = get_creation_dates(model, subUrn)
322     if len(creation_dates) == 0:
323         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
324         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
325         parse_submission_page(model, submissionTree, subUrn)
326     else:
327         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
328
329
330 def get_creation_dates(model, subUrn):
331     query = RDF.Statement(subUrn, CREATION_DATE, None)
332     creation_dates = list(model.find_statements(query))
333     return creation_dates
334
335
336 def parse_submission_page(model, submissionTree, subUrn):
337     cells = submissionTree.findall('.//td')
338     dateTimeType = xsdNS['dateTime']
339     created_label = [x for x in cells
340                      if x.text_content().startswith('Created')]
341     if len(created_label) == 1:
342         created_date = get_date_contents(created_label[0].getnext())
343         created_date_node = RDF.Node(literal=created_date.isoformat(),
344                                      datatype=dateTimeType.uri)
345         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
346     else:
347         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
348         LOGGER.warn(msg)
349         raise Warning(msg)
350
351
352 def update_submission_detail(model, subUrn, status, recent_update, cookie):
353     HasStatusN = submissionOntology['has_status']
354     StatusN = submissionOntology['status']
355     LastModifyN = submissionOntology['last_modify_date']
356
357     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
358     status_nodes = list(model.find_statements(status_nodes_query))
359
360     if len(status_nodes) == 0:
361         # has no status node, add one
362         LOGGER.info("Adding status node to {0}".format(subUrn))
363         status_node = create_status_node(subUrn, recent_update)
364         add_stmt(model, subUrn, HasStatusN, status_node)
365         add_stmt(model, status_node, rdfNS['type'], StatusN)
366         add_stmt(model, status_node, StatusN, status)
367         add_stmt(model, status_node, LastModifyN, recent_update)
368         update_ddf(model, subUrn, status_node, cookie=cookie)
369         update_daf(model, subUrn, status_node, cookie=cookie)
370     else:
371         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
372         for status_statement in status_nodes:
373             status_node = status_statement.object
374             last_modified_query = RDF.Statement(status_node,
375                                                 LastModifyN,
376                                                 None)
377             last_mod_nodes = model.find_statements(last_modified_query)
378             for last_mod_statement in last_mod_nodes:
379                 last_mod_date = str(last_mod_statement.object)
380                 if recent_update == str(last_mod_date):
381                     update_ddf(model, subUrn, status_node, cookie=cookie)
382                     update_daf(model, subUrn, status_node, cookie=cookie)
383                     break
384
385
386 def update_daf(model, submission_url, status_node, cookie):
387     download_daf_uri = str(submission_url).replace('show', 'download_daf')
388     daf_uri = RDF.Uri(download_daf_uri)
389
390     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
391     if not model.contains_statement(status_is_daf):
392         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
393                                                      status_node))
394         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
395         daf_hash = hashlib.md5(daf_text).hexdigest()
396         daf_hash_stmt = RDF.Statement(status_node,
397                                       dafTermOntology['md5sum'],
398                                       daf_hash)
399         model.add_statement(daf_hash_stmt)
400         daf.fromstring_into_model(model, status_node, daf_text)
401
402
403 def update_ddf(model, subUrn, statusNode, cookie):
404     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
405     ddfUrn = RDF.Uri(download_ddf_url)
406
407     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
408     if not model.contains_statement(status_is_ddf):
409         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
410         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
411         add_ddf_statements(model, statusNode, ddf_text)
412         model.add_statement(status_is_ddf)
413
414
415 def add_ddf_statements(model, statusNode, ddf_string):
416     """Convert a ddf text file into RDF Statements
417     """
418     ddf_lines = ddf_string.split('\n')
419     # first line is header
420     header = ddf_lines[0].split()
421     attributes = [DCC_NS[x] for x in header]
422
423     for ddf_line in ddf_lines[1:]:
424         ddf_line = ddf_line.strip()
425         if len(ddf_line) == 0:
426             continue
427         if ddf_line.startswith("#"):
428             continue
429
430         ddf_record = ddf_line.split('\t')
431         files = ddf_record[0].split(',')
432         file_attributes = ddf_record[1:]
433
434         for f in files:
435             fileNode = RDF.Node()
436             add_stmt(model,
437                      statusNode,
438                      submissionOntology['has_file'],
439                      fileNode)
440             add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
441             add_stmt(model, fileNode, DCC_NS['filename'], f)
442
443             for predicate, object in zip(attributes[1:], file_attributes):
444                 add_stmt(model, fileNode, predicate, object)
445
446
447 def load_encode_assigned_libraries(model, htswapi):
448     """Get libraries associated with encode.
449     """
450     encodeFilters = ["/library/?affiliations__id__exact=44",
451                      "/library/?affiliations__id__exact=80",
452                     ]
453
454     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
455     rdfaParser = RDF.Parser(name='rdfa')
456     for encodeUrl in encodeUrls:
457         LOGGER.info("Scanning library url {0}".format(encodeUrl))
458         rdfaParser.parse_into_model(model, encodeUrl)
459         query = RDF.Statement(None, libraryOntology['library_id'], None)
460         libraries = model.find_statements(query)
461         for statement in libraries:
462             libraryUrn = statement.subject
463             load_library_detail(model, libraryUrn)
464
465
466 def load_unassigned_submitted_libraries(model):
467     unassigned = find_unscanned_submitted_libraries(model)
468     for query_record in unassigned:
469         library_urn = query_record['library_urn']
470         LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
471         load_library_detail(model, library_urn)
472
473 def reload_libraries(model, library_list):
474     if len(library_list) == 0:
475         # reload everything.
476         queryset = find_all_libraries(model)
477         libraries = ( str(s['library_urn']) for s in queryset )
478     else:
479         libraries = ( user_library_id_to_library_urn(l) for l in library_list )
480
481     for library_urn in libraries:
482         delete_library(model, library_urn)
483         load_library_detail(model, library_urn)
484
485 def user_library_id_to_library_urn(library_id):
486     split_url = urlparse.urlsplit(library_id)
487     if len(split_url.scheme) == 0:
488         return LIBRARY_NS[library_id]
489     else:
490         return library_id
491
492 def delete_library(model, library_urn):
493     if not isinstance(library_urn, RDF.Node):
494         raise ValueError("library urn must be a RDF.Node")
495
496     LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
497     lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
498     for lane in model.find_statements(lane_query):
499         delete_lane(model, lane.object)
500     library_attrib_query = RDF.Statement(library_urn, None, None)
501     for library_attrib in model.find_statements(library_attrib_query):
502         LOGGER.debug("Deleting {0}".format(str(library_attrib)))
503         del model[library_attrib]
504
505
506 def delete_lane(model, lane_urn):
507     if not isinstance(lane_urn, RDF.Node):
508         raise ValueError("lane urn must be a RDF.Node")
509
510     delete_lane_mapping(model, lane_urn)
511     lane_attrib_query = RDF.Statement(lane_urn,None,None)
512     for lane_attrib in model.find_statements(lane_attrib_query):
513         LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
514         del model[lane_attrib]
515
516
517 def delete_lane_mapping(model, lane_urn):
518     if not isinstance(lane_urn, RDF.Node):
519         raise ValueError("lane urn must be a RDF.Node")
520
521     lane_mapping_query = RDF.Statement(lane_urn,
522                                        libraryOntology['has_mappings'],
523                                        None)
524     for lane_mapping in model.find_statements(lane_mapping_query):
525         mapping_attrib_query = RDF.Statement(lane_mapping.object,
526                                              None,
527                                              None)
528         for mapping_attrib in model.find_statements(mapping_attrib_query):
529             LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
530             del model[mapping_attrib]
531
532
533 def load_encodedcc_files(model, genome, composite):
534     file_index = ucsc.get_encodedcc_file_index(genome, composite)
535     if file_index is None:
536         return
537
538     for filename, attributes in file_index.items():
539         s = RDF.Node(RDF.Uri(filename))
540         model.add_statement(
541             RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
542         for name, value in attributes.items():
543             p = RDF.Node(DCC_NS[name])
544             o = RDF.Node(value)
545             model.add_statement(RDF.Statement(s,p,o))
546
547
548 def load_library_detail(model, libraryUrn):
549     """Grab detail information from library page
550     """
551     rdfaParser = RDF.Parser(name='rdfa')
552     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
553     results = list(model.find_statements(query))
554     log_message = "Found {0} statements for {1}"
555     LOGGER.debug(log_message.format(len(results), libraryUrn))
556     if len(results) == 0:
557         LOGGER.info("Loading {0}".format(str(libraryUrn)))
558         try:
559             body = get_url_as_text(str(libraryUrn.uri), 'GET')
560             rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
561         except httplib2.HttpLib2ErrorWithResponse, e:
562             LOGGER.error(str(e))
563     elif len(results) == 1:
564         pass  # Assuming that a loaded dataset has one record
565     else:
566         LOGGER.warning("Many dates for {0}".format(libraryUrn))
567
568
569 def get_library_id(name):
570     """Guess library ID from library name
571
572     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
573     '11039'
574     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
575     '10150'
576     >>> get_library_id('2x75-GM12892-rep2-SL2970')
577     '02970'
578     """
579     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
580     library_id = None
581     if match is not None:
582         library_id = match.group('id')
583     if library_id in SL_MAP:
584         library_id = SL_MAP[library_id]
585     return library_id
586
587
588 def get_contents(element):
589     """Return contents or none.
590     """
591     if len(element.contents) == 0:
592         return None
593
594     a = element.find('a')
595     if a is not None:
596         return a.contents[0].encode(CHARSET)
597
598     return element.contents[0].encode(CHARSET)
599
600
601 def create_status_node(submission_uri, timestamp):
602     submission_uri = daf.submission_uri_to_string(submission_uri)
603     if submission_uri[-1] != '/':
604         sumbission_uri += '/'
605     status_uri = submission_uri + timestamp
606     return RDF.Node(RDF.Uri(status_uri))
607
608
609 def get_date_contents(element):
610     data = element.text_content()
611     if data:
612         return datetime.strptime(data, "%Y-%m-%d %H:%M")
613     else:
614         return None
615
616
617 def add_stmt(model, subject, predicate, rdf_object):
618     """Convienence create RDF Statement and add to a model
619     """
620     return model.add_statement(
621         RDF.Statement(subject, predicate, rdf_object))
622
623
624 def login(cookie=None):
625     """Login if we don't have a cookie
626     """
627     if cookie is not None:
628         return cookie
629
630     keys = keyring.get_keyring()
631     password = keys.get_password(LOGIN_URL, USERNAME)
632     credentials = {'login': USERNAME,
633                    'password': password}
634     headers = {'Content-type': 'application/x-www-form-urlencoded'}
635     http = httplib2.Http()
636     response, content = http.request(LOGIN_URL,
637                                      'POST',
638                                      headers=headers,
639                                      body=urllib.urlencode(credentials))
640     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
641                                                     response['status']))
642
643     cookie = response.get('set-cookie', None)
644     if cookie is None:
645         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
646     return cookie
647
648
649 def get_url_as_tree(url, method, cookie=None):
650     http = httplib2.Http()
651     headers = {}
652     if cookie is not None:
653         headers['Cookie'] = cookie
654     response, content = http.request(url, method, headers=headers)
655     if response['status'] == '200':
656         tree = fromstring(content, base_url=url)
657         return tree
658     else:
659         msg = "error accessing {0}, status {1}"
660         msg = msg.format(url, response['status'])
661         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
662         raise e
663
664
665 def get_url_as_text(url, method, cookie=None):
666     http = httplib2.Http()
667     headers = {}
668     if cookie is not None:
669         headers['Cookie'] = cookie
670     response, content = http.request(url, method, headers=headers)
671     if response['status'] == '200':
672         return content
673     else:
674         msg = "error accessing {0}, status {1}"
675         msg = msg.format(url, response['status'])
676         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
677         raise e
678
679 ################
680 #  old stuff
681 SUBMISSIONS_LACKING_LIBID = [
682     ('1x75-Directional-HeLa-Rep1',    '11208'),
683     ('1x75-Directional-HeLa-Rep2',    '11207'),
684     ('1x75-Directional-HepG2-Rep1',   '11210'),
685     ('1x75-Directional-HepG2-Rep2',   '11209'),
686     ('1x75-Directional-H1-hESC-Rep1', '10947'),
687     ('1x75-Directional-H1-hESC-Rep2', '11009'),
688     ('1x75-Directional-HUVEC-Rep1',   '11206'),
689     ('1x75-Directional-HUVEC-Rep2',   '11205'),
690     ('1x75-Directional-K562-Rep1',    '11008'),
691     ('1x75-Directional-K562-Rep2',    '11007'),
692     ('1x75-Directional-NHEK-Rep1',    '11204'),
693     ('1x75-Directional-GM12878-Rep1', '11011'),
694     ('1x75-Directional-GM12878-Rep2', '11010'),
695     ]
696
697
698 def select_by_library_id(submission_list):
699     subl = [(x.library_id, x) for x in submission_list if x.library_id]
700     libraries = {}
701     for lib_id, subobj in subl:
702         libraries.setdefault(lib_id, []).append(subobj)
703
704     for submission in libraries.values():
705         submission.sort(key=attrgetter('date'), reverse=True)
706
707     return libraries
708
709
710 def library_to_freeze(selected_libraries):
711     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
712     lib_ids = sorted(selected_libraries.keys())
713     report = ['<html><table border="1">']
714     report = ["""<html>
715 <head>
716 <style type="text/css">
717  td {border-width:0 0 1px 1px; border-style:solid;}
718 </style>
719 </head>
720 <body>
721 <table>
722 """]
723     report.append('<thead>')
724     report.append('<tr><td>Library ID</td><td>Name</td>')
725     for f in freezes:
726         report.append('<td>{0}</td>'.format(f))
727     report.append('</tr>')
728     report.append('</thead>')
729     report.append('<tbody>')
730     for lib_id in lib_ids:
731         report.append('<tr>')
732         lib_url = LIBRARY_NS[lib_id].uri
733         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
734         submissions = selected_libraries[lib_id]
735         report.append('<td>{0}</td>'.format(submissions[0].name))
736         batched = {}
737         for sub in submissions:
738             date = date_to_freeze(sub.date)
739             batched.setdefault(date, []).append(sub)
740         for d in freezes:
741             report.append('<td>')
742             for s in batched.get(d, []):
743                 show_url = submission_view_url(s.subid)
744                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
745                 report.append("{0}:{1}".format(subid, s.status))
746             report.append('</td>')
747         else:
748             report.append('<td></td>')
749         report.append("</tr>")
750     report.append('</tbody>')
751     report.append("</table></html>")
752     return "\n".join(report)
753
754
755 def date_to_freeze(d):
756     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
757                (datetime(2010, 7, 30), '2010-Jul'),
758                (datetime(2011, 1, 30), '2011-Jan'),
759                ]
760     for end, name in freezes:
761         if d < end:
762             return name
763     else:
764         return None
765
766 if __name__ == "__main__":
767     main()