remove unicode_literals from things going to redland RDF
[htsworkflow.git] / encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5 from __future__ import print_function
6
7 from datetime import datetime
8 import hashlib
9 import httplib2
10 import keyring
11 import logging
12 from lxml.html import fromstring
13 from operator import attrgetter
14 from optparse import OptionParser, OptionGroup
15 # python keyring
16 import os
17 import re
18 # redland rdf lib
19 import RDF
20 import sys
21 import urllib
22 import urlparse
23
24 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
25     os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
26
27 from htsworkflow.submission import daf, ucsc
28
29 from htsworkflow.util import api
30 from htsworkflow.util.rdfns import *
31 from htsworkflow.util.rdfhelp import \
32      get_model, \
33      get_serializer, \
34      sparql_query, \
35      submissionOntology, \
36      libraryOntology, \
37      load_into_model
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
40
41 # URL mappings
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
43
44 from htsworkflow.submission.ucsc import \
45      daf_download_url, \
46      ddf_download_url, \
47      get_encodedcc_file_index, \
48      submission_view_url, \
49      UCSCEncodePipeline
50
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
52
53 DBDIR = os.path.expanduser("~diane/proj/submission")
54
55 LOGGER = logging.getLogger("encode_find")
56
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
59
60 USERNAME = 'detrout'
61 CHARSET = 'utf-8'
62
63 SL_MAP = {'SL2970': '02970',
64           'SL2971': '02971',
65           'SL2973': '02973',}
66
67 def main(cmdline=None):
68     """
69     Parse command line arguments
70
71     Takes a list of arguments (assuming arg[0] is the program name) or None
72     If None, it looks at sys.argv
73     """
74     parser = make_parser()
75     opts, args = parser.parse_args(cmdline)
76
77     if opts.debug:
78         logging.basicConfig(level=logging.DEBUG)
79     elif opts.verbose:
80         logging.basicConfig(level=logging.INFO)
81     else:
82         logging.basicConfig(level=logging.ERROR)
83
84     htsw_authdata = api.make_auth_from_opts(opts, parser)
85     htswapi = api.HtswApi(opts.host, htsw_authdata)
86
87     cookie = None
88     model = get_model(opts.model, DBDIR)
89
90     if opts.load_rdf is not None:
91         ns_uri = submissionOntology[''].uri
92         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
93
94     if len(args) == 0:
95         limit = None
96     else:
97         limit = args
98
99     if opts.reload_libraries:
100         reload_libraries(model, args)
101         return
102
103     if opts.update:
104         opts.update_submission = True
105         opts.update_libraries = True
106         opts.update_ucsc_downloads = True
107
108     if opts.update_submission:
109         cookie = login(cookie=cookie)
110         load_my_submissions(model, limit=limit, cookie=cookie)
111
112     if opts.update_libraries:
113         load_encode_assigned_libraries(model, htswapi)
114         load_unassigned_submitted_libraries(model)
115
116     if opts.update_ucsc_downloads:
117         our_tracks = [
118             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
119             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
120             #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
121             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
122         ]
123         for track_info in our_tracks:
124             load_encodedcc_files(model, **track_info )
125
126     if opts.sparql is not None:
127         sparql_query(model, opts.sparql, 'html')
128
129     if opts.find_submission_with_no_library:
130         report_submissions_with_no_library(model)
131
132     if opts.print_rdf:
133         serializer = get_serializer(name=opts.rdf_parser_name)
134         print(serializer.serialize_model_to_string(model))
135
136
137 def make_parser():
138     """Construct option parser
139     """
140     parser = OptionParser()
141     commands = OptionGroup(parser, "Commands")
142     commands.add_option('--model', default=None,
143       help="Load model database")
144     commands.add_option('--load-rdf', default=None,
145       help="load rdf statements into model")
146     commands.add_option('--print-rdf', action="store_true", default=False,
147       help="print ending model state")
148     commands.add_option('--update', action="store_true", default=False,
149       help="Do all updates")
150     commands.add_option('--update-submission', action="store_true",
151                         default=False,
152       help="download status from ucsc")
153     commands.add_option('--update-ucsc-downloads', action="store_true",
154                         default=False,
155       help="Update download locations from UCSC")
156     commands.add_option('--update-libraries', action="store_true",
157                         default=False,
158       help="download library info from htsw")
159     commands.add_option('--reload-libraries', action="store_true",
160                         default=False,
161                         help="Delete and redownload library information. "\
162                              "Optionally list specific library IDs.")
163     parser.add_option_group(commands)
164
165     queries = OptionGroup(parser, "Queries")
166     queries.add_option('--sparql', default=None,
167       help="execute arbitrary sparql query")
168     queries.add_option('--find-submission-with-no-library', default=False,
169       action="store_true",
170       help="find submissions with no library ID")
171     parser.add_option_group(queries)
172
173     options = OptionGroup(parser, "Options")
174     options.add_option("--rdf-parser-name", default="turtle",
175       help="set rdf file parser type")
176     options.add_option("-v", "--verbose", action="store_true", default=False)
177     options.add_option("--debug", action="store_true", default=False)
178     parser.add_option_group(options)
179
180     api.add_auth_options(parser)
181
182     return parser
183
184
185 def load_my_submissions(model, limit=None, cookie=None):
186     """Parse all of my submissions from encodesubmit into model
187     It will look at the global USER_URL to figure out who to scrape
188     cookie contains the session cookie, if none, will attempt to login
189     """
190     if cookie is None:
191         cookie = login()
192
193     tree = get_url_as_tree(USER_URL, 'GET', cookie)
194     table_rows = tree.xpath('//table[@id="projects"]/tr')
195     # first record is header
196     name_n = submissionOntology['name']
197     species_n = submissionOntology['species']
198     library_urn = submissionOntology['library_urn']
199
200     # skip header
201     for row in table_rows[1:]:
202         cell = row.xpath('td')
203         if cell is not None and len(cell) > 1:
204             submission_id = str(cell[0].text_content())
205             if limit is None or submission_id in limit:
206                 subUrn = RDF.Uri(submission_view_url(submission_id))
207
208                 add_stmt(model,
209                          subUrn,
210                          TYPE_N,
211                          submissionOntology['Submission'])
212                 add_stmt(model,
213                          subUrn,
214                          DCC_NS['subId'],
215                          RDF.Node(submission_id))
216
217                 name = str(cell[4].text_content())
218                 add_stmt(model, subUrn, name_n, name)
219
220                 species = str(cell[2].text_content())
221                 if species is not None:
222                     add_stmt(model, subUrn, species_n, species)
223
224                 library_id = get_library_id(name)
225                 if library_id is not None:
226                     add_submission_to_library_urn(model,
227                                                   subUrn,
228                                                   library_urn,
229                                                   library_id)
230                 else:
231                     errmsg = 'Unable to find library id in {0} for {1}'
232                     LOGGER.warn(errmsg.format(name, str(subUrn)))
233
234                 add_submission_creation_date(model, subUrn, cookie)
235
236                 # grab changing atttributes
237                 status = str(cell[6].text_content()).strip()
238                 last_mod_datetime = get_date_contents(cell[8])
239                 last_mod = last_mod_datetime.isoformat()
240
241                 update_submission_detail(model, subUrn, status, last_mod,
242                                          cookie=cookie)
243
244                 LOGGER.info("Processed {0}".format(subUrn))
245
246
247 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
248     """Add a link from a UCSC submission to woldlab library if needed
249     """
250     libraryUrn = LIBRARY_NS[library_id + '/']
251     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
252     if not model.contains_statement(query):
253         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
254         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
255         model.add_statement(link)
256     else:
257         LOGGER.debug("Found: {0}".format(str(query)))
258
259
260 def report_submissions_with_no_library(model):
261     missing = find_submissions_with_no_library(model)
262     for row in results:
263         subid = row['subid']
264         name = row['name']
265         print("# {0}".format(name))
266         print("<{0}>".format(subid.uri))
267         print("  encodeSubmit:library_urn "\
268               "<http://jumpgate.caltech.edu/library/> .")
269         print("")
270
271 def find_submissions_with_no_library(model):
272     missing_lib_query_text = """
273 PREFIX submissionOntology:<{submissionOntology}>
274
275 SELECT
276  ?subid ?name
277 WHERE {{
278   ?subid submissionOntology:name ?name
279   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
280   FILTER  (!bound(?libid))
281 }}""".format(submissionOntology=submissionOntology[''].uri)
282     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
283
284     return missing_lib_query.execute(model)
285
286
287 def find_unscanned_submitted_libraries(model):
288     """Scan model for libraries that don't have library details loaded
289     """
290     unscanned_libraries = """
291 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
292 PREFIX submissionOntology:<{submissionOntology}>
293
294 SELECT distinct ?submission ?library_urn
295 WHERE {{
296   ?submission submissionOntology:library_urn ?library_urn .
297   OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
298   FILTER(!BOUND(?library_type))
299 }}""".format(submissionOntology=submissionOntology[''].uri)
300     query = RDF.SPARQLQuery(unscanned_libraries)
301     return query.execute(model)
302
303 def find_all_libraries(model):
304     """Scan model for every library marked as
305     """
306     libraries = """
307 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
308 PREFIX libraryOntology:<{libraryOntology}>
309
310 SELECT distinct ?library_urn
311 WHERE {{
312   ?library_urn rdf:type ?library_type .
313   FILTER(regex(?libray
314 }}""".format(libraryOntology=libraryOntology[''].uri)
315     query = RDF.SPARQLQuery(libraries)
316     return query.execute(model)
317
318
319 def add_submission_creation_date(model, subUrn, cookie):
320     # in theory the submission page might have more information on it.
321     creation_dates = get_creation_dates(model, subUrn)
322     if len(creation_dates) == 0:
323         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
324         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
325         parse_submission_page(model, submissionTree, subUrn)
326     else:
327         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
328
329
330 def get_creation_dates(model, subUrn):
331     query = RDF.Statement(subUrn, CREATION_DATE, None)
332     creation_dates = list(model.find_statements(query))
333     return creation_dates
334
335
336 def parse_submission_page(model, submissionTree, subUrn):
337     cells = submissionTree.findall('.//td')
338     dateTimeType = xsdNS['dateTime']
339     created_label = [x for x in cells
340                      if x.text_content().startswith('Created')]
341     if len(created_label) == 1:
342         created_date = get_date_contents(created_label[0].getnext())
343         created_date_node = RDF.Node(literal=created_date.isoformat(),
344                                      datatype=dateTimeType.uri)
345         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
346     else:
347         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
348         LOGGER.warn(msg)
349         raise Warning(msg)
350
351
352 def update_submission_detail(model, subUrn, status, recent_update, cookie):
353     HasStatusN = submissionOntology['has_status']
354     StatusN = submissionOntology['status']
355     LastModifyN = submissionOntology['last_modify_date']
356
357     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
358     status_nodes = list(model.find_statements(status_nodes_query))
359
360     if len(status_nodes) == 0:
361         # has no status node, add one
362         LOGGER.info("Adding status node to {0}".format(subUrn))
363         status_node = create_status_node(subUrn, recent_update)
364         add_stmt(model, subUrn, HasStatusN, status_node)
365         add_stmt(model, status_node, rdfNS['type'], StatusN)
366         add_stmt(model, status_node, StatusN, status)
367         add_stmt(model, status_node, LastModifyN, recent_update)
368         update_ddf(model, subUrn, status_node, cookie=cookie)
369         update_daf(model, subUrn, status_node, cookie=cookie)
370     else:
371         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
372         for status_statement in status_nodes:
373             status_node = status_statement.object
374             last_modified_query = RDF.Statement(status_node,
375                                                 LastModifyN,
376                                                 None)
377             last_mod_nodes = model.find_statements(last_modified_query)
378             for last_mod_statement in last_mod_nodes:
379                 last_mod_date = str(last_mod_statement.object)
380                 if recent_update == str(last_mod_date):
381                     update_ddf(model, subUrn, status_node, cookie=cookie)
382                     update_daf(model, subUrn, status_node, cookie=cookie)
383                     break
384
385
386 def update_daf(model, submission_url, status_node, cookie):
387     download_daf_uri = str(submission_url).replace('show', 'download_daf')
388     daf_uri = RDF.Uri(download_daf_uri)
389
390     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
391     if not model.contains_statement(status_is_daf):
392         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
393                                                      status_node))
394         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
395         daf_hash = hashlib.md5(daf_text).hexdigest()
396         daf_hash_stmt = RDF.Statement(status_node,
397                                       dafTermOntology['md5sum'],
398                                       daf_hash)
399         model.add_statement(daf_hash_stmt)
400         daf.fromstring_into_model(model, status_node, daf_text)
401
402
403 def update_ddf(model, subUrn, statusNode, cookie):
404     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
405     ddfUrn = RDF.Uri(download_ddf_url)
406
407     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
408     if not model.contains_statement(status_is_ddf):
409         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
410         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
411         add_ddf_statements(model, statusNode, ddf_text)
412         model.add_statement(status_is_ddf)
413
414
415 def add_ddf_statements(model, statusNode, ddf_string):
416     """Convert a ddf text file into RDF Statements
417     """
418     ddf_lines = ddf_string.split('\n')
419     # first line is header
420     header = ddf_lines[0].split()
421     attributes = [DCC_NS[x] for x in header]
422
423     for ddf_line in ddf_lines[1:]:
424         ddf_line = ddf_line.strip()
425         if len(ddf_line) == 0:
426             continue
427         if ddf_line.startswith("#"):
428             continue
429
430         ddf_record = ddf_line.split('\t')
431         files = ddf_record[0].split(',')
432         file_attributes = ddf_record[1:]
433
434         for f in files:
435             fileNode = RDF.Node()
436             add_stmt(model,
437                      statusNode,
438                      submissionOntology['has_file'],
439                      fileNode)
440             add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
441             add_stmt(model, fileNode, DCC_NS['filename'], f)
442
443             for predicate, object in zip(attributes[1:], file_attributes):
444                 add_stmt(model, fileNode, predicate, object)
445
446
447 def load_encode_assigned_libraries(model, htswapi):
448     """Get libraries associated with encode.
449     """
450     encodeFilters = ["/library/?affiliations__id__exact=44",
451                      "/library/?affiliations__id__exact=80",
452                     ]
453
454     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
455     rdfaParser = RDF.Parser(name='rdfa')
456     for encodeUrl in encodeUrls:
457         LOGGER.info("Scanning library url {0}".format(encodeUrl))
458         rdfaParser.parse_into_model(model, encodeUrl)
459         query = RDF.Statement(None, libraryOntology['library_id'], None)
460         libraries = model.find_statements(query)
461         for statement in libraries:
462             libraryUrn = statement.subject
463             load_library_detail(model, libraryUrn)
464
465
466 def load_unassigned_submitted_libraries(model):
467     unassigned = find_unscanned_submitted_libraries(model)
468     for query_record in unassigned:
469         library_urn = query_record['library_urn']
470         LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
471         load_library_detail(model, library_urn)
472
473 def reload_libraries(model, library_list):
474     if len(library_list) == 0:
475         # reload everything.
476         queryset = find_all_libraries(model)
477         libraries = ( str(s['library_urn']) for s in queryset )
478     else:
479         libraries = ( user_library_id_to_library_urn(l) for l in library_list )
480
481     for library_urn in libraries:
482         delete_library(model, library_urn)
483         load_library_detail(model, library_urn)
484
485 def user_library_id_to_library_urn(library_id):
486     split_url = urlparse.urlsplit(library_id)
487     if len(split_url.scheme) == 0:
488         return LIBRARY_NS[library_id]
489     else:
490         return library_id
491
492 def delete_library(model, library_urn):
493     if not isinstance(library_urn, RDF.Node):
494         raise ValueError("library urn must be a RDF.Node")
495
496     LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
497     lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
498     for lane in model.find_statements(lane_query):
499         delete_lane(model, lane.object)
500     library_attrib_query = RDF.Statement(library_urn, None, None)
501     for library_attrib in model.find_statements(library_attrib_query):
502         LOGGER.debug("Deleting {0}".format(str(library_attrib)))
503         del model[library_attrib]
504
505
506 def delete_lane(model, lane_urn):
507     if not isinstance(lane_urn, RDF.Node):
508         raise ValueError("lane urn must be a RDF.Node")
509
510     delete_lane_mapping(model, lane_urn)
511     lane_attrib_query = RDF.Statement(lane_urn,None,None)
512     for lane_attrib in model.find_statements(lane_attrib_query):
513         LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
514         del model[lane_attrib]
515
516
517 def delete_lane_mapping(model, lane_urn):
518     if not isinstance(lane_urn, RDF.Node):
519         raise ValueError("lane urn must be a RDF.Node")
520
521     lane_mapping_query = RDF.Statement(lane_urn,
522                                        libraryOntology['has_mappings'],
523                                        None)
524     for lane_mapping in model.find_statements(lane_mapping_query):
525         mapping_attrib_query = RDF.Statement(lane_mapping.object,
526                                              None,
527                                              None)
528         for mapping_attrib in model.find_statements(mapping_attrib_query):
529             LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
530             del model[mapping_attrib]
531
532
533 def load_encodedcc_files(model, genome, composite):
534     file_index = ucsc.get_encodedcc_file_index(genome, composite)
535     if file_index is None:
536         return
537
538     lib_term = submissionOntology['library_urn']
539     sub_term = submissionOntology['submission_urn']
540     for filename, attributes in file_index.items():
541         s = RDF.Node(RDF.Uri(filename))
542         model.add_statement(
543             RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
544         for name, value in attributes.items():
545             p = RDF.Node(DCC_NS[name])
546             o = RDF.Node(value)
547             model.add_statement(RDF.Statement(s,p,o))
548             if name.lower() == 'labexpid':
549                 model.add_statement(
550                     RDF.Statement(s, lib_term, LIBRARY_NS[value+'/']))
551             elif name.lower() == 'subid':
552                 sub_url = RDF.Uri(submission_view_url(value))
553                 model.add_statement(
554                     RDF.Statement(s, sub_term, sub_url))
555
556
557 def load_library_detail(model, libraryUrn):
558     """Grab detail information from library page
559     """
560     rdfaParser = RDF.Parser(name='rdfa')
561     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
562     results = list(model.find_statements(query))
563     log_message = "Found {0} statements for {1}"
564     LOGGER.debug(log_message.format(len(results), libraryUrn))
565     if len(results) == 0:
566         LOGGER.info("Loading {0}".format(str(libraryUrn)))
567         try:
568             body = get_url_as_text(str(libraryUrn.uri), 'GET')
569             rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
570         except httplib2.HttpLib2ErrorWithResponse as e:
571             LOGGER.error(str(e))
572     elif len(results) == 1:
573         pass  # Assuming that a loaded dataset has one record
574     else:
575         LOGGER.warning("Many dates for {0}".format(libraryUrn))
576
577
578 def get_library_id(name):
579     """Guess library ID from library name
580
581     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
582     '11039'
583     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
584     '10150'
585     >>> get_library_id('2x75-GM12892-rep2-SL2970')
586     '02970'
587     """
588     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
589     library_id = None
590     if match is not None:
591         library_id = match.group('id')
592     if library_id in SL_MAP:
593         library_id = SL_MAP[library_id]
594     return library_id
595
596
597 def get_contents(element):
598     """Return contents or none.
599     """
600     if len(element.contents) == 0:
601         return None
602
603     a = element.find('a')
604     if a is not None:
605         return a.contents[0].encode(CHARSET)
606
607     return element.contents[0].encode(CHARSET)
608
609
610 def create_status_node(submission_uri, timestamp):
611     submission_uri = daf.submission_uri_to_string(submission_uri)
612     if submission_uri[-1] != '/':
613         sumbission_uri += '/'
614     status_uri = submission_uri + timestamp
615     return RDF.Node(RDF.Uri(status_uri))
616
617
618 def get_date_contents(element):
619     data = element.text_content()
620     if data:
621         return datetime.strptime(data, "%Y-%m-%d %H:%M")
622     else:
623         return None
624
625
626 def add_stmt(model, subject, predicate, rdf_object):
627     """Convienence create RDF Statement and add to a model
628     """
629     return model.add_statement(
630         RDF.Statement(subject, predicate, rdf_object))
631
632
633 def login(cookie=None):
634     """Login if we don't have a cookie
635     """
636     if cookie is not None:
637         return cookie
638
639     keys = keyring.get_keyring()
640     password = keys.get_password(LOGIN_URL, USERNAME)
641     credentials = {'login': USERNAME,
642                    'password': password}
643     headers = {'Content-type': 'application/x-www-form-urlencoded'}
644     http = httplib2.Http()
645     response, content = http.request(LOGIN_URL,
646                                      'POST',
647                                      headers=headers,
648                                      body=urllib.urlencode(credentials))
649     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
650                                                     response['status']))
651
652     cookie = response.get('set-cookie', None)
653     if cookie is None:
654         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
655     return cookie
656
657
658 def get_url_as_tree(url, method, cookie=None):
659     http = httplib2.Http()
660     headers = {}
661     if cookie is not None:
662         headers['Cookie'] = cookie
663     response, content = http.request(url, method, headers=headers)
664     if response['status'] == '200':
665         tree = fromstring(content, base_url=url)
666         return tree
667     else:
668         msg = "error accessing {0}, status {1}"
669         msg = msg.format(url, response['status'])
670         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
671         raise e
672
673
674 def get_url_as_text(url, method, cookie=None):
675     http = httplib2.Http()
676     headers = {}
677     if cookie is not None:
678         headers['Cookie'] = cookie
679     response, content = http.request(url, method, headers=headers)
680     if response['status'] == '200':
681         return content
682     else:
683         msg = "error accessing {0}, status {1}"
684         msg = msg.format(url, response['status'])
685         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
686         raise e
687
688 ################
689 #  old stuff
690 SUBMISSIONS_LACKING_LIBID = [
691     ('1x75-Directional-HeLa-Rep1',    '11208'),
692     ('1x75-Directional-HeLa-Rep2',    '11207'),
693     ('1x75-Directional-HepG2-Rep1',   '11210'),
694     ('1x75-Directional-HepG2-Rep2',   '11209'),
695     ('1x75-Directional-H1-hESC-Rep1', '10947'),
696     ('1x75-Directional-H1-hESC-Rep2', '11009'),
697     ('1x75-Directional-HUVEC-Rep1',   '11206'),
698     ('1x75-Directional-HUVEC-Rep2',   '11205'),
699     ('1x75-Directional-K562-Rep1',    '11008'),
700     ('1x75-Directional-K562-Rep2',    '11007'),
701     ('1x75-Directional-NHEK-Rep1',    '11204'),
702     ('1x75-Directional-GM12878-Rep1', '11011'),
703     ('1x75-Directional-GM12878-Rep2', '11010'),
704     ]
705
706
707 def select_by_library_id(submission_list):
708     subl = [(x.library_id, x) for x in submission_list if x.library_id]
709     libraries = {}
710     for lib_id, subobj in subl:
711         libraries.setdefault(lib_id, []).append(subobj)
712
713     for submission in libraries.values():
714         submission.sort(key=attrgetter('date'), reverse=True)
715
716     return libraries
717
718
719 def library_to_freeze(selected_libraries):
720     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
721     lib_ids = sorted(selected_libraries.keys())
722     report = ['<html><table border="1">']
723     report = ["""<html>
724 <head>
725 <style type="text/css">
726  td {border-width:0 0 1px 1px; border-style:solid;}
727 </style>
728 </head>
729 <body>
730 <table>
731 """]
732     report.append('<thead>')
733     report.append('<tr><td>Library ID</td><td>Name</td>')
734     for f in freezes:
735         report.append('<td>{0}</td>'.format(f))
736     report.append('</tr>')
737     report.append('</thead>')
738     report.append('<tbody>')
739     for lib_id in lib_ids:
740         report.append('<tr>')
741         lib_url = LIBRARY_NS[lib_id].uri
742         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
743         submissions = selected_libraries[lib_id]
744         report.append('<td>{0}</td>'.format(submissions[0].name))
745         batched = {}
746         for sub in submissions:
747             date = date_to_freeze(sub.date)
748             batched.setdefault(date, []).append(sub)
749         for d in freezes:
750             report.append('<td>')
751             for s in batched.get(d, []):
752                 show_url = submission_view_url(s.subid)
753                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
754                 report.append("{0}:{1}".format(subid, s.status))
755             report.append('</td>')
756         else:
757             report.append('<td></td>')
758         report.append("</tr>")
759     report.append('</tbody>')
760     report.append("</table></html>")
761     return "\n".join(report)
762
763
764 def date_to_freeze(d):
765     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
766                (datetime(2010, 7, 30), '2010-Jul'),
767                (datetime(2011, 1, 30), '2011-Jan'),
768                ]
769     for end, name in freezes:
770         if d < end:
771             return name
772     else:
773         return None
774
775 if __name__ == "__main__":
776     main()