0495d142041e90dc2e9c9baaa91e2740007a69aa
[htsworkflow.git] / encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from datetime import datetime
7 import hashlib
8 import httplib2
9 import keyring
10 import logging
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
14 # python keyring
15 import os
16 import re
17 # redland rdf lib
18 import RDF
19 import sys
20 import urllib
21 import urlparse
22
23 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
24     os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
25
26 from htsworkflow.submission import daf, ucsc
27
28 from htsworkflow.util import api
29 from htsworkflow.util.rdfhelp import \
30      dafTermOntology, \
31      dublinCoreNS, \
32      get_model, \
33      get_serializer, \
34      sparql_query, \
35      submissionOntology, \
36      libraryOntology, \
37      load_into_model, \
38      rdfNS, \
39      rdfsNS, \
40      xsdNS
41 TYPE_N = rdfNS['type']
42 CREATION_DATE = libraryOntology['date']
43
44 # URL mappings
45 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
46
47 from htsworkflow.submission.ucsc import \
48      daf_download_url, \
49      ddf_download_url, \
50      get_encodedcc_file_index, \
51      submission_view_url, \
52      UCSCEncodePipeline
53
54 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
55
56 DBDIR = os.path.expanduser("~diane/proj/submission")
57
58 LOGGER = logging.getLogger("encode_find")
59
60 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
61 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
62
63 USERNAME = 'detrout'
64 CHARSET = 'utf-8'
65
66 SL_MAP = {'SL2970': '02970',
67           'SL2971': '02971',
68           'SL2973': '02973',}
69
70 def main(cmdline=None):
71     """
72     Parse command line arguments
73
74     Takes a list of arguments (assuming arg[0] is the program name) or None
75     If None, it looks at sys.argv
76     """
77     parser = make_parser()
78     opts, args = parser.parse_args(cmdline)
79
80     if opts.debug:
81         logging.basicConfig(level=logging.DEBUG)
82     elif opts.verbose:
83         logging.basicConfig(level=logging.INFO)
84     else:
85         logging.basicConfig(level=logging.ERROR)
86
87     htsw_authdata = api.make_auth_from_opts(opts, parser)
88     htswapi = api.HtswApi(opts.host, htsw_authdata)
89
90     cookie = None
91     model = get_model(opts.model, DBDIR)
92
93     if opts.load_rdf is not None:
94         ns_uri = submissionOntology[''].uri
95         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
96
97     if len(args) == 0:
98         limit = None
99     else:
100         limit = args
101
102     if opts.reload_libraries:
103         reload_libraries(model, args)
104         return
105
106     if opts.update:
107         opts.update_submission = True
108         opts.update_libraries = True
109         opts.update_ucsc_downloads = True
110
111     if opts.update_submission:
112         cookie = login(cookie=cookie)
113         load_my_submissions(model, limit=limit, cookie=cookie)
114
115     if opts.update_libraries:
116         load_encode_assigned_libraries(model, htswapi)
117         load_unassigned_submitted_libraries(model)
118
119     if opts.update_ucsc_downloads:
120         our_tracks = [
121             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
122             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
123             #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
124             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
125         ]
126         for track_info in our_tracks:
127             load_encodedcc_files(model, **track_info )
128
129     if opts.sparql is not None:
130         sparql_query(model, opts.sparql, 'html')
131
132     if opts.find_submission_with_no_library:
133         report_submissions_with_no_library(model)
134
135     if opts.print_rdf:
136         serializer = get_serializer(name=opts.rdf_parser_name)
137         print serializer.serialize_model_to_string(model)
138
139
140 def make_parser():
141     """Construct option parser
142     """
143     parser = OptionParser()
144     commands = OptionGroup(parser, "Commands")
145     commands.add_option('--model', default=None,
146       help="Load model database")
147     commands.add_option('--load-rdf', default=None,
148       help="load rdf statements into model")
149     commands.add_option('--print-rdf', action="store_true", default=False,
150       help="print ending model state")
151     commands.add_option('--update', action="store_true", default=False,
152       help="Do all updates")
153     commands.add_option('--update-submission', action="store_true",
154                         default=False,
155       help="download status from ucsc")
156     commands.add_option('--update-ucsc-downloads', action="store_true",
157                         default=False,
158       help="Update download locations from UCSC")
159     commands.add_option('--update-libraries', action="store_true",
160                         default=False,
161       help="download library info from htsw")
162     commands.add_option('--reload-libraries', action="store_true",
163                         default=False,
164                         help="Delete and redownload library information. "\
165                              "Optionally list specific library IDs.")
166     parser.add_option_group(commands)
167
168     queries = OptionGroup(parser, "Queries")
169     queries.add_option('--sparql', default=None,
170       help="execute arbitrary sparql query")
171     queries.add_option('--find-submission-with-no-library', default=False,
172       action="store_true",
173       help="find submissions with no library ID")
174     parser.add_option_group(queries)
175
176     options = OptionGroup(parser, "Options")
177     options.add_option("--rdf-parser-name", default="turtle",
178       help="set rdf file parser type")
179     options.add_option("-v", "--verbose", action="store_true", default=False)
180     options.add_option("--debug", action="store_true", default=False)
181     parser.add_option_group(options)
182
183     api.add_auth_options(parser)
184
185     return parser
186
187
188 def load_my_submissions(model, limit=None, cookie=None):
189     """Parse all of my submissions from encodesubmit into model
190     It will look at the global USER_URL to figure out who to scrape
191     cookie contains the session cookie, if none, will attempt to login
192     """
193     if cookie is None:
194         cookie = login()
195
196     tree = get_url_as_tree(USER_URL, 'GET', cookie)
197     table_rows = tree.xpath('//table[@id="projects"]/tr')
198     # first record is header
199     name_n = submissionOntology['name']
200     species_n = submissionOntology['species']
201     library_urn = submissionOntology['library_urn']
202
203     # skip header
204     for row in table_rows[1:]:
205         cell = row.xpath('td')
206         if cell is not None and len(cell) > 1:
207             submission_id = str(cell[0].text_content())
208             if limit is None or submission_id in limit:
209                 subUrn = RDF.Uri(submission_view_url(submission_id))
210
211                 add_stmt(model,
212                          subUrn,
213                          TYPE_N,
214                          submissionOntology['Submission'])
215                 add_stmt(model,
216                          subUrn,
217                          DCC_NS['subId'],
218                          RDF.Node(submission_id))
219
220                 name = str(cell[4].text_content())
221                 add_stmt(model, subUrn, name_n, name)
222
223                 species = str(cell[2].text_content())
224                 if species is not None:
225                     add_stmt(model, subUrn, species_n, species)
226
227                 library_id = get_library_id(name)
228                 if library_id is not None:
229                     add_submission_to_library_urn(model,
230                                                   subUrn,
231                                                   library_urn,
232                                                   library_id)
233                 else:
234                     errmsg = 'Unable to find library id in {0} for {1}'
235                     LOGGER.warn(errmsg.format(name, str(subUrn)))
236
237                 add_submission_creation_date(model, subUrn, cookie)
238
239                 # grab changing atttributes
240                 status = str(cell[6].text_content()).strip()
241                 last_mod_datetime = get_date_contents(cell[8])
242                 last_mod = last_mod_datetime.isoformat()
243
244                 update_submission_detail(model, subUrn, status, last_mod,
245                                          cookie=cookie)
246
247                 LOGGER.info("Processed {0}".format(subUrn))
248
249
250 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
251     """Add a link from a UCSC submission to woldlab library if needed
252     """
253     libraryUrn = LIBRARY_NS[library_id + '/']
254     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
255     if not model.contains_statement(query):
256         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
257         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
258         model.add_statement(link)
259     else:
260         LOGGER.debug("Found: {0}".format(str(query)))
261
262
263 def report_submissions_with_no_library(model):
264     missing = find_submissions_with_no_library(model)
265     for row in results:
266         subid = row['subid']
267         name = row['name']
268         print "# {0}".format(name)
269         print "<{0}>".format(subid.uri)
270         print "  encodeSubmit:library_urn "\
271               "<http://jumpgate.caltech.edu/library/> ."
272         print ""
273
274 def find_submissions_with_no_library(model):
275     missing_lib_query_text = """
276 PREFIX submissionOntology:<{submissionOntology}>
277
278 SELECT
279  ?subid ?name
280 WHERE {{
281   ?subid submissionOntology:name ?name
282   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
283   FILTER  (!bound(?libid))
284 }}""".format(submissionOntology=submissionOntology[''].uri)
285     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
286
287     return missing_lib_query.execute(model)
288
289
290 def find_unscanned_submitted_libraries(model):
291     """Scan model for libraries that don't have library details loaded
292     """
293     unscanned_libraries = """
294 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
295 PREFIX submissionOntology:<{submissionOntology}>
296
297 SELECT distinct ?submission ?library_urn
298 WHERE {{
299   ?submission submissionOntology:library_urn ?library_urn .
300   OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
301   FILTER(!BOUND(?library_type))
302 }}""".format(submissionOntology=submissionOntology[''].uri)
303     query = RDF.SPARQLQuery(unscanned_libraries)
304     return query.execute(model)
305
306 def find_all_libraries(model):
307     """Scan model for every library marked as
308     """
309     libraries = """
310 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
311 PREFIX libraryOntology:<{libraryOntology}>
312
313 SELECT distinct ?library_urn
314 WHERE {{
315   ?library_urn rdf:type ?library_type .
316   FILTER(regex(?libray
317 }}""".format(libraryOntology=libraryOntology[''].uri)
318     query = RDF.SPARQLQuery(libraries)
319     return query.execute(model)
320
321
322 def add_submission_creation_date(model, subUrn, cookie):
323     # in theory the submission page might have more information on it.
324     creation_dates = get_creation_dates(model, subUrn)
325     if len(creation_dates) == 0:
326         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
327         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
328         parse_submission_page(model, submissionTree, subUrn)
329     else:
330         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
331
332
333 def get_creation_dates(model, subUrn):
334     query = RDF.Statement(subUrn, CREATION_DATE, None)
335     creation_dates = list(model.find_statements(query))
336     return creation_dates
337
338
339 def parse_submission_page(model, submissionTree, subUrn):
340     cells = submissionTree.findall('.//td')
341     dateTimeType = xsdNS['dateTime']
342     created_label = [x for x in cells
343                      if x.text_content().startswith('Created')]
344     if len(created_label) == 1:
345         created_date = get_date_contents(created_label[0].getnext())
346         created_date_node = RDF.Node(literal=created_date.isoformat(),
347                                      datatype=dateTimeType.uri)
348         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
349     else:
350         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
351         LOGGER.warn(msg)
352         raise Warning(msg)
353
354
355 def update_submission_detail(model, subUrn, status, recent_update, cookie):
356     HasStatusN = submissionOntology['has_status']
357     StatusN = submissionOntology['status']
358     LastModifyN = submissionOntology['last_modify_date']
359
360     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
361     status_nodes = list(model.find_statements(status_nodes_query))
362
363     if len(status_nodes) == 0:
364         # has no status node, add one
365         LOGGER.info("Adding status node to {0}".format(subUrn))
366         status_node = create_status_node(subUrn, recent_update)
367         add_stmt(model, subUrn, HasStatusN, status_node)
368         add_stmt(model, status_node, rdfNS['type'], StatusN)
369         add_stmt(model, status_node, StatusN, status)
370         add_stmt(model, status_node, LastModifyN, recent_update)
371         update_ddf(model, subUrn, status_node, cookie=cookie)
372         update_daf(model, subUrn, status_node, cookie=cookie)
373     else:
374         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
375         for status_statement in status_nodes:
376             status_node = status_statement.object
377             last_modified_query = RDF.Statement(status_node,
378                                                 LastModifyN,
379                                                 None)
380             last_mod_nodes = model.find_statements(last_modified_query)
381             for last_mod_statement in last_mod_nodes:
382                 last_mod_date = str(last_mod_statement.object)
383                 if recent_update == str(last_mod_date):
384                     update_ddf(model, subUrn, status_node, cookie=cookie)
385                     update_daf(model, subUrn, status_node, cookie=cookie)
386                     break
387
388
389 def update_daf(model, submission_url, status_node, cookie):
390     download_daf_uri = str(submission_url).replace('show', 'download_daf')
391     daf_uri = RDF.Uri(download_daf_uri)
392
393     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
394     if not model.contains_statement(status_is_daf):
395         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
396                                                      status_node))
397         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
398         daf_hash = hashlib.md5(daf_text).hexdigest()
399         daf_hash_stmt = RDF.Statement(status_node,
400                                       dafTermOntology['md5sum'],
401                                       daf_hash)
402         model.add_statement(daf_hash_stmt)
403         daf.fromstring_into_model(model, status_node, daf_text)
404
405
406 def update_ddf(model, subUrn, statusNode, cookie):
407     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
408     ddfUrn = RDF.Uri(download_ddf_url)
409
410     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
411     if not model.contains_statement(status_is_ddf):
412         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
413         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
414         add_ddf_statements(model, statusNode, ddf_text)
415         model.add_statement(status_is_ddf)
416
417
418 def add_ddf_statements(model, statusNode, ddf_string):
419     """Convert a ddf text file into RDF Statements
420     """
421     ddf_lines = ddf_string.split('\n')
422     # first line is header
423     header = ddf_lines[0].split()
424     attributes = [DCC_NS[x] for x in header]
425
426     for ddf_line in ddf_lines[1:]:
427         ddf_line = ddf_line.strip()
428         if len(ddf_line) == 0:
429             continue
430         if ddf_line.startswith("#"):
431             continue
432
433         ddf_record = ddf_line.split('\t')
434         files = ddf_record[0].split(',')
435         file_attributes = ddf_record[1:]
436
437         for f in files:
438             fileNode = RDF.Node()
439             add_stmt(model,
440                      statusNode,
441                      submissionOntology['has_file'],
442                      fileNode)
443             add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
444             add_stmt(model, fileNode, DCC_NS['filename'], f)
445
446             for predicate, object in zip(attributes[1:], file_attributes):
447                 add_stmt(model, fileNode, predicate, object)
448
449
450 def load_encode_assigned_libraries(model, htswapi):
451     """Get libraries associated with encode.
452     """
453     encodeFilters = ["/library/?affiliations__id__exact=44",
454                      "/library/?affiliations__id__exact=80",
455                     ]
456
457     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
458     rdfaParser = RDF.Parser(name='rdfa')
459     for encodeUrl in encodeUrls:
460         LOGGER.info("Scanning library url {0}".format(encodeUrl))
461         rdfaParser.parse_into_model(model, encodeUrl)
462         query = RDF.Statement(None, libraryOntology['library_id'], None)
463         libraries = model.find_statements(query)
464         for statement in libraries:
465             libraryUrn = statement.subject
466             load_library_detail(model, libraryUrn)
467
468
469 def load_unassigned_submitted_libraries(model):
470     unassigned = find_unscanned_submitted_libraries(model)
471     for query_record in unassigned:
472         library_urn = query_record['library_urn']
473         LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
474         load_library_detail(model, library_urn)
475
476 def reload_libraries(model, library_list):
477     if len(library_list) == 0:
478         # reload everything.
479         queryset = find_all_libraries(model)
480         libraries = ( str(s['library_urn']) for s in queryset )
481     else:
482         libraries = ( user_library_id_to_library_urn(l) for l in library_list )
483
484     for library_urn in libraries:
485         delete_library(model, library_urn)
486         load_library_detail(model, library_urn)
487
488 def user_library_id_to_library_urn(library_id):
489     split_url = urlparse.urlsplit(library_id)
490     if len(split_url.scheme) == 0:
491         return LIBRARY_NS[library_id]
492     else:
493         return library_id
494
495 def delete_library(model, library_urn):
496     if not isinstance(library_urn, RDF.Node):
497         raise ValueError("library urn must be a RDF.Node")
498
499     LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
500     lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
501     for lane in model.find_statements(lane_query):
502         delete_lane(model, lane.object)
503     library_attrib_query = RDF.Statement(library_urn, None, None)
504     for library_attrib in model.find_statements(library_attrib_query):
505         LOGGER.debug("Deleting {0}".format(str(library_attrib)))
506         del model[library_attrib]
507
508
509 def delete_lane(model, lane_urn):
510     if not isinstance(lane_urn, RDF.Node):
511         raise ValueError("lane urn must be a RDF.Node")
512
513     delete_lane_mapping(model, lane_urn)
514     lane_attrib_query = RDF.Statement(lane_urn,None,None)
515     for lane_attrib in model.find_statements(lane_attrib_query):
516         LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
517         del model[lane_attrib]
518
519
520 def delete_lane_mapping(model, lane_urn):
521     if not isinstance(lane_urn, RDF.Node):
522         raise ValueError("lane urn must be a RDF.Node")
523
524     lane_mapping_query = RDF.Statement(lane_urn,
525                                        libraryOntology['has_mappings'],
526                                        None)
527     for lane_mapping in model.find_statements(lane_mapping_query):
528         mapping_attrib_query = RDF.Statement(lane_mapping.object,
529                                              None,
530                                              None)
531         for mapping_attrib in model.find_statements(mapping_attrib_query):
532             LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
533             del model[mapping_attrib]
534
535
536 def load_encodedcc_files(model, genome, composite):
537     file_index = ucsc.get_encodedcc_file_index(genome, composite)
538     if file_index is None:
539         return
540
541     lib_term = submissionOntology['library_urn']
542     sub_term = submissionOntology['submission_urn']
543     for filename, attributes in file_index.items():
544         s = RDF.Node(RDF.Uri(filename))
545         model.add_statement(
546             RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
547         for name, value in attributes.items():
548             p = RDF.Node(DCC_NS[name])
549             o = RDF.Node(value)
550             model.add_statement(RDF.Statement(s,p,o))
551             if name.lower() == 'labexpid':
552                 model.add_statement(
553                     RDF.Statement(s, lib_term, LIBRARY_NS[value+'/']))
554             elif name.lower() == 'subid':
555                 sub_url = RDF.Uri(submission_view_url(value))
556                 model.add_statement(
557                     RDF.Statement(s, sub_term, sub_url))
558
559
560 def load_library_detail(model, libraryUrn):
561     """Grab detail information from library page
562     """
563     rdfaParser = RDF.Parser(name='rdfa')
564     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
565     results = list(model.find_statements(query))
566     log_message = "Found {0} statements for {1}"
567     LOGGER.debug(log_message.format(len(results), libraryUrn))
568     if len(results) == 0:
569         LOGGER.info("Loading {0}".format(str(libraryUrn)))
570         try:
571             body = get_url_as_text(str(libraryUrn.uri), 'GET')
572             rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
573         except httplib2.HttpLib2ErrorWithResponse, e:
574             LOGGER.error(str(e))
575     elif len(results) == 1:
576         pass  # Assuming that a loaded dataset has one record
577     else:
578         LOGGER.warning("Many dates for {0}".format(libraryUrn))
579
580
581 def get_library_id(name):
582     """Guess library ID from library name
583
584     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
585     '11039'
586     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
587     '10150'
588     >>> get_library_id('2x75-GM12892-rep2-SL2970')
589     '02970'
590     """
591     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
592     library_id = None
593     if match is not None:
594         library_id = match.group('id')
595     if library_id in SL_MAP:
596         library_id = SL_MAP[library_id]
597     return library_id
598
599
600 def get_contents(element):
601     """Return contents or none.
602     """
603     if len(element.contents) == 0:
604         return None
605
606     a = element.find('a')
607     if a is not None:
608         return a.contents[0].encode(CHARSET)
609
610     return element.contents[0].encode(CHARSET)
611
612
613 def create_status_node(submission_uri, timestamp):
614     submission_uri = daf.submission_uri_to_string(submission_uri)
615     if submission_uri[-1] != '/':
616         sumbission_uri += '/'
617     status_uri = submission_uri + timestamp
618     return RDF.Node(RDF.Uri(status_uri))
619
620
621 def get_date_contents(element):
622     data = element.text_content()
623     if data:
624         return datetime.strptime(data, "%Y-%m-%d %H:%M")
625     else:
626         return None
627
628
629 def add_stmt(model, subject, predicate, rdf_object):
630     """Convienence create RDF Statement and add to a model
631     """
632     return model.add_statement(
633         RDF.Statement(subject, predicate, rdf_object))
634
635
636 def login(cookie=None):
637     """Login if we don't have a cookie
638     """
639     if cookie is not None:
640         return cookie
641
642     keys = keyring.get_keyring()
643     password = keys.get_password(LOGIN_URL, USERNAME)
644     credentials = {'login': USERNAME,
645                    'password': password}
646     headers = {'Content-type': 'application/x-www-form-urlencoded'}
647     http = httplib2.Http()
648     response, content = http.request(LOGIN_URL,
649                                      'POST',
650                                      headers=headers,
651                                      body=urllib.urlencode(credentials))
652     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
653                                                     response['status']))
654
655     cookie = response.get('set-cookie', None)
656     if cookie is None:
657         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
658     return cookie
659
660
661 def get_url_as_tree(url, method, cookie=None):
662     http = httplib2.Http()
663     headers = {}
664     if cookie is not None:
665         headers['Cookie'] = cookie
666     response, content = http.request(url, method, headers=headers)
667     if response['status'] == '200':
668         tree = fromstring(content, base_url=url)
669         return tree
670     else:
671         msg = "error accessing {0}, status {1}"
672         msg = msg.format(url, response['status'])
673         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
674         raise e
675
676
677 def get_url_as_text(url, method, cookie=None):
678     http = httplib2.Http()
679     headers = {}
680     if cookie is not None:
681         headers['Cookie'] = cookie
682     response, content = http.request(url, method, headers=headers)
683     if response['status'] == '200':
684         return content
685     else:
686         msg = "error accessing {0}, status {1}"
687         msg = msg.format(url, response['status'])
688         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
689         raise e
690
691 ################
692 #  old stuff
693 SUBMISSIONS_LACKING_LIBID = [
694     ('1x75-Directional-HeLa-Rep1',    '11208'),
695     ('1x75-Directional-HeLa-Rep2',    '11207'),
696     ('1x75-Directional-HepG2-Rep1',   '11210'),
697     ('1x75-Directional-HepG2-Rep2',   '11209'),
698     ('1x75-Directional-H1-hESC-Rep1', '10947'),
699     ('1x75-Directional-H1-hESC-Rep2', '11009'),
700     ('1x75-Directional-HUVEC-Rep1',   '11206'),
701     ('1x75-Directional-HUVEC-Rep2',   '11205'),
702     ('1x75-Directional-K562-Rep1',    '11008'),
703     ('1x75-Directional-K562-Rep2',    '11007'),
704     ('1x75-Directional-NHEK-Rep1',    '11204'),
705     ('1x75-Directional-GM12878-Rep1', '11011'),
706     ('1x75-Directional-GM12878-Rep2', '11010'),
707     ]
708
709
710 def select_by_library_id(submission_list):
711     subl = [(x.library_id, x) for x in submission_list if x.library_id]
712     libraries = {}
713     for lib_id, subobj in subl:
714         libraries.setdefault(lib_id, []).append(subobj)
715
716     for submission in libraries.values():
717         submission.sort(key=attrgetter('date'), reverse=True)
718
719     return libraries
720
721
722 def library_to_freeze(selected_libraries):
723     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
724     lib_ids = sorted(selected_libraries.keys())
725     report = ['<html><table border="1">']
726     report = ["""<html>
727 <head>
728 <style type="text/css">
729  td {border-width:0 0 1px 1px; border-style:solid;}
730 </style>
731 </head>
732 <body>
733 <table>
734 """]
735     report.append('<thead>')
736     report.append('<tr><td>Library ID</td><td>Name</td>')
737     for f in freezes:
738         report.append('<td>{0}</td>'.format(f))
739     report.append('</tr>')
740     report.append('</thead>')
741     report.append('<tbody>')
742     for lib_id in lib_ids:
743         report.append('<tr>')
744         lib_url = LIBRARY_NS[lib_id].uri
745         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
746         submissions = selected_libraries[lib_id]
747         report.append('<td>{0}</td>'.format(submissions[0].name))
748         batched = {}
749         for sub in submissions:
750             date = date_to_freeze(sub.date)
751             batched.setdefault(date, []).append(sub)
752         for d in freezes:
753             report.append('<td>')
754             for s in batched.get(d, []):
755                 show_url = submission_view_url(s.subid)
756                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
757                 report.append("{0}:{1}".format(subid, s.status))
758             report.append('</td>')
759         else:
760             report.append('<td></td>')
761         report.append("</tr>")
762     report.append('</tbody>')
763     report.append("</table></html>")
764     return "\n".join(report)
765
766
767 def date_to_freeze(d):
768     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
769                (datetime(2010, 7, 30), '2010-Jul'),
770                (datetime(2011, 1, 30), '2011-Jan'),
771                ]
772     for end, name in freezes:
773         if d < end:
774             return name
775     else:
776         return None
777
778 if __name__ == "__main__":
779     main()