mark the example submission rule files as being raw, so the escapes dont get confused
[htsworkflow.git] / encode_submission / encode_find.py
1 #!/usr/bin/env python
2 """
3 Gather information about our submissions into a single RDF store
4 """
5
6 from datetime import datetime
7 import hashlib
8 import httplib2
9 import keyring
10 import logging
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
14 # python keyring
15 import os
16 import re
17 # redland rdf lib
18 import RDF
19 import sys
20 import urllib
21 import urlparse
22
23 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
24     os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
25
26 from htsworkflow.submission import daf, ucsc
27
28 from htsworkflow.util import api
29 from htsworkflow.util.rdfns import *
30 from htsworkflow.util.rdfhelp import \
31      get_model, \
32      get_serializer, \
33      sparql_query, \
34      submissionOntology, \
35      libraryOntology, \
36      load_into_model
37 TYPE_N = rdfNS['type']
38 CREATION_DATE = libraryOntology['date']
39
40 # URL mappings
41 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
42
43 from htsworkflow.submission.ucsc import \
44      daf_download_url, \
45      ddf_download_url, \
46      get_encodedcc_file_index, \
47      submission_view_url, \
48      UCSCEncodePipeline
49
50 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
51
52 DBDIR = os.path.expanduser("~diane/proj/submission")
53
54 LOGGER = logging.getLogger("encode_find")
55
56 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
57 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
58
59 USERNAME = 'detrout'
60 CHARSET = 'utf-8'
61
62 SL_MAP = {'SL2970': '02970',
63           'SL2971': '02971',
64           'SL2973': '02973',}
65
66 def main(cmdline=None):
67     """
68     Parse command line arguments
69
70     Takes a list of arguments (assuming arg[0] is the program name) or None
71     If None, it looks at sys.argv
72     """
73     parser = make_parser()
74     opts, args = parser.parse_args(cmdline)
75
76     if opts.debug:
77         logging.basicConfig(level=logging.DEBUG)
78     elif opts.verbose:
79         logging.basicConfig(level=logging.INFO)
80     else:
81         logging.basicConfig(level=logging.ERROR)
82
83     htsw_authdata = api.make_auth_from_opts(opts, parser)
84     htswapi = api.HtswApi(opts.host, htsw_authdata)
85
86     cookie = None
87     model = get_model(opts.model, DBDIR)
88
89     if opts.load_rdf is not None:
90         ns_uri = submissionOntology[''].uri
91         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
92
93     if len(args) == 0:
94         limit = None
95     else:
96         limit = args
97
98     if opts.reload_libraries:
99         reload_libraries(model, args)
100         return
101
102     if opts.update:
103         opts.update_submission = True
104         opts.update_libraries = True
105         opts.update_ucsc_downloads = True
106
107     if opts.update_submission:
108         cookie = login(cookie=cookie)
109         load_my_submissions(model, limit=limit, cookie=cookie)
110
111     if opts.update_libraries:
112         load_encode_assigned_libraries(model, htswapi)
113         load_unassigned_submitted_libraries(model)
114
115     if opts.update_ucsc_downloads:
116         our_tracks = [
117             {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
118             {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
119             #{'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
120             {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
121         ]
122         for track_info in our_tracks:
123             load_encodedcc_files(model, **track_info )
124
125     if opts.sparql is not None:
126         sparql_query(model, opts.sparql, 'html')
127
128     if opts.find_submission_with_no_library:
129         report_submissions_with_no_library(model)
130
131     if opts.print_rdf:
132         serializer = get_serializer(name=opts.rdf_parser_name)
133         print serializer.serialize_model_to_string(model)
134
135
136 def make_parser():
137     """Construct option parser
138     """
139     parser = OptionParser()
140     commands = OptionGroup(parser, "Commands")
141     commands.add_option('--model', default=None,
142       help="Load model database")
143     commands.add_option('--load-rdf', default=None,
144       help="load rdf statements into model")
145     commands.add_option('--print-rdf', action="store_true", default=False,
146       help="print ending model state")
147     commands.add_option('--update', action="store_true", default=False,
148       help="Do all updates")
149     commands.add_option('--update-submission', action="store_true",
150                         default=False,
151       help="download status from ucsc")
152     commands.add_option('--update-ucsc-downloads', action="store_true",
153                         default=False,
154       help="Update download locations from UCSC")
155     commands.add_option('--update-libraries', action="store_true",
156                         default=False,
157       help="download library info from htsw")
158     commands.add_option('--reload-libraries', action="store_true",
159                         default=False,
160                         help="Delete and redownload library information. "\
161                              "Optionally list specific library IDs.")
162     parser.add_option_group(commands)
163
164     queries = OptionGroup(parser, "Queries")
165     queries.add_option('--sparql', default=None,
166       help="execute arbitrary sparql query")
167     queries.add_option('--find-submission-with-no-library', default=False,
168       action="store_true",
169       help="find submissions with no library ID")
170     parser.add_option_group(queries)
171
172     options = OptionGroup(parser, "Options")
173     options.add_option("--rdf-parser-name", default="turtle",
174       help="set rdf file parser type")
175     options.add_option("-v", "--verbose", action="store_true", default=False)
176     options.add_option("--debug", action="store_true", default=False)
177     parser.add_option_group(options)
178
179     api.add_auth_options(parser)
180
181     return parser
182
183
184 def load_my_submissions(model, limit=None, cookie=None):
185     """Parse all of my submissions from encodesubmit into model
186     It will look at the global USER_URL to figure out who to scrape
187     cookie contains the session cookie, if none, will attempt to login
188     """
189     if cookie is None:
190         cookie = login()
191
192     tree = get_url_as_tree(USER_URL, 'GET', cookie)
193     table_rows = tree.xpath('//table[@id="projects"]/tr')
194     # first record is header
195     name_n = submissionOntology['name']
196     species_n = submissionOntology['species']
197     library_urn = submissionOntology['library_urn']
198
199     # skip header
200     for row in table_rows[1:]:
201         cell = row.xpath('td')
202         if cell is not None and len(cell) > 1:
203             submission_id = str(cell[0].text_content())
204             if limit is None or submission_id in limit:
205                 subUrn = RDF.Uri(submission_view_url(submission_id))
206
207                 add_stmt(model,
208                          subUrn,
209                          TYPE_N,
210                          submissionOntology['Submission'])
211                 add_stmt(model,
212                          subUrn,
213                          DCC_NS['subId'],
214                          RDF.Node(submission_id))
215
216                 name = str(cell[4].text_content())
217                 add_stmt(model, subUrn, name_n, name)
218
219                 species = str(cell[2].text_content())
220                 if species is not None:
221                     add_stmt(model, subUrn, species_n, species)
222
223                 library_id = get_library_id(name)
224                 if library_id is not None:
225                     add_submission_to_library_urn(model,
226                                                   subUrn,
227                                                   library_urn,
228                                                   library_id)
229                 else:
230                     errmsg = 'Unable to find library id in {0} for {1}'
231                     LOGGER.warn(errmsg.format(name, str(subUrn)))
232
233                 add_submission_creation_date(model, subUrn, cookie)
234
235                 # grab changing atttributes
236                 status = str(cell[6].text_content()).strip()
237                 last_mod_datetime = get_date_contents(cell[8])
238                 last_mod = last_mod_datetime.isoformat()
239
240                 update_submission_detail(model, subUrn, status, last_mod,
241                                          cookie=cookie)
242
243                 LOGGER.info("Processed {0}".format(subUrn))
244
245
246 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
247     """Add a link from a UCSC submission to woldlab library if needed
248     """
249     libraryUrn = LIBRARY_NS[library_id + '/']
250     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
251     if not model.contains_statement(query):
252         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
253         LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
254         model.add_statement(link)
255     else:
256         LOGGER.debug("Found: {0}".format(str(query)))
257
258
259 def report_submissions_with_no_library(model):
260     missing = find_submissions_with_no_library(model)
261     for row in results:
262         subid = row['subid']
263         name = row['name']
264         print "# {0}".format(name)
265         print "<{0}>".format(subid.uri)
266         print "  encodeSubmit:library_urn "\
267               "<http://jumpgate.caltech.edu/library/> ."
268         print ""
269
270 def find_submissions_with_no_library(model):
271     missing_lib_query_text = """
272 PREFIX submissionOntology:<{submissionOntology}>
273
274 SELECT
275  ?subid ?name
276 WHERE {{
277   ?subid submissionOntology:name ?name
278   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
279   FILTER  (!bound(?libid))
280 }}""".format(submissionOntology=submissionOntology[''].uri)
281     missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
282
283     return missing_lib_query.execute(model)
284
285
286 def find_unscanned_submitted_libraries(model):
287     """Scan model for libraries that don't have library details loaded
288     """
289     unscanned_libraries = """
290 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
291 PREFIX submissionOntology:<{submissionOntology}>
292
293 SELECT distinct ?submission ?library_urn
294 WHERE {{
295   ?submission submissionOntology:library_urn ?library_urn .
296   OPTIONAL {{ ?library_urn rdf:type ?library_type  }}
297   FILTER(!BOUND(?library_type))
298 }}""".format(submissionOntology=submissionOntology[''].uri)
299     query = RDF.SPARQLQuery(unscanned_libraries)
300     return query.execute(model)
301
302 def find_all_libraries(model):
303     """Scan model for every library marked as
304     """
305     libraries = """
306 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
307 PREFIX libraryOntology:<{libraryOntology}>
308
309 SELECT distinct ?library_urn
310 WHERE {{
311   ?library_urn rdf:type ?library_type .
312   FILTER(regex(?libray
313 }}""".format(libraryOntology=libraryOntology[''].uri)
314     query = RDF.SPARQLQuery(libraries)
315     return query.execute(model)
316
317
318 def add_submission_creation_date(model, subUrn, cookie):
319     # in theory the submission page might have more information on it.
320     creation_dates = get_creation_dates(model, subUrn)
321     if len(creation_dates) == 0:
322         LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
323         submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
324         parse_submission_page(model, submissionTree, subUrn)
325     else:
326         LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
327
328
329 def get_creation_dates(model, subUrn):
330     query = RDF.Statement(subUrn, CREATION_DATE, None)
331     creation_dates = list(model.find_statements(query))
332     return creation_dates
333
334
335 def parse_submission_page(model, submissionTree, subUrn):
336     cells = submissionTree.findall('.//td')
337     dateTimeType = xsdNS['dateTime']
338     created_label = [x for x in cells
339                      if x.text_content().startswith('Created')]
340     if len(created_label) == 1:
341         created_date = get_date_contents(created_label[0].getnext())
342         created_date_node = RDF.Node(literal=created_date.isoformat(),
343                                      datatype=dateTimeType.uri)
344         add_stmt(model, subUrn, CREATION_DATE, created_date_node)
345     else:
346         msg = 'Unable to find creation date for {0}'.format(str(subUrn))
347         LOGGER.warn(msg)
348         raise Warning(msg)
349
350
351 def update_submission_detail(model, subUrn, status, recent_update, cookie):
352     HasStatusN = submissionOntology['has_status']
353     StatusN = submissionOntology['status']
354     LastModifyN = submissionOntology['last_modify_date']
355
356     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
357     status_nodes = list(model.find_statements(status_nodes_query))
358
359     if len(status_nodes) == 0:
360         # has no status node, add one
361         LOGGER.info("Adding status node to {0}".format(subUrn))
362         status_node = create_status_node(subUrn, recent_update)
363         add_stmt(model, subUrn, HasStatusN, status_node)
364         add_stmt(model, status_node, rdfNS['type'], StatusN)
365         add_stmt(model, status_node, StatusN, status)
366         add_stmt(model, status_node, LastModifyN, recent_update)
367         update_ddf(model, subUrn, status_node, cookie=cookie)
368         update_daf(model, subUrn, status_node, cookie=cookie)
369     else:
370         LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
371         for status_statement in status_nodes:
372             status_node = status_statement.object
373             last_modified_query = RDF.Statement(status_node,
374                                                 LastModifyN,
375                                                 None)
376             last_mod_nodes = model.find_statements(last_modified_query)
377             for last_mod_statement in last_mod_nodes:
378                 last_mod_date = str(last_mod_statement.object)
379                 if recent_update == str(last_mod_date):
380                     update_ddf(model, subUrn, status_node, cookie=cookie)
381                     update_daf(model, subUrn, status_node, cookie=cookie)
382                     break
383
384
385 def update_daf(model, submission_url, status_node, cookie):
386     download_daf_uri = str(submission_url).replace('show', 'download_daf')
387     daf_uri = RDF.Uri(download_daf_uri)
388
389     status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
390     if not model.contains_statement(status_is_daf):
391         LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
392                                                      status_node))
393         daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
394         daf_hash = hashlib.md5(daf_text).hexdigest()
395         daf_hash_stmt = RDF.Statement(status_node,
396                                       dafTermOntology['md5sum'],
397                                       daf_hash)
398         model.add_statement(daf_hash_stmt)
399         daf.fromstring_into_model(model, status_node, daf_text)
400
401
402 def update_ddf(model, subUrn, statusNode, cookie):
403     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
404     ddfUrn = RDF.Uri(download_ddf_url)
405
406     status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
407     if not model.contains_statement(status_is_ddf):
408         LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
409         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
410         add_ddf_statements(model, statusNode, ddf_text)
411         model.add_statement(status_is_ddf)
412
413
414 def add_ddf_statements(model, statusNode, ddf_string):
415     """Convert a ddf text file into RDF Statements
416     """
417     ddf_lines = ddf_string.split('\n')
418     # first line is header
419     header = ddf_lines[0].split()
420     attributes = [DCC_NS[x] for x in header]
421
422     for ddf_line in ddf_lines[1:]:
423         ddf_line = ddf_line.strip()
424         if len(ddf_line) == 0:
425             continue
426         if ddf_line.startswith("#"):
427             continue
428
429         ddf_record = ddf_line.split('\t')
430         files = ddf_record[0].split(',')
431         file_attributes = ddf_record[1:]
432
433         for f in files:
434             fileNode = RDF.Node()
435             add_stmt(model,
436                      statusNode,
437                      submissionOntology['has_file'],
438                      fileNode)
439             add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
440             add_stmt(model, fileNode, DCC_NS['filename'], f)
441
442             for predicate, object in zip(attributes[1:], file_attributes):
443                 add_stmt(model, fileNode, predicate, object)
444
445
446 def load_encode_assigned_libraries(model, htswapi):
447     """Get libraries associated with encode.
448     """
449     encodeFilters = ["/library/?affiliations__id__exact=44",
450                      "/library/?affiliations__id__exact=80",
451                     ]
452
453     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
454     rdfaParser = RDF.Parser(name='rdfa')
455     for encodeUrl in encodeUrls:
456         LOGGER.info("Scanning library url {0}".format(encodeUrl))
457         rdfaParser.parse_into_model(model, encodeUrl)
458         query = RDF.Statement(None, libraryOntology['library_id'], None)
459         libraries = model.find_statements(query)
460         for statement in libraries:
461             libraryUrn = statement.subject
462             load_library_detail(model, libraryUrn)
463
464
465 def load_unassigned_submitted_libraries(model):
466     unassigned = find_unscanned_submitted_libraries(model)
467     for query_record in unassigned:
468         library_urn = query_record['library_urn']
469         LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
470         load_library_detail(model, library_urn)
471
472 def reload_libraries(model, library_list):
473     if len(library_list) == 0:
474         # reload everything.
475         queryset = find_all_libraries(model)
476         libraries = ( str(s['library_urn']) for s in queryset )
477     else:
478         libraries = ( user_library_id_to_library_urn(l) for l in library_list )
479
480     for library_urn in libraries:
481         delete_library(model, library_urn)
482         load_library_detail(model, library_urn)
483
484 def user_library_id_to_library_urn(library_id):
485     split_url = urlparse.urlsplit(library_id)
486     if len(split_url.scheme) == 0:
487         return LIBRARY_NS[library_id]
488     else:
489         return library_id
490
491 def delete_library(model, library_urn):
492     if not isinstance(library_urn, RDF.Node):
493         raise ValueError("library urn must be a RDF.Node")
494
495     LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
496     lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
497     for lane in model.find_statements(lane_query):
498         delete_lane(model, lane.object)
499     library_attrib_query = RDF.Statement(library_urn, None, None)
500     for library_attrib in model.find_statements(library_attrib_query):
501         LOGGER.debug("Deleting {0}".format(str(library_attrib)))
502         del model[library_attrib]
503
504
505 def delete_lane(model, lane_urn):
506     if not isinstance(lane_urn, RDF.Node):
507         raise ValueError("lane urn must be a RDF.Node")
508
509     delete_lane_mapping(model, lane_urn)
510     lane_attrib_query = RDF.Statement(lane_urn,None,None)
511     for lane_attrib in model.find_statements(lane_attrib_query):
512         LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
513         del model[lane_attrib]
514
515
516 def delete_lane_mapping(model, lane_urn):
517     if not isinstance(lane_urn, RDF.Node):
518         raise ValueError("lane urn must be a RDF.Node")
519
520     lane_mapping_query = RDF.Statement(lane_urn,
521                                        libraryOntology['has_mappings'],
522                                        None)
523     for lane_mapping in model.find_statements(lane_mapping_query):
524         mapping_attrib_query = RDF.Statement(lane_mapping.object,
525                                              None,
526                                              None)
527         for mapping_attrib in model.find_statements(mapping_attrib_query):
528             LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
529             del model[mapping_attrib]
530
531
532 def load_encodedcc_files(model, genome, composite):
533     file_index = ucsc.get_encodedcc_file_index(genome, composite)
534     if file_index is None:
535         return
536
537     lib_term = submissionOntology['library_urn']
538     sub_term = submissionOntology['submission_urn']
539     for filename, attributes in file_index.items():
540         s = RDF.Node(RDF.Uri(filename))
541         model.add_statement(
542             RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
543         for name, value in attributes.items():
544             p = RDF.Node(DCC_NS[name])
545             o = RDF.Node(value)
546             model.add_statement(RDF.Statement(s,p,o))
547             if name.lower() == 'labexpid':
548                 model.add_statement(
549                     RDF.Statement(s, lib_term, LIBRARY_NS[value+'/']))
550             elif name.lower() == 'subid':
551                 sub_url = RDF.Uri(submission_view_url(value))
552                 model.add_statement(
553                     RDF.Statement(s, sub_term, sub_url))
554
555
556 def load_library_detail(model, libraryUrn):
557     """Grab detail information from library page
558     """
559     rdfaParser = RDF.Parser(name='rdfa')
560     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
561     results = list(model.find_statements(query))
562     log_message = "Found {0} statements for {1}"
563     LOGGER.debug(log_message.format(len(results), libraryUrn))
564     if len(results) == 0:
565         LOGGER.info("Loading {0}".format(str(libraryUrn)))
566         try:
567             body = get_url_as_text(str(libraryUrn.uri), 'GET')
568             rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
569         except httplib2.HttpLib2ErrorWithResponse, e:
570             LOGGER.error(str(e))
571     elif len(results) == 1:
572         pass  # Assuming that a loaded dataset has one record
573     else:
574         LOGGER.warning("Many dates for {0}".format(libraryUrn))
575
576
577 def get_library_id(name):
578     """Guess library ID from library name
579
580     >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
581     '11039'
582     >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
583     '10150'
584     >>> get_library_id('2x75-GM12892-rep2-SL2970')
585     '02970'
586     """
587     match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
588     library_id = None
589     if match is not None:
590         library_id = match.group('id')
591     if library_id in SL_MAP:
592         library_id = SL_MAP[library_id]
593     return library_id
594
595
596 def get_contents(element):
597     """Return contents or none.
598     """
599     if len(element.contents) == 0:
600         return None
601
602     a = element.find('a')
603     if a is not None:
604         return a.contents[0].encode(CHARSET)
605
606     return element.contents[0].encode(CHARSET)
607
608
609 def create_status_node(submission_uri, timestamp):
610     submission_uri = daf.submission_uri_to_string(submission_uri)
611     if submission_uri[-1] != '/':
612         sumbission_uri += '/'
613     status_uri = submission_uri + timestamp
614     return RDF.Node(RDF.Uri(status_uri))
615
616
617 def get_date_contents(element):
618     data = element.text_content()
619     if data:
620         return datetime.strptime(data, "%Y-%m-%d %H:%M")
621     else:
622         return None
623
624
625 def add_stmt(model, subject, predicate, rdf_object):
626     """Convienence create RDF Statement and add to a model
627     """
628     return model.add_statement(
629         RDF.Statement(subject, predicate, rdf_object))
630
631
632 def login(cookie=None):
633     """Login if we don't have a cookie
634     """
635     if cookie is not None:
636         return cookie
637
638     keys = keyring.get_keyring()
639     password = keys.get_password(LOGIN_URL, USERNAME)
640     credentials = {'login': USERNAME,
641                    'password': password}
642     headers = {'Content-type': 'application/x-www-form-urlencoded'}
643     http = httplib2.Http()
644     response, content = http.request(LOGIN_URL,
645                                      'POST',
646                                      headers=headers,
647                                      body=urllib.urlencode(credentials))
648     LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
649                                                     response['status']))
650
651     cookie = response.get('set-cookie', None)
652     if cookie is None:
653         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
654     return cookie
655
656
657 def get_url_as_tree(url, method, cookie=None):
658     http = httplib2.Http()
659     headers = {}
660     if cookie is not None:
661         headers['Cookie'] = cookie
662     response, content = http.request(url, method, headers=headers)
663     if response['status'] == '200':
664         tree = fromstring(content, base_url=url)
665         return tree
666     else:
667         msg = "error accessing {0}, status {1}"
668         msg = msg.format(url, response['status'])
669         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
670         raise e
671
672
673 def get_url_as_text(url, method, cookie=None):
674     http = httplib2.Http()
675     headers = {}
676     if cookie is not None:
677         headers['Cookie'] = cookie
678     response, content = http.request(url, method, headers=headers)
679     if response['status'] == '200':
680         return content
681     else:
682         msg = "error accessing {0}, status {1}"
683         msg = msg.format(url, response['status'])
684         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
685         raise e
686
687 ################
688 #  old stuff
689 SUBMISSIONS_LACKING_LIBID = [
690     ('1x75-Directional-HeLa-Rep1',    '11208'),
691     ('1x75-Directional-HeLa-Rep2',    '11207'),
692     ('1x75-Directional-HepG2-Rep1',   '11210'),
693     ('1x75-Directional-HepG2-Rep2',   '11209'),
694     ('1x75-Directional-H1-hESC-Rep1', '10947'),
695     ('1x75-Directional-H1-hESC-Rep2', '11009'),
696     ('1x75-Directional-HUVEC-Rep1',   '11206'),
697     ('1x75-Directional-HUVEC-Rep2',   '11205'),
698     ('1x75-Directional-K562-Rep1',    '11008'),
699     ('1x75-Directional-K562-Rep2',    '11007'),
700     ('1x75-Directional-NHEK-Rep1',    '11204'),
701     ('1x75-Directional-GM12878-Rep1', '11011'),
702     ('1x75-Directional-GM12878-Rep2', '11010'),
703     ]
704
705
706 def select_by_library_id(submission_list):
707     subl = [(x.library_id, x) for x in submission_list if x.library_id]
708     libraries = {}
709     for lib_id, subobj in subl:
710         libraries.setdefault(lib_id, []).append(subobj)
711
712     for submission in libraries.values():
713         submission.sort(key=attrgetter('date'), reverse=True)
714
715     return libraries
716
717
718 def library_to_freeze(selected_libraries):
719     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
720     lib_ids = sorted(selected_libraries.keys())
721     report = ['<html><table border="1">']
722     report = ["""<html>
723 <head>
724 <style type="text/css">
725  td {border-width:0 0 1px 1px; border-style:solid;}
726 </style>
727 </head>
728 <body>
729 <table>
730 """]
731     report.append('<thead>')
732     report.append('<tr><td>Library ID</td><td>Name</td>')
733     for f in freezes:
734         report.append('<td>{0}</td>'.format(f))
735     report.append('</tr>')
736     report.append('</thead>')
737     report.append('<tbody>')
738     for lib_id in lib_ids:
739         report.append('<tr>')
740         lib_url = LIBRARY_NS[lib_id].uri
741         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
742         submissions = selected_libraries[lib_id]
743         report.append('<td>{0}</td>'.format(submissions[0].name))
744         batched = {}
745         for sub in submissions:
746             date = date_to_freeze(sub.date)
747             batched.setdefault(date, []).append(sub)
748         for d in freezes:
749             report.append('<td>')
750             for s in batched.get(d, []):
751                 show_url = submission_view_url(s.subid)
752                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
753                 report.append("{0}:{1}".format(subid, s.status))
754             report.append('</td>')
755         else:
756             report.append('<td></td>')
757         report.append("</tr>")
758     report.append('</tbody>')
759     report.append("</table></html>")
760     return "\n".join(report)
761
762
763 def date_to_freeze(d):
764     freezes = [(datetime(2010, 1, 30), '2010-Jan'),
765                (datetime(2010, 7, 30), '2010-Jul'),
766                (datetime(2011, 1, 30), '2011-Jan'),
767                ]
768     for end, name in freezes:
769         if d < end:
770             return name
771     else:
772         return None
773
774 if __name__ == "__main__":
775     main()