3 Gather information about our submissions into a single RDF store
5 from __future__ import print_function
7 from datetime import datetime
12 from lxml.html import fromstring
13 from operator import attrgetter
14 from optparse import OptionParser, OptionGroup
21 from six.moves import urllib
23 from rdflib import BNode, Graph, Literal, Namespace, URIRef
24 from rdflib.namespace import RDF, RDFS, XSD
25 if not 'DJANGO_SETTINGS_MODULE' in os.environ:
26 os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
28 from htsworkflow.submission import daf, ucsc
30 from htsworkflow.util import api
31 from htsworkflow.util.rdfns import (
36 CREATION_DATE = libraryOntology['date']
39 LIBRARY_NS = Namespace("http://jumpgate.caltech.edu/library/")
41 from htsworkflow.submission.ucsc import \
44 get_encodedcc_file_index, \
45 submission_view_url, \
48 DCC_NS = Namespace(UCSCEncodePipeline + 'download_ddf#')
50 DBDIR = os.path.expanduser("~diane/proj/submission")
52 LOGGER = logging.getLogger("encode_find")
54 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
55 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
60 SL_MAP = {'SL2970': '02970',
64 def main(cmdline=None):
66 Parse command line arguments
68 Takes a list of arguments (assuming arg[0] is the program name) or None
69 If None, it looks at sys.argv
71 parser = make_parser()
72 opts, args = parser.parse_args(cmdline)
75 logging.basicConfig(level=logging.DEBUG)
77 logging.basicConfig(level=logging.INFO)
79 logging.basicConfig(level=logging.ERROR)
81 htsw_authdata = api.make_auth_from_opts(opts, parser)
82 htswapi = api.HtswApi(opts.host, htsw_authdata)
85 model = get_model(opts.model, DBDIR)
87 if opts.load_rdf is not None:
88 ns_uri = submissionOntology[''].uri
89 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
96 if opts.reload_libraries:
97 reload_libraries(model, args)
101 opts.update_submission = True
102 opts.update_libraries = True
103 opts.update_ucsc_downloads = True
105 if opts.update_submission:
106 cookie = login(cookie=cookie)
107 load_my_submissions(model, limit=limit, cookie=cookie)
109 if opts.update_libraries:
110 load_encode_assigned_libraries(model, htswapi)
111 load_unassigned_submitted_libraries(model)
113 if opts.update_ucsc_downloads:
115 {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
116 {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
117 #{'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
118 {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
120 for track_info in our_tracks:
121 load_encodedcc_files(model, **track_info )
123 if opts.sparql is not None:
124 sparql_query(model, opts.sparql, 'html')
126 if opts.find_submission_with_no_library:
127 report_submissions_with_no_library(model)
130 serializer = get_serializer(name=opts.rdf_parser_name)
131 print(serializer.serialize_model_to_string(model))
135 """Construct option parser
137 parser = OptionParser()
138 commands = OptionGroup(parser, "Commands")
139 commands.add_option('--model', default=None,
140 help="Load model database")
141 commands.add_option('--load-rdf', default=None,
142 help="load rdf statements into model")
143 commands.add_option('--print-rdf', action="store_true", default=False,
144 help="print ending model state")
145 commands.add_option('--update', action="store_true", default=False,
146 help="Do all updates")
147 commands.add_option('--update-submission', action="store_true",
149 help="download status from ucsc")
150 commands.add_option('--update-ucsc-downloads', action="store_true",
152 help="Update download locations from UCSC")
153 commands.add_option('--update-libraries', action="store_true",
155 help="download library info from htsw")
156 commands.add_option('--reload-libraries', action="store_true",
158 help="Delete and redownload library information. "\
159 "Optionally list specific library IDs.")
160 parser.add_option_group(commands)
162 queries = OptionGroup(parser, "Queries")
163 queries.add_option('--sparql', default=None,
164 help="execute arbitrary sparql query")
165 queries.add_option('--find-submission-with-no-library', default=False,
167 help="find submissions with no library ID")
168 parser.add_option_group(queries)
170 options = OptionGroup(parser, "Options")
171 options.add_option("--rdf-parser-name", default="turtle",
172 help="set rdf file parser type")
173 options.add_option("-v", "--verbose", action="store_true", default=False)
174 options.add_option("--debug", action="store_true", default=False)
175 parser.add_option_group(options)
177 api.add_auth_options(parser)
182 def load_my_submissions(model, limit=None, cookie=None):
183 """Parse all of my submissions from encodesubmit into model
184 It will look at the global USER_URL to figure out who to scrape
185 cookie contains the session cookie, if none, will attempt to login
190 tree = get_url_as_tree(USER_URL, 'GET', cookie)
191 table_rows = tree.xpath('//table[@id="projects"]/tr')
192 # first record is header
193 name_n = submissionOntology['name']
194 species_n = submissionOntology['species']
195 library_urn = submissionOntology['library_urn']
198 for row in table_rows[1:]:
199 cell = row.xpath('td')
200 if cell is not None and len(cell) > 1:
201 submission_id = str(cell[0].text_content())
202 if limit is None or submission_id in limit:
203 subUrn = URIRef(submission_view_url(submission_id))
208 submissionOntology['Submission'])
212 Literal(submission_id))
214 name = str(cell[4].text_content())
215 add_stmt(model, subUrn, name_n, name)
217 species = str(cell[2].text_content())
218 if species is not None:
219 add_stmt(model, subUrn, species_n, species)
221 library_id = get_library_id(name)
222 if library_id is not None:
223 add_submission_to_library_urn(model,
228 errmsg = 'Unable to find library id in {0} for {1}'
229 LOGGER.warn(errmsg.format(name, str(subUrn)))
231 add_submission_creation_date(model, subUrn, cookie)
233 # grab changing atttributes
234 status = str(cell[6].text_content()).strip()
235 last_mod_datetime = get_date_contents(cell[8])
236 last_mod = last_mod_datetime.isoformat()
238 update_submission_detail(model, subUrn, status, last_mod,
241 LOGGER.info("Processed {0}".format(subUrn))
244 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
245 """Add a link from a UCSC submission to woldlab library if needed
247 libraryUrn = LIBRARY_NS[library_id + '/']
248 query = (submissionUrn, predicate, libraryUrn)
249 if not query in model:
250 link = (submissionUrn, predicate, libraryUrn)
251 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
254 LOGGER.debug("Found: {0}".format(str(query)))
257 def report_submissions_with_no_library(model):
258 missing = find_submissions_with_no_library(model)
262 print("# {0}".format(name))
263 print("<{0}>".format(subid.uri))
264 print(" encodeSubmit:library_urn "\
265 "<http://jumpgate.caltech.edu/library/> .")
268 def find_submissions_with_no_library(model):
269 missing_lib_query_text = """
270 PREFIX submissionOntology:<{submissionOntology}>
275 ?subid submissionOntology:name ?name
276 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
277 FILTER (!bound(?libid))
278 }}""".format(submissionOntology=submissionOntology[''].uri)
280 return model.query(missing_lib_query_text)
283 def find_unscanned_submitted_libraries(model):
284 """Scan model for libraries that don't have library details loaded
286 unscanned_libraries = """
287 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
288 PREFIX submissionOntology:<{submissionOntology}>
290 SELECT distinct ?submission ?library_urn
292 ?submission submissionOntology:library_urn ?library_urn .
293 OPTIONAL {{ ?library_urn rdf:type ?library_type }}
294 FILTER(!BOUND(?library_type))
295 }}""".format(submissionOntology=submissionOntology[''].uri)
296 return model.query(unscanned_libraries)
298 def find_all_libraries(model):
299 """Scan model for every library marked as
302 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
303 PREFIX libraryOntology:<{libraryOntology}>
305 SELECT distinct ?library_urn
307 ?library_urn rdf:type ?library_type .
309 }}""".format(libraryOntology=libraryOntology[''].uri)
310 return model.query(query)
313 def add_submission_creation_date(model, subUrn, cookie):
314 # in theory the submission page might have more information on it.
315 creation_dates = get_creation_dates(model, subUrn)
316 if len(creation_dates) == 0:
317 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
318 submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
319 parse_submission_page(model, submissionTree, subUrn)
321 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
324 def get_creation_dates(model, subUrn):
325 query = (subUrn, CREATION_DATE, None)
326 creation_dates = list(model.triples(query))
327 return creation_dates
330 def parse_submission_page(model, submissionTree, subUrn):
331 cells = submissionTree.findall('.//td')
332 created_label = [x for x in cells
333 if x.text_content().startswith('Created')]
334 if len(created_label) == 1:
335 created_date = get_date_contents(created_label[0].getnext())
336 created_date_node = Literal(created_date.isoformat(),
337 datatype=XSD.dateTime)
338 add_stmt(model, subUrn, CREATION_DATE, created_date_node)
340 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
345 def update_submission_detail(model, subUrn, status, recent_update, cookie):
346 HasStatusN = submissionOntology['has_status']
347 StatusN = submissionOntology['status']
348 LastModifyN = submissionOntology['last_modify_date']
350 status_nodes_query = (subUrn, HasStatusN, None)
351 status_nodes = list(model.triples(status_nodes_query))
353 if len(status_nodes) == 0:
354 # has no status node, add one
355 LOGGER.info("Adding status node to {0}".format(subUrn))
356 status_node = create_status_node(subUrn, recent_update)
357 add_stmt(model, subUrn, HasStatusN, status_node)
358 add_stmt(model, status_node, RDF['type'], StatusN)
359 add_stmt(model, status_node, StatusN, status)
360 add_stmt(model, status_node, LastModifyN, recent_update)
361 update_ddf(model, subUrn, status_node, cookie=cookie)
362 update_daf(model, subUrn, status_node, cookie=cookie)
364 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
365 for status_statement in status_nodes:
366 status_node = status_statement[2]
367 last_modified_query = (status_node, LastModifyN, None)
368 last_mod_nodes = model.find_statements(last_modified_query)
369 for last_mod_statement in last_mod_nodes:
370 last_mod_date = str(last_mod_statement[2])
371 if recent_update == str(last_mod_date):
372 update_ddf(model, subUrn, status_node, cookie=cookie)
373 update_daf(model, subUrn, status_node, cookie=cookie)
377 def update_daf(model, submission_url, status_node, cookie):
378 download_daf_uri = str(submission_url).replace('show', 'download_daf')
379 daf_uri = URIRef(download_daf_uri)
381 status_is_daf = (status_node, TYPE_N, dafTermOntology[''])
382 if status_is_daf not in model:
383 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
385 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
386 daf_hash = hashlib.md5(daf_text).hexdigest()
387 daf_hash_stmt = (status_node, dafTermOntology['md5sum'], daf_hash)
388 model.add(daf_hash_stmt)
389 daf.fromstring_into_model(model, status_node, daf_text)
392 def update_ddf(model, subUrn, statusNode, cookie):
393 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
394 ddfUrn = URIRef(download_ddf_url)
396 status_is_ddf = (statusNode, TYPE_N, DCC_NS[''])
397 if status_is_ddf in model:
398 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
399 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
400 add_ddf_statements(model, statusNode, ddf_text)
401 model.add_statement(status_is_ddf)
404 def add_ddf_statements(model, statusNode, ddf_string):
405 """Convert a ddf text file into RDF Statements
407 ddf_lines = ddf_string.split('\n')
408 # first line is header
409 header = ddf_lines[0].split()
410 attributes = [DCC_NS[x] for x in header]
412 for ddf_line in ddf_lines[1:]:
413 ddf_line = ddf_line.strip()
414 if len(ddf_line) == 0:
416 if ddf_line.startswith("#"):
419 ddf_record = ddf_line.split('\t')
420 files = ddf_record[0].split(',')
421 file_attributes = ddf_record[1:]
427 submissionOntology['has_file'],
429 add_stmt(model, fileNode, RDF['type'], DCC_NS['file'])
430 add_stmt(model, fileNode, DCC_NS['filename'], f)
432 for predicate, object in zip(attributes[1:], file_attributes):
433 add_stmt(model, fileNode, predicate, object)
436 def load_encode_assigned_libraries(model, htswapi):
437 """Get libraries associated with encode.
439 encodeFilters = ["/library/?affiliations__id__exact=44",
440 "/library/?affiliations__id__exact=80",
443 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
444 for encodeUrl in encodeUrls:
445 LOGGER.info("Scanning library url {0}".format(encodeUrl))
446 model.parse(source=encodeUrl, format='rdfa')
447 query = (None, libraryOntology['library_id'], None)
448 libraries = model.triples(query)
449 for statement in libraries:
450 libraryUrn = statement[0]
451 load_library_detail(model, libraryUrn)
454 def load_unassigned_submitted_libraries(model):
455 unassigned = find_unscanned_submitted_libraries(model)
456 for query_record in unassigned:
457 library_urn = query_record['library_urn']
458 LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
459 load_library_detail(model, library_urn)
461 def reload_libraries(model, library_list):
462 if len(library_list) == 0:
464 queryset = find_all_libraries(model)
465 libraries = ( str(s['library_urn']) for s in queryset )
467 libraries = ( user_library_id_to_library_urn(l) for l in library_list )
469 for library_urn in libraries:
470 delete_library(model, library_urn)
471 load_library_detail(model, library_urn)
473 def user_library_id_to_library_urn(library_id):
474 split_url = urllib.parse.urlsplit(library_id)
475 if len(split_url.scheme) == 0:
476 return LIBRARY_NS[library_id]
480 def delete_library(model, library_urn):
481 if not isinstance(library_urn, (Literal, URIRef)):
482 raise ValueError("library urn must be a Literal")
484 LOGGER.info("Deleting {0}".format(str(library_urn)))
485 lane_query = (library_urn, libraryOntology['has_lane'],None)
486 for lane in model.triples(lane_query):
487 delete_lane(model, lane[2])
488 library_attrib_query = (library_urn, None, None)
489 for library_attrib in model.triples(library_attrib_query):
490 LOGGER.debug("Deleting {0}".format(str(library_attrib)))
491 model.remove(library_attrib)
494 def delete_lane(model, lane_urn):
495 if not isinstance(lane_urn, (Literal, URIRef)):
496 raise ValueError("lane urn must be a Literal or URIRef")
498 delete_lane_mapping(model, lane_urn)
499 lane_attrib_query = (lane_urn,None,None)
500 for lane_attrib in model.triples(lane_attrib_query):
501 LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
502 model.remove(lane_attrib)
505 def delete_lane_mapping(model, lane_urn):
506 if not isinstance(lane_urn, (Literal, URIRef)):
507 raise ValueError("lane urn must be a Literal or URIRef")
509 lane_mapping_query = (lane_urn,
510 libraryOntology['has_mappings'],
512 for lane_mapping in model.triples(lane_mapping_query):
513 mapping_attrib_query = (lane_mapping[2], None, None)
514 for mapping_attrib in model.triples(mapping_attrib_query):
515 LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
516 model.remove(mapping_attrib)
519 def load_encodedcc_files(model, genome, composite):
520 file_index = ucsc.get_encodedcc_file_index(genome, composite)
521 if file_index is None:
524 lib_term = submissionOntology['library_urn']
525 sub_term = submissionOntology['submission_urn']
526 for filename, attributes in file_index.items():
528 model.add((s, TYPE_N, submissionOntology['ucsc_track']))
529 for name, value in attributes.items():
533 if name.lower() == 'labexpid':
534 model.add((s, lib_term, LIBRARY_NS[value+'/']))
535 elif name.lower() == 'subid':
536 sub_url = URIRef(submission_view_url(value))
537 model.add((s, sub_term, sub_url))
540 def load_library_detail(model, libraryUrn):
541 """Grab detail information from library page
543 rdfaParser = RDF.Parser(name='rdfa')
544 query = (libraryUrn, libraryOntology['date'], None)
545 results = list(model.find_statements(query))
546 log_message = "Found {0} statements for {1}"
547 LOGGER.debug(log_message.format(len(results), libraryUrn))
548 if len(results) == 0:
549 LOGGER.info("Loading {0}".format(str(libraryUrn)))
551 body = get_url_as_text(str(libraryUrn), 'GET')
552 rdfaParser.parse_string_into_model(model, body, libraryUrn)
553 except httplib2.HttpLib2ErrorWithResponse as e:
555 elif len(results) == 1:
556 pass # Assuming that a loaded dataset has one record
558 LOGGER.warning("Many dates for {0}".format(libraryUrn))
561 def get_library_id(name):
562 """Guess library ID from library name
564 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
566 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
568 >>> get_library_id('2x75-GM12892-rep2-SL2970')
571 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
573 if match is not None:
574 library_id = match.group('id')
575 if library_id in SL_MAP:
576 library_id = SL_MAP[library_id]
580 def get_contents(element):
581 """Return contents or none.
583 if len(element.contents) == 0:
586 a = element.find('a')
588 return a.contents[0].encode(CHARSET)
590 return element.contents[0].encode(CHARSET)
593 def create_status_node(submission_uri, timestamp):
594 submission_uri = daf.submission_uri_to_string(submission_uri)
595 if submission_uri[-1] != '/':
596 sumbission_uri += '/'
597 status_uri = submission_uri + timestamp
598 return URIRef(status_uri)
601 def get_date_contents(element):
602 data = element.text_content()
604 return datetime.strptime(data, "%Y-%m-%d %H:%M")
609 def add_stmt(model, subject, predicate, rdf_object):
610 """Convienence create RDF Statement and add to a model
612 return model.add((subject, predicate, rdf_object))
615 def login(cookie=None):
616 """Login if we don't have a cookie
618 if cookie is not None:
621 keys = keyring.get_keyring()
622 password = keys.get_password(LOGIN_URL, USERNAME)
623 credentials = {'login': USERNAME,
624 'password': password}
625 headers = {'Content-type': 'application/x-www-form-urlencoded'}
626 http = httplib2.Http()
627 response, content = http.request(LOGIN_URL,
630 body=urllib.parse.urlencode(credentials))
631 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
634 cookie = response.get('set-cookie', None)
636 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
640 def get_url_as_tree(url, method, cookie=None):
641 http = httplib2.Http()
643 if cookie is not None:
644 headers['Cookie'] = cookie
645 response, content = http.request(url, method, headers=headers)
646 if response['status'] == '200':
647 tree = fromstring(content, base_url=url)
650 msg = "error accessing {0}, status {1}"
651 msg = msg.format(url, response['status'])
652 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
656 def get_url_as_text(url, method, cookie=None):
657 http = httplib2.Http()
659 if cookie is not None:
660 headers['Cookie'] = cookie
661 response, content = http.request(url, method, headers=headers)
662 if response['status'] == '200':
665 msg = "error accessing {0}, status {1}"
666 msg = msg.format(url, response['status'])
667 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
672 SUBMISSIONS_LACKING_LIBID = [
673 ('1x75-Directional-HeLa-Rep1', '11208'),
674 ('1x75-Directional-HeLa-Rep2', '11207'),
675 ('1x75-Directional-HepG2-Rep1', '11210'),
676 ('1x75-Directional-HepG2-Rep2', '11209'),
677 ('1x75-Directional-H1-hESC-Rep1', '10947'),
678 ('1x75-Directional-H1-hESC-Rep2', '11009'),
679 ('1x75-Directional-HUVEC-Rep1', '11206'),
680 ('1x75-Directional-HUVEC-Rep2', '11205'),
681 ('1x75-Directional-K562-Rep1', '11008'),
682 ('1x75-Directional-K562-Rep2', '11007'),
683 ('1x75-Directional-NHEK-Rep1', '11204'),
684 ('1x75-Directional-GM12878-Rep1', '11011'),
685 ('1x75-Directional-GM12878-Rep2', '11010'),
689 def select_by_library_id(submission_list):
690 subl = [(x.library_id, x) for x in submission_list if x.library_id]
692 for lib_id, subobj in subl:
693 libraries.setdefault(lib_id, []).append(subobj)
695 for submission in libraries.values():
696 submission.sort(key=attrgetter('date'), reverse=True)
701 def library_to_freeze(selected_libraries):
702 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
703 lib_ids = sorted(selected_libraries.keys())
704 report = ['<html><table border="1">']
707 <style type="text/css">
708 td {border-width:0 0 1px 1px; border-style:solid;}
714 report.append('<thead>')
715 report.append('<tr><td>Library ID</td><td>Name</td>')
717 report.append('<td>{0}</td>'.format(f))
718 report.append('</tr>')
719 report.append('</thead>')
720 report.append('<tbody>')
721 for lib_id in lib_ids:
722 report.append('<tr>')
723 lib_url = LIBRARY_NS[lib_id]
724 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
725 submissions = selected_libraries[lib_id]
726 report.append('<td>{0}</td>'.format(submissions[0].name))
728 for sub in submissions:
729 date = date_to_freeze(sub.date)
730 batched.setdefault(date, []).append(sub)
732 report.append('<td>')
733 for s in batched.get(d, []):
734 show_url = submission_view_url(s.subid)
735 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
736 report.append("{0}:{1}".format(subid, s.status))
737 report.append('</td>')
739 report.append('<td></td>')
740 report.append("</tr>")
741 report.append('</tbody>')
742 report.append("</table></html>")
743 return "\n".join(report)
746 def date_to_freeze(d):
747 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
748 (datetime(2010, 7, 30), '2010-Jul'),
749 (datetime(2011, 1, 30), '2011-Jan'),
751 for end, name in freezes:
757 if __name__ == "__main__":