3 Gather information about our submissions into a single RDF store
6 from datetime import datetime
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
23 from htsworkflow.submission import daf, ucsc
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
44 from htsworkflow.submission.ucsc import \
47 get_encodedcc_file_index, \
48 submission_view_url, \
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
53 DBDIR = os.path.expanduser("~diane/proj/submission")
55 LOGGER = logging.getLogger("encode_find")
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
63 SL_MAP = {'SL2970': '02970',
67 def main(cmdline=None):
69 Parse command line arguments
71 Takes a list of arguments (assuming arg[0] is the program name) or None
72 If None, it looks at sys.argv
74 parser = make_parser()
75 opts, args = parser.parse_args(cmdline)
78 logging.basicConfig(level=logging.DEBUG)
80 logging.basicConfig(level=logging.INFO)
82 logging.basicConfig(level=logging.ERROR)
84 htsw_authdata = api.make_auth_from_opts(opts, parser)
85 htswapi = api.HtswApi(opts.host, htsw_authdata)
88 model = get_model(opts.model, DBDIR)
90 if opts.load_rdf is not None:
91 ns_uri = submissionOntology[''].uri
92 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
99 if opts.reload_libraries:
100 reload_libraries(model, args)
104 opts.update_submission = True
105 opts.update_libraries = True
106 opts.update_ucsc_downloads = True
108 if opts.update_submission:
109 cookie = login(cookie=cookie)
110 load_my_submissions(model, limit=limit, cookie=cookie)
112 if opts.update_libraries:
113 load_encode_assigned_libraries(model, htswapi)
114 load_unassigned_submitted_libraries(model)
116 if opts.update_ucsc_downloads:
118 {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
119 {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
120 #{'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
121 {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
123 for track_info in our_tracks:
124 load_encodedcc_files(model, **track_info )
126 if opts.sparql is not None:
127 sparql_query(model, opts.sparql)
129 if opts.find_submission_with_no_library:
130 report_submissions_with_no_library(model)
133 serializer = get_serializer(name=opts.rdf_parser_name)
134 print serializer.serialize_model_to_string(model)
138 """Construct option parser
140 parser = OptionParser()
141 commands = OptionGroup(parser, "Commands")
142 commands.add_option('--model', default=None,
143 help="Load model database")
144 commands.add_option('--load-rdf', default=None,
145 help="load rdf statements into model")
146 commands.add_option('--print-rdf', action="store_true", default=False,
147 help="print ending model state")
148 commands.add_option('--update', action="store_true", default=False,
149 help="Do all updates")
150 commands.add_option('--update-submission', action="store_true",
152 help="download status from ucsc")
153 commands.add_option('--update-ucsc-downloads', action="store_true",
155 help="Update download locations from UCSC")
156 commands.add_option('--update-libraries', action="store_true",
158 help="download library info from htsw")
159 commands.add_option('--reload-libraries', action="store_true",
161 help="Delete and redownload library information. "\
162 "Optionally list specific library IDs.")
163 parser.add_option_group(commands)
165 queries = OptionGroup(parser, "Queries")
166 queries.add_option('--sparql', default=None,
167 help="execute arbitrary sparql query")
168 queries.add_option('--find-submission-with-no-library', default=False,
170 help="find submissions with no library ID")
171 parser.add_option_group(queries)
173 options = OptionGroup(parser, "Options")
174 options.add_option("--rdf-parser-name", default="turtle",
175 help="set rdf file parser type")
176 options.add_option("-v", "--verbose", action="store_true", default=False)
177 options.add_option("--debug", action="store_true", default=False)
178 parser.add_option_group(options)
180 api.add_auth_options(parser)
185 def load_my_submissions(model, limit=None, cookie=None):
186 """Parse all the submissions from UCSC into model
187 It will look at the global USER_URL to figure out who to scrape
188 cookie contains the session cookie, if none, will attempt to login
193 tree = get_url_as_tree(USER_URL, 'GET', cookie)
194 table_rows = tree.xpath('//table[@id="projects"]/tr')
195 # first record is header
196 name_n = submissionOntology['name']
197 species_n = submissionOntology['species']
198 library_urn = submissionOntology['library_urn']
201 for row in table_rows[1:]:
202 cell = row.xpath('td')
203 if cell is not None and len(cell) > 1:
204 submission_id = str(cell[0].text_content())
205 if limit is None or submission_id in limit:
206 subUrn = RDF.Uri(submission_view_url(submission_id))
211 submissionOntology['Submission'])
215 RDF.Node(submission_id))
217 name = str(cell[4].text_content())
218 add_stmt(model, subUrn, name_n, name)
220 species = str(cell[2].text_content())
221 if species is not None:
222 add_stmt(model, subUrn, species_n, species)
224 library_id = get_library_id(name)
225 if library_id is not None:
226 add_submission_to_library_urn(model,
231 errmsg = 'Unable to find library id in {0} for {1}'
232 LOGGER.warn(errmsg.format(name, str(subUrn)))
234 add_submission_creation_date(model, subUrn, cookie)
236 # grab changing atttributes
237 status = str(cell[6].text_content()).strip()
238 last_mod_datetime = get_date_contents(cell[8])
239 last_mod = last_mod_datetime.isoformat()
241 update_submission_detail(model, subUrn, status, last_mod,
244 LOGGER.info("Processed {0}".format(subUrn))
247 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
248 """Add a link from a UCSC submission to woldlab library if needed
250 libraryUrn = LIBRARY_NS[library_id + '/']
251 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
252 if not model.contains_statement(query):
253 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
254 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
255 model.add_statement(link)
257 LOGGER.debug("Found: {0}".format(str(query)))
260 def report_submissions_with_no_library(model):
261 missing = find_submissions_with_no_library(model)
265 print "# {0}".format(name)
266 print "<{0}>".format(subid.uri)
267 print " encodeSubmit:library_urn "\
268 "<http://jumpgate.caltech.edu/library/> ."
271 def find_submissions_with_no_library(model):
272 missing_lib_query_text = """
273 PREFIX submissionOntology:<{submissionOntology}>
278 ?subid submissionOntology:name ?name
279 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
280 FILTER (!bound(?libid))
281 }}""".format(submissionOntology=submissionOntology[''].uri)
282 missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
284 return missing_lib_query.execute(model)
287 def find_unscanned_submitted_libraries(model):
288 """Scan model for libraries that don't have library details loaded
290 unscanned_libraries = """
291 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
292 PREFIX submissionOntology:<{submissionOntology}>
294 SELECT distinct ?submission ?library_urn
296 ?submission submissionOntology:library_urn ?library_urn .
297 OPTIONAL {{ ?library_urn rdf:type ?library_type }}
298 FILTER(!BOUND(?library_type))
299 }}""".format(submissionOntology=submissionOntology[''].uri)
300 query = RDF.SPARQLQuery(unscanned_libraries)
301 return query.execute(model)
303 def find_all_libraries(model):
304 """Scan model for every library marked as
307 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
308 PREFIX libraryOntology:<{libraryOntology}>
310 SELECT distinct ?library_urn
312 ?library_urn rdf:type ?library_type .
314 }}""".format(libraryOntology=libraryOntology[''].uri)
315 query = RDF.SPARQLQuery(libraries)
316 return query.execute(model)
319 def add_submission_creation_date(model, subUrn, cookie):
320 # in theory the submission page might have more information on it.
321 creation_dates = get_creation_dates(model, subUrn)
322 if len(creation_dates) == 0:
323 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
324 submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
325 parse_submission_page(model, submissionTree, subUrn)
327 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
330 def get_creation_dates(model, subUrn):
331 query = RDF.Statement(subUrn, CREATION_DATE, None)
332 creation_dates = list(model.find_statements(query))
333 return creation_dates
336 def parse_submission_page(model, submissionTree, subUrn):
337 cells = submissionTree.findall('.//td')
338 dateTimeType = xsdNS['dateTime']
339 created_label = [x for x in cells
340 if x.text_content().startswith('Created')]
341 if len(created_label) == 1:
342 created_date = get_date_contents(created_label[0].getnext())
343 created_date_node = RDF.Node(literal=created_date.isoformat(),
344 datatype=dateTimeType.uri)
345 add_stmt(model, subUrn, CREATION_DATE, created_date_node)
347 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
352 def update_submission_detail(model, subUrn, status, recent_update, cookie):
353 HasStatusN = submissionOntology['has_status']
354 StatusN = submissionOntology['status']
355 LastModifyN = submissionOntology['last_modify_date']
357 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
358 status_nodes = list(model.find_statements(status_nodes_query))
360 if len(status_nodes) == 0:
361 # has no status node, add one
362 LOGGER.info("Adding status node to {0}".format(subUrn))
363 status_node = create_status_node(subUrn, recent_update)
364 add_stmt(model, subUrn, HasStatusN, status_node)
365 add_stmt(model, status_node, rdfNS['type'], StatusN)
366 add_stmt(model, status_node, StatusN, status)
367 add_stmt(model, status_node, LastModifyN, recent_update)
368 update_ddf(model, subUrn, status_node, cookie=cookie)
369 update_daf(model, subUrn, status_node, cookie=cookie)
371 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
372 for status_statement in status_nodes:
373 status_node = status_statement.object
374 last_modified_query = RDF.Statement(status_node,
377 last_mod_nodes = model.find_statements(last_modified_query)
378 for last_mod_statement in last_mod_nodes:
379 last_mod_date = str(last_mod_statement.object)
380 if recent_update == str(last_mod_date):
381 update_ddf(model, subUrn, status_node, cookie=cookie)
382 update_daf(model, subUrn, status_node, cookie=cookie)
386 def update_daf(model, submission_url, status_node, cookie):
387 download_daf_uri = str(submission_url).replace('show', 'download_daf')
388 daf_uri = RDF.Uri(download_daf_uri)
390 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
391 if not model.contains_statement(status_is_daf):
392 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
394 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
395 daf_hash = hashlib.md5(daf_text).hexdigest()
396 daf_hash_stmt = RDF.Statement(status_node,
397 dafTermOntology['md5sum'],
399 model.add_statement(daf_hash_stmt)
400 daf.fromstring_into_model(model, status_node, daf_text)
403 def update_ddf(model, subUrn, statusNode, cookie):
404 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
405 ddfUrn = RDF.Uri(download_ddf_url)
407 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
408 if not model.contains_statement(status_is_ddf):
409 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
410 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
411 add_ddf_statements(model, statusNode, ddf_text)
412 model.add_statement(status_is_ddf)
415 def add_ddf_statements(model, statusNode, ddf_string):
416 """Convert a ddf text file into RDF Statements
418 ddf_lines = ddf_string.split('\n')
419 # first line is header
420 header = ddf_lines[0].split()
421 attributes = [DCC_NS[x] for x in header]
423 for ddf_line in ddf_lines[1:]:
424 ddf_line = ddf_line.strip()
425 if len(ddf_line) == 0:
427 if ddf_line.startswith("#"):
430 ddf_record = ddf_line.split('\t')
431 files = ddf_record[0].split(',')
432 file_attributes = ddf_record[1:]
435 fileNode = RDF.Node()
438 submissionOntology['has_file'],
440 add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
441 add_stmt(model, fileNode, DCC_NS['filename'], f)
443 for predicate, object in zip(attributes[1:], file_attributes):
444 add_stmt(model, fileNode, predicate, object)
447 def load_encode_assigned_libraries(model, htswapi):
448 """Get libraries associated with encode.
450 encodeFilters = ["/library/?affiliations__id__exact=44",
451 "/library/?affiliations__id__exact=80",
454 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
455 rdfaParser = RDF.Parser(name='rdfa')
456 for encodeUrl in encodeUrls:
457 LOGGER.info("Scanning library url {0}".format(encodeUrl))
458 rdfaParser.parse_into_model(model, encodeUrl)
459 query = RDF.Statement(None, libraryOntology['library_id'], None)
460 libraries = model.find_statements(query)
461 for statement in libraries:
462 libraryUrn = statement.subject
463 load_library_detail(model, libraryUrn)
466 def load_unassigned_submitted_libraries(model):
467 unassigned = find_unscanned_submitted_libraries(model)
468 for query_record in unassigned:
469 library_urn = query_record['library_urn']
470 LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
471 load_library_detail(model, library_urn)
473 def reload_libraries(model, library_list):
474 if len(library_list) == 0:
476 queryset = find_all_libraries(model)
477 libraries = ( str(s['library_urn']) for s in queryset )
479 libraries = ( user_library_id_to_library_urn(l) for l in library_list )
481 for library_urn in libraries:
482 delete_library(model, library_urn)
483 load_library_detail(model, library_urn)
485 def user_library_id_to_library_urn(library_id):
486 split_url = urlparse.urlsplit(library_id)
487 if len(split_url.scheme) == 0:
488 return LIBRARY_NS[library_id]
492 def delete_library(model, library_urn):
493 if not isinstance(library_urn, RDF.Node):
494 raise ValueError("library urn must be a RDF.Node")
496 LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
497 lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
498 for lane in model.find_statements(lane_query):
499 delete_lane(model, lane.object)
500 library_attrib_query = RDF.Statement(library_urn, None, None)
501 for library_attrib in model.find_statements(library_attrib_query):
502 LOGGER.debug("Deleting {0}".format(str(library_attrib)))
503 del model[library_attrib]
506 def delete_lane(model, lane_urn):
507 if not isinstance(lane_urn, RDF.Node):
508 raise ValueError("lane urn must be a RDF.Node")
510 delete_lane_mapping(model, lane_urn)
511 lane_attrib_query = RDF.Statement(lane_urn,None,None)
512 for lane_attrib in model.find_statements(lane_attrib_query):
513 LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
514 del model[lane_attrib]
517 def delete_lane_mapping(model, lane_urn):
518 if not isinstance(lane_urn, RDF.Node):
519 raise ValueError("lane urn must be a RDF.Node")
521 lane_mapping_query = RDF.Statement(lane_urn,
522 libraryOntology['has_mappings'],
524 for lane_mapping in model.find_statements(lane_mapping_query):
525 mapping_attrib_query = RDF.Statement(lane_mapping.object,
528 for mapping_attrib in model.find_statements(mapping_attrib_query):
529 LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
530 del model[mapping_attrib]
533 def load_encodedcc_files(model, genome, composite):
534 file_index = ucsc.get_encodedcc_file_index(genome, composite)
535 if file_index is None:
538 for filename, attributes in file_index.items():
539 s = RDF.Node(RDF.Uri(filename))
541 RDF.Statement(s, TYPE_N, submissionOntology['ucsc_track']))
542 for name, value in attributes.items():
543 p = RDF.Node(DCC_NS[name])
545 model.add_statement(RDF.Statement(s,p,o))
548 def load_library_detail(model, libraryUrn):
549 """Grab detail information from library page
551 rdfaParser = RDF.Parser(name='rdfa')
552 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
553 results = list(model.find_statements(query))
554 log_message = "Found {0} statements for {1}"
555 LOGGER.debug(log_message.format(len(results), libraryUrn))
556 if len(results) == 0:
557 LOGGER.info("Loading {0}".format(str(libraryUrn)))
559 body = get_url_as_text(str(libraryUrn.uri), 'GET')
560 rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
561 except httplib2.HttpLib2ErrorWithResponse, e:
563 elif len(results) == 1:
564 pass # Assuming that a loaded dataset has one record
566 LOGGER.warning("Many dates for {0}".format(libraryUrn))
569 def get_library_id(name):
570 """Guess library ID from library name
572 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
574 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
576 >>> get_library_id('2x75-GM12892-rep2-SL2970')
579 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
581 if match is not None:
582 library_id = match.group('id')
583 if library_id in SL_MAP:
584 library_id = SL_MAP[library_id]
588 def get_contents(element):
589 """Return contents or none.
591 if len(element.contents) == 0:
594 a = element.find('a')
596 return a.contents[0].encode(CHARSET)
598 return element.contents[0].encode(CHARSET)
601 def create_status_node(submission_uri, timestamp):
602 submission_uri = daf.submission_uri_to_string(submission_uri)
603 if submission_uri[-1] != '/':
604 sumbission_uri += '/'
605 status_uri = submission_uri + timestamp
606 return RDF.Node(RDF.Uri(status_uri))
609 def get_date_contents(element):
610 data = element.text_content()
612 return datetime.strptime(data, "%Y-%m-%d %H:%M")
617 def add_stmt(model, subject, predicate, rdf_object):
618 """Convienence create RDF Statement and add to a model
620 return model.add_statement(
621 RDF.Statement(subject, predicate, rdf_object))
624 def login(cookie=None):
625 """Login if we don't have a cookie
627 if cookie is not None:
630 keys = keyring.get_keyring()
631 password = keys.get_password(LOGIN_URL, USERNAME)
632 credentials = {'login': USERNAME,
633 'password': password}
634 headers = {'Content-type': 'application/x-www-form-urlencoded'}
635 http = httplib2.Http()
636 response, content = http.request(LOGIN_URL,
639 body=urllib.urlencode(credentials))
640 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
643 cookie = response.get('set-cookie', None)
645 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
649 def get_url_as_tree(url, method, cookie=None):
650 http = httplib2.Http()
652 if cookie is not None:
653 headers['Cookie'] = cookie
654 response, content = http.request(url, method, headers=headers)
655 if response['status'] == '200':
656 tree = fromstring(content, base_url=url)
659 msg = "error accessing {0}, status {1}"
660 msg = msg.format(url, response['status'])
661 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
665 def get_url_as_text(url, method, cookie=None):
666 http = httplib2.Http()
668 if cookie is not None:
669 headers['Cookie'] = cookie
670 response, content = http.request(url, method, headers=headers)
671 if response['status'] == '200':
674 msg = "error accessing {0}, status {1}"
675 msg = msg.format(url, response['status'])
676 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
681 SUBMISSIONS_LACKING_LIBID = [
682 ('1x75-Directional-HeLa-Rep1', '11208'),
683 ('1x75-Directional-HeLa-Rep2', '11207'),
684 ('1x75-Directional-HepG2-Rep1', '11210'),
685 ('1x75-Directional-HepG2-Rep2', '11209'),
686 ('1x75-Directional-H1-hESC-Rep1', '10947'),
687 ('1x75-Directional-H1-hESC-Rep2', '11009'),
688 ('1x75-Directional-HUVEC-Rep1', '11206'),
689 ('1x75-Directional-HUVEC-Rep2', '11205'),
690 ('1x75-Directional-K562-Rep1', '11008'),
691 ('1x75-Directional-K562-Rep2', '11007'),
692 ('1x75-Directional-NHEK-Rep1', '11204'),
693 ('1x75-Directional-GM12878-Rep1', '11011'),
694 ('1x75-Directional-GM12878-Rep2', '11010'),
698 def select_by_library_id(submission_list):
699 subl = [(x.library_id, x) for x in submission_list if x.library_id]
701 for lib_id, subobj in subl:
702 libraries.setdefault(lib_id, []).append(subobj)
704 for submission in libraries.values():
705 submission.sort(key=attrgetter('date'), reverse=True)
710 def library_to_freeze(selected_libraries):
711 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
712 lib_ids = sorted(selected_libraries.keys())
713 report = ['<html><table border="1">']
716 <style type="text/css">
717 td {border-width:0 0 1px 1px; border-style:solid;}
723 report.append('<thead>')
724 report.append('<tr><td>Library ID</td><td>Name</td>')
726 report.append('<td>{0}</td>'.format(f))
727 report.append('</tr>')
728 report.append('</thead>')
729 report.append('<tbody>')
730 for lib_id in lib_ids:
731 report.append('<tr>')
732 lib_url = LIBRARY_NS[lib_id].uri
733 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
734 submissions = selected_libraries[lib_id]
735 report.append('<td>{0}</td>'.format(submissions[0].name))
737 for sub in submissions:
738 date = date_to_freeze(sub.date)
739 batched.setdefault(date, []).append(sub)
741 report.append('<td>')
742 for s in batched.get(d, []):
743 show_url = submission_view_url(s.subid)
744 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
745 report.append("{0}:{1}".format(subid, s.status))
746 report.append('</td>')
748 report.append('<td></td>')
749 report.append("</tr>")
750 report.append('</tbody>')
751 report.append("</table></html>")
752 return "\n".join(report)
755 def date_to_freeze(d):
756 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
757 (datetime(2010, 7, 30), '2010-Jul'),
758 (datetime(2011, 1, 30), '2011-Jan'),
760 for end, name in freezes:
766 if __name__ == "__main__":