3 Gather information about our submissions into a single RDF store
6 from datetime import datetime
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
23 from htsworkflow.submission import daf, ucsc
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
44 from htsworkflow.submission.ucsc import \
47 get_encodedcc_file_index, \
48 submission_view_url, \
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
53 DBDIR = os.path.expanduser("~diane/proj/submission")
55 LOGGER = logging.getLogger("encode_find")
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
63 SL_MAP = {'SL2970': '02970',
67 def main(cmdline=None):
69 Parse command line arguments
71 Takes a list of arguments (assuming arg[0] is the program name) or None
72 If None, it looks at sys.argv
74 parser = make_parser()
75 opts, args = parser.parse_args(cmdline)
78 logging.basicConfig(level=logging.DEBUG)
80 logging.basicConfig(level=logging.INFO)
82 logging.basicConfig(level=logging.ERROR)
84 htsw_authdata = api.make_auth_from_opts(opts, parser)
85 htswapi = api.HtswApi(opts.host, htsw_authdata)
88 model = get_model(opts.model, DBDIR)
90 if opts.load_rdf is not None:
91 ns_uri = submissionOntology[''].uri
92 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
99 if opts.reload_libraries:
100 reload_libraries(model, args)
104 opts.update_submission = True
105 opts.update_libraries = True
106 opts.update_ucsc_downloads = True
108 if opts.update_submission:
109 cookie = login(cookie=cookie)
110 load_my_submissions(model, limit=limit, cookie=cookie)
112 if opts.update_libraries:
113 load_encode_assigned_libraries(model, htswapi)
114 load_unassigned_submitted_libraries(model)
116 if opts.update_ucsc_downloads:
118 {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
119 {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
120 #{'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
121 {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
123 for track_info in our_tracks:
124 load_encodedcc_files(model, **track_info )
126 if opts.sparql is not None:
127 sparql_query(model, opts.sparql)
129 if opts.find_submission_with_no_library:
130 report_submissions_with_no_library(model)
133 serializer = get_serializer(name=opts.rdf_parser_name)
134 print serializer.serialize_model_to_string(model)
138 """Construct option parser
140 parser = OptionParser()
141 commands = OptionGroup(parser, "Commands")
142 commands.add_option('--model', default=None,
143 help="Load model database")
144 commands.add_option('--load-rdf', default=None,
145 help="load rdf statements into model")
146 commands.add_option('--print-rdf', action="store_true", default=False,
147 help="print ending model state")
148 commands.add_option('--update', action="store_true", default=False,
149 help="Do all updates")
150 commands.add_option('--update-submission', action="store_true",
152 help="download status from ucsc")
153 commands.add_option('--update-ucsc-downloads', action="store_true",
155 help="Update download locations from UCSC")
156 commands.add_option('--update-libraries', action="store_true",
158 help="download library info from htsw")
159 commands.add_option('--reload-libraries', action="store_true",
161 help="Delete and redownload library information. "\
162 "Optionally list specific library IDs.")
163 parser.add_option_group(commands)
165 queries = OptionGroup(parser, "Queries")
166 queries.add_option('--sparql', default=None,
167 help="execute arbitrary sparql query")
168 queries.add_option('--find-submission-with-no-library', default=False,
170 help="find submissions with no library ID")
171 parser.add_option_group(queries)
173 options = OptionGroup(parser, "Options")
174 options.add_option("--rdf-parser-name", default="turtle",
175 help="set rdf file parser type")
176 options.add_option("-v", "--verbose", action="store_true", default=False)
177 options.add_option("--debug", action="store_true", default=False)
178 parser.add_option_group(options)
180 api.add_auth_options(parser)
185 def load_my_submissions(model, limit=None, cookie=None):
186 """Parse all the submissions from UCSC into model
187 It will look at the global USER_URL to figure out who to scrape
188 cookie contains the session cookie, if none, will attempt to login
193 tree = get_url_as_tree(USER_URL, 'GET', cookie)
194 table_rows = tree.xpath('//table[@id="projects"]/tr')
195 # first record is header
196 name_n = submissionOntology['name']
197 species_n = submissionOntology['species']
198 library_urn = submissionOntology['library_urn']
201 for row in table_rows[1:]:
202 cell = row.xpath('td')
203 if cell is not None and len(cell) > 1:
204 submission_id = str(cell[0].text_content())
205 if limit is None or submission_id in limit:
206 subUrn = RDF.Uri(submission_view_url(submission_id))
211 submissionOntology['Submission'])
215 RDF.Node(submission_id))
217 name = str(cell[4].text_content())
218 add_stmt(model, subUrn, name_n, name)
220 species = str(cell[2].text_content())
221 if species is not None:
222 add_stmt(model, subUrn, species_n, species)
224 library_id = get_library_id(name)
225 if library_id is not None:
226 add_submission_to_library_urn(model,
231 errmsg = 'Unable to find library id in {0} for {1}'
232 LOGGER.warn(errmsg.format(name, str(subUrn)))
234 add_submission_creation_date(model, subUrn, cookie)
236 # grab changing atttributes
237 status = str(cell[6].text_content()).strip()
238 last_mod_datetime = get_date_contents(cell[8])
239 last_mod = last_mod_datetime.isoformat()
241 update_submission_detail(model, subUrn, status, last_mod,
244 LOGGER.info("Processed {0}".format(subUrn))
247 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
248 """Add a link from a UCSC submission to woldlab library if needed
250 libraryUrn = LIBRARY_NS[library_id + '/']
251 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
252 if not model.contains_statement(query):
253 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
254 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
255 model.add_statement(link)
257 LOGGER.debug("Found: {0}".format(str(query)))
260 def report_submissions_with_no_library(model):
261 missing = find_submissions_with_no_library(model)
265 print "# {0}".format(name)
266 print "<{0}>".format(subid.uri)
267 print " encodeSubmit:library_urn "\
268 "<http://jumpgate.caltech.edu/library/> ."
271 def find_submissions_with_no_library(model):
272 missing_lib_query_text = """
273 PREFIX submissionOntology:<{submissionOntology}>
278 ?subid submissionOntology:name ?name
279 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
280 FILTER (!bound(?libid))
281 }}""".format(submissionOntology=submissionOntology[''].uri)
282 missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
284 return missing_lib_query.execute(model)
287 def find_unscanned_submitted_libraries(model):
288 """Scan model for libraries that don't have library details loaded
290 unscanned_libraries = """
291 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
292 PREFIX submissionOntology:<{submissionOntology}>
294 SELECT distinct ?submission ?library_urn
296 ?submission submissionOntology:library_urn ?library_urn .
297 OPTIONAL {{ ?library_urn rdf:type ?library_type }}
298 FILTER(!BOUND(?library_type))
299 }}""".format(submissionOntology=submissionOntology[''].uri)
300 query = RDF.SPARQLQuery(unscanned_libraries)
301 return query.execute(model)
303 def find_all_libraries(model):
304 """Scan model for every library marked as
307 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
308 PREFIX libraryOntology:<{libraryOntology}>
310 SELECT distinct ?library_urn
312 ?library_urn rdf:type ?library_type .
314 }}""".format(libraryOntology=libraryOntology[''].uri)
315 query = RDF.SPARQLQuery(libraries)
316 return query.execute(model)
319 def add_submission_creation_date(model, subUrn, cookie):
320 # in theory the submission page might have more information on it.
321 creation_dates = get_creation_dates(model, subUrn)
322 if len(creation_dates) == 0:
323 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
324 submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
325 parse_submission_page(model, submissionTree, subUrn)
327 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
330 def get_creation_dates(model, subUrn):
331 query = RDF.Statement(subUrn, CREATION_DATE, None)
332 creation_dates = list(model.find_statements(query))
333 return creation_dates
336 def parse_submission_page(model, submissionTree, subUrn):
337 cells = submissionTree.findall('.//td')
338 dateTimeType = xsdNS['dateTime']
339 created_label = [x for x in cells
340 if x.text_content().startswith('Created')]
341 if len(created_label) == 1:
342 created_date = get_date_contents(created_label[0].getnext())
343 created_date_node = RDF.Node(literal=created_date.isoformat(),
344 datatype=dateTimeType.uri)
345 add_stmt(model, subUrn, CREATION_DATE, created_date_node)
347 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
352 def update_submission_detail(model, subUrn, status, recent_update, cookie):
353 HasStatusN = submissionOntology['has_status']
354 StatusN = submissionOntology['status']
355 LastModifyN = submissionOntology['last_modify_date']
357 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
358 status_nodes = list(model.find_statements(status_nodes_query))
360 if len(status_nodes) == 0:
361 # has no status node, add one
362 LOGGER.info("Adding status node to {0}".format(subUrn))
363 status_node = create_status_node(subUrn, recent_update)
364 add_stmt(model, subUrn, HasStatusN, status_node)
365 add_stmt(model, status_node, rdfNS['type'], StatusN)
366 add_stmt(model, status_node, StatusN, status)
367 add_stmt(model, status_node, LastModifyN, recent_update)
368 update_ddf(model, subUrn, status_node, cookie=cookie)
369 update_daf(model, subUrn, status_node, cookie=cookie)
371 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
372 for status_statement in status_nodes:
373 status_node = status_statement.object
374 last_modified_query = RDF.Statement(status_node,
377 last_mod_nodes = model.find_statements(last_modified_query)
378 for last_mod_statement in last_mod_nodes:
379 last_mod_date = str(last_mod_statement.object)
380 if recent_update == str(last_mod_date):
381 update_ddf(model, subUrn, status_node, cookie=cookie)
382 update_daf(model, subUrn, status_node, cookie=cookie)
386 def update_daf(model, submission_url, status_node, cookie):
387 download_daf_uri = str(submission_url).replace('show', 'download_daf')
388 daf_uri = RDF.Uri(download_daf_uri)
390 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
391 if not model.contains_statement(status_is_daf):
392 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
394 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
395 daf_hash = hashlib.md5(daf_text).hexdigest()
396 daf_hash_stmt = RDF.Statement(status_node,
397 dafTermOntology['md5sum'],
399 model.add_statement(daf_hash_stmt)
400 daf.fromstring_into_model(model, status_node, daf_text)
403 def update_ddf(model, subUrn, statusNode, cookie):
404 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
405 ddfUrn = RDF.Uri(download_ddf_url)
407 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
408 if not model.contains_statement(status_is_ddf):
409 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
410 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
411 add_ddf_statements(model, statusNode, ddf_text)
412 model.add_statement(status_is_ddf)
415 def add_ddf_statements(model, statusNode, ddf_string):
416 """Convert a ddf text file into RDF Statements
418 ddf_lines = ddf_string.split('\n')
419 # first line is header
420 header = ddf_lines[0].split()
421 attributes = [DCC_NS[x] for x in header]
423 for ddf_line in ddf_lines[1:]:
424 ddf_line = ddf_line.strip()
425 if len(ddf_line) == 0:
427 if ddf_line.startswith("#"):
430 ddf_record = ddf_line.split('\t')
431 files = ddf_record[0].split(',')
432 file_attributes = ddf_record[1:]
435 fileNode = RDF.Node()
438 submissionOntology['has_file'],
440 add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
441 add_stmt(model, fileNode, DCC_NS['filename'], f)
443 for predicate, object in zip(attributes[1:], file_attributes):
444 add_stmt(model, fileNode, predicate, object)
447 def load_encode_assigned_libraries(model, htswapi):
448 """Get libraries associated with encode.
450 encodeFilters = ["/library/?affiliations__id__exact=44",
451 "/library/?affiliations__id__exact=80",
454 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
455 rdfaParser = RDF.Parser(name='rdfa')
456 for encodeUrl in encodeUrls:
457 LOGGER.info("Scanning library url {0}".format(encodeUrl))
458 rdfaParser.parse_into_model(model, encodeUrl)
459 query = RDF.Statement(None, libraryOntology['library_id'], None)
460 libraries = model.find_statements(query)
461 for statement in libraries:
462 libraryUrn = statement.subject
463 load_library_detail(model, libraryUrn)
466 def load_unassigned_submitted_libraries(model):
467 unassigned = find_unscanned_submitted_libraries(model)
468 for query_record in unassigned:
469 library_urn = query_record['library_urn']
470 LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
471 load_library_detail(model, library_urn)
473 def reload_libraries(model, library_list):
474 if len(library_list) == 0:
476 queryset = find_all_libraries(model)
477 libraries = ( str(s['library_urn']) for s in queryset )
479 libraries = ( user_library_id_to_library_urn(l) for l in library_list )
481 for library_urn in libraries:
482 delete_library(model, library_urn)
483 load_library_detail(model, library_urn)
485 def user_library_id_to_library_urn(library_id):
486 split_url = urlparse.urlsplit(library_id)
487 if len(split_url.scheme) == 0:
488 return LIBRARY_NS[library_id]
492 def delete_library(model, library_urn):
493 if not isinstance(library_urn, RDF.Node):
494 raise ValueError("library urn must be a RDF.Node")
496 LOGGER.info("Deleting {0}".format(str(library_urn.uri)))
497 lane_query = RDF.Statement(library_urn, libraryOntology['has_lane'],None)
498 for lane in model.find_statements(lane_query):
499 delete_lane(model, lane.object)
500 library_attrib_query = RDF.Statement(library_urn, None, None)
501 for library_attrib in model.find_statements(library_attrib_query):
502 LOGGER.debug("Deleting {0}".format(str(library_attrib)))
503 del model[library_attrib]
506 def delete_lane(model, lane_urn):
507 if not isinstance(lane_urn, RDF.Node):
508 raise ValueError("lane urn must be a RDF.Node")
510 delete_lane_mapping(model, lane_urn)
511 lane_attrib_query = RDF.Statement(lane_urn,None,None)
512 for lane_attrib in model.find_statements(lane_attrib_query):
513 LOGGER.debug("Deleting {0}".format(str(lane_attrib)))
514 del model[lane_attrib]
517 def delete_lane_mapping(model, lane_urn):
518 if not isinstance(lane_urn, RDF.Node):
519 raise ValueError("lane urn must be a RDF.Node")
521 lane_mapping_query = RDF.Statement(lane_urn,
522 libraryOntology['has_mappings'],
524 for lane_mapping in model.find_statements(lane_mapping_query):
525 mapping_attrib_query = RDF.Statement(lane_mapping.object,
528 for mapping_attrib in model.find_statements(mapping_attrib_query):
529 LOGGER.debug("Deleting {0}".format(str(mapping_attrib)))
530 del model[mapping_attrib]
533 def load_encodedcc_files(model, genome, composite):
534 file_index = ucsc.get_encodedcc_file_index(genome, composite)
535 if file_index is None:
538 for filename, attributes in file_index.items():
539 s = RDF.Node(RDF.Uri(filename))
540 for name, value in attributes.items():
541 p = RDF.Node(DCC_NS[name])
543 model.add_statement(RDF.Statement(s,p,o))
546 def load_library_detail(model, libraryUrn):
547 """Grab detail information from library page
549 rdfaParser = RDF.Parser(name='rdfa')
550 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
551 results = list(model.find_statements(query))
552 log_message = "Found {0} statements for {1}"
553 LOGGER.debug(log_message.format(len(results), libraryUrn))
554 if len(results) == 0:
555 LOGGER.info("Loading {0}".format(str(libraryUrn)))
557 body = get_url_as_text(str(libraryUrn.uri), 'GET')
558 rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
559 except httplib2.HttpLib2ErrorWithResponse, e:
561 elif len(results) == 1:
562 pass # Assuming that a loaded dataset has one record
564 LOGGER.warning("Many dates for {0}".format(libraryUrn))
567 def get_library_id(name):
568 """Guess library ID from library name
570 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
572 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
574 >>> get_library_id('2x75-GM12892-rep2-SL2970')
577 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
579 if match is not None:
580 library_id = match.group('id')
581 if library_id in SL_MAP:
582 library_id = SL_MAP[library_id]
586 def get_contents(element):
587 """Return contents or none.
589 if len(element.contents) == 0:
592 a = element.find('a')
594 return a.contents[0].encode(CHARSET)
596 return element.contents[0].encode(CHARSET)
599 def create_status_node(submission_uri, timestamp):
600 submission_uri = daf.submission_uri_to_string(submission_uri)
601 if submission_uri[-1] != '/':
602 sumbission_uri += '/'
603 status_uri = submission_uri + timestamp
604 return RDF.Node(RDF.Uri(status_uri))
607 def get_date_contents(element):
608 data = element.text_content()
610 return datetime.strptime(data, "%Y-%m-%d %H:%M")
615 def add_stmt(model, subject, predicate, rdf_object):
616 """Convienence create RDF Statement and add to a model
618 return model.add_statement(
619 RDF.Statement(subject, predicate, rdf_object))
622 def login(cookie=None):
623 """Login if we don't have a cookie
625 if cookie is not None:
628 keys = keyring.get_keyring()
629 password = keys.get_password(LOGIN_URL, USERNAME)
630 credentials = {'login': USERNAME,
631 'password': password}
632 headers = {'Content-type': 'application/x-www-form-urlencoded'}
633 http = httplib2.Http()
634 response, content = http.request(LOGIN_URL,
637 body=urllib.urlencode(credentials))
638 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
641 cookie = response.get('set-cookie', None)
643 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
647 def get_url_as_tree(url, method, cookie=None):
648 http = httplib2.Http()
650 if cookie is not None:
651 headers['Cookie'] = cookie
652 response, content = http.request(url, method, headers=headers)
653 if response['status'] == '200':
654 tree = fromstring(content, base_url=url)
657 msg = "error accessing {0}, status {1}"
658 msg = msg.format(url, response['status'])
659 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
663 def get_url_as_text(url, method, cookie=None):
664 http = httplib2.Http()
666 if cookie is not None:
667 headers['Cookie'] = cookie
668 response, content = http.request(url, method, headers=headers)
669 if response['status'] == '200':
672 msg = "error accessing {0}, status {1}"
673 msg = msg.format(url, response['status'])
674 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
679 SUBMISSIONS_LACKING_LIBID = [
680 ('1x75-Directional-HeLa-Rep1', '11208'),
681 ('1x75-Directional-HeLa-Rep2', '11207'),
682 ('1x75-Directional-HepG2-Rep1', '11210'),
683 ('1x75-Directional-HepG2-Rep2', '11209'),
684 ('1x75-Directional-H1-hESC-Rep1', '10947'),
685 ('1x75-Directional-H1-hESC-Rep2', '11009'),
686 ('1x75-Directional-HUVEC-Rep1', '11206'),
687 ('1x75-Directional-HUVEC-Rep2', '11205'),
688 ('1x75-Directional-K562-Rep1', '11008'),
689 ('1x75-Directional-K562-Rep2', '11007'),
690 ('1x75-Directional-NHEK-Rep1', '11204'),
691 ('1x75-Directional-GM12878-Rep1', '11011'),
692 ('1x75-Directional-GM12878-Rep2', '11010'),
696 def select_by_library_id(submission_list):
697 subl = [(x.library_id, x) for x in submission_list if x.library_id]
699 for lib_id, subobj in subl:
700 libraries.setdefault(lib_id, []).append(subobj)
702 for submission in libraries.values():
703 submission.sort(key=attrgetter('date'), reverse=True)
708 def library_to_freeze(selected_libraries):
709 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
710 lib_ids = sorted(selected_libraries.keys())
711 report = ['<html><table border="1">']
714 <style type="text/css">
715 td {border-width:0 0 1px 1px; border-style:solid;}
721 report.append('<thead>')
722 report.append('<tr><td>Library ID</td><td>Name</td>')
724 report.append('<td>{0}</td>'.format(f))
725 report.append('</tr>')
726 report.append('</thead>')
727 report.append('<tbody>')
728 for lib_id in lib_ids:
729 report.append('<tr>')
730 lib_url = LIBRARY_NS[lib_id].uri
731 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
732 submissions = selected_libraries[lib_id]
733 report.append('<td>{0}</td>'.format(submissions[0].name))
735 for sub in submissions:
736 date = date_to_freeze(sub.date)
737 batched.setdefault(date, []).append(sub)
739 report.append('<td>')
740 for s in batched.get(d, []):
741 show_url = submission_view_url(s.subid)
742 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
743 report.append("{0}:{1}".format(subid, s.status))
744 report.append('</td>')
746 report.append('<td></td>')
747 report.append("</tr>")
748 report.append('</tbody>')
749 report.append("</table></html>")
750 return "\n".join(report)
753 def date_to_freeze(d):
754 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
755 (datetime(2010, 7, 30), '2010-Jul'),
756 (datetime(2011, 1, 30), '2011-Jan'),
758 for end, name in freezes:
764 if __name__ == "__main__":