3 Gather information about our submissions into a single RDF store
6 from datetime import datetime
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
23 from htsworkflow.submission import daf, ucsc
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
44 from htsworkflow.submission.ucsc import \
47 get_encodedcc_file_index, \
48 submission_view_url, \
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
53 DBDIR = os.path.expanduser("~diane/proj/submission")
55 LOGGER = logging.getLogger("encode_find")
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
63 SL_MAP = {'SL2970': '02970',
67 def main(cmdline=None):
69 Parse command line arguments
71 Takes a list of arguments (assuming arg[0] is the program name) or None
72 If None, it looks at sys.argv
74 parser = make_parser()
75 opts, args = parser.parse_args(cmdline)
78 logging.basicConfig(level=logging.DEBUG)
80 logging.basicConfig(level=logging.INFO)
82 logging.basicConfig(level=logging.ERROR)
84 htsw_authdata = api.make_auth_from_opts(opts, parser)
85 htswapi = api.HtswApi(opts.host, htsw_authdata)
88 model = get_model(opts.model, DBDIR)
90 if opts.load_rdf is not None:
91 ns_uri = submissionOntology[''].uri
92 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
100 opts.update_submission = True
101 opts.update_libraries = True
102 opts.update_ucsc_downloads = True
104 if opts.update_submission:
105 cookie = login(cookie=cookie)
106 load_my_submissions(model, limit=limit, cookie=cookie)
108 if opts.update_libraries:
109 load_encode_assigned_libraries(model, htswapi)
110 load_unassigned_submitted_libraries(model)
112 if opts.update_ucsc_downloads:
114 {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
115 {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
116 #{'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
117 {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
119 for track_info in our_tracks:
120 load_encodedcc_files(model, **track_info )
122 if opts.sparql is not None:
123 sparql_query(model, opts.sparql)
125 if opts.find_submission_with_no_library:
126 report_submissions_with_no_library(model)
129 serializer = get_serializer(name=opts.rdf_parser_name)
130 print serializer.serialize_model_to_string(model)
134 """Construct option parser
136 parser = OptionParser()
137 commands = OptionGroup(parser, "Commands")
138 commands.add_option('--model', default=None,
139 help="Load model database")
140 commands.add_option('--load-rdf', default=None,
141 help="load rdf statements into model")
142 commands.add_option('--print-rdf', action="store_true", default=False,
143 help="print ending model state")
144 commands.add_option('--update', action="store_true", default=False,
145 help="Do all updates")
146 commands.add_option('--update-submission', action="store_true",
148 help="download status from ucsc")
149 commands.add_option('--update-ucsc-downloads', action="store_true",
151 help="Update download locations from UCSC")
152 commands.add_option('--update-libraries', action="store_true",
154 help="download library info from htsw")
155 parser.add_option_group(commands)
157 queries = OptionGroup(parser, "Queries")
158 queries.add_option('--sparql', default=None,
159 help="execute arbitrary sparql query")
160 queries.add_option('--find-submission-with-no-library', default=False,
162 help="find submissions with no library ID")
163 parser.add_option_group(queries)
165 options = OptionGroup(parser, "Options")
166 options.add_option("--rdf-parser-name", default="turtle",
167 help="set rdf file parser type")
168 options.add_option("-v", "--verbose", action="store_true", default=False)
169 options.add_option("--debug", action="store_true", default=False)
170 parser.add_option_group(options)
172 api.add_auth_options(parser)
177 def load_my_submissions(model, limit=None, cookie=None):
178 """Parse all the submissions from UCSC into model
179 It will look at the global USER_URL to figure out who to scrape
180 cookie contains the session cookie, if none, will attempt to login
185 tree = get_url_as_tree(USER_URL, 'GET', cookie)
186 table_rows = tree.xpath('//table[@id="projects"]/tr')
187 # first record is header
188 name_n = submissionOntology['name']
189 species_n = submissionOntology['species']
190 library_urn = submissionOntology['library_urn']
193 for row in table_rows[1:]:
194 cell = row.xpath('td')
195 if cell is not None and len(cell) > 1:
196 submission_id = str(cell[0].text_content())
197 if limit is None or submission_id in limit:
198 subUrn = RDF.Uri(submission_view_url(submission_id))
203 submissionOntology['Submission'])
207 RDF.Node(submission_id))
209 name = str(cell[4].text_content())
210 add_stmt(model, subUrn, name_n, name)
212 species = str(cell[2].text_content())
213 if species is not None:
214 add_stmt(model, subUrn, species_n, species)
216 library_id = get_library_id(name)
217 if library_id is not None:
218 add_submission_to_library_urn(model,
223 errmsg = 'Unable to find library id in {0} for {1}'
224 LOGGER.warn(errmsg.format(name, str(subUrn)))
226 add_submission_creation_date(model, subUrn, cookie)
228 # grab changing atttributes
229 status = str(cell[6].text_content()).strip()
230 last_mod_datetime = get_date_contents(cell[8])
231 last_mod = last_mod_datetime.isoformat()
233 update_submission_detail(model, subUrn, status, last_mod,
236 LOGGER.info("Processed {0}".format(subUrn))
239 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
240 """Add a link from a UCSC submission to woldlab library if needed
242 libraryUrn = LIBRARY_NS[library_id + '/']
243 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
244 if not model.contains_statement(query):
245 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
246 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
247 model.add_statement(link)
249 LOGGER.debug("Found: {0}".format(str(query)))
252 def report_submissions_with_no_library(model):
253 missing = find_submissions_with_no_library(model)
257 print "# {0}".format(name)
258 print "<{0}>".format(subid.uri)
259 print " encodeSubmit:library_urn "\
260 "<http://jumpgate.caltech.edu/library/> ."
263 def find_submissions_with_no_library(model):
264 missing_lib_query_text = """
265 PREFIX submissionOntology:<{submissionOntology}>
270 ?subid submissionOntology:name ?name
271 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
272 FILTER (!bound(?libid))
273 }}""".format(submissionOntology=submissionOntology[''].uri)
274 missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
276 return missing_lib_query.execute(model)
279 def find_unscanned_submitted_libraries(model):
280 """Scan model for libraries that don't have library details loaded
282 unscanned_libraries = """
283 PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
284 PREFIX submissionOntology:<{submissionOntology}>
286 SELECT distinct ?submission ?library_urn
288 ?submission submissionOntology:library_urn ?library_urn .
289 OPTIONAL {{ ?library_urn rdf:type ?library_type }}
290 FILTER(!BOUND(?library_type))
291 }}""".format(submissionOntology=submissionOntology[''].uri)
292 query = RDF.SPARQLQuery(unscanned_libraries)
293 return query.execute(model)
296 def add_submission_creation_date(model, subUrn, cookie):
297 # in theory the submission page might have more information on it.
298 creation_dates = get_creation_dates(model, subUrn)
299 if len(creation_dates) == 0:
300 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
301 submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
302 parse_submission_page(model, submissionTree, subUrn)
304 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
307 def get_creation_dates(model, subUrn):
308 query = RDF.Statement(subUrn, CREATION_DATE, None)
309 creation_dates = list(model.find_statements(query))
310 return creation_dates
313 def parse_submission_page(model, submissionTree, subUrn):
314 cells = submissionTree.findall('.//td')
315 dateTimeType = xsdNS['dateTime']
316 created_label = [x for x in cells
317 if x.text_content().startswith('Created')]
318 if len(created_label) == 1:
319 created_date = get_date_contents(created_label[0].getnext())
320 created_date_node = RDF.Node(literal=created_date.isoformat(),
321 datatype=dateTimeType.uri)
322 add_stmt(model, subUrn, CREATION_DATE, created_date_node)
324 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
329 def update_submission_detail(model, subUrn, status, recent_update, cookie):
330 HasStatusN = submissionOntology['has_status']
331 StatusN = submissionOntology['status']
332 LastModifyN = submissionOntology['last_modify_date']
334 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
335 status_nodes = list(model.find_statements(status_nodes_query))
337 if len(status_nodes) == 0:
338 # has no status node, add one
339 LOGGER.info("Adding status node to {0}".format(subUrn))
340 status_node = create_status_node(subUrn, recent_update)
341 add_stmt(model, subUrn, HasStatusN, status_node)
342 add_stmt(model, status_node, rdfNS['type'], StatusN)
343 add_stmt(model, status_node, StatusN, status)
344 add_stmt(model, status_node, LastModifyN, recent_update)
345 update_ddf(model, subUrn, status_node, cookie=cookie)
346 update_daf(model, subUrn, status_node, cookie=cookie)
348 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
349 for status_statement in status_nodes:
350 status_node = status_statement.object
351 last_modified_query = RDF.Statement(status_node,
354 last_mod_nodes = model.find_statements(last_modified_query)
355 for last_mod_statement in last_mod_nodes:
356 last_mod_date = str(last_mod_statement.object)
357 if recent_update == str(last_mod_date):
358 update_ddf(model, subUrn, status_node, cookie=cookie)
359 update_daf(model, subUrn, status_node, cookie=cookie)
363 def update_daf(model, submission_url, status_node, cookie):
364 download_daf_uri = str(submission_url).replace('show', 'download_daf')
365 daf_uri = RDF.Uri(download_daf_uri)
367 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
368 if not model.contains_statement(status_is_daf):
369 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
371 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
372 daf_hash = hashlib.md5(daf_text).hexdigest()
373 daf_hash_stmt = RDF.Statement(status_node,
374 dafTermOntology['md5sum'],
376 model.add_statement(daf_hash_stmt)
377 daf.fromstring_into_model(model, status_node, daf_text)
380 def update_ddf(model, subUrn, statusNode, cookie):
381 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
382 ddfUrn = RDF.Uri(download_ddf_url)
384 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
385 if not model.contains_statement(status_is_ddf):
386 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
387 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
388 add_ddf_statements(model, statusNode, ddf_text)
389 model.add_statement(status_is_ddf)
392 def add_ddf_statements(model, statusNode, ddf_string):
393 """Convert a ddf text file into RDF Statements
395 ddf_lines = ddf_string.split('\n')
396 # first line is header
397 header = ddf_lines[0].split()
398 attributes = [DCC_NS[x] for x in header]
400 for ddf_line in ddf_lines[1:]:
401 ddf_line = ddf_line.strip()
402 if len(ddf_line) == 0:
404 if ddf_line.startswith("#"):
407 ddf_record = ddf_line.split('\t')
408 files = ddf_record[0].split(',')
409 file_attributes = ddf_record[1:]
412 fileNode = RDF.Node()
415 submissionOntology['has_file'],
417 add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
418 add_stmt(model, fileNode, DCC_NS['filename'], f)
420 for predicate, object in zip(attributes[1:], file_attributes):
421 add_stmt(model, fileNode, predicate, object)
424 def load_encode_assigned_libraries(model, htswapi):
425 """Get libraries associated with encode.
427 encodeFilters = ["/library/?affiliations__id__exact=44",
428 "/library/?affiliations__id__exact=80",
431 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
432 rdfaParser = RDF.Parser(name='rdfa')
433 for encodeUrl in encodeUrls:
434 LOGGER.info("Scanning library url {0}".format(encodeUrl))
435 rdfaParser.parse_into_model(model, encodeUrl)
436 query = RDF.Statement(None, libraryOntology['library_id'], None)
437 libraries = model.find_statements(query)
438 for statement in libraries:
439 libraryUrn = statement.subject
440 load_library_detail(model, libraryUrn)
443 def load_unassigned_submitted_libraries(model):
444 unassigned = find_unscanned_submitted_libraries(model)
445 for query_record in unassigned:
446 library_urn = query_record['library_urn']
447 LOGGER.warn("Unassigned, submitted library: {0}".format(library_urn))
448 load_library_detail(model, library_urn)
451 def load_encodedcc_files(model, genome, composite):
452 file_index = ucsc.get_encodedcc_file_index(genome, composite)
453 if file_index is None:
456 for filename, attributes in file_index.items():
457 s = RDF.Node(RDF.Uri(filename))
458 for name, value in attributes.items():
459 p = RDF.Node(DCC_NS[name])
461 model.add_statement(RDF.Statement(s,p,o))
464 def load_library_detail(model, libraryUrn):
465 """Grab detail information from library page
467 rdfaParser = RDF.Parser(name='rdfa')
468 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
469 results = list(model.find_statements(query))
470 log_message = "Found {0} statements for {1}"
471 LOGGER.debug(log_message.format(len(results), libraryUrn))
472 if len(results) == 0:
473 LOGGER.info("Loading {0}".format(str(libraryUrn)))
475 body = get_url_as_text(str(libraryUrn.uri), 'GET')
476 rdfaParser.parse_string_into_model(model, body, libraryUrn.uri)
477 except httplib2.HttpLib2ErrorWithResponse, e:
479 elif len(results) == 1:
480 pass # Assuming that a loaded dataset has one record
482 LOGGER.warning("Many dates for {0}".format(libraryUrn))
485 def get_library_id(name):
486 """Guess library ID from library name
488 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
490 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
492 >>> get_library_id('2x75-GM12892-rep2-SL2970')
495 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
497 if match is not None:
498 library_id = match.group('id')
499 if library_id in SL_MAP:
500 library_id = SL_MAP[library_id]
504 def get_contents(element):
505 """Return contents or none.
507 if len(element.contents) == 0:
510 a = element.find('a')
512 return a.contents[0].encode(CHARSET)
514 return element.contents[0].encode(CHARSET)
517 def create_status_node(submission_uri, timestamp):
518 submission_uri = daf.submission_uri_to_string(submission_uri)
519 if submission_uri[-1] != '/':
520 sumbission_uri += '/'
521 status_uri = submission_uri + timestamp
522 return RDF.Node(RDF.Uri(status_uri))
525 def get_date_contents(element):
526 data = element.text_content()
528 return datetime.strptime(data, "%Y-%m-%d %H:%M")
533 def add_stmt(model, subject, predicate, rdf_object):
534 """Convienence create RDF Statement and add to a model
536 return model.add_statement(
537 RDF.Statement(subject, predicate, rdf_object))
540 def login(cookie=None):
541 """Login if we don't have a cookie
543 if cookie is not None:
546 keys = keyring.get_keyring()
547 password = keys.get_password(LOGIN_URL, USERNAME)
548 credentials = {'login': USERNAME,
549 'password': password}
550 headers = {'Content-type': 'application/x-www-form-urlencoded'}
551 http = httplib2.Http()
552 response, content = http.request(LOGIN_URL,
555 body=urllib.urlencode(credentials))
556 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
559 cookie = response.get('set-cookie', None)
561 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
565 def get_url_as_tree(url, method, cookie=None):
566 http = httplib2.Http()
568 if cookie is not None:
569 headers['Cookie'] = cookie
570 response, content = http.request(url, method, headers=headers)
571 if response['status'] == '200':
572 tree = fromstring(content, base_url=url)
575 msg = "error accessing {0}, status {1}"
576 msg = msg.format(url, response['status'])
577 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
581 def get_url_as_text(url, method, cookie=None):
582 http = httplib2.Http()
584 if cookie is not None:
585 headers['Cookie'] = cookie
586 response, content = http.request(url, method, headers=headers)
587 if response['status'] == '200':
590 msg = "error accessing {0}, status {1}"
591 msg = msg.format(url, response['status'])
592 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
597 SUBMISSIONS_LACKING_LIBID = [
598 ('1x75-Directional-HeLa-Rep1', '11208'),
599 ('1x75-Directional-HeLa-Rep2', '11207'),
600 ('1x75-Directional-HepG2-Rep1', '11210'),
601 ('1x75-Directional-HepG2-Rep2', '11209'),
602 ('1x75-Directional-H1-hESC-Rep1', '10947'),
603 ('1x75-Directional-H1-hESC-Rep2', '11009'),
604 ('1x75-Directional-HUVEC-Rep1', '11206'),
605 ('1x75-Directional-HUVEC-Rep2', '11205'),
606 ('1x75-Directional-K562-Rep1', '11008'),
607 ('1x75-Directional-K562-Rep2', '11007'),
608 ('1x75-Directional-NHEK-Rep1', '11204'),
609 ('1x75-Directional-GM12878-Rep1', '11011'),
610 ('1x75-Directional-GM12878-Rep2', '11010'),
614 def select_by_library_id(submission_list):
615 subl = [(x.library_id, x) for x in submission_list if x.library_id]
617 for lib_id, subobj in subl:
618 libraries.setdefault(lib_id, []).append(subobj)
620 for submission in libraries.values():
621 submission.sort(key=attrgetter('date'), reverse=True)
626 def library_to_freeze(selected_libraries):
627 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
628 lib_ids = sorted(selected_libraries.keys())
629 report = ['<html><table border="1">']
632 <style type="text/css">
633 td {border-width:0 0 1px 1px; border-style:solid;}
639 report.append('<thead>')
640 report.append('<tr><td>Library ID</td><td>Name</td>')
642 report.append('<td>{0}</td>'.format(f))
643 report.append('</tr>')
644 report.append('</thead>')
645 report.append('<tbody>')
646 for lib_id in lib_ids:
647 report.append('<tr>')
648 lib_url = LIBRARY_NS[lib_id].uri
649 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
650 submissions = selected_libraries[lib_id]
651 report.append('<td>{0}</td>'.format(submissions[0].name))
653 for sub in submissions:
654 date = date_to_freeze(sub.date)
655 batched.setdefault(date, []).append(sub)
657 report.append('<td>')
658 for s in batched.get(d, []):
659 show_url = submission_view_url(s.subid)
660 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
661 report.append("{0}:{1}".format(subid, s.status))
662 report.append('</td>')
664 report.append('<td></td>')
665 report.append("</tr>")
666 report.append('</tbody>')
667 report.append("</table></html>")
668 return "\n".join(report)
671 def date_to_freeze(d):
672 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
673 (datetime(2010, 7, 30), '2010-Jul'),
674 (datetime(2011, 1, 30), '2011-Jan'),
676 for end, name in freezes:
682 if __name__ == "__main__":