3 Gather information about our submissions into a single RDF store
6 from datetime import datetime
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
23 from htsworkflow.submission import daf, ucsc
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
38 TYPE_N = rdfNS['type']
39 CREATION_DATE = libraryOntology['date']
42 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
44 from htsworkflow.submission.ucsc import \
47 get_ucsc_file_index, \
48 submission_view_url, \
51 DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
53 DBDIR = os.path.expanduser("~diane/proj/submission")
55 LOGGER = logging.getLogger("encode_find")
57 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
58 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
63 GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
64 "{genome}/encodeDCC/{composite}/"
65 def main(cmdline=None):
67 Parse command line arguments
69 Takes a list of arguments (assuming arg[0] is the program name) or None
70 If None, it looks at sys.argv
72 parser = make_parser()
73 opts, args = parser.parse_args(cmdline)
76 logging.basicConfig(level=logging.DEBUG)
78 logging.basicConfig(level=logging.INFO)
80 htsw_authdata = api.make_auth_from_opts(opts, parser)
81 htswapi = api.HtswApi(opts.host, htsw_authdata)
84 model = get_model(opts.load_model, DBDIR)
86 if opts.load_rdf is not None:
87 ns_uri = submissionOntology[''].uri
88 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
96 cookie = login(cookie=cookie)
97 load_my_submissions(model, limit=limit, cookie=cookie)
98 load_encode_libraries(model, htswapi)
100 {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
101 {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
102 {'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
103 {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
105 for track_info in our_tracks:
106 load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
109 if opts.sparql is not None:
110 sparql_query(model, opts.sparql)
112 if opts.find_submission_with_no_library:
113 find_submissions_with_no_library(model)
116 serializer = get_serializer(name=opts.rdf_parser_name)
117 print serializer.serialize_model_to_string(model)
121 """Construct option parser
123 parser = OptionParser()
124 commands = OptionGroup(parser, "Commands")
125 commands.add_option('--load-model', default=None,
126 help="Load model database")
127 commands.add_option('--load-rdf', default=None,
128 help="load rdf statements into model")
129 commands.add_option('--print-rdf', action="store_true", default=False,
130 help="print ending model state")
131 commands.add_option('--update', action="store_true", default=False,
132 help="Query remote data sources and update our database")
133 #commands.add_option('--update-ucsc-status', default=None,
134 # help="download status from ucsc, requires filename for extra rules")
135 #commands.add_option('--update-ddfs', action="store_true", default=False,
136 # help="download ddf information for known submission")
137 #commands.add_option('--update-library', default=None,
138 # help="download library info from htsw, "\
139 # "requires filename for extra rules")
140 parser.add_option_group(commands)
142 queries = OptionGroup(parser, "Queries")
143 queries.add_option('--sparql', default=None,
144 help="execute arbitrary sparql query")
145 queries.add_option('--find-submission-with-no-library', default=False,
147 help="find submissions with no library ID")
148 parser.add_option_group(queries)
150 options = OptionGroup(parser, "Options")
151 options.add_option("--rdf-parser-name", default="turtle",
152 help="set rdf file parser type")
153 options.add_option("-v", "--verbose", action="store_true", default=False)
154 options.add_option("--debug", action="store_true", default=False)
155 parser.add_option_group(options)
157 api.add_auth_options(parser)
162 def load_my_submissions(model, limit=None, cookie=None):
163 """Parse all the submissions from UCSC into model
164 It will look at the global USER_URL to figure out who to scrape
165 cookie contains the session cookie, if none, will attempt to login
170 tree = get_url_as_tree(USER_URL, 'GET', cookie)
171 table_rows = tree.xpath('//table[@id="projects"]/tr')
172 # first record is header
173 name_n = submissionOntology['name']
174 species_n = submissionOntology['species']
175 library_urn = submissionOntology['library_urn']
178 for row in table_rows[1:]:
179 cell = row.xpath('td')
180 if cell is not None and len(cell) > 1:
181 submission_id = str(cell[0].text_content())
182 if limit is None or submission_id in limit:
183 subUrn = RDF.Uri(submission_view_url(submission_id))
188 submissionOntology['Submission'])
192 RDF.Node(submission_id))
194 name = str(cell[4].text_content())
195 add_stmt(model, subUrn, name_n, name)
197 species = str(cell[2].text_content())
198 if species is not None:
199 add_stmt(model, subUrn, species_n, species)
201 library_id = get_library_id(name)
202 if library_id is not None:
203 add_submission_to_library_urn(model,
208 errmsg = 'Unable to find library id in {0} for {1}'
209 LOGGER.warn(errmsg.format(name, str(subUrn)))
211 add_submission_creation_date(model, subUrn, cookie)
213 # grab changing atttributes
214 status = str(cell[6].text_content()).strip()
215 last_mod_datetime = get_date_contents(cell[8])
216 last_mod = last_mod_datetime.isoformat()
218 update_submission_detail(model, subUrn, status, last_mod,
221 LOGGER.info("Processed {0}".format(subUrn))
224 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
225 """Add a link from a UCSC submission to woldlab library if needed
227 libraryUrn = LIBRARY_NS[library_id + '/']
228 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
229 if not model.contains_statement(query):
230 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
231 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
232 model.add_statement(link)
234 LOGGER.debug("Found: {0}".format(str(query)))
237 def find_submissions_with_no_library(model):
238 missing_lib_query_text = """
239 PREFIX submissionOntology:<{submissionOntology}>
244 ?subid submissionOntology:name ?name
245 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
246 FILTER (!bound(?libid))
247 }}""".format(submissionOntology=submissionOntology[''].uri)
248 missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
250 results = missing_lib_query.execute(model)
254 print "# {0}".format(name)
255 print "<{0}>".format(subid.uri)
256 print " encodeSubmit:library_urn "\
257 "<http://jumpgate.caltech.edu/library/> ."
261 def add_submission_creation_date(model, subUrn, cookie):
262 # in theory the submission page might have more information on it.
263 creation_dates = get_creation_dates(model, subUrn)
264 if len(creation_dates) == 0:
265 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
266 submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
267 parse_submission_page(model, cells, subUrn)
269 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
271 def get_creation_dates(model, subUrn):
272 query = RDF.Statement(subUrn, CREATION_DATE, None)
273 creation_dates = list(model.find_statements(query))
274 return creation_dates
276 def parse_submission_page(model, submissionTree, subUrn):
277 cells = submissionTree.findall('.//td')
278 dateTimeType = xsdNS['dateTime']
279 created_label = [x for x in cells
280 if x.text_content().startswith('Created')]
281 if len(created_label) == 1:
282 created_date = get_date_contents(created_label[0].getnext())
283 created_date_node = RDF.Node(literal=created_date.isoformat(),
284 datatype=dateTimeType.uri)
285 add_stmt(model, subUrn, CREATION_DATE, created_date_node)
287 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
292 def update_submission_detail(model, subUrn, status, recent_update, cookie):
293 HasStatusN = submissionOntology['has_status']
294 StatusN = submissionOntology['status']
295 LastModifyN = submissionOntology['last_modify_date']
297 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
298 status_nodes = list(model.find_statements(status_nodes_query))
300 if len(status_nodes) == 0:
301 # has no status node, add one
302 LOGGER.info("Adding status node to {0}".format(subUrn))
303 status_node = create_status_node(subUrn, recent_update)
304 add_stmt(model, subUrn, HasStatusN, status_node)
305 add_stmt(model, status_node, rdfNS['type'], StatusN)
306 add_stmt(model, status_node, StatusN, status)
307 add_stmt(model, status_node, LastModifyN, recent_update)
308 update_ddf(model, subUrn, status_node, cookie=cookie)
309 update_daf(model, subUrn, status_node, cookie=cookie)
311 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
312 for status_statement in status_nodes:
313 status_node = status_statement.object
314 last_modified_query = RDF.Statement(status_node,
317 last_mod_nodes = model.find_statements(last_modified_query)
318 for last_mod_statement in last_mod_nodes:
319 last_mod_date = str(last_mod_statement.object)
320 if recent_update == str(last_mod_date):
321 update_ddf(model, subUrn, status_node, cookie=cookie)
322 update_daf(model, subUrn, status_node, cookie=cookie)
326 def update_daf(model, submission_url, status_node, cookie):
327 download_daf_uri = str(submission_url).replace('show', 'download_daf')
328 daf_uri = RDF.Uri(download_daf_uri)
330 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
331 if not model.contains_statement(status_is_daf):
332 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
334 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
335 daf_hash = hashlib.md5(daf_text).hexdigest()
336 daf_hash_stmt = RDF.Statement(status_node,
337 dafTermOntology['md5sum'],
339 model.add_statement(daf_hash_stmt)
340 daf.fromstring_into_model(model, status_node, daf_text)
343 def update_ddf(model, subUrn, statusNode, cookie):
344 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
345 ddfUrn = RDF.Uri(download_ddf_url)
347 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
348 if not model.contains_statement(status_is_ddf):
349 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
350 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
351 add_ddf_statements(model, statusNode, ddf_text)
352 model.add_statement(status_is_ddf)
355 def add_ddf_statements(model, statusNode, ddf_string):
356 """Convert a ddf text file into RDF Statements
358 ddf_lines = ddf_string.split('\n')
359 # first line is header
360 header = ddf_lines[0].split()
361 attributes = [DCC_NS[x] for x in header]
363 for ddf_line in ddf_lines[1:]:
364 ddf_line = ddf_line.strip()
365 if len(ddf_line) == 0:
367 if ddf_line.startswith("#"):
370 ddf_record = ddf_line.split('\t')
371 files = ddf_record[0].split(',')
372 file_attributes = ddf_record[1:]
375 fileNode = RDF.Node()
378 submissionOntology['has_file'],
380 add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
381 add_stmt(model, fileNode, DCC_NS['filename'], f)
383 for predicate, object in zip(attributes[1:], file_attributes):
384 add_stmt(model, fileNode, predicate, object)
387 def load_encode_libraries(model, htswapi):
388 """Get libraries associated with encode.
390 encodeFilters = ["/library/?affiliations__id__exact=44",
391 "/library/?affiliations__id__exact=80",
394 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
395 rdfaParser = RDF.Parser(name='rdfa')
396 for encodeUrl in encodeUrls:
397 LOGGER.info("Scanning library url {0}".format(encodeUrl))
398 rdfaParser.parse_into_model(model, encodeUrl)
399 query = RDF.Statement(None, libraryOntology['library_id'], None)
400 libraries = model.find_statements(query)
401 for statement in libraries:
402 libraryUrn = statement.subject
403 LOGGER.info("Scanning {0}".format(str(libraryUrn)))
404 load_library_detail(model, libraryUrn)
407 def load_encodedcc_files(model, base_url):
408 if base_url[-1] != '/':
411 file_index = ucsc.get_ucsc_file_index(base_url)
412 for filename, attributes in file_index.items():
413 s = RDF.Node(RDF.Uri(base_url + filename))
414 for name, value in attributes.items():
415 p = RDF.Node(DCC_NS[name])
417 model.add_statement(RDF.Statement(s,p,o))
419 def load_library_detail(model, libraryUrn):
420 """Grab detail information from library page
422 rdfaParser = RDF.Parser(name='rdfa')
423 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
424 results = list(model.find_statements(query))
425 log_message = "Found {0} statements for {1}"
426 LOGGER.debug(log_message.format(len(results), libraryUrn))
427 if len(results) == 0:
428 LOGGER.info("Loading {0}".format(str(libraryUrn)))
429 rdfaParser.parse_into_model(model, libraryUrn.uri)
430 elif len(results) == 1:
431 pass # Assuming that a loaded dataset has one record
433 LOGGER.warning("Many dates for {0}".format(libraryUrn))
436 def get_library_id(name):
437 """Guess library ID from library name
439 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
441 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
444 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
446 if match is not None:
447 library_id = match.group('id')
451 def get_contents(element):
452 """Return contents or none.
454 if len(element.contents) == 0:
457 a = element.find('a')
459 return a.contents[0].encode(CHARSET)
461 return element.contents[0].encode(CHARSET)
464 def create_status_node(submission_uri, timestamp):
465 submission_uri = daf.submission_uri_to_string(submission_uri)
466 if submission_uri[-1] != '/':
467 sumbission_uri += '/'
468 status_uri = submission_uri + timestamp
469 return RDF.Node(RDF.Uri(status_uri))
472 def get_date_contents(element):
473 data = element.text_content()
475 return datetime.strptime(data, "%Y-%m-%d %H:%M")
480 def add_stmt(model, subject, predicate, rdf_object):
481 """Convienence create RDF Statement and add to a model
483 return model.add_statement(
484 RDF.Statement(subject, predicate, rdf_object))
487 def login(cookie=None):
488 """Login if we don't have a cookie
490 if cookie is not None:
493 keys = keyring.get_keyring()
494 password = keys.get_password(LOGIN_URL, USERNAME)
495 credentials = {'login': USERNAME,
496 'password': password}
497 headers = {'Content-type': 'application/x-www-form-urlencoded'}
498 http = httplib2.Http()
499 response, content = http.request(LOGIN_URL,
502 body=urllib.urlencode(credentials))
503 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
506 cookie = response.get('set-cookie', None)
508 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
512 def get_url_as_tree(url, method, cookie=None):
513 http = httplib2.Http()
515 if cookie is not None:
516 headers['Cookie'] = cookie
517 response, content = http.request(url, method, headers=headers)
518 if response['status'] == '200':
519 tree = fromstring(content, base_url=url)
522 msg = "error accessing {0}, status {1}"
523 msg = msg.format(url, response['status'])
524 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
527 def get_url_as_text(url, method, cookie=None):
528 http = httplib2.Http()
530 if cookie is not None:
531 headers['Cookie'] = cookie
532 response, content = http.request(url, method, headers=headers)
533 if response['status'] == '200':
536 msg = "error accessing {0}, status {1}"
537 msg = msg.format(url, response['status'])
538 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
542 SUBMISSIONS_LACKING_LIBID = [
543 ('1x75-Directional-HeLa-Rep1', '11208'),
544 ('1x75-Directional-HeLa-Rep2', '11207'),
545 ('1x75-Directional-HepG2-Rep1', '11210'),
546 ('1x75-Directional-HepG2-Rep2', '11209'),
547 ('1x75-Directional-H1-hESC-Rep1', '10947'),
548 ('1x75-Directional-H1-hESC-Rep2', '11009'),
549 ('1x75-Directional-HUVEC-Rep1', '11206'),
550 ('1x75-Directional-HUVEC-Rep2', '11205'),
551 ('1x75-Directional-K562-Rep1', '11008'),
552 ('1x75-Directional-K562-Rep2', '11007'),
553 ('1x75-Directional-NHEK-Rep1', '11204'),
554 ('1x75-Directional-GM12878-Rep1', '11011'),
555 ('1x75-Directional-GM12878-Rep2', '11010'),
559 def select_by_library_id(submission_list):
560 subl = [(x.library_id, x) for x in submission_list if x.library_id]
562 for lib_id, subobj in subl:
563 libraries.setdefault(lib_id, []).append(subobj)
565 for submission in libraries.values():
566 submission.sort(key=attrgetter('date'), reverse=True)
571 def library_to_freeze(selected_libraries):
572 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
573 lib_ids = sorted(selected_libraries.keys())
574 report = ['<html><table border="1">']
577 <style type="text/css">
578 td {border-width:0 0 1px 1px; border-style:solid;}
584 report.append('<thead>')
585 report.append('<tr><td>Library ID</td><td>Name</td>')
587 report.append('<td>{0}</td>'.format(f))
588 report.append('</tr>')
589 report.append('</thead>')
590 report.append('<tbody>')
591 for lib_id in lib_ids:
592 report.append('<tr>')
593 lib_url = LIBRARY_NS[lib_id].uri
594 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
595 submissions = selected_libraries[lib_id]
596 report.append('<td>{0}</td>'.format(submissions[0].name))
598 for sub in submissions:
599 date = date_to_freeze(sub.date)
600 batched.setdefault(date, []).append(sub)
602 report.append('<td>')
603 for s in batched.get(d, []):
604 show_url = submission_view_url(s.subid)
605 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
606 report.append("{0}:{1}".format(subid, s.status))
607 report.append('</td>')
609 report.append('<td></td>')
610 report.append("</tr>")
611 report.append('</tbody>')
612 report.append("</table></html>")
613 return "\n".join(report)
616 def date_to_freeze(d):
617 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
618 (datetime(2010, 7, 30), '2010-Jul'),
619 (datetime(2011, 1, 30), '2011-Jan'),
621 for end, name in freezes:
627 if __name__ == "__main__":