3 Gather information about our submissions into a single RDF store
6 from datetime import datetime
11 from lxml.html import fromstring
12 from operator import attrgetter
13 from optparse import OptionParser, OptionGroup
23 from htsworkflow.submission import daf
25 from htsworkflow.util import api
26 from htsworkflow.util.rdfhelp import \
38 TYPE_N = rdfNS['type']
41 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
43 from htsworkflow.submission.ucsc import \
46 submission_view_url, \
49 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
50 DDF_NS = RDF.NS(DOWNLOAD_DDF)
52 DBDIR = os.path.expanduser("~diane/proj/submission")
54 LOGGER = logging.getLogger("encode_find")
56 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
57 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
63 def main(cmdline=None):
65 Parse command line arguments
67 Takes a list of arguments (assuming arg[0] is the program name) or None
68 If None, it looks at sys.argv
70 parser = make_parser()
71 opts, args = parser.parse_args(cmdline)
74 logging.basicConfig(level=logging.DEBUG)
76 logging.basicConfig(level=logging.INFO)
78 htsw_authdata = api.make_auth_from_opts(opts, parser)
79 htswapi = api.HtswApi(opts.host, htsw_authdata)
82 model = get_model(opts.load_model, DBDIR)
84 if opts.load_rdf is not None:
85 ns_uri = submissionOntology[''].uri
86 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
94 cookie = login(cookie=cookie)
95 load_my_submissions(model, limit=limit, cookie=cookie)
96 load_encode_libraries(model, htswapi)
98 if opts.sparql is not None:
99 sparql_query(model, opts.sparql)
101 if opts.find_submission_with_no_library:
102 find_submissions_with_no_library(model)
105 serializer = get_serializer(name=opts.rdf_parser_name)
106 print serializer.serialize_model_to_string(model)
110 """Construct option parser
112 parser = OptionParser()
113 commands = OptionGroup(parser, "Commands")
114 commands.add_option('--load-model', default=None,
115 help="Load model database")
116 commands.add_option('--load-rdf', default=None,
117 help="load rdf statements into model")
118 commands.add_option('--print-rdf', action="store_true", default=False,
119 help="print ending model state")
120 commands.add_option('--update', action="store_true", default=False,
121 help="Query remote data sources and update our database")
122 #commands.add_option('--update-ucsc-status', default=None,
123 # help="download status from ucsc, requires filename for extra rules")
124 #commands.add_option('--update-ddfs', action="store_true", default=False,
125 # help="download ddf information for known submission")
126 #commands.add_option('--update-library', default=None,
127 # help="download library info from htsw, "\
128 # "requires filename for extra rules")
129 parser.add_option_group(commands)
131 queries = OptionGroup(parser, "Queries")
132 queries.add_option('--sparql', default=None,
133 help="execute arbitrary sparql query")
134 queries.add_option('--find-submission-with-no-library', default=False,
136 help="find submissions with no library ID")
137 parser.add_option_group(queries)
139 options = OptionGroup(parser, "Options")
140 options.add_option("--rdf-parser-name", default="turtle",
141 help="set rdf file parser type")
142 options.add_option("-v", "--verbose", action="store_true", default=False)
143 options.add_option("--debug", action="store_true", default=False)
144 parser.add_option_group(options)
146 api.add_auth_options(parser)
151 def load_my_submissions(model, limit=None, cookie=None):
152 """Parse all the submissions from UCSC into model
153 It will look at the global USER_URL to figure out who to scrape
154 cookie contains the session cookie, if none, will attempt to login
159 tree = get_url_as_tree(USER_URL, 'GET', cookie)
160 table_rows = tree.xpath('//table[@id="projects"]/tr')
161 # first record is header
162 name_n = submissionOntology['name']
163 species_n = submissionOntology['species']
164 library_urn = submissionOntology['library_urn']
167 for row in table_rows[1:]:
168 cell = row.xpath('td')
169 if cell is not None and len(cell) > 1:
170 submission_id = str(cell[0].text_content())
171 if limit is None or submission_id in limit:
172 subUrn = RDF.Uri(submission_view_url(submission_id))
174 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
176 name = str(cell[4].text_content())
177 add_stmt(model, subUrn, name_n, name)
179 species = str(cell[2].text_content())
180 if species is not None:
181 add_stmt(model, subUrn, species_n, species)
183 library_id = get_library_id(name)
184 if library_id is not None:
185 add_submission_to_library_urn(model,
190 errmsg = 'Unable to find library id in {0} for {1}'
191 LOGGER.warn(errmsg.format(name, str(subUrn)))
193 add_submission_creation_date(model, subUrn, cookie)
195 # grab changing atttributes
196 status = str(cell[6].text_content()).strip()
197 last_mod_datetime = get_date_contents(cell[8])
198 last_mod = last_mod_datetime.isoformat()
200 update_submission_detail(model, subUrn, status, last_mod,
203 LOGGER.info("Processed {0}".format(subUrn))
208 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
209 """Add a link from a UCSC submission to woldlab library if needed
211 libraryUrn = LIBRARY_NS[library_id + '/']
212 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
213 if not model.contains_statement(query):
214 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
215 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
216 model.add_statement(link)
218 LOGGER.debug("Found: {0}".format(str(query)))
221 def find_submissions_with_no_library(model):
222 missing_lib_query_text = """
223 PREFIX submissionOntology:<{submissionOntology}>
228 ?subid submissionOntology:name ?name
229 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
230 FILTER (!bound(?libid))
231 }}""".format(submissionOntology=submissionOntology[''].uri)
232 missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
234 results = missing_lib_query.execute(model)
238 print "# {0}".format(name)
239 print "<{0}>".format(subid.uri)
240 print " encodeSubmit:library_urn "\
241 "<http://jumpgate.caltech.edu/library/> ."
245 def add_submission_creation_date(model, subUrn, cookie):
246 # in theory the submission page might have more information on it.
247 creationDateN = libraryOntology['date']
248 dateTimeType = xsdNS['dateTime']
249 query = RDF.Statement(subUrn, creationDateN, None)
250 creation_dates = list(model.find_statements(query))
251 if len(creation_dates) == 0:
252 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
253 tree = get_url_as_tree(str(subUrn), 'GET', cookie)
254 cells = tree.findall('.//td')
255 created_label = [x for x in cells
256 if x.text_content().startswith('Created')]
257 if len(created_label) == 1:
258 created_date = get_date_contents(created_label[0].getnext())
259 created_date_node = RDF.Node(literal=created_date.isoformat(),
260 datatype=dateTimeType.uri)
261 add_stmt(model, subUrn, creationDateN, created_date_node)
263 msg = 'Unable to find creation date for {0}'.format(str(subUrn))
267 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
270 def update_submission_detail(model, subUrn, status, recent_update, cookie):
271 HasStatusN = submissionOntology['has_status']
272 StatusN = submissionOntology['status']
273 LastModifyN = submissionOntology['last_modify_date']
275 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
276 status_nodes = list(model.find_statements(status_nodes_query))
278 if len(status_nodes) == 0:
279 # has no status node, add one
280 LOGGER.info("Adding status node to {0}".format(subUrn))
281 status_node = create_status_node(subUrn, recent_update)
282 add_stmt(model, subUrn, HasStatusN, status_node)
283 add_stmt(model, status_node, rdfNS['type'], StatusN)
284 add_stmt(model, status_node, StatusN, status)
285 add_stmt(model, status_node, LastModifyN, recent_update)
286 update_ddf(model, subUrn, status_node, cookie=cookie)
287 update_daf(model, subUrn, status_node, cookie=cookie)
289 LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
290 for status_statement in status_nodes:
291 status_node = status_statement.object
292 last_modified_query = RDF.Statement(status_node,
295 last_mod_nodes = model.find_statements(last_modified_query)
296 for last_mod_statement in last_mod_nodes:
297 last_mod_date = str(last_mod_statement.object)
298 if recent_update == str(last_mod_date):
299 update_ddf(model, subUrn, status_node, cookie=cookie)
300 update_daf(model, subUrn, status_node, cookie=cookie)
304 def update_daf(model, submission_url, status_node, cookie):
305 download_daf_uri = str(submission_url).replace('show', 'download_daf')
306 daf_uri = RDF.Uri(download_daf_uri)
308 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
309 if not model.contains_statement(status_is_daf):
310 LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
312 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
313 daf_hash = hashlib.md5(daf_text).hexdigest()
314 daf_hash_stmt = RDF.Statement(status_node,
315 dafTermOntology['md5sum'],
317 model.add_statement(daf_hash_stmt)
318 daf.fromstring_into_model(model, status_node, daf_text)
321 def update_ddf(model, subUrn, statusNode, cookie):
322 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
323 ddfUrn = RDF.Uri(download_ddf_url)
325 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
326 if not model.contains_statement(status_is_ddf):
327 LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
328 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
329 add_ddf_statements(model, statusNode, ddf_text)
330 model.add_statement(status_is_ddf)
333 def add_ddf_statements(model, statusNode, ddf_string):
334 """Convert a ddf text file into RDF Statements
336 ddf_lines = ddf_string.split('\n')
337 # first line is header
338 header = ddf_lines[0].split()
339 attributes = [DDF_NS[x] for x in header]
341 for ddf_line in ddf_lines[1:]:
342 ddf_line = ddf_line.strip()
343 if len(ddf_line) == 0:
345 if ddf_line.startswith("#"):
348 ddf_record = ddf_line.split('\t')
349 files = ddf_record[0].split(',')
350 file_attributes = ddf_record[1:]
353 fileNode = RDF.Node()
356 submissionOntology['has_file'],
358 add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
359 add_stmt(model, fileNode, DDF_NS['filename'], f)
361 for predicate, object in zip(attributes[1:], file_attributes):
362 add_stmt(model, fileNode, predicate, object)
365 def load_encode_libraries(model, htswapi):
366 """Get libraries associated with encode.
368 encodeFilters = ["/library/?affiliations__id__exact=44",
369 "/library/?affiliations__id__exact=80",
372 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
373 rdfaParser = RDF.Parser(name='rdfa')
374 for encodeUrl in encodeUrls:
375 LOGGER.info("Scanning library url {0}".format(encodeUrl))
376 rdfaParser.parse_into_model(model, encodeUrl)
377 query = RDF.Statement(None, libraryOntology['library_id'], None)
378 libraries = model.find_statements(query)
379 for statement in libraries:
380 libraryUrn = statement.subject
381 LOGGER.info("Scanning {0}".format(str(libraryUrn)))
382 load_library_detail(model, libraryUrn)
385 def load_library_detail(model, libraryUrn):
386 """Grab detail information from library page
388 rdfaParser = RDF.Parser(name='rdfa')
389 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
390 results = list(model.find_statements(query))
391 log_message = "Found {0} statements for {1}"
392 LOGGER.debug(log_message.format(len(results), libraryUrn))
393 if len(results) == 0:
394 LOGGER.info("Loading {0}".format(str(libraryUrn)))
395 rdfaParser.parse_into_model(model, libraryUrn.uri)
396 elif len(results) == 1:
397 pass # Assuming that a loaded dataset has one record
399 LOGGER.warning("Many dates for {0}".format(libraryUrn))
402 def get_library_id(name):
403 """Guess library ID from library name
405 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
407 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
410 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
412 if match is not None:
413 library_id = match.group('id')
417 def get_contents(element):
418 """Return contents or none.
420 if len(element.contents) == 0:
423 a = element.find('a')
425 return a.contents[0].encode(CHARSET)
427 return element.contents[0].encode(CHARSET)
430 def create_status_node(submission_uri, timestamp):
431 submission_uri = daf.submission_uri_to_string(submission_uri)
432 status_uri = urlparse.urljoin(submission_uri, timestamp)
433 return RDF.Node(RDF.Uri(status_uri))
436 def get_date_contents(element):
437 data = element.text_content()
439 return datetime.strptime(data, "%Y-%m-%d %H:%M")
444 def add_stmt(model, subject, predicate, rdf_object):
445 """Convienence create RDF Statement and add to a model
447 return model.add_statement(
448 RDF.Statement(subject, predicate, rdf_object))
451 def login(cookie=None):
452 """Login if we don't have a cookie
454 if cookie is not None:
457 keys = keyring.get_keyring()
458 password = keys.get_password(LOGIN_URL, USERNAME)
459 credentials = {'login': USERNAME,
460 'password': password}
461 headers = {'Content-type': 'application/x-www-form-urlencoded'}
462 http = httplib2.Http()
463 response, content = http.request(LOGIN_URL,
466 body=urllib.urlencode(credentials))
467 LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
470 cookie = response.get('set-cookie', None)
472 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
476 def get_url_as_tree(url, method, cookie=None):
477 http = httplib2.Http()
479 if cookie is not None:
480 headers['Cookie'] = cookie
481 response, content = http.request(url, method, headers=headers)
482 if response['status'] == '200':
483 tree = fromstring(content, base_url=url)
486 msg = "error accessing {0}, status {1}"
487 msg = msg.format(url, response['status'])
488 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
491 def get_url_as_text(url, method, cookie=None):
492 http = httplib2.Http()
494 if cookie is not None:
495 headers['Cookie'] = cookie
496 response, content = http.request(url, method, headers=headers)
497 if response['status'] == '200':
500 msg = "error accessing {0}, status {1}"
501 msg = msg.format(url, response['status'])
502 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
506 SUBMISSIONS_LACKING_LIBID = [
507 ('1x75-Directional-HeLa-Rep1', '11208'),
508 ('1x75-Directional-HeLa-Rep2', '11207'),
509 ('1x75-Directional-HepG2-Rep1', '11210'),
510 ('1x75-Directional-HepG2-Rep2', '11209'),
511 ('1x75-Directional-H1-hESC-Rep1', '10947'),
512 ('1x75-Directional-H1-hESC-Rep2', '11009'),
513 ('1x75-Directional-HUVEC-Rep1', '11206'),
514 ('1x75-Directional-HUVEC-Rep2', '11205'),
515 ('1x75-Directional-K562-Rep1', '11008'),
516 ('1x75-Directional-K562-Rep2', '11007'),
517 ('1x75-Directional-NHEK-Rep1', '11204'),
518 ('1x75-Directional-GM12878-Rep1', '11011'),
519 ('1x75-Directional-GM12878-Rep2', '11010'),
523 def select_by_library_id(submission_list):
524 subl = [(x.library_id, x) for x in submission_list if x.library_id]
526 for lib_id, subobj in subl:
527 libraries.setdefault(lib_id, []).append(subobj)
529 for submission in libraries.values():
530 submission.sort(key=attrgetter('date'), reverse=True)
535 def library_to_freeze(selected_libraries):
536 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
537 lib_ids = sorted(selected_libraries.keys())
538 report = ['<html><table border="1">']
541 <style type="text/css">
542 td {border-width:0 0 1px 1px; border-style:solid;}
548 report.append('<thead>')
549 report.append('<tr><td>Library ID</td><td>Name</td>')
551 report.append('<td>{0}</td>'.format(f))
552 report.append('</tr>')
553 report.append('</thead>')
554 report.append('<tbody>')
555 for lib_id in lib_ids:
556 report.append('<tr>')
557 lib_url = LIBRARY_NS[lib_id].uri
558 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
559 submissions = selected_libraries[lib_id]
560 report.append('<td>{0}</td>'.format(submissions[0].name))
562 for sub in submissions:
563 date = date_to_freeze(sub.date)
564 batched.setdefault(date, []).append(sub)
566 report.append('<td>')
567 for s in batched.get(d, []):
568 show_url = submission_view_url(s.subid)
569 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
570 report.append("{0}:{1}".format(subid, s.status))
571 report.append('</td>')
573 report.append('<td></td>')
574 report.append("</tr>")
575 report.append('</tbody>')
576 report.append("</table></html>")
577 return "\n".join(report)
580 def date_to_freeze(d):
581 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
582 (datetime(2010, 7, 30), '2010-Jul'),
583 (datetime(2011, 1, 30), '2011-Jan'),
585 for end, name in freezes:
591 if __name__ == "__main__":