3 Gather information about our submissions into a single RDF store
6 from BeautifulSoup import BeautifulSoup
7 from datetime import datetime
9 from operator import attrgetter
10 from optparse import OptionParser, OptionGroup
22 from htsworkflow.submission import daf
24 from htsworkflow.util import api
25 from htsworkflow.util.rdfhelp import \
37 TYPE_N = rdfNS['type']
40 LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
42 from htsworkflow.submission.ucsc import \
45 submission_view_url, \
48 DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
49 DDF_NS = RDF.NS(DOWNLOAD_DDF)
51 DBDIR = os.path.expanduser("~diane/proj/submission")
53 LOGGER = logging.getLogger("encode_find")
55 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
56 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
62 def main(cmdline=None):
64 Parse command line arguments
66 Takes a list of arguments (assuming arg[0] is the program name) or None
67 If None, it looks at sys.argv
69 parser = make_parser()
70 opts, args = parser.parse_args(cmdline)
73 logging.basicConfig(level=logging.DEBUG)
75 logging.basicConfig(level=logging.INFO)
77 htsw_authdata = api.make_auth_from_opts(opts, parser)
78 htswapi = api.HtswApi(opts.host, htsw_authdata)
81 model = get_model(opts.load_model, DBDIR)
83 if opts.load_rdf is not None:
84 ns_uri = submissionOntology[''].uri
85 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
93 cookie = login(cookie=cookie)
94 load_my_submissions(model, limit=limit, cookie=cookie)
95 load_encode_libraries(model, htswapi)
97 if opts.sparql is not None:
98 sparql_query(model, opts.sparql)
100 if opts.find_submission_with_no_library:
101 find_submissions_with_no_library(model)
104 serializer = get_serializer(name=opts.rdf_parser_name)
105 print serializer.serialize_model_to_string(model)
109 """Construct option parser
111 parser = OptionParser()
112 commands = OptionGroup(parser, "Commands")
113 commands.add_option('--load-model', default=None,
114 help="Load model database")
115 commands.add_option('--load-rdf', default=None,
116 help="load rdf statements into model")
117 commands.add_option('--print-rdf', action="store_true", default=False,
118 help="print ending model state")
119 commands.add_option('--update', action="store_true", default=False,
120 help="Query remote data sources and update our database")
121 #commands.add_option('--update-ucsc-status', default=None,
122 # help="download status from ucsc, requires filename for extra rules")
123 #commands.add_option('--update-ddfs', action="store_true", default=False,
124 # help="download ddf information for known submission")
125 #commands.add_option('--update-library', default=None,
126 # help="download library info from htsw, "\
127 # "requires filename for extra rules")
128 parser.add_option_group(commands)
130 queries = OptionGroup(parser, "Queries")
131 queries.add_option('--sparql', default=None,
132 help="execute arbitrary sparql query")
133 queries.add_option('--find-submission-with-no-library', default=False,
135 help="find submissions with no library ID")
136 parser.add_option_group(queries)
138 options = OptionGroup(parser, "Options")
139 options.add_option("--rdf-parser-name", default="turtle",
140 help="set rdf file parser type")
141 options.add_option("-v", "--verbose", action="store_true", default=False)
142 options.add_option("--debug", action="store_true", default=False)
143 parser.add_option_group(options)
145 api.add_auth_options(parser)
150 def load_my_submissions(model, limit=None, cookie=None):
151 """Parse all the submissions from UCSC into model
152 It will look at the global USER_URL to figure out who to scrape
153 cookie contains the session cookie, if none, will attempt to login
158 soup = get_url_as_soup(USER_URL, 'GET', cookie)
159 projects = soup.find('table', attrs={'id': 'projects'})
160 table_row = projects.findNext('tr')
161 # first record is header
162 table_row = table_row.findNext()
163 name_n = submissionOntology['name']
164 species_n = submissionOntology['species']
165 library_urn = submissionOntology['library_urn']
167 while table_row is not None:
168 cell = table_row.findAll('td')
169 if cell is not None and len(cell) > 1:
170 submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
171 if limit is None or submission_id in limit:
172 subUrn = RDF.Uri(submission_view_url(submission_id))
174 add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
176 name = get_contents(cell[4])
177 add_stmt(model, subUrn, name_n, name)
179 species = get_contents(cell[2])
180 if species is not None:
181 add_stmt(model, subUrn, species_n, species)
183 library_id = get_library_id(name)
184 if library_id is not None:
185 add_submission_to_library_urn(model,
190 add_submission_creation_date(model, subUrn, cookie)
192 # grab changing atttributes
193 status = get_contents(cell[6]).strip()
194 last_mod_datetime = get_date_contents(cell[8])
195 last_mod = last_mod_datetime.isoformat()
197 update_submission_detail(model, subUrn, status, last_mod,
200 logging.info("Processed {0}".format(subUrn))
202 table_row = table_row.findNext('tr')
205 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
206 """Add a link from a UCSC submission to woldlab library if needed
208 libraryUrn = LIBRARY_NS[library_id + '/']
209 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
210 if not model.contains_statement(query):
211 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
212 LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
213 model.add_statement(link)
215 LOGGER.debug("Found: {0}".format(str(query)))
218 def find_submissions_with_no_library(model):
219 missing_lib_query = RDF.SPARQLQuery("""
220 PREFIX submissionOntology:<{submissionOntology}>
225 ?subid submissionOntology:name ?name
226 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
227 FILTER (!bound(?libid))
228 }}""".format(submissionOntology=submissionOntology[''].uri))
230 results = missing_lib_query.execute(model)
234 print "# {0}".format(name)
235 print "<{0}>".format(subid.uri)
236 print " encodeSubmit:library_urn"\
237 "<http://jumpgate.caltech.edu/library/> ."
241 def add_submission_creation_date(model, subUrn, cookie):
242 # in theory the submission page might have more information on it.
243 creationDateN = libraryOntology['date']
244 dateTimeType = xsdNS['dateTime']
245 query = RDF.Statement(subUrn, creationDateN, None)
246 creation_dates = list(model.find_statements(query))
247 if len(creation_dates) == 0:
248 LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
249 soup = get_url_as_soup(str(subUrn), 'GET', cookie)
250 created_label = soup.find(text="Created: ")
252 created_date = get_date_contents(created_label.next)
253 created_date_node = RDF.Node(literal=created_date.isoformat(),
254 datatype=dateTimeType.uri)
255 add_stmt(model, subUrn, creationDateN, created_date_node)
257 LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
260 def update_submission_detail(model, subUrn, status, recent_update, cookie):
261 HasStatusN = submissionOntology['has_status']
262 StatusN = submissionOntology['status']
263 LastModifyN = submissionOntology['last_modify_date']
265 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
266 status_nodes = list(model.find_statements(status_nodes_query))
268 if len(status_nodes) == 0:
269 # has no status node, add one
270 logging.info("Adding status node to {0}".format(subUrn))
271 status_node = create_status_node(subUrn, recent_update)
272 add_stmt(model, subUrn, HasStatusN, status_node)
273 add_stmt(model, status_node, rdfsNS['type'], StatusN)
274 add_stmt(model, status_node, StatusN, status)
275 add_stmt(model, status_node, LastModifyN, recent_update)
276 update_ddf(model, subUrn, status_node, cookie=cookie)
277 update_daf(model, subUrn, status_node, cookie=cookie)
279 logging.info("Found {0} status blanks".format(len(status_nodes)))
280 for status_statement in status_nodes:
281 status_node = status_statement.object
282 last_modified_query = RDF.Statement(status_node,
285 last_mod_nodes = model.find_statements(last_modified_query)
286 for last_mod_statement in last_mod_nodes:
287 last_mod_date = str(last_mod_statement.object)
288 if recent_update == str(last_mod_date):
289 update_ddf(model, subUrn, status_node, cookie=cookie)
290 update_daf(model, subUrn, status_node, cookie=cookie)
294 def update_daf(model, submission_url, status_node, cookie):
295 download_daf_uri = str(submission_url).replace('show', 'download_daf')
296 daf_uri = RDF.Uri(download_daf_uri)
298 status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
299 if not model.contains_statement(status_is_daf):
300 logging.info('Adding daf to {0}, {1}'.format(submission_url,
302 daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
303 daf.fromstring_into_model(model, status_node, daf_text)
306 def update_ddf(model, subUrn, statusNode, cookie):
307 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
308 ddfUrn = RDF.Uri(download_ddf_url)
310 status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
311 if not model.contains_statement(status_is_ddf):
312 logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
313 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
314 add_ddf_statements(model, statusNode, ddf_text)
315 model.add_statement(status_is_ddf)
318 def add_ddf_statements(model, statusNode, ddf_string):
319 """Convert a ddf text file into RDF Statements
321 ddf_lines = ddf_string.split('\n')
322 # first line is header
323 header = ddf_lines[0].split()
324 attributes = [DDF_NS[x] for x in header]
326 for ddf_line in ddf_lines[1:]:
327 ddf_line = ddf_line.strip()
328 if len(ddf_line) == 0:
330 if ddf_line.startswith("#"):
333 ddf_record = ddf_line.split('\t')
334 files = ddf_record[0].split(',')
335 file_attributes = ddf_record[1:]
338 fileNode = RDF.Node()
341 submissionOntology['has_file'],
343 add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
344 add_stmt(model, fileNode, DDF_NS['filename'], f)
346 for predicate, object in zip(attributes[1:], file_attributes):
347 add_stmt(model, fileNode, predicate, object)
350 def load_encode_libraries(model, htswapi):
351 """Get libraries associated with encode.
353 encodeFilters = ["/library/?affiliations__id__exact=44",
354 "/library/?affiliations__id__exact=80",
357 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
358 rdfaParser = RDF.Parser(name='rdfa')
359 for encodeUrl in encodeUrls:
360 LOGGER.info("Scanning library url {0}".format(encodeUrl))
361 rdfaParser.parse_into_model(model, encodeUrl)
362 query = RDF.Statement(None, libraryOntology['library_id'], None)
363 libraries = model.find_statements(query)
364 for statement in libraries:
365 libraryUrn = statement.subject
366 LOGGER.info("Scanning {0}".format(str(libraryUrn)))
367 load_library_detail(model, libraryUrn)
370 def load_library_detail(model, libraryUrn):
371 """Grab detail information from library page
373 rdfaParser = RDF.Parser(name='rdfa')
374 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
375 results = list(model.find_statements(query))
376 log_message = "Found {0} statements for {1}"
377 LOGGER.debug(log_message.format(len(results), libraryUrn))
378 if len(results) == 0:
379 LOGGER.info("Loading {0}".format(str(libraryUrn)))
380 rdfaParser.parse_into_model(model, libraryUrn.uri)
381 elif len(results) == 1:
382 pass # Assuming that a loaded dataset has one record
384 logging.warning("Many dates for {0}".format(libraryUrn))
387 def get_library_id(name):
388 """Guess library ID from library name
390 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
392 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
395 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
397 if match is not None:
398 library_id = match.group('id')
402 def get_contents(element):
403 """Return contents or none.
405 if len(element.contents) == 0:
408 a = element.find('a')
410 return a.contents[0].encode(CHARSET)
412 return element.contents[0].encode(CHARSET)
415 def create_status_node(submission_uri, timestamp):
416 submission_uri = daf.submission_uri_to_string(submission_uri)
417 status_uri = urlparse.urljoin(submission_uri, timestamp)
418 return RDF.Node(RDF.Uri(status_uri))
420 def get_date_contents(element):
421 data = get_contents(element)
423 return datetime.strptime(data, "%Y-%m-%d %H:%M")
428 def add_stmt(model, subject, predicate, rdf_object):
429 """Convienence create RDF Statement and add to a model
431 return model.add_statement(
432 RDF.Statement(subject, predicate, rdf_object))
435 def login(cookie=None):
436 """Login if we don't have a cookie
438 if cookie is not None:
441 keys = keyring.get_keyring()
442 password = keys.get_password(LOGIN_URL, USERNAME)
443 credentials = {'login': USERNAME,
444 'password': password}
445 headers = {'Content-type': 'application/x-www-form-urlencoded'}
446 http = httplib2.Http()
447 response, content = http.request(LOGIN_URL,
450 body=urllib.urlencode(credentials))
451 logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
454 cookie = response.get('set-cookie', None)
456 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
460 def get_url_as_soup(url, method, cookie=None):
461 http = httplib2.Http()
463 if cookie is not None:
464 headers['Cookie'] = cookie
465 response, content = http.request(url, method, headers=headers)
466 if response['status'] == '200':
467 soup = BeautifulSoup(content,
468 fromEncoding="utf-8", # should read from header
469 convertEntities=BeautifulSoup.HTML_ENTITIES)
472 msg = "error accessing {0}, status {1}"
473 msg = msg.format(url, response['status'])
474 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
477 def get_url_as_text(url, method, cookie=None):
478 http = httplib2.Http()
480 if cookie is not None:
481 headers['Cookie'] = cookie
482 response, content = http.request(url, method, headers=headers)
483 if response['status'] == '200':
486 msg = "error accessing {0}, status {1}"
487 msg = msg.format(url, response['status'])
488 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
492 SUBMISSIONS_LACKING_LIBID = [
493 ('1x75-Directional-HeLa-Rep1', '11208'),
494 ('1x75-Directional-HeLa-Rep2', '11207'),
495 ('1x75-Directional-HepG2-Rep1', '11210'),
496 ('1x75-Directional-HepG2-Rep2', '11209'),
497 ('1x75-Directional-H1-hESC-Rep1', '10947'),
498 ('1x75-Directional-H1-hESC-Rep2', '11009'),
499 ('1x75-Directional-HUVEC-Rep1', '11206'),
500 ('1x75-Directional-HUVEC-Rep2', '11205'),
501 ('1x75-Directional-K562-Rep1', '11008'),
502 ('1x75-Directional-K562-Rep2', '11007'),
503 ('1x75-Directional-NHEK-Rep1', '11204'),
504 ('1x75-Directional-GM12878-Rep1', '11011'),
505 ('1x75-Directional-GM12878-Rep2', '11010'),
509 def select_by_library_id(submission_list):
510 subl = [(x.library_id, x) for x in submission_list if x.library_id]
512 for lib_id, subobj in subl:
513 libraries.setdefault(lib_id, []).append(subobj)
515 for submission in libraries.values():
516 submission.sort(key=attrgetter('date'), reverse=True)
521 def library_to_freeze(selected_libraries):
522 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
523 lib_ids = sorted(selected_libraries.keys())
524 report = ['<html><table border="1">']
527 <style type="text/css">
528 td {border-width:0 0 1px 1px; border-style:solid;}
534 report.append('<thead>')
535 report.append('<tr><td>Library ID</td><td>Name</td>')
537 report.append('<td>{0}</td>'.format(f))
538 report.append('</tr>')
539 report.append('</thead>')
540 report.append('<tbody>')
541 for lib_id in lib_ids:
542 report.append('<tr>')
543 lib_url = LIBRARY_NS[lib_id].uri
544 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
545 submissions = selected_libraries[lib_id]
546 report.append('<td>{0}</td>'.format(submissions[0].name))
548 for sub in submissions:
549 date = date_to_freeze(sub.date)
550 batched.setdefault(date, []).append(sub)
552 report.append('<td>')
553 for s in batched.get(d, []):
554 show_url = submission_view_url(s.subid)
555 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
556 report.append("{0}:{1}".format(subid, s.status))
557 report.append('</td>')
559 report.append('<td></td>')
560 report.append("</tr>")
561 report.append('</tbody>')
562 report.append("</table></html>")
563 return "\n".join(report)
566 def date_to_freeze(d):
567 freezes = [(datetime(2010, 1, 30), '2010-Jan'),
568 (datetime(2010, 7, 30), '2010-Jul'),
569 (datetime(2011, 1, 30), '2011-Jan'),
571 for end, name in freezes:
577 if __name__ == "__main__":