3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
18 from htsworkflow.util import api
20 logger = logging.getLogger("encode_find")
22 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
23 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
24 submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
25 ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
26 libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
28 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
29 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
30 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
31 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
33 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
34 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
39 def main(cmdline=None):
40 parser = make_parser()
41 opts, args = parser.parse_args(cmdline)
44 logging.basicConfig(level=logging.INFO)
46 htsw_authdata = api.make_auth_from_opts(opts, parser)
47 htswapi = api.HtswApi(opts.host, htsw_authdata)
50 model = get_model(opts.load_model)
52 if opts.load_rdf is not None:
53 load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
56 cookie = login(cookie=cookie)
57 load_my_submissions(model, cookie=cookie)
58 load_encode_libraries(model, htswapi)
60 if opts.sparql is not None:
61 sparql_query(model, opts.sparql)
63 if opts.find_submission_with_no_library:
64 missing = find_submissions_with_no_library(model)
67 serializer = RDF.Serializer(name=opts.rdf_parser_name)
68 print serializer.serialize_model_to_string(model)
72 parser = OptionParser()
73 commands = OptionGroup(parser, "Commands")
74 commands.add_option('--load-model', default=None,
75 help="Load model database")
76 commands.add_option('--load-rdf', default=None,
77 help="load rdf statements into model")
78 commands.add_option('--print-rdf', action="store_true", default=False,
79 help="print ending model state")
80 commands.add_option('--update', action="store_true", default=False,
81 help="Query remote data sources and update our database")
82 #commands.add_option('--update-ucsc-status', default=None,
83 # help="download status from ucsc, requires filename for extra rules")
84 #commands.add_option('--update-ddfs', action="store_true", default=False,
85 # help="download ddf information for known submission")
86 #commands.add_option('--update-library', default=None,
87 # help="download library info from htsw, requires filename for extra rules")
88 parser.add_option_group(commands)
90 queries = OptionGroup(parser, "Queries")
91 queries.add_option('--sparql', default=None,
92 help="execute arbitrary sparql query")
93 queries.add_option('--find-submission-with-no-library', default=False,
95 help="find submissions with no library ID")
96 parser.add_option_group(queries)
98 options = OptionGroup(parser, "Options")
99 options.add_option("--rdf-parser-name", default="turtle",
100 help="set rdf file parser type")
101 options.add_option("-v", "--verbose", action="store_true", default=False)
102 parser.add_option_group(options)
104 api.add_auth_options(parser)
108 def get_model(model_name=None):
109 if model_name is None:
110 storage = RDF.MemoryStorage()
112 storage = RDF.HashStorage(model_name, options="hash-type='bdb',dir='/tmp'")
113 model = RDF.Model(storage)
116 def load_my_submissions(model, cookie=None):
120 soup = get_url_as_soup(USER_URL, 'GET', cookie)
121 p = soup.find('table', attrs={'id':'projects'})
122 tr = p.findNext('tr')
123 # first record is header
125 TypeN = rdfsNS['type']
126 NameN = submitOntologyNS['name']
127 SpeciesN = submitOntologyNS['species']
128 LibraryURN = submitOntologyNS['library_urn']
130 while tr is not None:
131 td = tr.findAll('td')
132 if td is not None and len(td) > 1:
133 subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
134 subUrn = submissionNS[subUrnText]
136 add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission'])
138 name = get_contents(td[4])
139 add_stmt(model, subUrn, NameN, name)
141 species = get_contents(td[2])
142 if species is not None:
143 add_stmt(model, subUrn, SpeciesN, species)
145 library_id = get_library_id(name)
146 if library_id is not None:
147 add_submission_to_library_urn(model,
152 add_submission_creation_date(model, subUrn, cookie)
154 # grab changing atttributes
155 status = get_contents(td[6]).strip()
156 last_mod_datetime = get_date_contents(td[8])
157 last_mod = last_mod_datetime.isoformat()
159 update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
161 logging.info("Processed {0}".format( subUrn))
163 tr = tr.findNext('tr')
166 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
167 """Add a link from a UCSC submission to woldlab library if needed
169 libraryUrn = libraryNS[library_id]
170 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
171 if not model.contains_statement(query):
172 link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
173 logger.info("Adding Sub -> Lib link: {0}".format(link))
174 model.add_statement(link)
176 logger.debug("Found: {0}".format(str(query)))
179 def find_submissions_with_no_library(model):
180 missing_lib_query = RDF.SPARQLQuery("""
181 PREFIX submissionOntology:<{submissionOntology}>
186 ?subid submissionOntology:name ?name
187 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
188 FILTER (!bound(?libid))
189 }}""".format(submissionOntology=submitOntologyNS[''].uri)
192 results = missing_lib_query.execute(model)
196 print "# {0}".format(name)
197 print "<{0}>".format(subid.uri)
198 print " encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
202 def add_submission_creation_date(model, subUrn, cookie):
203 # in theory the submission page might have more information on it.
204 creationDateN = libOntNS['date']
205 dateTimeType = xsdNS['dateTime']
206 query = RDF.Statement(subUrn, creationDateN, None)
207 creation_dates = list(model.find_statements(query))
208 if len(creation_dates) == 0:
209 logger.info("Getting creation date for: {0}".format(str(subUrn)))
210 soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
211 created_label = soup.find(text="Created: ")
213 created_date = get_date_contents(created_label.next)
214 created_date_node = RDF.Node(literal=created_date.isoformat(),
215 datatype=dateTimeType.uri)
216 add_stmt(model, subUrn, creationDateN, created_date_node)
218 logger.debug("Found creation date for: {0}".format(str(subUrn)))
220 def update_submission_detail(model, subUrn, status, recent_update, cookie):
221 HasStatusN = submitOntologyNS['has_status']
222 StatusN = submitOntologyNS['status']
223 LastModifyN = submitOntologyNS['last_modify_date']
225 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
226 status_nodes = list(model.find_statements(status_nodes_query))
228 if len(status_nodes) == 0:
229 # has no status node, add one
230 logging.info("Adding status node to {0}".format(subUrn))
231 status_blank = RDF.Node()
232 add_stmt(model, subUrn, HasStatusN, status_blank)
233 add_stmt(model, status_blank, rdfs['type'], StatusT)
234 add_stmt(model, status_blank, StatusN, status)
235 add_stmt(model, status_blank, LastModifyN, recent_update)
236 update_ddf(model, subUrn, status_blank, cookie=cookie)
238 logging.info("Found {0} status blanks".format(len(status_nodes)))
239 for status_statement in status_nodes:
240 status_blank = status_statement.object
241 last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
242 last_mod_nodes = model.find_statements(last_modified_query)
243 for last_mod_statement in last_mod_nodes:
244 last_mod_date = str(last_mod_statement.object)
245 if recent_update == str(last_mod_date):
246 update_ddf(model, subUrn, status_blank, cookie=cookie)
251 def update_ddf(model, subUrn, statusNode, cookie):
252 TypeN = rdfsNS['type']
254 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
255 ddfUrn = RDF.Uri(download_ddf_url)
257 status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
258 if not model.contains_statement(status_is_ddf):
259 logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
260 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
261 add_ddf_statements(model, statusNode, ddf_text)
262 model.add_statement(status_is_ddf)
265 def add_ddf_statements(model, statusNode, ddf_string):
266 """Convert a ddf text file into RDF Statements
268 ddf_lines = ddf_string.split('\n')
269 # first line is header
270 header = ddf_lines[0].split()
271 attributes = [ ddfNS[x] for x in header ]
274 for ddf_line in ddf_lines[1:]:
275 ddf_line = ddf_line.strip()
276 if len(ddf_line) == 0:
278 if ddf_line.startswith("#"):
281 ddf_record = ddf_line.split('\t')
282 files = ddf_record[0].split(',')
283 file_attributes = ddf_record[1:]
286 fileNode = RDF.Node()
287 add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode)
288 add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
289 add_stmt(model, fileNode, ddfNS['filename'], f)
291 for predicate, object in zip( attributes[1:], file_attributes):
292 add_stmt(model, fileNode, predicate, object)
295 def load_encode_libraries(model, htswapi):
296 """Get libraries associated with encode.
298 encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
299 rdfaParser = RDF.Parser(name='rdfa')
301 rdfaParser.parse_into_model(model, encodeUrl)
302 query = RDF.Statement(None, libOntNS['library_id'], None)
303 libraries = model.find_statements(query)
304 for statement in libraries:
305 libraryUrn = statement.subject
306 load_library_detail(model, libraryUrn)
309 def load_library_detail(model, libraryUrn):
310 """Grab detail information from library page
312 rdfaParser = RDF.Parser(name='rdfa')
313 query = RDF.Statement(libraryUrn, libOntNS['date'], None)
314 results = list(model.find_statements(query))
315 if len(results) == 0:
316 logger.info("Loading {0}".format(str(libraryUrn)))
317 rdfaParser.parse_into_model(model, libraryUrn.uri)
318 elif len(results) == 1:
319 pass # Assuming that a loaded dataset has one record
321 logging.warning("Many dates for {0}".format(libraryUrn))
323 def get_library_id(name):
324 """Guess library ID from library name
326 match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
328 if match is not None:
329 library_id = match.group('id')
333 def get_contents(element):
334 """Return contents or none.
336 if len(element.contents) == 0:
339 a = element.find('a')
341 return a.contents[0].encode(CHARSET)
343 return element.contents[0].encode(CHARSET)
346 def get_date_contents(element):
347 data = get_contents(element)
349 return datetime.strptime(data, "%Y-%m-%d %H:%M")
353 def sparql_query(model, query_filename):
354 """Execute sparql query from file
356 query_body = open(query_filename,'r').read()
357 query = RDF.SPARQLQuery(query_body)
358 results = query.execute(model)
361 for k,v in row.items()[::-1]:
362 print "{0}: {1}".format(k,v)
366 def load_into_model(model, parser_name, filename):
367 if not os.path.exists(filename):
368 raise IOError("Can't find {0}".format(filename))
370 data = open(filename, 'r').read()
371 rdf_parser = RDF.Parser(name=parser_name)
372 ns_uri = submitOntologyNS[''].uri
373 rdf_parser.parse_string_into_model(model, data, ns_uri)
375 def add_stmt(model, subject, predicate, object):
376 """Convienence create RDF Statement and add to a model
378 return model.add_statement(
379 RDF.Statement(subject, predicate, object)
382 def login(cookie=None):
383 """Login if we don't have a cookie
385 if cookie is not None:
388 keys = keyring.get_keyring()
389 password = keys.get_password(LOGIN_URL, USERNAME)
390 credentials = {'login': USERNAME,
391 'password': password}
392 headers = {'Content-type': 'application/x-www-form-urlencoded'}
393 http = httplib2.Http()
394 response, content = http.request(LOGIN_URL,
397 body=urllib.urlencode(credentials))
398 logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
401 cookie = response.get('set-cookie', None)
403 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
407 def get_url_as_soup(url, method, cookie=None):
408 http = httplib2.Http()
410 if cookie is not None:
411 headers['Cookie'] = cookie
412 response, content = http.request(url, method, headers=headers)
413 if response['status'] == '200':
414 soup = BeautifulSoup(content,
415 fromEncoding="utf-8", # should read from header
416 convertEntities=BeautifulSoup.HTML_ENTITIES
420 msg = "error accessing {0}, status {1}"
421 msg = msg.format(url, response['status'])
422 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
424 def get_url_as_text(url, method, cookie=None):
425 http = httplib2.Http()
427 if cookie is not None:
428 headers['Cookie'] = cookie
429 response, content = http.request(url, method, headers=headers)
430 if response['status'] == '200':
433 msg = "error accessing {0}, status {1}"
434 msg = msg.format(url, response['status'])
435 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
439 SUBMISSIONS_LACKING_LIBID = [
440 ('1x75-Directional-HeLa-Rep1', '11208'),
441 ('1x75-Directional-HeLa-Rep2', '11207'),
442 ('1x75-Directional-HepG2-Rep1', '11210'),
443 ('1x75-Directional-HepG2-Rep2', '11209'),
444 ('1x75-Directional-H1-hESC-Rep1', '10947'),
445 ('1x75-Directional-H1-hESC-Rep2', '11009'),
446 ('1x75-Directional-HUVEC-Rep1', '11206'),
447 ('1x75-Directional-HUVEC-Rep2', '11205'),
448 ('1x75-Directional-K562-Rep1', '11008'),
449 ('1x75-Directional-K562-Rep2', '11007'),
450 ('1x75-Directional-NHEK-Rep1', '11204'),
451 ('1x75-Directional-GM12878-Rep1', '11011'),
452 ('1x75-Directional-GM12878-Rep2', '11010'),
457 def select_by_library_id(submission_list):
458 subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
460 for lib_id, subobj in subl:
461 libraries.setdefault(lib_id, []).append(subobj)
463 for submission in libraries.values():
464 submission.sort(key=attrgetter('date'), reverse=True)
468 def library_to_freeze(selected_libraries):
469 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
470 lib_ids = sorted(selected_libraries.keys())
471 report = ['<html><table border="1">']
474 <style type="text/css">
475 td {border-width:0 0 1px 1px; border-style:solid;}
481 report.append('<thead>')
482 report.append('<tr><td>Library ID</td><td>Name</td>')
484 report.append('<td>{0}</td>'.format(f))
485 report.append('</tr>')
486 report.append('</thead>')
487 report.append('<tbody>')
488 for lib_id in lib_ids:
489 report.append('<tr>')
490 lib_url = libraryNS[lib_id].uri
491 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
492 submissions = selected_libraries[lib_id]
493 report.append('<td>{0}</td>'.format(submissions[0].name))
495 for sub in submissions:
496 date = date_to_freeze(sub.date)
497 batched.setdefault(date, []).append(sub)
498 print lib_id, batched
500 report.append('<td>')
501 for s in batched.get(d, []):
502 show_url = submissionNS[s.subid].uri
503 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
504 report.append("{0}:{1}".format(subid, s.status))
505 report.append('</td>')
507 report.append('<td></td>')
508 report.append("</tr>")
509 report.append('</tbody>')
510 report.append("</table></html>")
511 return "\n".join(report)
514 def date_to_freeze(d):
515 freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
516 (datetime(2010, 7, 30), '2010-Jul'),
517 (datetime(2011, 1, 30), '2011-Jan'),
519 for end, name in freezes:
525 if __name__ == "__main__":