3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
19 from htsworkflow.util import api
20 from htsworkflow.util.rdfhelp import \
33 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
36 from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
37 download_ddf = UCSCEncodePipeline+"download_ddf#"
38 ddfNS = RDF.NS(download_ddf)
40 DBDIR = os.path.expanduser("~diane/proj/submission")
42 logger = logging.getLogger("encode_find")
44 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
45 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
50 def main(cmdline=None):
51 parser = make_parser()
52 opts, args = parser.parse_args(cmdline)
55 logging.basicConfig(level=logging.DEBUG)
57 logging.basicConfig(level=logging.INFO)
59 htsw_authdata = api.make_auth_from_opts(opts, parser)
60 htswapi = api.HtswApi(opts.host, htsw_authdata)
63 model = get_model(opts.load_model, DBDIR)
65 if opts.load_rdf is not None:
66 ns_uri = submissionOntology[''].uri
67 load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
70 cookie = login(cookie=cookie)
71 load_my_submissions(model, cookie=cookie)
72 load_encode_libraries(model, htswapi)
74 if opts.sparql is not None:
75 sparql_query(model, opts.sparql)
77 if opts.find_submission_with_no_library:
78 missing = find_submissions_with_no_library(model)
81 serializer = get_serializer(name=opts.rdf_parser_name)
82 print serializer.serialize_model_to_string(model)
86 parser = OptionParser()
87 commands = OptionGroup(parser, "Commands")
88 commands.add_option('--load-model', default=None,
89 help="Load model database")
90 commands.add_option('--load-rdf', default=None,
91 help="load rdf statements into model")
92 commands.add_option('--print-rdf', action="store_true", default=False,
93 help="print ending model state")
94 commands.add_option('--update', action="store_true", default=False,
95 help="Query remote data sources and update our database")
96 #commands.add_option('--update-ucsc-status', default=None,
97 # help="download status from ucsc, requires filename for extra rules")
98 #commands.add_option('--update-ddfs', action="store_true", default=False,
99 # help="download ddf information for known submission")
100 #commands.add_option('--update-library', default=None,
101 # help="download library info from htsw, requires filename for extra rules")
102 parser.add_option_group(commands)
104 queries = OptionGroup(parser, "Queries")
105 queries.add_option('--sparql', default=None,
106 help="execute arbitrary sparql query")
107 queries.add_option('--find-submission-with-no-library', default=False,
109 help="find submissions with no library ID")
110 parser.add_option_group(queries)
112 options = OptionGroup(parser, "Options")
113 options.add_option("--rdf-parser-name", default="turtle",
114 help="set rdf file parser type")
115 options.add_option("-v", "--verbose", action="store_true", default=False)
116 options.add_option("--debug", action="store_true", default=False)
117 parser.add_option_group(options)
119 api.add_auth_options(parser)
123 def load_my_submissions(model, cookie=None):
127 soup = get_url_as_soup(USER_URL, 'GET', cookie)
128 p = soup.find('table', attrs={'id':'projects'})
129 tr = p.findNext('tr')
130 # first record is header
132 TypeN = rdfsNS['type']
133 NameN = submissionOntology['name']
134 SpeciesN = submissionOntology['species']
135 LibraryURN = submissionOntology['library_urn']
137 while tr is not None:
138 td = tr.findAll('td')
139 if td is not None and len(td) > 1:
140 subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
141 subUrn = RDF.Uri(submission_view_url(subUrnText))
143 add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
145 name = get_contents(td[4])
146 add_stmt(model, subUrn, NameN, name)
148 species = get_contents(td[2])
149 if species is not None:
150 add_stmt(model, subUrn, SpeciesN, species)
152 library_id = get_library_id(name)
153 if library_id is not None:
154 add_submission_to_library_urn(model,
159 add_submission_creation_date(model, subUrn, cookie)
161 # grab changing atttributes
162 status = get_contents(td[6]).strip()
163 last_mod_datetime = get_date_contents(td[8])
164 last_mod = last_mod_datetime.isoformat()
166 update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
168 logging.info("Processed {0}".format( subUrn))
170 tr = tr.findNext('tr')
173 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
174 """Add a link from a UCSC submission to woldlab library if needed
176 libraryUrn = libraryNS[library_id+'/']
177 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
178 if not model.contains_statement(query):
179 link = RDF.Statement(submissionUrn, predicate, libraryUrn)
180 logger.info("Adding Sub -> Lib link: {0}".format(link))
181 model.add_statement(link)
183 logger.debug("Found: {0}".format(str(query)))
186 def find_submissions_with_no_library(model):
187 missing_lib_query = RDF.SPARQLQuery("""
188 PREFIX submissionOntology:<{submissionOntology}>
193 ?subid submissionOntology:name ?name
194 OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
195 FILTER (!bound(?libid))
196 }}""".format(submissionOntology=submissionOntology[''].uri)
199 results = missing_lib_query.execute(model)
203 print "# {0}".format(name)
204 print "<{0}>".format(subid.uri)
205 print " encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
209 def add_submission_creation_date(model, subUrn, cookie):
210 # in theory the submission page might have more information on it.
211 creationDateN = libraryOntology['date']
212 dateTimeType = xsdNS['dateTime']
213 query = RDF.Statement(subUrn, creationDateN, None)
214 creation_dates = list(model.find_statements(query))
215 if len(creation_dates) == 0:
216 logger.info("Getting creation date for: {0}".format(str(subUrn)))
217 soup = get_url_as_soup(str(subUrn), 'GET', cookie)
218 created_label = soup.find(text="Created: ")
220 created_date = get_date_contents(created_label.next)
221 created_date_node = RDF.Node(literal=created_date.isoformat(),
222 datatype=dateTimeType.uri)
223 add_stmt(model, subUrn, creationDateN, created_date_node)
225 logger.debug("Found creation date for: {0}".format(str(subUrn)))
227 def update_submission_detail(model, subUrn, status, recent_update, cookie):
228 HasStatusN = submissionOntology['has_status']
229 StatusN = submissionOntology['status']
230 LastModifyN = submissionOntology['last_modify_date']
232 status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
233 status_nodes = list(model.find_statements(status_nodes_query))
235 if len(status_nodes) == 0:
236 # has no status node, add one
237 logging.info("Adding status node to {0}".format(subUrn))
238 status_blank = RDF.Node()
239 add_stmt(model, subUrn, HasStatusN, status_blank)
240 add_stmt(model, status_blank, rdfsNS['type'], StatusN)
241 add_stmt(model, status_blank, StatusN, status)
242 add_stmt(model, status_blank, LastModifyN, recent_update)
243 update_ddf(model, subUrn, status_blank, cookie=cookie)
245 logging.info("Found {0} status blanks".format(len(status_nodes)))
246 for status_statement in status_nodes:
247 status_blank = status_statement.object
248 last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
249 last_mod_nodes = model.find_statements(last_modified_query)
250 for last_mod_statement in last_mod_nodes:
251 last_mod_date = str(last_mod_statement.object)
252 if recent_update == str(last_mod_date):
253 update_ddf(model, subUrn, status_blank, cookie=cookie)
258 def update_ddf(model, subUrn, statusNode, cookie):
259 TypeN = rdfsNS['type']
261 download_ddf_url = str(subUrn).replace('show', 'download_ddf')
262 ddfUrn = RDF.Uri(download_ddf_url)
264 status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
265 if not model.contains_statement(status_is_ddf):
266 logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
267 ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
268 add_ddf_statements(model, statusNode, ddf_text)
269 model.add_statement(status_is_ddf)
272 def add_ddf_statements(model, statusNode, ddf_string):
273 """Convert a ddf text file into RDF Statements
275 ddf_lines = ddf_string.split('\n')
276 # first line is header
277 header = ddf_lines[0].split()
278 attributes = [ ddfNS[x] for x in header ]
281 for ddf_line in ddf_lines[1:]:
282 ddf_line = ddf_line.strip()
283 if len(ddf_line) == 0:
285 if ddf_line.startswith("#"):
288 ddf_record = ddf_line.split('\t')
289 files = ddf_record[0].split(',')
290 file_attributes = ddf_record[1:]
293 fileNode = RDF.Node()
294 add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
295 add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
296 add_stmt(model, fileNode, ddfNS['filename'], f)
298 for predicate, object in zip( attributes[1:], file_attributes):
299 add_stmt(model, fileNode, predicate, object)
302 def load_encode_libraries(model, htswapi):
303 """Get libraries associated with encode.
305 encodeFilters = ["/library/?affiliations__id__exact=44",
306 "/library/?affiliations__id__exact=80",]
309 encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
310 rdfaParser = RDF.Parser(name='rdfa')
311 for encodeUrl in encodeUrls:
312 logger.info("Scanning library url {0}".format(encodeUrl))
313 rdfaParser.parse_into_model(model, encodeUrl)
314 query = RDF.Statement(None, libraryOntology['library_id'], None)
315 libraries = model.find_statements(query)
316 for statement in libraries:
317 libraryUrn = statement.subject
318 logger.info("Scanning {0}".format(str(libraryUrn)))
319 load_library_detail(model, libraryUrn)
322 def load_library_detail(model, libraryUrn):
323 """Grab detail information from library page
325 rdfaParser = RDF.Parser(name='rdfa')
326 query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
327 results = list(model.find_statements(query))
328 logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
329 if len(results) == 0:
330 logger.info("Loading {0}".format(str(libraryUrn)))
331 rdfaParser.parse_into_model(model, libraryUrn.uri)
332 elif len(results) == 1:
333 pass # Assuming that a loaded dataset has one record
335 logging.warning("Many dates for {0}".format(libraryUrn))
337 def get_library_id(name):
338 """Guess library ID from library name
340 >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
342 >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
345 match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
347 if match is not None:
348 library_id = match.group('id')
352 def get_contents(element):
353 """Return contents or none.
355 if len(element.contents) == 0:
358 a = element.find('a')
360 return a.contents[0].encode(CHARSET)
362 return element.contents[0].encode(CHARSET)
365 def get_date_contents(element):
366 data = get_contents(element)
368 return datetime.strptime(data, "%Y-%m-%d %H:%M")
373 def add_stmt(model, subject, predicate, object):
374 """Convienence create RDF Statement and add to a model
376 return model.add_statement(
377 RDF.Statement(subject, predicate, object)
381 def login(cookie=None):
382 """Login if we don't have a cookie
384 if cookie is not None:
387 keys = keyring.get_keyring()
388 password = keys.get_password(LOGIN_URL, USERNAME)
389 credentials = {'login': USERNAME,
390 'password': password}
391 headers = {'Content-type': 'application/x-www-form-urlencoded'}
392 http = httplib2.Http()
393 response, content = http.request(LOGIN_URL,
396 body=urllib.urlencode(credentials))
397 logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
400 cookie = response.get('set-cookie', None)
402 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
406 def get_url_as_soup(url, method, cookie=None):
407 http = httplib2.Http()
409 if cookie is not None:
410 headers['Cookie'] = cookie
411 response, content = http.request(url, method, headers=headers)
412 if response['status'] == '200':
413 soup = BeautifulSoup(content,
414 fromEncoding="utf-8", # should read from header
415 convertEntities=BeautifulSoup.HTML_ENTITIES
419 msg = "error accessing {0}, status {1}"
420 msg = msg.format(url, response['status'])
421 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
423 def get_url_as_text(url, method, cookie=None):
424 http = httplib2.Http()
426 if cookie is not None:
427 headers['Cookie'] = cookie
428 response, content = http.request(url, method, headers=headers)
429 if response['status'] == '200':
432 msg = "error accessing {0}, status {1}"
433 msg = msg.format(url, response['status'])
434 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
438 SUBMISSIONS_LACKING_LIBID = [
439 ('1x75-Directional-HeLa-Rep1', '11208'),
440 ('1x75-Directional-HeLa-Rep2', '11207'),
441 ('1x75-Directional-HepG2-Rep1', '11210'),
442 ('1x75-Directional-HepG2-Rep2', '11209'),
443 ('1x75-Directional-H1-hESC-Rep1', '10947'),
444 ('1x75-Directional-H1-hESC-Rep2', '11009'),
445 ('1x75-Directional-HUVEC-Rep1', '11206'),
446 ('1x75-Directional-HUVEC-Rep2', '11205'),
447 ('1x75-Directional-K562-Rep1', '11008'),
448 ('1x75-Directional-K562-Rep2', '11007'),
449 ('1x75-Directional-NHEK-Rep1', '11204'),
450 ('1x75-Directional-GM12878-Rep1', '11011'),
451 ('1x75-Directional-GM12878-Rep2', '11010'),
456 def select_by_library_id(submission_list):
457 subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
459 for lib_id, subobj in subl:
460 libraries.setdefault(lib_id, []).append(subobj)
462 for submission in libraries.values():
463 submission.sort(key=attrgetter('date'), reverse=True)
467 def library_to_freeze(selected_libraries):
468 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
469 lib_ids = sorted(selected_libraries.keys())
470 report = ['<html><table border="1">']
473 <style type="text/css">
474 td {border-width:0 0 1px 1px; border-style:solid;}
480 report.append('<thead>')
481 report.append('<tr><td>Library ID</td><td>Name</td>')
483 report.append('<td>{0}</td>'.format(f))
484 report.append('</tr>')
485 report.append('</thead>')
486 report.append('<tbody>')
487 for lib_id in lib_ids:
488 report.append('<tr>')
489 lib_url = libraryNS[lib_id].uri
490 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
491 submissions = selected_libraries[lib_id]
492 report.append('<td>{0}</td>'.format(submissions[0].name))
494 for sub in submissions:
495 date = date_to_freeze(sub.date)
496 batched.setdefault(date, []).append(sub)
497 print lib_id, batched
499 report.append('<td>')
500 for s in batched.get(d, []):
501 show_url = submission_view_url(s.subid)
502 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
503 report.append("{0}:{1}".format(subid, s.status))
504 report.append('</td>')
506 report.append('<td></td>')
507 report.append("</tr>")
508 report.append('</tbody>')
509 report.append("</table></html>")
510 return "\n".join(report)
513 def date_to_freeze(d):
514 freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
515 (datetime(2010, 7, 30), '2010-Jul'),
516 (datetime(2011, 1, 30), '2011-Jan'),
518 for end, name in freezes:
524 if __name__ == "__main__":