From c0d258e82499bc9eb20845340e934f859b5268e5 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Mon, 14 Feb 2011 23:28:58 -0800 Subject: [PATCH] Starting component to try and figure out what we've submitted. --- extra/ucsc_encode_submission/encode_find.py | 275 ++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 extra/ucsc_encode_submission/encode_find.py diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py new file mode 100644 index 0000000..7c614dc --- /dev/null +++ b/extra/ucsc_encode_submission/encode_find.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python + +from BeautifulSoup import BeautifulSoup +from datetime import datetime +import httplib2 +from operator import attrgetter +from optparse import OptionParser +# python keyring +import keyring +import logging +import os +import re +# redland rdf lib +import RDF +import sys +import urllib + +libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/") +submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/") +submitNS = RDF.NS("http://jumpgate.caltech.edu/wiki/EncodeSubmit#") +dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/") +rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#") +rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#") + +LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login' +USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user' +DETAIL_URL = 'http://encodesubmit.ucsc.edu/pipeline/show/{0}' +LIBRARY_URL = 'http://jumpgate.caltech.edu/library/{0}' +USERNAME = 'detrout' + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + cookie = login() + if cookie is None: + print "Failed to login" + + submissions = my_submissions(cookie) + for s in submissions: + for t in s.triples(): + print t + +def make_parser(): + parser = OptionParser() + return parser + + +def login(): + keys = keyring.get_keyring() + password = keys.get_password(LOGIN_URL, USERNAME) + credentials = {'login': USERNAME, + 'password': password} + headers = {'Content-type': 'application/x-www-form-urlencoded'} + http = httplib2.Http() + response, content = http.request(LOGIN_URL, + 'POST', + headers=headers, + body=urllib.urlencode(credentials)) + logging.debug("Login to {0}, status {1}".format(LOGIN_URL, + response['status'])) + + cookie = response.get('set-cookie', None) + return cookie + +def my_submissions(cookie): + soup = get_url_as_soup(USER_URL, 'GET', cookie) + p = soup.find('table', attrs={'id':'projects'}) + tr = p.findNext('tr') + # first record is header + tr = tr.findNext() + submissions = [] + while tr is not None: + td = tr.findAll('td') + if td is not None and len(td) > 1: + subid = td[0].contents[0].contents[0] + species = get_contents(td[2]) + name = get_contents(td[4]) + status = get_contents(td[6]).strip() + date = get_date_contents(td[8]) + age = get_contents(td[10]) + submissions.append( + Submission(subid, species, name, status, date, age, cookie) + ) + tr = tr.findNext('tr') + return submissions + +def get_contents(element): + """Return contents or none. + """ + if len(element.contents) == 0: + return None + + a = element.find('a') + if a is not None: + return a.contents[0] + + return element.contents[0] + +def get_date_contents(element): + data = get_contents(element) + if data: + return datetime.strptime(data, "%Y-%m-%d %H:%M") + else: + return None + +SUBMISSIONS_LACKING_LIBID = [ + ('1x75-Directional-HeLa-Rep1', '11208'), + ('1x75-Directional-HeLa-Rep2', '11207'), + ('1x75-Directional-HepG2-Rep1', '11210'), + ('1x75-Directional-HepG2-Rep2', '11209'), + ('1x75-Directional-H1-hESC-Rep1', '10947'), + ('1x75-Directional-H1-hESC-Rep2', '11009'), + ('1x75-Directional-HUVEC-Rep1', '11206'), + ('1x75-Directional-HUVEC-Rep2', '11205'), + ('1x75-Directional-K562-Rep1', '11008'), + ('1x75-Directional-K562-Rep2', '11007'), + ('1x75-Directional-NHEK-Rep1', '11204'), + ] + +class Submission(object): + def __init__(self, subid, species, name, status, date, age, cookie=None): + self.cookie = cookie + self.subid = subid + self.species = species + self.name = name + self.status = status + self.date = date + self.age = age + self._library_id = None + self._created_date = None + + def triples(self): + subNode = submissionNS[self.subid.encode('utf-8')] + dateNode = self.date.strftime("%Y-%m-%d") + s = [RDF.Statement(subNode, submitNS['name'], + self.name.encode('utf-8')), + RDF.Statement(subNode, submitNS['status'], + self.status.encode('utf-8')), + RDF.Statement(subNode, submitNS['last_modify_date'], dateNode), + ] + if self.species is not None: + s.append(RDF.Statement(subNode, submitNS['species'], + self.species.encode('utf-8'))) + if self.library_id is not None: + libId = libraryNS[self.library_id.encode('utf-8')] + s.append(RDF.Statement(subNode, rdfsNS['seeAlso'], libId)) + + return s + + + def _get_library_id(self): + if self._library_id is None: + match = re.search(r"[ -](?P([\d]{5})|(SL[\d]{4}))", self.name) + if match is not None: + self._library_id = match.group('id') + else: + for dir_lib_name, lib_id in SUBMISSIONS_LACKING_LIBID: + if dir_lib_name in self.name: + self._library_id = lib_id + break + + return self._library_id + + library_id = property(_get_library_id) + + def _get_detail(self): + detail = DETAIL_URL.format(self.subid) + soup = get_url_as_soup(detail, 'GET', self.cookie) + + created_label = soup.find(text="Created: ") + if created_label: + self._created_date = get_date_contents(created_label.next) + + def _get_created_date(self): + if self._created_date is None: + self._get_detail() + return self._created_date + created_date = property(_get_created_date) + + def __unicode__(self): + return u"{0}\t{1}\t{2}".format(self.subid, self.library_id, self.name) + + def __repr__(self): + return u"".format(self.subid, self.name) + + +def select_by_library_id(submission_list): + subl = [ (x.library_id, x) for x in submission_list if x.library_id ] + libraries = {} + for lib_id, subobj in subl: + libraries.setdefault(lib_id, []).append(subobj) + + for submission in libraries.values(): + submission.sort(key=attrgetter('date'), reverse=True) + + return libraries + +def library_to_freeze(selected_libraries): + freezes = ['2010-Jan', '2010-Jul', '2011-Jan'] + lib_ids = sorted(selected_libraries.keys()) + report = [''] + report = [""" + + + + +
+"""] + report.append('') + report.append('') + for f in freezes: + report.append(''.format(f)) + report.append('') + report.append('') + report.append('') + for lib_id in lib_ids: + report.append('') + lib_url = LIBRARY_URL.format(lib_id) + report.append(''.format(lib_url, lib_id)) + submissions = selected_libraries[lib_id] + report.append(''.format(submissions[0].name)) + batched = {} + for sub in submissions: + date = date_to_freeze(sub.date) + batched.setdefault(date, []).append(sub) + print lib_id, batched + for d in freezes: + report.append('') + else: + report.append('') + report.append("") + report.append('') + report.append("
Library IDName{0}
{1}{0}') + for s in batched.get(d, []): + subid = '{0}'.format(s.subid) + report.append("{0}:{1}".format(subid, s.status)) + report.append('
") + return "\n".join(report) + + +def date_to_freeze(d): + freezes = [ (datetime(2010, 1, 30), '2010-Jan'), + (datetime(2010, 7, 30), '2010-Jul'), + (datetime(2011, 1, 30), '2011-Jan'), + ] + for end, name in freezes: + if d < end: + return name + else: + return None + + +def get_url_as_soup(url, method, cookie=None): + http = httplib2.Http() + headers = {} + if cookie is not None: + headers['Cookie'] = cookie + response, content = http.request(url, method, headers=headers) + if response['status'] == '200': + soup = BeautifulSoup(content, + fromEncoding="utf-8", # should read from header + convertEntities=BeautifulSoup.HTML_ENTITIES + ) + return soup + else: + msg = "error accessing {0}, status {1}" + msg = msg.format(url, response['status']) + e = httplib2.HttpLib2ErrorWithResponse(msg, response, content) + +if __name__ == "__main__": + main() + -- 2.30.2