Starting component to try and figure out what we've submitted.
authorDiane Trout <diane@caltech.edu>
Tue, 15 Feb 2011 07:28:58 +0000 (23:28 -0800)
committerDiane Trout <diane@caltech.edu>
Tue, 15 Feb 2011 07:28:58 +0000 (23:28 -0800)
extra/ucsc_encode_submission/encode_find.py [new file with mode: 0644]

diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py
new file mode 100644 (file)
index 0000000..7c614dc
--- /dev/null
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+
+from BeautifulSoup import BeautifulSoup
+from datetime import datetime
+import httplib2
+from operator import attrgetter
+from optparse import OptionParser
+# python keyring
+import keyring
+import logging
+import os
+import re
+# redland rdf lib
+import RDF 
+import sys
+import urllib
+
+libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
+submitNS = RDF.NS("http://jumpgate.caltech.edu/wiki/EncodeSubmit#")
+dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
+rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
+
+LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
+USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
+DETAIL_URL = 'http://encodesubmit.ucsc.edu/pipeline/show/{0}'
+LIBRARY_URL = 'http://jumpgate.caltech.edu/library/{0}'
+USERNAME = 'detrout'
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    cookie = login()
+    if cookie is None:
+        print "Failed to login"
+
+    submissions = my_submissions(cookie)
+    for s in submissions:
+        for t in s.triples():
+            print t
+            
+def make_parser():
+    parser = OptionParser()
+    return parser
+
+
+def login():
+    keys = keyring.get_keyring()
+    password = keys.get_password(LOGIN_URL, USERNAME)
+    credentials = {'login': USERNAME,
+                   'password': password}
+    headers = {'Content-type': 'application/x-www-form-urlencoded'}
+    http = httplib2.Http()
+    response, content = http.request(LOGIN_URL,
+                                     'POST',
+                                     headers=headers,
+                                     body=urllib.urlencode(credentials))
+    logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
+                                                    response['status']))
+    
+    cookie = response.get('set-cookie', None)
+    return cookie
+
+def my_submissions(cookie):
+    soup = get_url_as_soup(USER_URL, 'GET', cookie)
+    p = soup.find('table', attrs={'id':'projects'})
+    tr = p.findNext('tr')
+    # first record is header
+    tr = tr.findNext()
+    submissions = []
+    while tr is not None:
+        td = tr.findAll('td')
+        if td is not None and len(td) > 1:
+            subid = td[0].contents[0].contents[0]
+            species = get_contents(td[2])
+            name = get_contents(td[4])
+            status = get_contents(td[6]).strip()
+            date = get_date_contents(td[8])
+            age = get_contents(td[10])
+            submissions.append(
+                Submission(subid, species, name, status, date, age, cookie)
+            )
+        tr = tr.findNext('tr')
+    return submissions
+
+def get_contents(element):
+    """Return contents or none.
+    """
+    if len(element.contents) == 0:
+        return None
+
+    a = element.find('a')
+    if a is not None:
+        return a.contents[0]
+
+    return element.contents[0]
+
+def get_date_contents(element):
+    data = get_contents(element)
+    if data:
+        return datetime.strptime(data, "%Y-%m-%d %H:%M")
+    else:
+        return None
+
+SUBMISSIONS_LACKING_LIBID = [
+    ('1x75-Directional-HeLa-Rep1',    '11208'),
+    ('1x75-Directional-HeLa-Rep2',    '11207'),
+    ('1x75-Directional-HepG2-Rep1',   '11210'),
+    ('1x75-Directional-HepG2-Rep2',   '11209'),
+    ('1x75-Directional-H1-hESC-Rep1', '10947'),
+    ('1x75-Directional-H1-hESC-Rep2', '11009'),
+    ('1x75-Directional-HUVEC-Rep1',   '11206'),
+    ('1x75-Directional-HUVEC-Rep2',   '11205'),
+    ('1x75-Directional-K562-Rep1',    '11008'),
+    ('1x75-Directional-K562-Rep2',    '11007'),
+    ('1x75-Directional-NHEK-Rep1',    '11204'),
+    ]
+
+class Submission(object):
+    def __init__(self, subid, species, name, status, date, age, cookie=None):
+        self.cookie = cookie
+        self.subid = subid
+        self.species = species
+        self.name = name
+        self.status = status
+        self.date = date
+        self.age = age
+        self._library_id = None
+        self._created_date = None
+
+    def triples(self):
+        subNode = submissionNS[self.subid.encode('utf-8')]
+        dateNode = self.date.strftime("%Y-%m-%d")
+        s = [RDF.Statement(subNode, submitNS['name'],
+                           self.name.encode('utf-8')),
+             RDF.Statement(subNode, submitNS['status'],
+                           self.status.encode('utf-8')),
+             RDF.Statement(subNode, submitNS['last_modify_date'], dateNode),
+             ]
+        if self.species is not None:
+            s.append(RDF.Statement(subNode, submitNS['species'],
+                                   self.species.encode('utf-8')))
+        if self.library_id is not None:
+             libId = libraryNS[self.library_id.encode('utf-8')]
+             s.append(RDF.Statement(subNode, rdfsNS['seeAlso'], libId))
+        
+        return s
+        
+
+    def _get_library_id(self):
+        if self._library_id is None:
+            match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", self.name)
+            if match is not None:
+                self._library_id = match.group('id')
+            else:
+                for dir_lib_name, lib_id in SUBMISSIONS_LACKING_LIBID:
+                    if dir_lib_name in self.name:
+                        self._library_id = lib_id
+                        break
+            
+        return self._library_id
+    
+    library_id = property(_get_library_id)
+
+    def _get_detail(self):
+        detail = DETAIL_URL.format(self.subid)
+        soup = get_url_as_soup(detail, 'GET', self.cookie)
+
+        created_label = soup.find(text="Created: ")
+        if created_label:
+            self._created_date = get_date_contents(created_label.next)
+            
+    def _get_created_date(self):
+        if self._created_date is None:
+            self._get_detail()
+        return self._created_date
+    created_date = property(_get_created_date)
+    
+    def __unicode__(self):
+        return u"{0}\t{1}\t{2}".format(self.subid, self.library_id, self.name)
+
+    def __repr__(self):
+        return u"<Submission ({0}) '{1}'>".format(self.subid, self.name)
+
+
+def select_by_library_id(submission_list):
+    subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
+    libraries = {}
+    for lib_id, subobj in subl:
+        libraries.setdefault(lib_id, []).append(subobj)
+
+    for submission in libraries.values():
+        submission.sort(key=attrgetter('date'), reverse=True)
+        
+    return libraries
+
+def library_to_freeze(selected_libraries):
+    freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
+    lib_ids = sorted(selected_libraries.keys())
+    report = ['<html><table border="1">']
+    report = ["""<html>
+<head>
+<style type="text/css">
+ td {border-width:0 0 1px 1px; border-style:solid;}
+</style>
+</head>
+<body>
+<table>
+"""]
+    report.append('<thead>')
+    report.append('<tr><td>Library ID</td><td>Name</td>')
+    for f in freezes:
+        report.append('<td>{0}</td>'.format(f))
+    report.append('</tr>')
+    report.append('</thead>')
+    report.append('<tbody>')
+    for lib_id in lib_ids:
+        report.append('<tr>')
+        lib_url = LIBRARY_URL.format(lib_id)
+        report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
+        submissions = selected_libraries[lib_id]
+        report.append('<td>{0}</td>'.format(submissions[0].name))
+        batched = {}
+        for sub in submissions:
+            date = date_to_freeze(sub.date)
+            batched.setdefault(date, []).append(sub)
+        print lib_id, batched
+        for d in freezes:
+            report.append('<td>')
+            for s in batched.get(d, []):
+                subid = '<a href="http://encodesubmit.ucsc.edu/pipeline/show/{0}">{0}</a>'.format(s.subid)
+                report.append("{0}:{1}".format(subid, s.status))
+            report.append('</td>')
+        else:
+            report.append('<td></td>')
+        report.append("</tr>")
+    report.append('</tbody>')
+    report.append("</table></html>")
+    return "\n".join(report)
+
+            
+def date_to_freeze(d):
+    freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
+                (datetime(2010, 7, 30), '2010-Jul'),
+                (datetime(2011, 1, 30), '2011-Jan'),
+                ]
+    for end, name in freezes:
+        if d < end:
+            return name
+    else:
+        return None
+    
+                
+def get_url_as_soup(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        soup = BeautifulSoup(content,
+                             fromEncoding="utf-8", # should read from header
+                             convertEntities=BeautifulSoup.HTML_ENTITIES
+                             )
+        return soup
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+if __name__ == "__main__":
+    main()
+