extra/ucsc_encode_submission/encode_find.py

   1 #!/usr/bin/env python
   2
   3 from BeautifulSoup import BeautifulSoup
   4 from datetime import datetime
   5 import httplib2
   6 from operator import attrgetter
   7 from optparse import OptionParser
   8 # python keyring
   9 import keyring
  10 import logging
  11 import os
  12 import re
  13 # redland rdf lib
  14 import RDF
  15 import sys
  16 import urllib
  17
  18 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
  19 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
  20 submitNS = RDF.NS("http://jumpgate.caltech.edu/wiki/EncodeSubmit#")
  21 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  22 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  23 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  24
  25 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  26 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  27 DETAIL_URL = 'http://encodesubmit.ucsc.edu/pipeline/show/{0}'
  28 LIBRARY_URL = 'http://jumpgate.caltech.edu/library/{0}'
  29 USERNAME = 'detrout'
  30
  31 def main(cmdline=None):
  32     parser = make_parser()
  33     opts, args = parser.parse_args(cmdline)
  34
  35     cookie = login()
  36     if cookie is None:
  37         print "Failed to login"
  38
  39     submissions = my_submissions(cookie)
  40     for s in submissions:
  41         for t in s.triples():
  42             print t
  43
  44 def make_parser():
  45     parser = OptionParser()
  46     return parser
  47
  48
  49 def login():
  50     keys = keyring.get_keyring()
  51     password = keys.get_password(LOGIN_URL, USERNAME)
  52     credentials = {'login': USERNAME,
  53                    'password': password}
  54     headers = {'Content-type': 'application/x-www-form-urlencoded'}
  55     http = httplib2.Http()
  56     response, content = http.request(LOGIN_URL,
  57                                      'POST',
  58                                      headers=headers,
  59                                      body=urllib.urlencode(credentials))
  60     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
  61                                                     response['status']))
  62
  63     cookie = response.get('set-cookie', None)
  64     return cookie
  65
  66 def my_submissions(cookie):
  67     soup = get_url_as_soup(USER_URL, 'GET', cookie)
  68     p = soup.find('table', attrs={'id':'projects'})
  69     tr = p.findNext('tr')
  70     # first record is header
  71     tr = tr.findNext()
  72     submissions = []
  73     while tr is not None:
  74         td = tr.findAll('td')
  75         if td is not None and len(td) > 1:
  76             subid = td[0].contents[0].contents[0]
  77             species = get_contents(td[2])
  78             name = get_contents(td[4])
  79             status = get_contents(td[6]).strip()
  80             date = get_date_contents(td[8])
  81             age = get_contents(td[10])
  82             submissions.append(
  83                 Submission(subid, species, name, status, date, age, cookie)
  84             )
  85         tr = tr.findNext('tr')
  86     return submissions
  87
  88 def get_contents(element):
  89     """Return contents or none.
  90     """
  91     if len(element.contents) == 0:
  92         return None
  93
  94     a = element.find('a')
  95     if a is not None:
  96         return a.contents[0]
  97
  98     return element.contents[0]
  99
 100 def get_date_contents(element):
 101     data = get_contents(element)
 102     if data:
 103         return datetime.strptime(data, "%Y-%m-%d %H:%M")
 104     else:
 105         return None
 106
 107 SUBMISSIONS_LACKING_LIBID = [
 108     ('1x75-Directional-HeLa-Rep1',    '11208'),
 109     ('1x75-Directional-HeLa-Rep2',    '11207'),
 110     ('1x75-Directional-HepG2-Rep1',   '11210'),
 111     ('1x75-Directional-HepG2-Rep2',   '11209'),
 112     ('1x75-Directional-H1-hESC-Rep1', '10947'),
 113     ('1x75-Directional-H1-hESC-Rep2', '11009'),
 114     ('1x75-Directional-HUVEC-Rep1',   '11206'),
 115     ('1x75-Directional-HUVEC-Rep2',   '11205'),
 116     ('1x75-Directional-K562-Rep1',    '11008'),
 117     ('1x75-Directional-K562-Rep2',    '11007'),
 118     ('1x75-Directional-NHEK-Rep1',    '11204'),
 119     ]
 120
 121 class Submission(object):
 122     def __init__(self, subid, species, name, status, date, age, cookie=None):
 123         self.cookie = cookie
 124         self.subid = subid
 125         self.species = species
 126         self.name = name
 127         self.status = status
 128         self.date = date
 129         self.age = age
 130         self._library_id = None
 131         self._created_date = None
 132
 133     def triples(self):
 134         subNode = submissionNS[self.subid.encode('utf-8')]
 135         dateNode = self.date.strftime("%Y-%m-%d")
 136         s = [RDF.Statement(subNode, submitNS['name'],
 137                            self.name.encode('utf-8')),
 138              RDF.Statement(subNode, submitNS['status'],
 139                            self.status.encode('utf-8')),
 140              RDF.Statement(subNode, submitNS['last_modify_date'], dateNode),
 141              ]
 142         if self.species is not None:
 143             s.append(RDF.Statement(subNode, submitNS['species'],
 144                                    self.species.encode('utf-8')))
 145         if self.library_id is not None:
 146              libId = libraryNS[self.library_id.encode('utf-8')]
 147              s.append(RDF.Statement(subNode, rdfsNS['seeAlso'], libId))
 148
 149         return s
 150
 151
 152     def _get_library_id(self):
 153         if self._library_id is None:
 154             match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", self.name)
 155             if match is not None:
 156                 self._library_id = match.group('id')
 157             else:
 158                 for dir_lib_name, lib_id in SUBMISSIONS_LACKING_LIBID:
 159                     if dir_lib_name in self.name:
 160                         self._library_id = lib_id
 161                         break
 162
 163         return self._library_id
 164
 165     library_id = property(_get_library_id)
 166
 167     def _get_detail(self):
 168         detail = DETAIL_URL.format(self.subid)
 169         soup = get_url_as_soup(detail, 'GET', self.cookie)
 170
 171         created_label = soup.find(text="Created: ")
 172         if created_label:
 173             self._created_date = get_date_contents(created_label.next)
 174
 175     def _get_created_date(self):
 176         if self._created_date is None:
 177             self._get_detail()
 178         return self._created_date
 179     created_date = property(_get_created_date)
 180
 181     def __unicode__(self):
 182         return u"{0}\t{1}\t{2}".format(self.subid, self.library_id, self.name)
 183
 184     def __repr__(self):
 185         return u"<Submission ({0}) '{1}'>".format(self.subid, self.name)
 186
 187
 188 def select_by_library_id(submission_list):
 189     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
 190     libraries = {}
 191     for lib_id, subobj in subl:
 192         libraries.setdefault(lib_id, []).append(subobj)
 193
 194     for submission in libraries.values():
 195         submission.sort(key=attrgetter('date'), reverse=True)
 196
 197     return libraries
 198
 199 def library_to_freeze(selected_libraries):
 200     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
 201     lib_ids = sorted(selected_libraries.keys())
 202     report = ['<html><table border="1">']
 203     report = ["""<html>
 204 <head>
 205 <style type="text/css">
 206  td {border-width:0 0 1px 1px; border-style:solid;}
 207 </style>
 208 </head>
 209 <body>
 210 <table>
 211 """]
 212     report.append('<thead>')
 213     report.append('<tr><td>Library ID</td><td>Name</td>')
 214     for f in freezes:
 215         report.append('<td>{0}</td>'.format(f))
 216     report.append('</tr>')
 217     report.append('</thead>')
 218     report.append('<tbody>')
 219     for lib_id in lib_ids:
 220         report.append('<tr>')
 221         lib_url = LIBRARY_URL.format(lib_id)
 222         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
 223         submissions = selected_libraries[lib_id]
 224         report.append('<td>{0}</td>'.format(submissions[0].name))
 225         batched = {}
 226         for sub in submissions:
 227             date = date_to_freeze(sub.date)
 228             batched.setdefault(date, []).append(sub)
 229         print lib_id, batched
 230         for d in freezes:
 231             report.append('<td>')
 232             for s in batched.get(d, []):
 233                 subid = '<a href="http://encodesubmit.ucsc.edu/pipeline/show/{0}">{0}</a>'.format(s.subid)
 234                 report.append("{0}:{1}".format(subid, s.status))
 235             report.append('</td>')
 236         else:
 237             report.append('<td></td>')
 238         report.append("</tr>")
 239     report.append('</tbody>')
 240     report.append("</table></html>")
 241     return "\n".join(report)
 242
 243
 244 def date_to_freeze(d):
 245     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
 246                 (datetime(2010, 7, 30), '2010-Jul'),
 247                 (datetime(2011, 1, 30), '2011-Jan'),
 248                 ]
 249     for end, name in freezes:
 250         if d < end:
 251             return name
 252     else:
 253         return None
 254
 255
 256 def get_url_as_soup(url, method, cookie=None):
 257     http = httplib2.Http()
 258     headers = {}
 259     if cookie is not None:
 260         headers['Cookie'] = cookie
 261     response, content = http.request(url, method, headers=headers)
 262     if response['status'] == '200':
 263         soup = BeautifulSoup(content,
 264                              fromEncoding="utf-8", # should read from header
 265                              convertEntities=BeautifulSoup.HTML_ENTITIES
 266                              )
 267         return soup
 268     else:
 269         msg = "error accessing {0}, status {1}"
 270         msg = msg.format(url, response['status'])
 271         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 272
 273 if __name__ == "__main__":
 274     main()
 275