Starting component to try and figure out what we've submitted.
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2
3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
5 import httplib2
6 from operator import attrgetter
7 from optparse import OptionParser
8 # python keyring
9 import keyring
10 import logging
11 import os
12 import re
13 # redland rdf lib
14 import RDF 
15 import sys
16 import urllib
17
18 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
19 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
20 submitNS = RDF.NS("http://jumpgate.caltech.edu/wiki/EncodeSubmit#")
21 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
22 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
23 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
24
25 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
26 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
27 DETAIL_URL = 'http://encodesubmit.ucsc.edu/pipeline/show/{0}'
28 LIBRARY_URL = 'http://jumpgate.caltech.edu/library/{0}'
29 USERNAME = 'detrout'
30
31 def main(cmdline=None):
32     parser = make_parser()
33     opts, args = parser.parse_args(cmdline)
34
35     cookie = login()
36     if cookie is None:
37         print "Failed to login"
38
39     submissions = my_submissions(cookie)
40     for s in submissions:
41         for t in s.triples():
42             print t
43             
44 def make_parser():
45     parser = OptionParser()
46     return parser
47
48
49 def login():
50     keys = keyring.get_keyring()
51     password = keys.get_password(LOGIN_URL, USERNAME)
52     credentials = {'login': USERNAME,
53                    'password': password}
54     headers = {'Content-type': 'application/x-www-form-urlencoded'}
55     http = httplib2.Http()
56     response, content = http.request(LOGIN_URL,
57                                      'POST',
58                                      headers=headers,
59                                      body=urllib.urlencode(credentials))
60     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
61                                                     response['status']))
62     
63     cookie = response.get('set-cookie', None)
64     return cookie
65
66 def my_submissions(cookie):
67     soup = get_url_as_soup(USER_URL, 'GET', cookie)
68     p = soup.find('table', attrs={'id':'projects'})
69     tr = p.findNext('tr')
70     # first record is header
71     tr = tr.findNext()
72     submissions = []
73     while tr is not None:
74         td = tr.findAll('td')
75         if td is not None and len(td) > 1:
76             subid = td[0].contents[0].contents[0]
77             species = get_contents(td[2])
78             name = get_contents(td[4])
79             status = get_contents(td[6]).strip()
80             date = get_date_contents(td[8])
81             age = get_contents(td[10])
82             submissions.append(
83                 Submission(subid, species, name, status, date, age, cookie)
84             )
85         tr = tr.findNext('tr')
86     return submissions
87
88 def get_contents(element):
89     """Return contents or none.
90     """
91     if len(element.contents) == 0:
92         return None
93
94     a = element.find('a')
95     if a is not None:
96         return a.contents[0]
97
98     return element.contents[0]
99
100 def get_date_contents(element):
101     data = get_contents(element)
102     if data:
103         return datetime.strptime(data, "%Y-%m-%d %H:%M")
104     else:
105         return None
106
107 SUBMISSIONS_LACKING_LIBID = [
108     ('1x75-Directional-HeLa-Rep1',    '11208'),
109     ('1x75-Directional-HeLa-Rep2',    '11207'),
110     ('1x75-Directional-HepG2-Rep1',   '11210'),
111     ('1x75-Directional-HepG2-Rep2',   '11209'),
112     ('1x75-Directional-H1-hESC-Rep1', '10947'),
113     ('1x75-Directional-H1-hESC-Rep2', '11009'),
114     ('1x75-Directional-HUVEC-Rep1',   '11206'),
115     ('1x75-Directional-HUVEC-Rep2',   '11205'),
116     ('1x75-Directional-K562-Rep1',    '11008'),
117     ('1x75-Directional-K562-Rep2',    '11007'),
118     ('1x75-Directional-NHEK-Rep1',    '11204'),
119     ]
120
121 class Submission(object):
122     def __init__(self, subid, species, name, status, date, age, cookie=None):
123         self.cookie = cookie
124         self.subid = subid
125         self.species = species
126         self.name = name
127         self.status = status
128         self.date = date
129         self.age = age
130         self._library_id = None
131         self._created_date = None
132
133     def triples(self):
134         subNode = submissionNS[self.subid.encode('utf-8')]
135         dateNode = self.date.strftime("%Y-%m-%d")
136         s = [RDF.Statement(subNode, submitNS['name'],
137                            self.name.encode('utf-8')),
138              RDF.Statement(subNode, submitNS['status'],
139                            self.status.encode('utf-8')),
140              RDF.Statement(subNode, submitNS['last_modify_date'], dateNode),
141              ]
142         if self.species is not None:
143             s.append(RDF.Statement(subNode, submitNS['species'],
144                                    self.species.encode('utf-8')))
145         if self.library_id is not None:
146              libId = libraryNS[self.library_id.encode('utf-8')]
147              s.append(RDF.Statement(subNode, rdfsNS['seeAlso'], libId))
148         
149         return s
150         
151
152     def _get_library_id(self):
153         if self._library_id is None:
154             match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", self.name)
155             if match is not None:
156                 self._library_id = match.group('id')
157             else:
158                 for dir_lib_name, lib_id in SUBMISSIONS_LACKING_LIBID:
159                     if dir_lib_name in self.name:
160                         self._library_id = lib_id
161                         break
162             
163         return self._library_id
164     
165     library_id = property(_get_library_id)
166
167     def _get_detail(self):
168         detail = DETAIL_URL.format(self.subid)
169         soup = get_url_as_soup(detail, 'GET', self.cookie)
170
171         created_label = soup.find(text="Created: ")
172         if created_label:
173             self._created_date = get_date_contents(created_label.next)
174             
175     def _get_created_date(self):
176         if self._created_date is None:
177             self._get_detail()
178         return self._created_date
179     created_date = property(_get_created_date)
180     
181     def __unicode__(self):
182         return u"{0}\t{1}\t{2}".format(self.subid, self.library_id, self.name)
183
184     def __repr__(self):
185         return u"<Submission ({0}) '{1}'>".format(self.subid, self.name)
186
187
188 def select_by_library_id(submission_list):
189     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
190     libraries = {}
191     for lib_id, subobj in subl:
192         libraries.setdefault(lib_id, []).append(subobj)
193
194     for submission in libraries.values():
195         submission.sort(key=attrgetter('date'), reverse=True)
196         
197     return libraries
198
199 def library_to_freeze(selected_libraries):
200     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
201     lib_ids = sorted(selected_libraries.keys())
202     report = ['<html><table border="1">']
203     report = ["""<html>
204 <head>
205 <style type="text/css">
206  td {border-width:0 0 1px 1px; border-style:solid;}
207 </style>
208 </head>
209 <body>
210 <table>
211 """]
212     report.append('<thead>')
213     report.append('<tr><td>Library ID</td><td>Name</td>')
214     for f in freezes:
215         report.append('<td>{0}</td>'.format(f))
216     report.append('</tr>')
217     report.append('</thead>')
218     report.append('<tbody>')
219     for lib_id in lib_ids:
220         report.append('<tr>')
221         lib_url = LIBRARY_URL.format(lib_id)
222         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
223         submissions = selected_libraries[lib_id]
224         report.append('<td>{0}</td>'.format(submissions[0].name))
225         batched = {}
226         for sub in submissions:
227             date = date_to_freeze(sub.date)
228             batched.setdefault(date, []).append(sub)
229         print lib_id, batched
230         for d in freezes:
231             report.append('<td>')
232             for s in batched.get(d, []):
233                 subid = '<a href="http://encodesubmit.ucsc.edu/pipeline/show/{0}">{0}</a>'.format(s.subid)
234                 report.append("{0}:{1}".format(subid, s.status))
235             report.append('</td>')
236         else:
237             report.append('<td></td>')
238         report.append("</tr>")
239     report.append('</tbody>')
240     report.append("</table></html>")
241     return "\n".join(report)
242
243             
244 def date_to_freeze(d):
245     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
246                 (datetime(2010, 7, 30), '2010-Jul'),
247                 (datetime(2011, 1, 30), '2011-Jan'),
248                 ]
249     for end, name in freezes:
250         if d < end:
251             return name
252     else:
253         return None
254     
255                 
256 def get_url_as_soup(url, method, cookie=None):
257     http = httplib2.Http()
258     headers = {}
259     if cookie is not None:
260         headers['Cookie'] = cookie
261     response, content = http.request(url, method, headers=headers)
262     if response['status'] == '200':
263         soup = BeautifulSoup(content,
264                              fromEncoding="utf-8", # should read from header
265                              convertEntities=BeautifulSoup.HTML_ENTITIES
266                              )
267         return soup
268     else:
269         msg = "error accessing {0}, status {1}"
270         msg = msg.format(url, response['status'])
271         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
272
273 if __name__ == "__main__":
274     main()
275