Further clean up ddf generation.
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
1 #!/usr/bin/env python
2
3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
5 import httplib2
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
8 # python keyring
9 import keyring
10 import logging
11 import os
12 import re
13 # redland rdf lib
14 import RDF 
15 import sys
16 import urllib
17 import urlparse
18
19 from htsworkflow.util import api
20 from htsworkflow.util.rdfhelp import \
21      dublinCoreNS, \
22      get_model, \
23      get_serializer, \
24      sparql_query, \
25      submitOntology, \
26      libraryOntology, \
27      load_into_model, \
28      rdfNS, \
29      rdfsNS, \
30      xsdNS
31
32 # URL mappings
33 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
34
35
36 from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
37 download_ddf = urlparse.urljoin(UCSCEncodePipeline, "download_ddf#", allow_fragments=True)
38 ddfNS = RDF.NS(download_ddf)
39                
40 DBDIR = os.path.expanduser("~diane/proj/submission")
41
42 logger = logging.getLogger("encode_find")
43
44 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
45 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
46
47 USERNAME = 'detrout'
48 CHARSET = 'utf-8'
49
50 def main(cmdline=None):
51     parser = make_parser()
52     opts, args = parser.parse_args(cmdline)
53
54     if opts.verbose:
55         logging.basicConfig(level=logging.INFO)
56
57     htsw_authdata = api.make_auth_from_opts(opts, parser)
58     htswapi = api.HtswApi(opts.host, htsw_authdata)
59     
60     cookie = None
61     model = get_model(opts.load_model, DBDIR)
62     
63     if opts.load_rdf is not None:
64         ns_uri = submitOntology[''].uri
65         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
66         
67     if opts.update:
68         cookie = login(cookie=cookie)
69         load_my_submissions(model, cookie=cookie)
70         load_encode_libraries(model, htswapi)
71
72     if opts.sparql is not None:
73         sparql_query(model, opts.sparql)
74
75     if opts.find_submission_with_no_library:
76         missing = find_submissions_with_no_library(model)
77                 
78     if opts.print_rdf:
79         serializer = get_serializer(name=opts.rdf_parser_name)
80         print serializer.serialize_model_to_string(model)
81
82
83 def make_parser():
84     parser = OptionParser()
85     commands = OptionGroup(parser, "Commands")
86     commands.add_option('--load-model', default=None,
87       help="Load model database")
88     commands.add_option('--load-rdf', default=None,
89       help="load rdf statements into model")
90     commands.add_option('--print-rdf', action="store_true", default=False,
91       help="print ending model state")
92     commands.add_option('--update', action="store_true", default=False,
93       help="Query remote data sources and update our database")
94     #commands.add_option('--update-ucsc-status', default=None,
95     #  help="download status from ucsc, requires filename for extra rules")
96     #commands.add_option('--update-ddfs', action="store_true", default=False,
97     #  help="download ddf information for known submission")
98     #commands.add_option('--update-library', default=None,
99     #  help="download library info from htsw, requires filename for extra rules")
100     parser.add_option_group(commands)
101                       
102     queries = OptionGroup(parser, "Queries")
103     queries.add_option('--sparql', default=None,
104       help="execute arbitrary sparql query")
105     queries.add_option('--find-submission-with-no-library', default=False,
106       action="store_true",
107       help="find submissions with no library ID")    
108     parser.add_option_group(queries)
109
110     options = OptionGroup(parser, "Options")
111     options.add_option("--rdf-parser-name", default="turtle",
112       help="set rdf file parser type")
113     options.add_option("-v", "--verbose", action="store_true", default=False)
114     parser.add_option_group(options)
115     
116     api.add_auth_options(parser)
117
118     return parser
119
120 def load_my_submissions(model, cookie=None):
121     if cookie is None:
122         cookie = login()
123         
124     soup = get_url_as_soup(USER_URL, 'GET', cookie)
125     p = soup.find('table', attrs={'id':'projects'})
126     tr = p.findNext('tr')
127     # first record is header
128     tr = tr.findNext()
129     TypeN = rdfsNS['type']
130     NameN = submitOntology['name']
131     SpeciesN = submitOntology['species']
132     LibraryURN = submitOntology['library_urn']
133
134     while tr is not None:
135         td = tr.findAll('td')
136         if td is not None and len(td) > 1:
137             subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
138             subUrn = RDF.Uri(submission_view_url(subUrnText))
139
140             add_stmt(model, subUrn, TypeN, submitOntology['Submission'])
141                 
142             name = get_contents(td[4])
143             add_stmt(model, subUrn, NameN, name)
144                 
145             species = get_contents(td[2])
146             if species is not None:
147                 add_stmt(model, subUrn, SpeciesN, species)
148
149             library_id = get_library_id(name)
150             if library_id is not None:
151                 add_submission_to_library_urn(model,
152                                               subUrn,
153                                               LibraryURN,
154                                               library_id)
155
156             add_submission_creation_date(model, subUrn, cookie)
157
158             # grab changing atttributes
159             status = get_contents(td[6]).strip()
160             last_mod_datetime = get_date_contents(td[8])
161             last_mod = last_mod_datetime.isoformat()
162
163             update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
164
165             logging.info("Processed {0}".format( subUrn))
166             
167         tr = tr.findNext('tr')
168
169
170 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
171     """Add a link from a UCSC submission to woldlab library if needed
172     """
173     libraryUrn = libraryNS[library_id]
174     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
175     if not model.contains_statement(query):
176         link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
177         logger.info("Adding Sub -> Lib link: {0}".format(link))
178         model.add_statement(link)
179     else:
180         logger.debug("Found: {0}".format(str(query)))
181
182     
183 def find_submissions_with_no_library(model):
184     missing_lib_query = RDF.SPARQLQuery("""
185 PREFIX submissionOntology:<{submissionOntology}>
186
187 SELECT 
188  ?subid ?name
189 WHERE {{
190   ?subid submissionOntology:name ?name
191   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
192   FILTER  (!bound(?libid))
193 }}""".format(submissionOntology=submitOntology[''].uri)
194 )    
195
196     results = missing_lib_query.execute(model)
197     for row in results:
198         subid = row['subid']
199         name = row['name']
200         print "# {0}".format(name)
201         print "<{0}>".format(subid.uri)
202         print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
203         print ""
204     
205
206 def add_submission_creation_date(model, subUrn, cookie):
207     # in theory the submission page might have more information on it.
208     creationDateN = libraryOntology['date']
209     dateTimeType = xsdNS['dateTime']
210     query = RDF.Statement(subUrn, creationDateN, None)
211     creation_dates = list(model.find_statements(query))
212     if len(creation_dates) == 0:
213         logger.info("Getting creation date for: {0}".format(str(subUrn)))
214         soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
215         created_label = soup.find(text="Created: ")
216         if created_label:
217             created_date = get_date_contents(created_label.next)
218             created_date_node = RDF.Node(literal=created_date.isoformat(),
219                                          datatype=dateTimeType.uri)
220             add_stmt(model, subUrn, creationDateN, created_date_node)
221     else:
222         logger.debug("Found creation date for: {0}".format(str(subUrn)))
223
224 def update_submission_detail(model, subUrn, status, recent_update, cookie):
225     HasStatusN = submitOntology['has_status']
226     StatusN = submitOntology['status']
227     LastModifyN = submitOntology['last_modify_date']
228
229     status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
230     status_nodes = list(model.find_statements(status_nodes_query))
231
232     if len(status_nodes) == 0:
233         # has no status node, add one
234         logging.info("Adding status node to {0}".format(subUrn))
235         status_blank = RDF.Node()
236         add_stmt(model, subUrn, HasStatusN, status_blank)
237         add_stmt(model, status_blank, rdfsNS['type'], StatusN)
238         add_stmt(model, status_blank, StatusN, status)
239         add_stmt(model, status_blank, LastModifyN, recent_update)
240         update_ddf(model, subUrn, status_blank, cookie=cookie)
241     else:
242         logging.info("Found {0} status blanks".format(len(status_nodes)))
243         for status_statement in status_nodes:
244             status_blank = status_statement.object
245             last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
246             last_mod_nodes = model.find_statements(last_modified_query)
247             for last_mod_statement in last_mod_nodes:
248                 last_mod_date = str(last_mod_statement.object)
249                 if recent_update == str(last_mod_date):
250                     update_ddf(model, subUrn, status_blank, cookie=cookie)
251                     break
252
253
254     
255 def update_ddf(model, subUrn, statusNode, cookie):
256     TypeN = rdfsNS['type']
257     
258     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
259     ddfUrn = RDF.Uri(download_ddf_url)
260     
261     status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
262     if not model.contains_statement(status_is_ddf):
263         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
264         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
265         add_ddf_statements(model, statusNode, ddf_text)
266         model.add_statement(status_is_ddf)
267
268
269 def add_ddf_statements(model, statusNode, ddf_string):
270     """Convert a ddf text file into RDF Statements
271     """
272     ddf_lines = ddf_string.split('\n')
273     # first line is header
274     header = ddf_lines[0].split()
275     attributes = [ ddfNS[x] for x in header ]
276     statements = []
277
278     for ddf_line in ddf_lines[1:]:
279         ddf_line = ddf_line.strip()
280         if len(ddf_line) == 0:
281             continue
282         if ddf_line.startswith("#"):
283             continue
284         
285         ddf_record = ddf_line.split('\t')
286         files = ddf_record[0].split(',')
287         file_attributes = ddf_record[1:]
288
289         for f in files:
290             fileNode = RDF.Node()
291             add_stmt(model, statusNode, submitOntology['has_file'], fileNode)
292             add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
293             add_stmt(model, fileNode, ddfNS['filename'], f)
294
295             for predicate, object in zip( attributes[1:], file_attributes):
296                 add_stmt(model, fileNode, predicate, object)
297
298
299 def load_encode_libraries(model, htswapi):
300     """Get libraries associated with encode.
301     """
302     encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
303     rdfaParser = RDF.Parser(name='rdfa')
304     print encodeUrl
305     rdfaParser.parse_into_model(model, encodeUrl)
306     query = RDF.Statement(None, libraryOntology['library_id'], None)
307     libraries = model.find_statements(query)
308     for statement in libraries:
309         libraryUrn = statement.subject
310         load_library_detail(model, libraryUrn)
311
312
313 def load_library_detail(model, libraryUrn):
314     """Grab detail information from library page
315     """
316     rdfaParser = RDF.Parser(name='rdfa')
317     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
318     results = list(model.find_statements(query))
319     if len(results) == 0:
320         logger.info("Loading {0}".format(str(libraryUrn)))
321         rdfaParser.parse_into_model(model, libraryUrn.uri)
322     elif len(results) == 1:
323         pass # Assuming that a loaded dataset has one record
324     else:
325         logging.warning("Many dates for {0}".format(libraryUrn))
326                         
327 def get_library_id(name):
328     """Guess library ID from library name
329     """
330     match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
331     library_id = None
332     if match is not None:
333         library_id = match.group('id')
334     return library_id
335
336
337 def get_contents(element):
338     """Return contents or none.
339     """
340     if len(element.contents) == 0:
341         return None
342
343     a = element.find('a')
344     if a is not None:
345         return a.contents[0].encode(CHARSET)
346
347     return element.contents[0].encode(CHARSET)
348     
349     
350 def get_date_contents(element):
351     data = get_contents(element)
352     if data:
353         return datetime.strptime(data, "%Y-%m-%d %H:%M")
354     else:
355         return None
356
357         
358 def load_into_model(model, parser_name, filename):
359     if not os.path.exists(filename):
360         raise IOError("Can't find {0}".format(filename))
361     
362     data = open(filename, 'r').read()
363     rdf_parser = RDF.Parser(name=parser_name)
364     rdf_parser.parse_string_into_model(model, data, ns_uri)
365
366 def add_stmt(model, subject, predicate, object):
367     """Convienence create RDF Statement and add to a model
368     """
369     return model.add_statement(
370         RDF.Statement(subject, predicate, object)
371     )
372
373 def login(cookie=None):
374     """Login if we don't have a cookie
375     """
376     if cookie is not None:
377         return cookie
378     
379     keys = keyring.get_keyring()
380     password = keys.get_password(LOGIN_URL, USERNAME)
381     credentials = {'login': USERNAME,
382                    'password': password}
383     headers = {'Content-type': 'application/x-www-form-urlencoded'}
384     http = httplib2.Http()
385     response, content = http.request(LOGIN_URL,
386                                      'POST',
387                                      headers=headers,
388                                      body=urllib.urlencode(credentials))
389     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
390                                                     response['status']))
391     
392     cookie = response.get('set-cookie', None)
393     if cookie is None:
394         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
395     return cookie
396
397                 
398 def get_url_as_soup(url, method, cookie=None):
399     http = httplib2.Http()
400     headers = {}
401     if cookie is not None:
402         headers['Cookie'] = cookie
403     response, content = http.request(url, method, headers=headers)
404     if response['status'] == '200':
405         soup = BeautifulSoup(content,
406                              fromEncoding="utf-8", # should read from header
407                              convertEntities=BeautifulSoup.HTML_ENTITIES
408                              )
409         return soup
410     else:
411         msg = "error accessing {0}, status {1}"
412         msg = msg.format(url, response['status'])
413         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
414
415 def get_url_as_text(url, method, cookie=None):
416     http = httplib2.Http()
417     headers = {}
418     if cookie is not None:
419         headers['Cookie'] = cookie
420     response, content = http.request(url, method, headers=headers)
421     if response['status'] == '200':
422         return content
423     else:
424         msg = "error accessing {0}, status {1}"
425         msg = msg.format(url, response['status'])
426         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
427     
428 ################
429 #  old stuff
430 SUBMISSIONS_LACKING_LIBID = [
431     ('1x75-Directional-HeLa-Rep1',    '11208'),
432     ('1x75-Directional-HeLa-Rep2',    '11207'),
433     ('1x75-Directional-HepG2-Rep1',   '11210'),
434     ('1x75-Directional-HepG2-Rep2',   '11209'),
435     ('1x75-Directional-H1-hESC-Rep1', '10947'),
436     ('1x75-Directional-H1-hESC-Rep2', '11009'),
437     ('1x75-Directional-HUVEC-Rep1',   '11206'),
438     ('1x75-Directional-HUVEC-Rep2',   '11205'),
439     ('1x75-Directional-K562-Rep1',    '11008'),
440     ('1x75-Directional-K562-Rep2',    '11007'),
441     ('1x75-Directional-NHEK-Rep1',    '11204'),
442     ('1x75-Directional-GM12878-Rep1', '11011'),
443     ('1x75-Directional-GM12878-Rep2', '11010'),
444     ]
445
446
447
448 def select_by_library_id(submission_list):
449     subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
450     libraries = {}
451     for lib_id, subobj in subl:
452         libraries.setdefault(lib_id, []).append(subobj)
453
454     for submission in libraries.values():
455         submission.sort(key=attrgetter('date'), reverse=True)
456         
457     return libraries
458
459 def library_to_freeze(selected_libraries):
460     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
461     lib_ids = sorted(selected_libraries.keys())
462     report = ['<html><table border="1">']
463     report = ["""<html>
464 <head>
465 <style type="text/css">
466  td {border-width:0 0 1px 1px; border-style:solid;}
467 </style>
468 </head>
469 <body>
470 <table>
471 """]
472     report.append('<thead>')
473     report.append('<tr><td>Library ID</td><td>Name</td>')
474     for f in freezes:
475         report.append('<td>{0}</td>'.format(f))
476     report.append('</tr>')
477     report.append('</thead>')
478     report.append('<tbody>')
479     for lib_id in lib_ids:
480         report.append('<tr>')
481         lib_url = libraryNS[lib_id].uri
482         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
483         submissions = selected_libraries[lib_id]
484         report.append('<td>{0}</td>'.format(submissions[0].name))
485         batched = {}
486         for sub in submissions:
487             date = date_to_freeze(sub.date)
488             batched.setdefault(date, []).append(sub)
489         print lib_id, batched
490         for d in freezes:
491             report.append('<td>')
492             for s in batched.get(d, []):
493                 show_url = submission_view_url(s.subid)
494                 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
495                 report.append("{0}:{1}".format(subid, s.status))
496             report.append('</td>')
497         else:
498             report.append('<td></td>')
499         report.append("</tr>")
500     report.append('</tbody>')
501     report.append("</table></html>")
502     return "\n".join(report)
503
504             
505 def date_to_freeze(d):
506     freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
507                 (datetime(2010, 7, 30), '2010-Jul'),
508                 (datetime(2011, 1, 30), '2011-Jan'),
509                 ]
510     for end, name in freezes:
511         if d < end:
512             return name
513     else:
514         return None
515
516 if __name__ == "__main__":
517     main()
518