3 from BeautifulSoup import BeautifulSoup
4 from datetime import datetime
6 from operator import attrgetter
7 from optparse import OptionParser, OptionGroup
18 from htsworkflow.util import api
20 logger = logging.getLogger("encode_find")
22 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
23 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
24 submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
25 ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
26 libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
28 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
29 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
30 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
31 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
33 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
34 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
39 def main(cmdline=None):
40 parser = make_parser()
41 opts, args = parser.parse_args(cmdline)
44 logging.basicConfig(level=logging.INFO)
46 htsw_authdata = api.make_auth_from_opts(opts, parser)
47 htswapi = api.HtswApi(opts.host, htsw_authdata)
50 model = get_model(opts.load_model)
52 if opts.load_rdf is not None:
53 load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
56 cookie = login(cookie=cookie)
57 load_my_submissions(model, cookie=cookie)
58 update_submission_detail(model, cookie=cookie)
59 load_libraries(model, htswapi)
61 if opts.sparql is not None:
62 sparql_query(model, opts.sparql)
64 if opts.find_submission_with_no_library:
65 missing = find_submissions_with_no_library(model)
68 serializer = RDF.Serializer(name=opts.rdf_parser_name)
69 print serializer.serialize_model_to_string(model)
73 parser = OptionParser()
74 commands = OptionGroup(parser, "Commands")
75 commands.add_option('--load-model', default=None,
76 help="Load model database")
77 commands.add_option('--load-rdf', default=None,
78 help="load rdf statements into model")
79 commands.add_option('--print-rdf', action="store_true", default=False,
80 help="print ending model state")
81 commands.add_option('--update', action="store_true", default=False,
82 help="Query remote data sources and update our database")
83 #commands.add_option('--update-ucsc-status', default=None,
84 # help="download status from ucsc, requires filename for extra rules")
85 #commands.add_option('--update-ddfs', action="store_true", default=False,
86 # help="download ddf information for known submission")
87 #commands.add_option('--update-library', default=None,
88 # help="download library info from htsw, requires filename for extra rules")
89 parser.add_option_group(commands)
91 queries = OptionGroup(parser, "Queries")
92 queries.add_option('--sparql', default=None,
93 help="execute arbitrary sparql query")
94 queries.add_option('--find-submission-with-no-library', default=False,
96 help="find submissions with no library ID")
97 parser.add_option_group(queries)
99 options = OptionGroup(parser, "Options")
100 options.add_option("--rdf-parser-name", default="turtle",
101 help="set rdf file parser type")
102 options.add_option("-v", "--verbose", action="store_true", default=False)
103 parser.add_option_group(options)
105 api.add_auth_options(parser)
109 def get_model(model_name=None):
110 if model_name is None:
111 storage = RDF.MemoryStorage()
113 storage = RDF.HashStorage(model_name, options="hash-type='bdb',dir='/tmp'")
114 model = RDF.Model(storage)
117 def load_my_submissions(model, cookie=None):
121 soup = get_url_as_soup(USER_URL, 'GET', cookie)
122 p = soup.find('table', attrs={'id':'projects'})
123 tr = p.findNext('tr')
124 # first record is header
126 ClassP = rdfsNS['Class']
127 NameP = submitOntologyNS['name']
128 StatusP = submitOntologyNS['status']
129 LastModifyP = submitOntologyNS['last_modify_date']
130 SpeciesP = submitOntologyNS['species']
131 LibraryURN = submitOntologyNS['library_urn']
133 add_stmt = model.add_statement
135 while tr is not None:
136 td = tr.findAll('td')
137 if td is not None and len(td) > 1:
138 subIdText = td[0].contents[0].contents[0].encode(CHARSET)
139 subId = submissionNS[subIdText]
140 submission_stmt = Stmt(subId, ClassP,
141 submitOntologyNS['Submission'])
142 if model.contains_statement(submission_stmt):
143 logger.debug("Have {0}".format(str(submission_stmt)))
145 logger.info("New submission {0}".format(str(submission_stmt)))
146 add_stmt(submission_stmt)
148 name = get_contents(td[4])
149 add_stmt(Stmt(subId, NameP, name))
151 status = get_contents(td[6]).strip()
152 add_stmt(Stmt(subId, StatusP, status))
154 last_mod_datetime = get_date_contents(td[8])
155 last_mod = last_mod_datetime.isoformat()
156 add_stmt(Stmt(subId, LastModifyP, last_mod))
158 species = get_contents(td[2])
159 if species is not None:
160 add_stmt(Stmt(subId, SpeciesP, species))
162 library_id = get_library_id(name)
163 if library_id is not None:
164 add_submission_to_library_urn(model,
169 tr = tr.findNext('tr')
172 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
173 """Add a link from a UCSC submission to woldlab library if needed
175 libraryUrn = libraryNS[library_id]
176 query = RDF.Statement(submissionUrn, predicate, libraryUrn)
177 if not model.contains_statement(query):
178 link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
179 logger.info("Adding Sub -> Lib link: {0}".format(link))
180 model.add_statement(link)
182 logger.info("Found: {0}".format(str(result[0])))
185 def find_submissions_with_no_library(model):
186 p = os.path.abspath(__file__)
187 sourcedir = os.path.dirname(p)
188 no_lib = open(os.path.join(sourcedir, "no-lib.sparql"),'r').read()
189 query = RDF.SPARQLQuery(no_lib)
190 results = query.execute(model)
194 print "# {0}".format(name)
195 print "<{0}>".format(subid.uri)
196 print " encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
199 def update_submission_detail(model, cookie=None):
200 """Look for submission IDs in our model and go get their ddfs
202 submissions = model.get_sources(rdfsNS['Class'],
203 submitOntologyNS['Submission'])
204 for subUrn in submissions:
205 logging.info("Updating detail for: {0}".format(str(subUrn)))
206 update_submission_creation_date(model, subUrn, cookie)
207 download_ddf(model, subUrn, cookie=cookie)
210 def update_submission_creation_date(model, subUrn, cookie):
211 # in theory the submission page might have more information on it.
212 creationDateP = libNS['date']
213 dateTimeType = xsdNS['dateTime']
214 query = RDF.Statement(subUrn, creationDateP, None)
215 creation_dates = list(model.find_statements(query))
216 if len(creation_dates) == 0:
217 logger.info("Getting creation date for: {0}".format(str(subUrn)))
218 soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
219 created_label = soup.find(text="Created: ")
221 created_date = get_date_contents(created_label.next)
222 created_date_node = RDF.Node(literal=created_date.isoformat(),
223 datatype=dateTimeType.uri)
225 RDF.Statement(subUrn, creationDateP, created_date_node)
229 def download_ddf(model, subId, cookie=None):
235 download_ddf_url = str(subId).replace('show', 'download_ddf')
236 ddf = get_url_as_text(download_ddf_url, 'GET', cookie)
237 ddfUrn = RDF.Uri(download_ddf_url)
238 query = RDF.Statement(ddfUrn, rdfsNS['Class'], ddfNS['ddf'])
239 if not model.contains_statement(query):
240 statements = parse_ddf(subId, ddf)
242 model.add_statement(s)
245 def parse_ddf(subId, ddf_blob):
246 """Convert a ddf text file into RDF Statements
248 ddf_data = ddf_blob.split('\n')
249 # first line is header
250 header = ddf_data[0].split()
251 attributes = [ ddfNS[x] for x in header ]
253 subIdUri = str(subId.uri)
254 # force it to look like a namespace
255 if subIdUri[-1] != '/':
257 subIdNS = RDF.NS(subIdUri)
258 for ddf_line in ddf_data[1:]:
259 ddf_line = ddf_line.strip()
260 if len(ddf_line) == 0:
262 if ddf_line.startswith("#"):
265 ddf_records = ddf_line.split('\t')
266 files = ddf_records[0].split(',')
267 file_attributes = ddf_records[1:]
271 statements += [RDF.Statement(subId,
272 submitOntologyNS['has_file'],
274 statements += [RDF.Statement(blank, rdfsNS['Class'],
275 submitOntologyNS['File'])]
276 statements += [RDF.Statement(blank, ddfNS['filename'], f)]
277 file_uri_list = [ blank ] * len(file_attributes)
278 for s,p,o in zip(file_uri_list, attributes[1:], file_attributes):
279 statements += [RDF.Statement(s,p,o)]
283 def load_libraries(model, htswapi):
286 query = RDF.SPARQLQuery("""
287 SELECT distinct ?library_urn
289 ?subid <http://jumpgate.caltech.edu/wiki/EncodeSubmit#library_urn> ?library_urn .
291 results = query.execute(model)
292 #newmodel = get_model()
295 lib_id = row['library_urn']
296 lib_uri = str(row['library_urn'].uri)
297 short_lib_id = lib_uri.replace(libraryNS._prefix,"")
298 logging.info("Loading library info: {0}".format(short_lib_id))
299 if short_lib_id.startswith("SL"):
301 lib_info = htswapi.get_library(short_lib_id)
303 for lib_k, lib_v in lib_info.items():
304 if lib_k != 'lane_set':
305 attribute = lib_k.encode(CHARSET)
307 RDF.Statement(lib_id,
308 submitOntologyNS[attribute],
311 for flowcell in lib_v:
314 RDF.Statement(lib_id,
315 submitOntologyNS['has_lane'],
317 for fc_k, fc_v in flowcell.items():
320 submitOntologyNS[fc_k.encode(CHARSET)],
323 #serializer = RDF.Serializer('turtle')
324 #print serializer.serialize_model_to_string(newmodel)
326 def get_library_id(name):
327 """Guess library ID from library name
329 match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
331 if match is not None:
332 library_id = match.group('id')
336 def get_contents(element):
337 """Return contents or none.
339 if len(element.contents) == 0:
342 a = element.find('a')
344 return a.contents[0].encode(CHARSET)
346 return element.contents[0].encode(CHARSET)
349 def get_date_contents(element):
350 data = get_contents(element)
352 return datetime.strptime(data, "%Y-%m-%d %H:%M")
356 def sparql_query(model, query_filename):
357 """Execute sparql query from file
359 query_body = open(query_filename,'r').read()
360 query = RDF.SPARQLQuery(query_body)
361 results = query.execute(model)
364 for k,v in row.items()[::-1]:
365 print "{0}: {1}".format(k,v)
369 def load_into_model(model, parser_name, filename):
370 if not os.path.exists(filename):
371 raise IOError("Can't find {0}".format(filename))
373 data = open(filename, 'r').read()
374 rdf_parser = RDF.Parser(name=parser_name)
375 ns_uri = submitOntologyNS[''].uri
376 rdf_parser.parse_string_into_model(model, data, ns_uri)
379 def login(cookie=None):
380 """Login if we don't have a cookie
382 if cookie is not None:
385 keys = keyring.get_keyring()
386 password = keys.get_password(LOGIN_URL, USERNAME)
387 credentials = {'login': USERNAME,
388 'password': password}
389 headers = {'Content-type': 'application/x-www-form-urlencoded'}
390 http = httplib2.Http()
391 response, content = http.request(LOGIN_URL,
394 body=urllib.urlencode(credentials))
395 logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
398 cookie = response.get('set-cookie', None)
400 raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
404 def get_url_as_soup(url, method, cookie=None):
405 http = httplib2.Http()
407 if cookie is not None:
408 headers['Cookie'] = cookie
409 response, content = http.request(url, method, headers=headers)
410 if response['status'] == '200':
411 soup = BeautifulSoup(content,
412 fromEncoding="utf-8", # should read from header
413 convertEntities=BeautifulSoup.HTML_ENTITIES
417 msg = "error accessing {0}, status {1}"
418 msg = msg.format(url, response['status'])
419 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
421 def get_url_as_text(url, method, cookie=None):
422 http = httplib2.Http()
424 if cookie is not None:
425 headers['Cookie'] = cookie
426 response, content = http.request(url, method, headers=headers)
427 if response['status'] == '200':
430 msg = "error accessing {0}, status {1}"
431 msg = msg.format(url, response['status'])
432 e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
436 SUBMISSIONS_LACKING_LIBID = [
437 ('1x75-Directional-HeLa-Rep1', '11208'),
438 ('1x75-Directional-HeLa-Rep2', '11207'),
439 ('1x75-Directional-HepG2-Rep1', '11210'),
440 ('1x75-Directional-HepG2-Rep2', '11209'),
441 ('1x75-Directional-H1-hESC-Rep1', '10947'),
442 ('1x75-Directional-H1-hESC-Rep2', '11009'),
443 ('1x75-Directional-HUVEC-Rep1', '11206'),
444 ('1x75-Directional-HUVEC-Rep2', '11205'),
445 ('1x75-Directional-K562-Rep1', '11008'),
446 ('1x75-Directional-K562-Rep2', '11007'),
447 ('1x75-Directional-NHEK-Rep1', '11204'),
448 ('1x75-Directional-GM12878-Rep1', '11011'),
449 ('1x75-Directional-GM12878-Rep2', '11010'),
454 def select_by_library_id(submission_list):
455 subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
457 for lib_id, subobj in subl:
458 libraries.setdefault(lib_id, []).append(subobj)
460 for submission in libraries.values():
461 submission.sort(key=attrgetter('date'), reverse=True)
465 def library_to_freeze(selected_libraries):
466 freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
467 lib_ids = sorted(selected_libraries.keys())
468 report = ['<html><table border="1">']
471 <style type="text/css">
472 td {border-width:0 0 1px 1px; border-style:solid;}
478 report.append('<thead>')
479 report.append('<tr><td>Library ID</td><td>Name</td>')
481 report.append('<td>{0}</td>'.format(f))
482 report.append('</tr>')
483 report.append('</thead>')
484 report.append('<tbody>')
485 for lib_id in lib_ids:
486 report.append('<tr>')
487 lib_url = libraryNS[lib_id].uri
488 report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
489 submissions = selected_libraries[lib_id]
490 report.append('<td>{0}</td>'.format(submissions[0].name))
492 for sub in submissions:
493 date = date_to_freeze(sub.date)
494 batched.setdefault(date, []).append(sub)
495 print lib_id, batched
497 report.append('<td>')
498 for s in batched.get(d, []):
499 show_url = submissionNS[s.subid].uri
500 subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
501 report.append("{0}:{1}".format(subid, s.status))
502 report.append('</td>')
504 report.append('<td></td>')
505 report.append("</tr>")
506 report.append('</tbody>')
507 report.append("</table></html>")
508 return "\n".join(report)
511 def date_to_freeze(d):
512 freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
513 (datetime(2010, 7, 30), '2010-Jul'),
514 (datetime(2011, 1, 30), '2011-Jan'),
516 for end, name in freezes:
522 if __name__ == "__main__":