#!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
from BeautifulSoup import BeautifulSoup
from datetime import datetime
import os
import re
# redland rdf lib
-import RDF
+import RDF
import sys
import urllib
import urlparse
+from htsworkflow.submission import daf
+
from htsworkflow.util import api
from htsworkflow.util.rdfhelp import \
+ dafTermOntology, \
dublinCoreNS, \
get_model, \
get_serializer, \
rdfNS, \
rdfsNS, \
xsdNS
+TYPE_N = rdfNS['type']
# URL mappings
-libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+ daf_download_url, \
+ ddf_download_url, \
+ submission_view_url, \
+ UCSCEncodePipeline
+DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
+DDF_NS = RDF.NS(DOWNLOAD_DDF)
-from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
-download_ddf = UCSCEncodePipeline+"download_ddf#"
-ddfNS = RDF.NS(download_ddf)
-
DBDIR = os.path.expanduser("~diane/proj/submission")
-logger = logging.getLogger("encode_find")
+LOGGER = logging.getLogger("encode_find")
LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
USERNAME = 'detrout'
CHARSET = 'utf-8'
+
def main(cmdline=None):
+ """
+ Parse command line arguments
+
+ Takes a list of arguments (assuming arg[0] is the program name) or None
+ If None, it looks at sys.argv
+ """
parser = make_parser()
opts, args = parser.parse_args(cmdline)
htsw_authdata = api.make_auth_from_opts(opts, parser)
htswapi = api.HtswApi(opts.host, htsw_authdata)
-
+
cookie = None
model = get_model(opts.load_model, DBDIR)
-
+
if opts.load_rdf is not None:
ns_uri = submissionOntology[''].uri
load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-
+
+ if len(args) == 0:
+ limit = None
+ else:
+ limit = args
+
if opts.update:
cookie = login(cookie=cookie)
- load_my_submissions(model, cookie=cookie)
+ load_my_submissions(model, limit=limit, cookie=cookie)
load_encode_libraries(model, htswapi)
if opts.sparql is not None:
sparql_query(model, opts.sparql)
if opts.find_submission_with_no_library:
- missing = find_submissions_with_no_library(model)
-
+ find_submissions_with_no_library(model)
+
if opts.print_rdf:
serializer = get_serializer(name=opts.rdf_parser_name)
print serializer.serialize_model_to_string(model)
def make_parser():
+ """Construct option parser
+ """
parser = OptionParser()
commands = OptionGroup(parser, "Commands")
commands.add_option('--load-model', default=None,
#commands.add_option('--update-ddfs', action="store_true", default=False,
# help="download ddf information for known submission")
#commands.add_option('--update-library', default=None,
- # help="download library info from htsw, requires filename for extra rules")
+ # help="download library info from htsw, "\
+ # "requires filename for extra rules")
parser.add_option_group(commands)
-
+
queries = OptionGroup(parser, "Queries")
queries.add_option('--sparql', default=None,
help="execute arbitrary sparql query")
queries.add_option('--find-submission-with-no-library', default=False,
action="store_true",
- help="find submissions with no library ID")
+ help="find submissions with no library ID")
parser.add_option_group(queries)
options = OptionGroup(parser, "Options")
options.add_option("-v", "--verbose", action="store_true", default=False)
options.add_option("--debug", action="store_true", default=False)
parser.add_option_group(options)
-
+
api.add_auth_options(parser)
return parser
-def load_my_submissions(model, cookie=None):
+
+def load_my_submissions(model, limit=None, cookie=None):
+ """Parse all the submissions from UCSC into model
+ It will look at the global USER_URL to figure out who to scrape
+ cookie contains the session cookie, if none, will attempt to login
+ """
if cookie is None:
cookie = login()
-
+
soup = get_url_as_soup(USER_URL, 'GET', cookie)
- p = soup.find('table', attrs={'id':'projects'})
- tr = p.findNext('tr')
+ projects = soup.find('table', attrs={'id': 'projects'})
+ table_row = projects.findNext('tr')
# first record is header
- tr = tr.findNext()
- TypeN = rdfsNS['type']
- NameN = submissionOntology['name']
- SpeciesN = submissionOntology['species']
- LibraryURN = submissionOntology['library_urn']
-
- while tr is not None:
- td = tr.findAll('td')
- if td is not None and len(td) > 1:
- subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
- subUrn = RDF.Uri(submission_view_url(subUrnText))
-
- add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
-
- name = get_contents(td[4])
- add_stmt(model, subUrn, NameN, name)
-
- species = get_contents(td[2])
- if species is not None:
- add_stmt(model, subUrn, SpeciesN, species)
-
- library_id = get_library_id(name)
- if library_id is not None:
- add_submission_to_library_urn(model,
- subUrn,
- LibraryURN,
- library_id)
-
- add_submission_creation_date(model, subUrn, cookie)
-
- # grab changing atttributes
- status = get_contents(td[6]).strip()
- last_mod_datetime = get_date_contents(td[8])
- last_mod = last_mod_datetime.isoformat()
-
- update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
-
- logging.info("Processed {0}".format( subUrn))
-
- tr = tr.findNext('tr')
+ table_row = table_row.findNext()
+ name_n = submissionOntology['name']
+ species_n = submissionOntology['species']
+ library_urn = submissionOntology['library_urn']
+
+ while table_row is not None:
+ cell = table_row.findAll('td')
+ if cell is not None and len(cell) > 1:
+ submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
+ if limit is None or submission_id in limit:
+ subUrn = RDF.Uri(submission_view_url(submission_id))
+
+ add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+
+ name = get_contents(cell[4])
+ add_stmt(model, subUrn, name_n, name)
+
+ species = get_contents(cell[2])
+ if species is not None:
+ add_stmt(model, subUrn, species_n, species)
+
+ library_id = get_library_id(name)
+ if library_id is not None:
+ add_submission_to_library_urn(model,
+ subUrn,
+ library_urn,
+ library_id)
+
+ add_submission_creation_date(model, subUrn, cookie)
+
+ # grab changing atttributes
+ status = get_contents(cell[6]).strip()
+ last_mod_datetime = get_date_contents(cell[8])
+ last_mod = last_mod_datetime.isoformat()
+
+ update_submission_detail(model, subUrn, status, last_mod,
+ cookie=cookie)
+
+ logging.info("Processed {0}".format(subUrn))
+
+ table_row = table_row.findNext('tr')
def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
"""Add a link from a UCSC submission to woldlab library if needed
"""
- libraryUrn = libraryNS[library_id+'/']
+ libraryUrn = LIBRARY_NS[library_id + '/']
query = RDF.Statement(submissionUrn, predicate, libraryUrn)
if not model.contains_statement(query):
link = RDF.Statement(submissionUrn, predicate, libraryUrn)
- logger.info("Adding Sub -> Lib link: {0}".format(link))
+ LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
model.add_statement(link)
else:
- logger.debug("Found: {0}".format(str(query)))
+ LOGGER.debug("Found: {0}".format(str(query)))
+
-
def find_submissions_with_no_library(model):
missing_lib_query = RDF.SPARQLQuery("""
PREFIX submissionOntology:<{submissionOntology}>
-SELECT
+SELECT
?subid ?name
WHERE {{
?subid submissionOntology:name ?name
OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
FILTER (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-)
+}}""".format(submissionOntology=submissionOntology[''].uri))
results = missing_lib_query.execute(model)
for row in results:
name = row['name']
print "# {0}".format(name)
print "<{0}>".format(subid.uri)
- print " encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+ print " encodeSubmit:library_urn"\
+ "<http://jumpgate.caltech.edu/library/> ."
print ""
-
+
def add_submission_creation_date(model, subUrn, cookie):
# in theory the submission page might have more information on it.
query = RDF.Statement(subUrn, creationDateN, None)
creation_dates = list(model.find_statements(query))
if len(creation_dates) == 0:
- logger.info("Getting creation date for: {0}".format(str(subUrn)))
+ LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
soup = get_url_as_soup(str(subUrn), 'GET', cookie)
created_label = soup.find(text="Created: ")
if created_label:
datatype=dateTimeType.uri)
add_stmt(model, subUrn, creationDateN, created_date_node)
else:
- logger.debug("Found creation date for: {0}".format(str(subUrn)))
+ LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
def update_submission_detail(model, subUrn, status, recent_update, cookie):
HasStatusN = submissionOntology['has_status']
if len(status_nodes) == 0:
# has no status node, add one
logging.info("Adding status node to {0}".format(subUrn))
- status_blank = RDF.Node()
- add_stmt(model, subUrn, HasStatusN, status_blank)
- add_stmt(model, status_blank, rdfsNS['type'], StatusN)
- add_stmt(model, status_blank, StatusN, status)
- add_stmt(model, status_blank, LastModifyN, recent_update)
- update_ddf(model, subUrn, status_blank, cookie=cookie)
+ status_node = create_status_node(subUrn, recent_update)
+ add_stmt(model, subUrn, HasStatusN, status_node)
+ add_stmt(model, status_node, rdfsNS['type'], StatusN)
+ add_stmt(model, status_node, StatusN, status)
+ add_stmt(model, status_node, LastModifyN, recent_update)
+ update_ddf(model, subUrn, status_node, cookie=cookie)
+ update_daf(model, subUrn, status_node, cookie=cookie)
else:
logging.info("Found {0} status blanks".format(len(status_nodes)))
for status_statement in status_nodes:
- status_blank = status_statement.object
- last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
+ status_node = status_statement.object
+ last_modified_query = RDF.Statement(status_node,
+ LastModifyN,
+ None)
last_mod_nodes = model.find_statements(last_modified_query)
for last_mod_statement in last_mod_nodes:
last_mod_date = str(last_mod_statement.object)
if recent_update == str(last_mod_date):
- update_ddf(model, subUrn, status_blank, cookie=cookie)
+ update_ddf(model, subUrn, status_node, cookie=cookie)
+ update_daf(model, subUrn, status_node, cookie=cookie)
break
-
+def update_daf(model, submission_url, status_node, cookie):
+ download_daf_uri = str(submission_url).replace('show', 'download_daf')
+ daf_uri = RDF.Uri(download_daf_uri)
+
+ status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+ if not model.contains_statement(status_is_daf):
+ logging.info('Adding daf to {0}, {1}'.format(submission_url,
+ status_node))
+ daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+ daf.fromstring_into_model(model, status_node, daf_text)
+
+
def update_ddf(model, subUrn, statusNode, cookie):
- TypeN = rdfsNS['type']
-
download_ddf_url = str(subUrn).replace('show', 'download_ddf')
ddfUrn = RDF.Uri(download_ddf_url)
-
- status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
+
+ status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
if not model.contains_statement(status_is_ddf):
logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
ddf_lines = ddf_string.split('\n')
# first line is header
header = ddf_lines[0].split()
- attributes = [ ddfNS[x] for x in header ]
- statements = []
+ attributes = [DDF_NS[x] for x in header]
for ddf_line in ddf_lines[1:]:
ddf_line = ddf_line.strip()
continue
if ddf_line.startswith("#"):
continue
-
+
ddf_record = ddf_line.split('\t')
files = ddf_record[0].split(',')
file_attributes = ddf_record[1:]
for f in files:
fileNode = RDF.Node()
- add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
- add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
- add_stmt(model, fileNode, ddfNS['filename'], f)
-
- for predicate, object in zip( attributes[1:], file_attributes):
+ add_stmt(model,
+ statusNode,
+ submissionOntology['has_file'],
+ fileNode)
+ add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
+ add_stmt(model, fileNode, DDF_NS['filename'], f)
+
+ for predicate, object in zip(attributes[1:], file_attributes):
add_stmt(model, fileNode, predicate, object)
"""Get libraries associated with encode.
"""
encodeFilters = ["/library/?affiliations__id__exact=44",
- "/library/?affiliations__id__exact=80",]
+ "/library/?affiliations__id__exact=80",
+ ]
-
encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
rdfaParser = RDF.Parser(name='rdfa')
for encodeUrl in encodeUrls:
- logger.info("Scanning library url {0}".format(encodeUrl))
+ LOGGER.info("Scanning library url {0}".format(encodeUrl))
rdfaParser.parse_into_model(model, encodeUrl)
query = RDF.Statement(None, libraryOntology['library_id'], None)
libraries = model.find_statements(query)
for statement in libraries:
libraryUrn = statement.subject
- logger.info("Scanning {0}".format(str(libraryUrn)))
+ LOGGER.info("Scanning {0}".format(str(libraryUrn)))
load_library_detail(model, libraryUrn)
rdfaParser = RDF.Parser(name='rdfa')
query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
results = list(model.find_statements(query))
- logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
+ log_message = "Found {0} statements for {1}"
+ LOGGER.debug(log_message.format(len(results), libraryUrn))
if len(results) == 0:
- logger.info("Loading {0}".format(str(libraryUrn)))
+ LOGGER.info("Loading {0}".format(str(libraryUrn)))
rdfaParser.parse_into_model(model, libraryUrn.uri)
elif len(results) == 1:
- pass # Assuming that a loaded dataset has one record
+ pass # Assuming that a loaded dataset has one record
else:
logging.warning("Many dates for {0}".format(libraryUrn))
-
+
+
def get_library_id(name):
"""Guess library ID from library name
return a.contents[0].encode(CHARSET)
return element.contents[0].encode(CHARSET)
-
-
+
+
+def create_status_node(submission_uri, timestamp):
+ submission_uri = daf.submission_uri_to_string(submission_uri)
+ status_uri = urlparse.urljoin(submission_uri, timestamp)
+ return RDF.Node(RDF.Uri(status_uri))
+
def get_date_contents(element):
data = get_contents(element)
if data:
else:
return None
-
-def add_stmt(model, subject, predicate, object):
+
+def add_stmt(model, subject, predicate, rdf_object):
"""Convienence create RDF Statement and add to a model
"""
return model.add_statement(
- RDF.Statement(subject, predicate, object)
- )
+ RDF.Statement(subject, predicate, rdf_object))
def login(cookie=None):
"""
if cookie is not None:
return cookie
-
+
keys = keyring.get_keyring()
password = keys.get_password(LOGIN_URL, USERNAME)
credentials = {'login': USERNAME,
body=urllib.urlencode(credentials))
logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
response['status']))
-
+
cookie = response.get('set-cookie', None)
if cookie is None:
raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
return cookie
-
+
def get_url_as_soup(url, method, cookie=None):
http = httplib2.Http()
headers = {}
response, content = http.request(url, method, headers=headers)
if response['status'] == '200':
soup = BeautifulSoup(content,
- fromEncoding="utf-8", # should read from header
- convertEntities=BeautifulSoup.HTML_ENTITIES
- )
+ fromEncoding="utf-8", # should read from header
+ convertEntities=BeautifulSoup.HTML_ENTITIES)
return soup
else:
msg = "error accessing {0}, status {1}"
msg = msg.format(url, response['status'])
e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
def get_url_as_text(url, method, cookie=None):
http = httplib2.Http()
headers = {}
msg = "error accessing {0}, status {1}"
msg = msg.format(url, response['status'])
e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
+
################
# old stuff
SUBMISSIONS_LACKING_LIBID = [
]
-
def select_by_library_id(submission_list):
- subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
+ subl = [(x.library_id, x) for x in submission_list if x.library_id]
libraries = {}
for lib_id, subobj in subl:
libraries.setdefault(lib_id, []).append(subobj)
for submission in libraries.values():
submission.sort(key=attrgetter('date'), reverse=True)
-
+
return libraries
+
def library_to_freeze(selected_libraries):
freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
lib_ids = sorted(selected_libraries.keys())
report.append('<tbody>')
for lib_id in lib_ids:
report.append('<tr>')
- lib_url = libraryNS[lib_id].uri
+ lib_url = LIBRARY_NS[lib_id].uri
report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
submissions = selected_libraries[lib_id]
report.append('<td>{0}</td>'.format(submissions[0].name))
for sub in submissions:
date = date_to_freeze(sub.date)
batched.setdefault(date, []).append(sub)
- print lib_id, batched
for d in freezes:
report.append('<td>')
for s in batched.get(d, []):
report.append("</table></html>")
return "\n".join(report)
-
+
def date_to_freeze(d):
- freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
- (datetime(2010, 7, 30), '2010-Jul'),
- (datetime(2011, 1, 30), '2011-Jan'),
- ]
+ freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+ (datetime(2010, 7, 30), '2010-Jul'),
+ (datetime(2011, 1, 30), '2011-Jan'),
+ ]
for end, name in freezes:
if d < end:
return name
if __name__ == "__main__":
main()
-