Attempt to download DAF data for a encodesubmit submission
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
index 15acea6e92b2399c5edf1e60aae70f2ca7dfe6d8..f116281f3f07dd563455cf2de18d685e2f583652 100644 (file)
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
 
 from BeautifulSoup import BeautifulSoup
 from datetime import datetime
@@ -11,13 +14,16 @@ import logging
 import os
 import re
 # redland rdf lib
-import RDF 
+import RDF
 import sys
 import urllib
 import urlparse
 
+from htsworkflow.submission import daf
+
 from htsworkflow.util import api
 from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
      dublinCoreNS, \
      get_model, \
      get_serializer, \
@@ -28,18 +34,23 @@ from htsworkflow.util.rdfhelp import \
      rdfNS, \
      rdfsNS, \
      xsdNS
+TYPE_N = rdfNS['type']
 
 # URL mappings
-libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+     daf_download_url, \
+     ddf_download_url, \
+     submission_view_url, \
+     UCSCEncodePipeline
 
+DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
+DDF_NS = RDF.NS(DOWNLOAD_DDF)
 
-from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
-download_ddf = UCSCEncodePipeline+"download_ddf#"
-ddfNS = RDF.NS(download_ddf)
-               
 DBDIR = os.path.expanduser("~diane/proj/submission")
 
-logger = logging.getLogger("encode_find")
+LOGGER = logging.getLogger("encode_find")
 
 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
@@ -47,7 +58,14 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
 USERNAME = 'detrout'
 CHARSET = 'utf-8'
 
+
 def main(cmdline=None):
+    """
+    Parse command line arguments
+
+    Takes a list of arguments (assuming arg[0] is the program name) or None
+    If None, it looks at sys.argv
+    """
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
 
@@ -58,31 +76,38 @@ def main(cmdline=None):
 
     htsw_authdata = api.make_auth_from_opts(opts, parser)
     htswapi = api.HtswApi(opts.host, htsw_authdata)
-    
+
     cookie = None
     model = get_model(opts.load_model, DBDIR)
-    
+
     if opts.load_rdf is not None:
         ns_uri = submissionOntology[''].uri
         load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-        
+
+    if len(args) == 0:
+        limit = None
+    else:
+        limit = args
+
     if opts.update:
         cookie = login(cookie=cookie)
-        load_my_submissions(model, cookie=cookie)
+        load_my_submissions(model, limit=limit, cookie=cookie)
         load_encode_libraries(model, htswapi)
 
     if opts.sparql is not None:
         sparql_query(model, opts.sparql)
 
     if opts.find_submission_with_no_library:
-        missing = find_submissions_with_no_library(model)
-                
+        find_submissions_with_no_library(model)
+
     if opts.print_rdf:
         serializer = get_serializer(name=opts.rdf_parser_name)
         print serializer.serialize_model_to_string(model)
 
 
 def make_parser():
+    """Construct option parser
+    """
     parser = OptionParser()
     commands = OptionGroup(parser, "Commands")
     commands.add_option('--load-model', default=None,
@@ -98,15 +123,16 @@ def make_parser():
     #commands.add_option('--update-ddfs', action="store_true", default=False,
     #  help="download ddf information for known submission")
     #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, requires filename for extra rules")
+    #  help="download library info from htsw, "\
+    #       "requires filename for extra rules")
     parser.add_option_group(commands)
-                      
+
     queries = OptionGroup(parser, "Queries")
     queries.add_option('--sparql', default=None,
       help="execute arbitrary sparql query")
     queries.add_option('--find-submission-with-no-library', default=False,
       action="store_true",
-      help="find submissions with no library ID")    
+      help="find submissions with no library ID")
     parser.add_option_group(queries)
 
     options = OptionGroup(parser, "Options")
@@ -115,86 +141,91 @@ def make_parser():
     options.add_option("-v", "--verbose", action="store_true", default=False)
     options.add_option("--debug", action="store_true", default=False)
     parser.add_option_group(options)
-    
+
     api.add_auth_options(parser)
 
     return parser
 
-def load_my_submissions(model, cookie=None):
+
+def load_my_submissions(model, limit=None, cookie=None):
+    """Parse all the submissions from UCSC into model
+    It will look at the global USER_URL to figure out who to scrape
+    cookie contains the session cookie, if none, will attempt to login
+    """
     if cookie is None:
         cookie = login()
-        
+
     soup = get_url_as_soup(USER_URL, 'GET', cookie)
-    p = soup.find('table', attrs={'id':'projects'})
-    tr = p.findNext('tr')
+    projects = soup.find('table', attrs={'id': 'projects'})
+    table_row = projects.findNext('tr')
     # first record is header
-    tr = tr.findNext()
-    TypeN = rdfsNS['type']
-    NameN = submissionOntology['name']
-    SpeciesN = submissionOntology['species']
-    LibraryURN = submissionOntology['library_urn']
-
-    while tr is not None:
-        td = tr.findAll('td')
-        if td is not None and len(td) > 1:
-            subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
-            subUrn = RDF.Uri(submission_view_url(subUrnText))
-
-            add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
-                
-            name = get_contents(td[4])
-            add_stmt(model, subUrn, NameN, name)
-                
-            species = get_contents(td[2])
-            if species is not None:
-                add_stmt(model, subUrn, SpeciesN, species)
-
-            library_id = get_library_id(name)
-            if library_id is not None:
-                add_submission_to_library_urn(model,
-                                              subUrn,
-                                              LibraryURN,
-                                              library_id)
-
-            add_submission_creation_date(model, subUrn, cookie)
-
-            # grab changing atttributes
-            status = get_contents(td[6]).strip()
-            last_mod_datetime = get_date_contents(td[8])
-            last_mod = last_mod_datetime.isoformat()
-
-            update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
-
-            logging.info("Processed {0}".format( subUrn))
-            
-        tr = tr.findNext('tr')
+    table_row = table_row.findNext()
+    name_n = submissionOntology['name']
+    species_n = submissionOntology['species']
+    library_urn = submissionOntology['library_urn']
+
+    while table_row is not None:
+        cell = table_row.findAll('td')
+        if cell is not None and len(cell) > 1:
+            submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
+            if limit is None or submission_id in limit:
+                subUrn = RDF.Uri(submission_view_url(submission_id))
+
+                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+
+                name = get_contents(cell[4])
+                add_stmt(model, subUrn, name_n, name)
+
+                species = get_contents(cell[2])
+                if species is not None:
+                    add_stmt(model, subUrn, species_n, species)
+
+                library_id = get_library_id(name)
+                if library_id is not None:
+                    add_submission_to_library_urn(model,
+                                                  subUrn,
+                                                  library_urn,
+                                                  library_id)
+
+                add_submission_creation_date(model, subUrn, cookie)
+
+                # grab changing atttributes
+                status = get_contents(cell[6]).strip()
+                last_mod_datetime = get_date_contents(cell[8])
+                last_mod = last_mod_datetime.isoformat()
+
+                update_submission_detail(model, subUrn, status, last_mod,
+                                         cookie=cookie)
+
+                logging.info("Processed {0}".format(subUrn))
+
+        table_row = table_row.findNext('tr')
 
 
 def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
     """Add a link from a UCSC submission to woldlab library if needed
     """
-    libraryUrn = libraryNS[library_id+'/']
+    libraryUrn = LIBRARY_NS[library_id + '/']
     query = RDF.Statement(submissionUrn, predicate, libraryUrn)
     if not model.contains_statement(query):
         link = RDF.Statement(submissionUrn, predicate, libraryUrn)
-        logger.info("Adding Sub -> Lib link: {0}".format(link))
+        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
         model.add_statement(link)
     else:
-        logger.debug("Found: {0}".format(str(query)))
+        LOGGER.debug("Found: {0}".format(str(query)))
+
 
-    
 def find_submissions_with_no_library(model):
     missing_lib_query = RDF.SPARQLQuery("""
 PREFIX submissionOntology:<{submissionOntology}>
 
-SELECT 
+SELECT
  ?subid ?name
 WHERE {{
   ?subid submissionOntology:name ?name
   OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
   FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-)    
+}}""".format(submissionOntology=submissionOntology[''].uri))
 
     results = missing_lib_query.execute(model)
     for row in results:
@@ -202,9 +233,10 @@ WHERE {{
         name = row['name']
         print "# {0}".format(name)
         print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+        print "  encodeSubmit:library_urn"\
+              "<http://jumpgate.caltech.edu/library/> ."
         print ""
-    
+
 
 def add_submission_creation_date(model, subUrn, cookie):
     # in theory the submission page might have more information on it.
@@ -213,7 +245,7 @@ def add_submission_creation_date(model, subUrn, cookie):
     query = RDF.Statement(subUrn, creationDateN, None)
     creation_dates = list(model.find_statements(query))
     if len(creation_dates) == 0:
-        logger.info("Getting creation date for: {0}".format(str(subUrn)))
+        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
         soup = get_url_as_soup(str(subUrn), 'GET', cookie)
         created_label = soup.find(text="Created: ")
         if created_label:
@@ -222,7 +254,8 @@ def add_submission_creation_date(model, subUrn, cookie):
                                          datatype=dateTimeType.uri)
             add_stmt(model, subUrn, creationDateN, created_date_node)
     else:
-        logger.debug("Found creation date for: {0}".format(str(subUrn)))
+        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
 
 def update_submission_detail(model, subUrn, status, recent_update, cookie):
     HasStatusN = submissionOntology['has_status']
@@ -235,33 +268,46 @@ def update_submission_detail(model, subUrn, status, recent_update, cookie):
     if len(status_nodes) == 0:
         # has no status node, add one
         logging.info("Adding status node to {0}".format(subUrn))
-        status_blank = RDF.Node()
-        add_stmt(model, subUrn, HasStatusN, status_blank)
-        add_stmt(model, status_blank, rdfsNS['type'], StatusN)
-        add_stmt(model, status_blank, StatusN, status)
-        add_stmt(model, status_blank, LastModifyN, recent_update)
-        update_ddf(model, subUrn, status_blank, cookie=cookie)
+        status_node = create_status_node(subUrn, recent_update)
+        add_stmt(model, subUrn, HasStatusN, status_node)
+        add_stmt(model, status_node, rdfsNS['type'], StatusN)
+        add_stmt(model, status_node, StatusN, status)
+        add_stmt(model, status_node, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_node, cookie=cookie)
+        update_daf(model, subUrn, status_node, cookie=cookie)
     else:
         logging.info("Found {0} status blanks".format(len(status_nodes)))
         for status_statement in status_nodes:
-            status_blank = status_statement.object
-            last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
+            status_node = status_statement.object
+            last_modified_query = RDF.Statement(status_node,
+                                                LastModifyN,
+                                                None)
             last_mod_nodes = model.find_statements(last_modified_query)
             for last_mod_statement in last_mod_nodes:
                 last_mod_date = str(last_mod_statement.object)
                 if recent_update == str(last_mod_date):
-                    update_ddf(model, subUrn, status_blank, cookie=cookie)
+                    update_ddf(model, subUrn, status_node, cookie=cookie)
+                    update_daf(model, subUrn, status_node, cookie=cookie)
                     break
 
 
-    
+def update_daf(model, submission_url, status_node, cookie):
+    download_daf_uri = str(submission_url).replace('show', 'download_daf')
+    daf_uri = RDF.Uri(download_daf_uri)
+
+    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+    if not model.contains_statement(status_is_daf):
+        logging.info('Adding daf to {0}, {1}'.format(submission_url,
+                                                     status_node))
+        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+        daf.fromstring_into_model(model, status_node, daf_text)
+
+
 def update_ddf(model, subUrn, statusNode, cookie):
-    TypeN = rdfsNS['type']
-    
     download_ddf_url = str(subUrn).replace('show', 'download_ddf')
     ddfUrn = RDF.Uri(download_ddf_url)
-    
-    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
+
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
     if not model.contains_statement(status_is_ddf):
         logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
         ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -275,8 +321,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
     ddf_lines = ddf_string.split('\n')
     # first line is header
     header = ddf_lines[0].split()
-    attributes = [ ddfNS[x] for x in header ]
-    statements = []
+    attributes = [DDF_NS[x] for x in header]
 
     for ddf_line in ddf_lines[1:]:
         ddf_line = ddf_line.strip()
@@ -284,18 +329,21 @@ def add_ddf_statements(model, statusNode, ddf_string):
             continue
         if ddf_line.startswith("#"):
             continue
-        
+
         ddf_record = ddf_line.split('\t')
         files = ddf_record[0].split(',')
         file_attributes = ddf_record[1:]
 
         for f in files:
             fileNode = RDF.Node()
-            add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
-            add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
-            add_stmt(model, fileNode, ddfNS['filename'], f)
-
-            for predicate, object in zip( attributes[1:], file_attributes):
+            add_stmt(model,
+                     statusNode,
+                     submissionOntology['has_file'],
+                     fileNode)
+            add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
+            add_stmt(model, fileNode, DDF_NS['filename'], f)
+
+            for predicate, object in zip(attributes[1:], file_attributes):
                 add_stmt(model, fileNode, predicate, object)
 
 
@@ -303,19 +351,19 @@ def load_encode_libraries(model, htswapi):
     """Get libraries associated with encode.
     """
     encodeFilters = ["/library/?affiliations__id__exact=44",
-                     "/library/?affiliations__id__exact=80",]
+                     "/library/?affiliations__id__exact=80",
+                    ]
 
-    
     encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
     rdfaParser = RDF.Parser(name='rdfa')
     for encodeUrl in encodeUrls:
-        logger.info("Scanning library url {0}".format(encodeUrl))
+        LOGGER.info("Scanning library url {0}".format(encodeUrl))
         rdfaParser.parse_into_model(model, encodeUrl)
         query = RDF.Statement(None, libraryOntology['library_id'], None)
         libraries = model.find_statements(query)
         for statement in libraries:
             libraryUrn = statement.subject
-            logger.info("Scanning {0}".format(str(libraryUrn)))
+            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
             load_library_detail(model, libraryUrn)
 
 
@@ -325,15 +373,17 @@ def load_library_detail(model, libraryUrn):
     rdfaParser = RDF.Parser(name='rdfa')
     query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
     results = list(model.find_statements(query))
-    logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
+    log_message = "Found {0} statements for {1}"
+    LOGGER.debug(log_message.format(len(results), libraryUrn))
     if len(results) == 0:
-        logger.info("Loading {0}".format(str(libraryUrn)))
+        LOGGER.info("Loading {0}".format(str(libraryUrn)))
         rdfaParser.parse_into_model(model, libraryUrn.uri)
     elif len(results) == 1:
-        pass # Assuming that a loaded dataset has one record
+        pass  # Assuming that a loaded dataset has one record
     else:
         logging.warning("Many dates for {0}".format(libraryUrn))
-                        
+
+
 def get_library_id(name):
     """Guess library ID from library name
 
@@ -360,8 +410,13 @@ def get_contents(element):
         return a.contents[0].encode(CHARSET)
 
     return element.contents[0].encode(CHARSET)
-    
-    
+
+
+def create_status_node(submission_uri, timestamp):
+    submission_uri = daf.submission_uri_to_string(submission_uri)
+    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    return RDF.Node(RDF.Uri(status_uri))
+
 def get_date_contents(element):
     data = get_contents(element)
     if data:
@@ -369,13 +424,12 @@ def get_date_contents(element):
     else:
         return None
 
-        
-def add_stmt(model, subject, predicate, object):
+
+def add_stmt(model, subject, predicate, rdf_object):
     """Convienence create RDF Statement and add to a model
     """
     return model.add_statement(
-        RDF.Statement(subject, predicate, object)
-    )
+        RDF.Statement(subject, predicate, rdf_object))
 
 
 def login(cookie=None):
@@ -383,7 +437,7 @@ def login(cookie=None):
     """
     if cookie is not None:
         return cookie
-    
+
     keys = keyring.get_keyring()
     password = keys.get_password(LOGIN_URL, USERNAME)
     credentials = {'login': USERNAME,
@@ -396,13 +450,13 @@ def login(cookie=None):
                                      body=urllib.urlencode(credentials))
     logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
                                                     response['status']))
-    
+
     cookie = response.get('set-cookie', None)
     if cookie is None:
         raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
     return cookie
 
-                
+
 def get_url_as_soup(url, method, cookie=None):
     http = httplib2.Http()
     headers = {}
@@ -411,15 +465,15 @@ def get_url_as_soup(url, method, cookie=None):
     response, content = http.request(url, method, headers=headers)
     if response['status'] == '200':
         soup = BeautifulSoup(content,
-                             fromEncoding="utf-8", # should read from header
-                             convertEntities=BeautifulSoup.HTML_ENTITIES
-                             )
+                             fromEncoding="utf-8",  # should read from header
+                             convertEntities=BeautifulSoup.HTML_ENTITIES)
         return soup
     else:
         msg = "error accessing {0}, status {1}"
         msg = msg.format(url, response['status'])
         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 
+
 def get_url_as_text(url, method, cookie=None):
     http = httplib2.Http()
     headers = {}
@@ -432,7 +486,7 @@ def get_url_as_text(url, method, cookie=None):
         msg = "error accessing {0}, status {1}"
         msg = msg.format(url, response['status'])
         e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-    
+
 ################
 #  old stuff
 SUBMISSIONS_LACKING_LIBID = [
@@ -452,18 +506,18 @@ SUBMISSIONS_LACKING_LIBID = [
     ]
 
 
-
 def select_by_library_id(submission_list):
-    subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
+    subl = [(x.library_id, x) for x in submission_list if x.library_id]
     libraries = {}
     for lib_id, subobj in subl:
         libraries.setdefault(lib_id, []).append(subobj)
 
     for submission in libraries.values():
         submission.sort(key=attrgetter('date'), reverse=True)
-        
+
     return libraries
 
+
 def library_to_freeze(selected_libraries):
     freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
     lib_ids = sorted(selected_libraries.keys())
@@ -486,7 +540,7 @@ def library_to_freeze(selected_libraries):
     report.append('<tbody>')
     for lib_id in lib_ids:
         report.append('<tr>')
-        lib_url = libraryNS[lib_id].uri
+        lib_url = LIBRARY_NS[lib_id].uri
         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
         submissions = selected_libraries[lib_id]
         report.append('<td>{0}</td>'.format(submissions[0].name))
@@ -494,7 +548,6 @@ def library_to_freeze(selected_libraries):
         for sub in submissions:
             date = date_to_freeze(sub.date)
             batched.setdefault(date, []).append(sub)
-        print lib_id, batched
         for d in freezes:
             report.append('<td>')
             for s in batched.get(d, []):
@@ -509,12 +562,12 @@ def library_to_freeze(selected_libraries):
     report.append("</table></html>")
     return "\n".join(report)
 
-            
+
 def date_to_freeze(d):
-    freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
-                (datetime(2010, 7, 30), '2010-Jul'),
-                (datetime(2011, 1, 30), '2011-Jan'),
-                ]
+    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+               (datetime(2010, 7, 30), '2010-Jul'),
+               (datetime(2011, 1, 30), '2011-Jan'),
+               ]
     for end, name in freezes:
         if d < end:
             return name
@@ -523,4 +576,3 @@ def date_to_freeze(d):
 
 if __name__ == "__main__":
     main()
-