Attempt to download DAF data for a encodesubmit submission

[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

index 15acea6e92b2399c5edf1e60aae70f2ca7dfe6d8..f116281f3f07dd563455cf2de18d685e2f583652 100644 (file)
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -1,4 +1,7 @@
  #!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
  
  from BeautifulSoup import BeautifulSoup
  from datetime import datetime
@@ -11,13 +14,16 @@ import logging
  import os
  import re
  # redland rdf lib
-import RDF 
+import RDF
  import sys
  import urllib
  import urlparse
  
+from htsworkflow.submission import daf
+
  from htsworkflow.util import api
  from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
       dublinCoreNS, \
       get_model, \
       get_serializer, \
@@ -28,18 +34,23 @@ from htsworkflow.util.rdfhelp import \
       rdfNS, \
       rdfsNS, \
       xsdNS
+TYPE_N = rdfNS['type']
  
  # URL mappings
-libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+     daf_download_url, \
+     ddf_download_url, \
+     submission_view_url, \
+     UCSCEncodePipeline
  
+DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
+DDF_NS = RDF.NS(DOWNLOAD_DDF)
  
-from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
-download_ddf = UCSCEncodePipeline+"download_ddf#"
-ddfNS = RDF.NS(download_ddf)
-               
  DBDIR = os.path.expanduser("~diane/proj/submission")
  
-logger = logging.getLogger("encode_find")
+LOGGER = logging.getLogger("encode_find")
  
  LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
@@ -47,7 +58,14 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
+
  def main(cmdline=None):
+    """
+    Parse command line arguments
+
+    Takes a list of arguments (assuming arg[0] is the program name) or None
+    If None, it looks at sys.argv
+    """
      parser = make_parser()
      opts, args = parser.parse_args(cmdline)
  
@@ -58,31 +76,38 @@ def main(cmdline=None):
  
      htsw_authdata = api.make_auth_from_opts(opts, parser)
      htswapi = api.HtswApi(opts.host, htsw_authdata)
-    
+
      cookie = None
      model = get_model(opts.load_model, DBDIR)
-    
+
      if opts.load_rdf is not None:
          ns_uri = submissionOntology[''].uri
          load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-        
+
+    if len(args) == 0:
+        limit = None
+    else:
+        limit = args
+
      if opts.update:
          cookie = login(cookie=cookie)
-        load_my_submissions(model, cookie=cookie)
+        load_my_submissions(model, limit=limit, cookie=cookie)
          load_encode_libraries(model, htswapi)
  
      if opts.sparql is not None:
          sparql_query(model, opts.sparql)
  
      if opts.find_submission_with_no_library:
-        missing = find_submissions_with_no_library(model)
-                
+        find_submissions_with_no_library(model)
+
      if opts.print_rdf:
          serializer = get_serializer(name=opts.rdf_parser_name)
          print serializer.serialize_model_to_string(model)
  
  
  def make_parser():
+    """Construct option parser
+    """
      parser = OptionParser()
      commands = OptionGroup(parser, "Commands")
      commands.add_option('--load-model', default=None,
@@ -98,15 +123,16 @@ def make_parser():
      #commands.add_option('--update-ddfs', action="store_true", default=False,
      #  help="download ddf information for known submission")
      #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, requires filename for extra rules")
+    #  help="download library info from htsw, "\
+    #       "requires filename for extra rules")
      parser.add_option_group(commands)
-                      
+
      queries = OptionGroup(parser, "Queries")
      queries.add_option('--sparql', default=None,
        help="execute arbitrary sparql query")
      queries.add_option('--find-submission-with-no-library', default=False,
        action="store_true",
-      help="find submissions with no library ID")    
+      help="find submissions with no library ID")
      parser.add_option_group(queries)
  
      options = OptionGroup(parser, "Options")
@@ -115,86 +141,91 @@ def make_parser():
      options.add_option("-v", "--verbose", action="store_true", default=False)
      options.add_option("--debug", action="store_true", default=False)
      parser.add_option_group(options)
-    
+
      api.add_auth_options(parser)
  
      return parser
  
-def load_my_submissions(model, cookie=None):
+
+def load_my_submissions(model, limit=None, cookie=None):
+    """Parse all the submissions from UCSC into model
+    It will look at the global USER_URL to figure out who to scrape
+    cookie contains the session cookie, if none, will attempt to login
+    """
      if cookie is None:
          cookie = login()
-        
+
      soup = get_url_as_soup(USER_URL, 'GET', cookie)
-    p = soup.find('table', attrs={'id':'projects'})
-    tr = p.findNext('tr')
+    projects = soup.find('table', attrs={'id': 'projects'})
+    table_row = projects.findNext('tr')
      # first record is header
-    tr = tr.findNext()
-    TypeN = rdfsNS['type']
-    NameN = submissionOntology['name']
-    SpeciesN = submissionOntology['species']
-    LibraryURN = submissionOntology['library_urn']
-
-    while tr is not None:
-        td = tr.findAll('td')
-        if td is not None and len(td) > 1:
-            subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
-            subUrn = RDF.Uri(submission_view_url(subUrnText))
-
-            add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
-                
-            name = get_contents(td[4])
-            add_stmt(model, subUrn, NameN, name)
-                
-            species = get_contents(td[2])
-            if species is not None:
-                add_stmt(model, subUrn, SpeciesN, species)
-
-            library_id = get_library_id(name)
-            if library_id is not None:
-                add_submission_to_library_urn(model,
-                                              subUrn,
-                                              LibraryURN,
-                                              library_id)
-
-            add_submission_creation_date(model, subUrn, cookie)
-
-            # grab changing atttributes
-            status = get_contents(td[6]).strip()
-            last_mod_datetime = get_date_contents(td[8])
-            last_mod = last_mod_datetime.isoformat()
-
-            update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
-
-            logging.info("Processed {0}".format( subUrn))
-            
-        tr = tr.findNext('tr')
+    table_row = table_row.findNext()
+    name_n = submissionOntology['name']
+    species_n = submissionOntology['species']
+    library_urn = submissionOntology['library_urn']
+
+    while table_row is not None:
+        cell = table_row.findAll('td')
+        if cell is not None and len(cell) > 1:
+            submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
+            if limit is None or submission_id in limit:
+                subUrn = RDF.Uri(submission_view_url(submission_id))
+
+                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+
+                name = get_contents(cell[4])
+                add_stmt(model, subUrn, name_n, name)
+
+                species = get_contents(cell[2])
+                if species is not None:
+                    add_stmt(model, subUrn, species_n, species)
+
+                library_id = get_library_id(name)
+                if library_id is not None:
+                    add_submission_to_library_urn(model,
+                                                  subUrn,
+                                                  library_urn,
+                                                  library_id)
+
+                add_submission_creation_date(model, subUrn, cookie)
+
+                # grab changing atttributes
+                status = get_contents(cell[6]).strip()
+                last_mod_datetime = get_date_contents(cell[8])
+                last_mod = last_mod_datetime.isoformat()
+
+                update_submission_detail(model, subUrn, status, last_mod,
+                                         cookie=cookie)
+
+                logging.info("Processed {0}".format(subUrn))
+
+        table_row = table_row.findNext('tr')
  
  
  def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
      """Add a link from a UCSC submission to woldlab library if needed
      """
-    libraryUrn = libraryNS[library_id+'/']
+    libraryUrn = LIBRARY_NS[library_id + '/']
      query = RDF.Statement(submissionUrn, predicate, libraryUrn)
      if not model.contains_statement(query):
          link = RDF.Statement(submissionUrn, predicate, libraryUrn)
-        logger.info("Adding Sub -> Lib link: {0}".format(link))
+        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
          model.add_statement(link)
      else:
-        logger.debug("Found: {0}".format(str(query)))
+        LOGGER.debug("Found: {0}".format(str(query)))
+
  
-    
  def find_submissions_with_no_library(model):
      missing_lib_query = RDF.SPARQLQuery("""
  PREFIX submissionOntology:<{submissionOntology}>
  
-SELECT 
+SELECT
   ?subid ?name
  WHERE {{
    ?subid submissionOntology:name ?name
    OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
    FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-)    
+}}""".format(submissionOntology=submissionOntology[''].uri))
  
      results = missing_lib_query.execute(model)
      for row in results:
@@ -202,9 +233,10 @@ WHERE {{
          name = row['name']
          print "# {0}".format(name)
          print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+        print "  encodeSubmit:library_urn"\
+              "<http://jumpgate.caltech.edu/library/> ."
          print ""
-    
+
  
  def add_submission_creation_date(model, subUrn, cookie):
      # in theory the submission page might have more information on it.
@@ -213,7 +245,7 @@ def add_submission_creation_date(model, subUrn, cookie):
      query = RDF.Statement(subUrn, creationDateN, None)
      creation_dates = list(model.find_statements(query))
      if len(creation_dates) == 0:
-        logger.info("Getting creation date for: {0}".format(str(subUrn)))
+        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
          soup = get_url_as_soup(str(subUrn), 'GET', cookie)
          created_label = soup.find(text="Created: ")
          if created_label:
@@ -222,7 +254,8 @@ def add_submission_creation_date(model, subUrn, cookie):
                                           datatype=dateTimeType.uri)
              add_stmt(model, subUrn, creationDateN, created_date_node)
      else:
-        logger.debug("Found creation date for: {0}".format(str(subUrn)))
+        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
  
  def update_submission_detail(model, subUrn, status, recent_update, cookie):
      HasStatusN = submissionOntology['has_status']
@@ -235,33 +268,46 @@ def update_submission_detail(model, subUrn, status, recent_update, cookie):
      if len(status_nodes) == 0:
          # has no status node, add one
          logging.info("Adding status node to {0}".format(subUrn))
-        status_blank = RDF.Node()
-        add_stmt(model, subUrn, HasStatusN, status_blank)
-        add_stmt(model, status_blank, rdfsNS['type'], StatusN)
-        add_stmt(model, status_blank, StatusN, status)
-        add_stmt(model, status_blank, LastModifyN, recent_update)
-        update_ddf(model, subUrn, status_blank, cookie=cookie)
+        status_node = create_status_node(subUrn, recent_update)
+        add_stmt(model, subUrn, HasStatusN, status_node)
+        add_stmt(model, status_node, rdfsNS['type'], StatusN)
+        add_stmt(model, status_node, StatusN, status)
+        add_stmt(model, status_node, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_node, cookie=cookie)
+        update_daf(model, subUrn, status_node, cookie=cookie)
      else:
          logging.info("Found {0} status blanks".format(len(status_nodes)))
          for status_statement in status_nodes:
-            status_blank = status_statement.object
-            last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
+            status_node = status_statement.object
+            last_modified_query = RDF.Statement(status_node,
+                                                LastModifyN,
+                                                None)
              last_mod_nodes = model.find_statements(last_modified_query)
              for last_mod_statement in last_mod_nodes:
                  last_mod_date = str(last_mod_statement.object)
                  if recent_update == str(last_mod_date):
-                    update_ddf(model, subUrn, status_blank, cookie=cookie)
+                    update_ddf(model, subUrn, status_node, cookie=cookie)
+                    update_daf(model, subUrn, status_node, cookie=cookie)
                      break
  
  
-    
+def update_daf(model, submission_url, status_node, cookie):
+    download_daf_uri = str(submission_url).replace('show', 'download_daf')
+    daf_uri = RDF.Uri(download_daf_uri)
+
+    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+    if not model.contains_statement(status_is_daf):
+        logging.info('Adding daf to {0}, {1}'.format(submission_url,
+                                                     status_node))
+        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+        daf.fromstring_into_model(model, status_node, daf_text)
+
+
  def update_ddf(model, subUrn, statusNode, cookie):
-    TypeN = rdfsNS['type']
-    
      download_ddf_url = str(subUrn).replace('show', 'download_ddf')
      ddfUrn = RDF.Uri(download_ddf_url)
-    
-    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
+
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
      if not model.contains_statement(status_is_ddf):
          logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
          ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -275,8 +321,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
      ddf_lines = ddf_string.split('\n')
      # first line is header
      header = ddf_lines[0].split()
-    attributes = [ ddfNS[x] for x in header ]
-    statements = []
+    attributes = [DDF_NS[x] for x in header]
  
      for ddf_line in ddf_lines[1:]:
          ddf_line = ddf_line.strip()
@@ -284,18 +329,21 @@ def add_ddf_statements(model, statusNode, ddf_string):
              continue
          if ddf_line.startswith("#"):
              continue
-        
+
          ddf_record = ddf_line.split('\t')
          files = ddf_record[0].split(',')
          file_attributes = ddf_record[1:]
  
          for f in files:
              fileNode = RDF.Node()
-            add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
-            add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
-            add_stmt(model, fileNode, ddfNS['filename'], f)
-
-            for predicate, object in zip( attributes[1:], file_attributes):
+            add_stmt(model,
+                     statusNode,
+                     submissionOntology['has_file'],
+                     fileNode)
+            add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
+            add_stmt(model, fileNode, DDF_NS['filename'], f)
+
+            for predicate, object in zip(attributes[1:], file_attributes):
                  add_stmt(model, fileNode, predicate, object)
  
  
@@ -303,19 +351,19 @@ def load_encode_libraries(model, htswapi):
      """Get libraries associated with encode.
      """
      encodeFilters = ["/library/?affiliations__id__exact=44",
-                     "/library/?affiliations__id__exact=80",]
+                     "/library/?affiliations__id__exact=80",
+                    ]
  
-    
      encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
      rdfaParser = RDF.Parser(name='rdfa')
      for encodeUrl in encodeUrls:
-        logger.info("Scanning library url {0}".format(encodeUrl))
+        LOGGER.info("Scanning library url {0}".format(encodeUrl))
          rdfaParser.parse_into_model(model, encodeUrl)
          query = RDF.Statement(None, libraryOntology['library_id'], None)
          libraries = model.find_statements(query)
          for statement in libraries:
              libraryUrn = statement.subject
-            logger.info("Scanning {0}".format(str(libraryUrn)))
+            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
              load_library_detail(model, libraryUrn)
  
  
@@ -325,15 +373,17 @@ def load_library_detail(model, libraryUrn):
      rdfaParser = RDF.Parser(name='rdfa')
      query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
      results = list(model.find_statements(query))
-    logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
+    log_message = "Found {0} statements for {1}"
+    LOGGER.debug(log_message.format(len(results), libraryUrn))
      if len(results) == 0:
-        logger.info("Loading {0}".format(str(libraryUrn)))
+        LOGGER.info("Loading {0}".format(str(libraryUrn)))
          rdfaParser.parse_into_model(model, libraryUrn.uri)
      elif len(results) == 1:
-        pass # Assuming that a loaded dataset has one record
+        pass  # Assuming that a loaded dataset has one record
      else:
          logging.warning("Many dates for {0}".format(libraryUrn))
-                        
+
+
  def get_library_id(name):
      """Guess library ID from library name
  
@@ -360,8 +410,13 @@ def get_contents(element):
          return a.contents[0].encode(CHARSET)
  
      return element.contents[0].encode(CHARSET)
-    
-    
+
+
+def create_status_node(submission_uri, timestamp):
+    submission_uri = daf.submission_uri_to_string(submission_uri)
+    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    return RDF.Node(RDF.Uri(status_uri))
+
  def get_date_contents(element):
      data = get_contents(element)
      if data:
@@ -369,13 +424,12 @@ def get_date_contents(element):
      else:
          return None
  
-        
-def add_stmt(model, subject, predicate, object):
+
+def add_stmt(model, subject, predicate, rdf_object):
      """Convienence create RDF Statement and add to a model
      """
      return model.add_statement(
-        RDF.Statement(subject, predicate, object)
-    )
+        RDF.Statement(subject, predicate, rdf_object))
  
  
  def login(cookie=None):
@@ -383,7 +437,7 @@ def login(cookie=None):
      """
      if cookie is not None:
          return cookie
-    
+
      keys = keyring.get_keyring()
      password = keys.get_password(LOGIN_URL, USERNAME)
      credentials = {'login': USERNAME,
@@ -396,13 +450,13 @@ def login(cookie=None):
                                       body=urllib.urlencode(credentials))
      logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
                                                      response['status']))
-    
+
      cookie = response.get('set-cookie', None)
      if cookie is None:
          raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
      return cookie
  
-                
+
  def get_url_as_soup(url, method, cookie=None):
      http = httplib2.Http()
      headers = {}
@@ -411,15 +465,15 @@ def get_url_as_soup(url, method, cookie=None):
      response, content = http.request(url, method, headers=headers)
      if response['status'] == '200':
          soup = BeautifulSoup(content,
-                             fromEncoding="utf-8", # should read from header
-                             convertEntities=BeautifulSoup.HTML_ENTITIES
-                             )
+                             fromEncoding="utf-8",  # should read from header
+                             convertEntities=BeautifulSoup.HTML_ENTITIES)
          return soup
      else:
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
  
+
  def get_url_as_text(url, method, cookie=None):
      http = httplib2.Http()
      headers = {}
@@ -432,7 +486,7 @@ def get_url_as_text(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-    
+
  ################
  #  old stuff
  SUBMISSIONS_LACKING_LIBID = [
@@ -452,18 +506,18 @@ SUBMISSIONS_LACKING_LIBID = [
      ]
  
  
-
  def select_by_library_id(submission_list):
-    subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
+    subl = [(x.library_id, x) for x in submission_list if x.library_id]
      libraries = {}
      for lib_id, subobj in subl:
          libraries.setdefault(lib_id, []).append(subobj)
  
      for submission in libraries.values():
          submission.sort(key=attrgetter('date'), reverse=True)
-        
+
      return libraries
  
+
  def library_to_freeze(selected_libraries):
      freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
      lib_ids = sorted(selected_libraries.keys())
@@ -486,7 +540,7 @@ def library_to_freeze(selected_libraries):
      report.append('<tbody>')
      for lib_id in lib_ids:
          report.append('<tr>')
-        lib_url = libraryNS[lib_id].uri
+        lib_url = LIBRARY_NS[lib_id].uri
          report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
          submissions = selected_libraries[lib_id]
          report.append('<td>{0}</td>'.format(submissions[0].name))
@@ -494,7 +548,6 @@ def library_to_freeze(selected_libraries):
          for sub in submissions:
              date = date_to_freeze(sub.date)
              batched.setdefault(date, []).append(sub)
-        print lib_id, batched
          for d in freezes:
              report.append('<td>')
              for s in batched.get(d, []):
@@ -509,12 +562,12 @@ def library_to_freeze(selected_libraries):
      report.append("</table></html>")
      return "\n".join(report)
  
-            
+
  def date_to_freeze(d):
-    freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
-                (datetime(2010, 7, 30), '2010-Jul'),
-                (datetime(2011, 1, 30), '2011-Jan'),
-                ]
+    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+               (datetime(2010, 7, 30), '2010-Jul'),
+               (datetime(2011, 1, 30), '2011-Jan'),
+               ]
      for end, name in freezes:
          if d < end:
              return name
@@ -523,4 +576,3 @@ def date_to_freeze(d):
  
  if __name__ == "__main__":
      main()
-