Attempt to download DAF data for a encodesubmit submission

author Diane Trout <diane@caltech.edu>

Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)

committer Diane Trout <diane@caltech.edu>

Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)
author Diane Trout <diane@caltech.edu>
Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)
committer Diane Trout <diane@caltech.edu>
Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

index 15acea6e92b2399c5edf1e60aae70f2ca7dfe6d8..f116281f3f07dd563455cf2de18d685e2f583652 100644 (file)
--- a/extra/ucsc_encode_submission/encode_find.py
+++ b/extra/ucsc_encode_submission/encode_find.py
@@ -1,4 +1,7 @@
  #!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
  
  from BeautifulSoup import BeautifulSoup
  from datetime import datetime
@@ -11,13 +14,16 @@ import logging
  import os
  import re
  # redland rdf lib
-import RDF 
+import RDF
  import sys
  import urllib
  import urlparse
  
+from htsworkflow.submission import daf
+
  from htsworkflow.util import api
  from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
       dublinCoreNS, \
       get_model, \
       get_serializer, \
@@ -28,18 +34,23 @@ from htsworkflow.util.rdfhelp import \
       rdfNS, \
       rdfsNS, \
       xsdNS
+TYPE_N = rdfNS['type']
  
  # URL mappings
-libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+     daf_download_url, \
+     ddf_download_url, \
+     submission_view_url, \
+     UCSCEncodePipeline
  
+DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
+DDF_NS = RDF.NS(DOWNLOAD_DDF)
  
-from htsworkflow.submission.ucsc import submission_view_url, UCSCEncodePipeline
-download_ddf = UCSCEncodePipeline+"download_ddf#"
-ddfNS = RDF.NS(download_ddf)
-               
  DBDIR = os.path.expanduser("~diane/proj/submission")
  
-logger = logging.getLogger("encode_find")
+LOGGER = logging.getLogger("encode_find")
  
  LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
  USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
@@ -47,7 +58,14 @@ USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
  USERNAME = 'detrout'
  CHARSET = 'utf-8'
  
+
  def main(cmdline=None):
+    """
+    Parse command line arguments
+
+    Takes a list of arguments (assuming arg[0] is the program name) or None
+    If None, it looks at sys.argv
+    """
      parser = make_parser()
      opts, args = parser.parse_args(cmdline)
  
@@ -58,31 +76,38 @@ def main(cmdline=None):
  
      htsw_authdata = api.make_auth_from_opts(opts, parser)
      htswapi = api.HtswApi(opts.host, htsw_authdata)
-    
+
      cookie = None
      model = get_model(opts.load_model, DBDIR)
-    
+
      if opts.load_rdf is not None:
          ns_uri = submissionOntology[''].uri
          load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-        
+
+    if len(args) == 0:
+        limit = None
+    else:
+        limit = args
+
      if opts.update:
          cookie = login(cookie=cookie)
-        load_my_submissions(model, cookie=cookie)
+        load_my_submissions(model, limit=limit, cookie=cookie)
          load_encode_libraries(model, htswapi)
  
      if opts.sparql is not None:
          sparql_query(model, opts.sparql)
  
      if opts.find_submission_with_no_library:
-        missing = find_submissions_with_no_library(model)
-                
+        find_submissions_with_no_library(model)
+
      if opts.print_rdf:
          serializer = get_serializer(name=opts.rdf_parser_name)
          print serializer.serialize_model_to_string(model)
  
  
  def make_parser():
+    """Construct option parser
+    """
      parser = OptionParser()
      commands = OptionGroup(parser, "Commands")
      commands.add_option('--load-model', default=None,
@@ -98,15 +123,16 @@ def make_parser():
      #commands.add_option('--update-ddfs', action="store_true", default=False,
      #  help="download ddf information for known submission")
      #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, requires filename for extra rules")
+    #  help="download library info from htsw, "\
+    #       "requires filename for extra rules")
      parser.add_option_group(commands)
-                      
+
      queries = OptionGroup(parser, "Queries")
      queries.add_option('--sparql', default=None,
        help="execute arbitrary sparql query")
      queries.add_option('--find-submission-with-no-library', default=False,
        action="store_true",
-      help="find submissions with no library ID")    
+      help="find submissions with no library ID")
      parser.add_option_group(queries)
  
      options = OptionGroup(parser, "Options")
@@ -115,86 +141,91 @@ def make_parser():
      options.add_option("-v", "--verbose", action="store_true", default=False)
      options.add_option("--debug", action="store_true", default=False)
      parser.add_option_group(options)
-    
+
      api.add_auth_options(parser)
  
      return parser
  
-def load_my_submissions(model, cookie=None):
+
+def load_my_submissions(model, limit=None, cookie=None):
+    """Parse all the submissions from UCSC into model
+    It will look at the global USER_URL to figure out who to scrape
+    cookie contains the session cookie, if none, will attempt to login
+    """
      if cookie is None:
          cookie = login()
-        
+
      soup = get_url_as_soup(USER_URL, 'GET', cookie)
-    p = soup.find('table', attrs={'id':'projects'})
-    tr = p.findNext('tr')
+    projects = soup.find('table', attrs={'id': 'projects'})
+    table_row = projects.findNext('tr')
      # first record is header
-    tr = tr.findNext()
-    TypeN = rdfsNS['type']
-    NameN = submissionOntology['name']
-    SpeciesN = submissionOntology['species']
-    LibraryURN = submissionOntology['library_urn']
-
-    while tr is not None:
-        td = tr.findAll('td')
-        if td is not None and len(td) > 1:
-            subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
-            subUrn = RDF.Uri(submission_view_url(subUrnText))
-
-            add_stmt(model, subUrn, TypeN, submissionOntology['Submission'])
-                
-            name = get_contents(td[4])
-            add_stmt(model, subUrn, NameN, name)
-                
-            species = get_contents(td[2])
-            if species is not None:
-                add_stmt(model, subUrn, SpeciesN, species)
-
-            library_id = get_library_id(name)
-            if library_id is not None:
-                add_submission_to_library_urn(model,
-                                              subUrn,
-                                              LibraryURN,
-                                              library_id)
-
-            add_submission_creation_date(model, subUrn, cookie)
-
-            # grab changing atttributes
-            status = get_contents(td[6]).strip()
-            last_mod_datetime = get_date_contents(td[8])
-            last_mod = last_mod_datetime.isoformat()
-
-            update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
-
-            logging.info("Processed {0}".format( subUrn))
-            
-        tr = tr.findNext('tr')
+    table_row = table_row.findNext()
+    name_n = submissionOntology['name']
+    species_n = submissionOntology['species']
+    library_urn = submissionOntology['library_urn']
+
+    while table_row is not None:
+        cell = table_row.findAll('td')
+        if cell is not None and len(cell) > 1:
+            submission_id = cell[0].contents[0].contents[0].encode(CHARSET)
+            if limit is None or submission_id in limit:
+                subUrn = RDF.Uri(submission_view_url(submission_id))
+
+                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+
+                name = get_contents(cell[4])
+                add_stmt(model, subUrn, name_n, name)
+
+                species = get_contents(cell[2])
+                if species is not None:
+                    add_stmt(model, subUrn, species_n, species)
+
+                library_id = get_library_id(name)
+                if library_id is not None:
+                    add_submission_to_library_urn(model,
+                                                  subUrn,
+                                                  library_urn,
+                                                  library_id)
+
+                add_submission_creation_date(model, subUrn, cookie)
+
+                # grab changing atttributes
+                status = get_contents(cell[6]).strip()
+                last_mod_datetime = get_date_contents(cell[8])
+                last_mod = last_mod_datetime.isoformat()
+
+                update_submission_detail(model, subUrn, status, last_mod,
+                                         cookie=cookie)
+
+                logging.info("Processed {0}".format(subUrn))
+
+        table_row = table_row.findNext('tr')
  
  
  def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
      """Add a link from a UCSC submission to woldlab library if needed
      """
-    libraryUrn = libraryNS[library_id+'/']
+    libraryUrn = LIBRARY_NS[library_id + '/']
      query = RDF.Statement(submissionUrn, predicate, libraryUrn)
      if not model.contains_statement(query):
          link = RDF.Statement(submissionUrn, predicate, libraryUrn)
-        logger.info("Adding Sub -> Lib link: {0}".format(link))
+        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
          model.add_statement(link)
      else:
-        logger.debug("Found: {0}".format(str(query)))
+        LOGGER.debug("Found: {0}".format(str(query)))
+
  
-    
  def find_submissions_with_no_library(model):
      missing_lib_query = RDF.SPARQLQuery("""
  PREFIX submissionOntology:<{submissionOntology}>
  
-SELECT 
+SELECT
   ?subid ?name
  WHERE {{
    ?subid submissionOntology:name ?name
    OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
    FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-)    
+}}""".format(submissionOntology=submissionOntology[''].uri))
  
      results = missing_lib_query.execute(model)
      for row in results:
@@ -202,9 +233,10 @@ WHERE {{
          name = row['name']
          print "# {0}".format(name)
          print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+        print "  encodeSubmit:library_urn"\
+              "<http://jumpgate.caltech.edu/library/> ."
          print ""
-    
+
  
  def add_submission_creation_date(model, subUrn, cookie):
      # in theory the submission page might have more information on it.
@@ -213,7 +245,7 @@ def add_submission_creation_date(model, subUrn, cookie):
      query = RDF.Statement(subUrn, creationDateN, None)
      creation_dates = list(model.find_statements(query))
      if len(creation_dates) == 0:
-        logger.info("Getting creation date for: {0}".format(str(subUrn)))
+        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
          soup = get_url_as_soup(str(subUrn), 'GET', cookie)
          created_label = soup.find(text="Created: ")
          if created_label:
@@ -222,7 +254,8 @@ def add_submission_creation_date(model, subUrn, cookie):
                                           datatype=dateTimeType.uri)
              add_stmt(model, subUrn, creationDateN, created_date_node)
      else:
-        logger.debug("Found creation date for: {0}".format(str(subUrn)))
+        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
  
  def update_submission_detail(model, subUrn, status, recent_update, cookie):
      HasStatusN = submissionOntology['has_status']
@@ -235,33 +268,46 @@ def update_submission_detail(model, subUrn, status, recent_update, cookie):
      if len(status_nodes) == 0:
          # has no status node, add one
          logging.info("Adding status node to {0}".format(subUrn))
-        status_blank = RDF.Node()
-        add_stmt(model, subUrn, HasStatusN, status_blank)
-        add_stmt(model, status_blank, rdfsNS['type'], StatusN)
-        add_stmt(model, status_blank, StatusN, status)
-        add_stmt(model, status_blank, LastModifyN, recent_update)
-        update_ddf(model, subUrn, status_blank, cookie=cookie)
+        status_node = create_status_node(subUrn, recent_update)
+        add_stmt(model, subUrn, HasStatusN, status_node)
+        add_stmt(model, status_node, rdfsNS['type'], StatusN)
+        add_stmt(model, status_node, StatusN, status)
+        add_stmt(model, status_node, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_node, cookie=cookie)
+        update_daf(model, subUrn, status_node, cookie=cookie)
      else:
          logging.info("Found {0} status blanks".format(len(status_nodes)))
          for status_statement in status_nodes:
-            status_blank = status_statement.object
-            last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
+            status_node = status_statement.object
+            last_modified_query = RDF.Statement(status_node,
+                                                LastModifyN,
+                                                None)
              last_mod_nodes = model.find_statements(last_modified_query)
              for last_mod_statement in last_mod_nodes:
                  last_mod_date = str(last_mod_statement.object)
                  if recent_update == str(last_mod_date):
-                    update_ddf(model, subUrn, status_blank, cookie=cookie)
+                    update_ddf(model, subUrn, status_node, cookie=cookie)
+                    update_daf(model, subUrn, status_node, cookie=cookie)
                      break
  
  
-    
+def update_daf(model, submission_url, status_node, cookie):
+    download_daf_uri = str(submission_url).replace('show', 'download_daf')
+    daf_uri = RDF.Uri(download_daf_uri)
+
+    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+    if not model.contains_statement(status_is_daf):
+        logging.info('Adding daf to {0}, {1}'.format(submission_url,
+                                                     status_node))
+        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+        daf.fromstring_into_model(model, status_node, daf_text)
+
+
  def update_ddf(model, subUrn, statusNode, cookie):
-    TypeN = rdfsNS['type']
-    
      download_ddf_url = str(subUrn).replace('show', 'download_ddf')
      ddfUrn = RDF.Uri(download_ddf_url)
-    
-    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS[''])
+
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
      if not model.contains_statement(status_is_ddf):
          logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
          ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
@@ -275,8 +321,7 @@ def add_ddf_statements(model, statusNode, ddf_string):
      ddf_lines = ddf_string.split('\n')
      # first line is header
      header = ddf_lines[0].split()
-    attributes = [ ddfNS[x] for x in header ]
-    statements = []
+    attributes = [DDF_NS[x] for x in header]
  
      for ddf_line in ddf_lines[1:]:
          ddf_line = ddf_line.strip()
@@ -284,18 +329,21 @@ def add_ddf_statements(model, statusNode, ddf_string):
              continue
          if ddf_line.startswith("#"):
              continue
-        
+
          ddf_record = ddf_line.split('\t')
          files = ddf_record[0].split(',')
          file_attributes = ddf_record[1:]
  
          for f in files:
              fileNode = RDF.Node()
-            add_stmt(model, statusNode, submissionOntology['has_file'], fileNode)
-            add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
-            add_stmt(model, fileNode, ddfNS['filename'], f)
-
-            for predicate, object in zip( attributes[1:], file_attributes):
+            add_stmt(model,
+                     statusNode,
+                     submissionOntology['has_file'],
+                     fileNode)
+            add_stmt(model, fileNode, rdfsNS['type'], DDF_NS['file'])
+            add_stmt(model, fileNode, DDF_NS['filename'], f)
+
+            for predicate, object in zip(attributes[1:], file_attributes):
                  add_stmt(model, fileNode, predicate, object)
  
  
@@ -303,19 +351,19 @@ def load_encode_libraries(model, htswapi):
      """Get libraries associated with encode.
      """
      encodeFilters = ["/library/?affiliations__id__exact=44",
-                     "/library/?affiliations__id__exact=80",]
+                     "/library/?affiliations__id__exact=80",
+                    ]
  
-    
      encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
      rdfaParser = RDF.Parser(name='rdfa')
      for encodeUrl in encodeUrls:
-        logger.info("Scanning library url {0}".format(encodeUrl))
+        LOGGER.info("Scanning library url {0}".format(encodeUrl))
          rdfaParser.parse_into_model(model, encodeUrl)
          query = RDF.Statement(None, libraryOntology['library_id'], None)
          libraries = model.find_statements(query)
          for statement in libraries:
              libraryUrn = statement.subject
-            logger.info("Scanning {0}".format(str(libraryUrn)))
+            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
              load_library_detail(model, libraryUrn)
  
  
@@ -325,15 +373,17 @@ def load_library_detail(model, libraryUrn):
      rdfaParser = RDF.Parser(name='rdfa')
      query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
      results = list(model.find_statements(query))
-    logger.debug("Found {0} statements for {1}".format(len(results), libraryUrn))
+    log_message = "Found {0} statements for {1}"
+    LOGGER.debug(log_message.format(len(results), libraryUrn))
      if len(results) == 0:
-        logger.info("Loading {0}".format(str(libraryUrn)))
+        LOGGER.info("Loading {0}".format(str(libraryUrn)))
          rdfaParser.parse_into_model(model, libraryUrn.uri)
      elif len(results) == 1:
-        pass # Assuming that a loaded dataset has one record
+        pass  # Assuming that a loaded dataset has one record
      else:
          logging.warning("Many dates for {0}".format(libraryUrn))
-                        
+
+
  def get_library_id(name):
      """Guess library ID from library name
  
@@ -360,8 +410,13 @@ def get_contents(element):
          return a.contents[0].encode(CHARSET)
  
      return element.contents[0].encode(CHARSET)
-    
-    
+
+
+def create_status_node(submission_uri, timestamp):
+    submission_uri = daf.submission_uri_to_string(submission_uri)
+    status_uri = urlparse.urljoin(submission_uri, timestamp)
+    return RDF.Node(RDF.Uri(status_uri))
+
  def get_date_contents(element):
      data = get_contents(element)
      if data:
@@ -369,13 +424,12 @@ def get_date_contents(element):
      else:
          return None
  
-        
-def add_stmt(model, subject, predicate, object):
+
+def add_stmt(model, subject, predicate, rdf_object):
      """Convienence create RDF Statement and add to a model
      """
      return model.add_statement(
-        RDF.Statement(subject, predicate, object)
-    )
+        RDF.Statement(subject, predicate, rdf_object))
  
  
  def login(cookie=None):
@@ -383,7 +437,7 @@ def login(cookie=None):
      """
      if cookie is not None:
          return cookie
-    
+
      keys = keyring.get_keyring()
      password = keys.get_password(LOGIN_URL, USERNAME)
      credentials = {'login': USERNAME,
@@ -396,13 +450,13 @@ def login(cookie=None):
                                       body=urllib.urlencode(credentials))
      logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
                                                      response['status']))
-    
+
      cookie = response.get('set-cookie', None)
      if cookie is None:
          raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
      return cookie
  
-                
+
  def get_url_as_soup(url, method, cookie=None):
      http = httplib2.Http()
      headers = {}
@@ -411,15 +465,15 @@ def get_url_as_soup(url, method, cookie=None):
      response, content = http.request(url, method, headers=headers)
      if response['status'] == '200':
          soup = BeautifulSoup(content,
-                             fromEncoding="utf-8", # should read from header
-                             convertEntities=BeautifulSoup.HTML_ENTITIES
-                             )
+                             fromEncoding="utf-8",  # should read from header
+                             convertEntities=BeautifulSoup.HTML_ENTITIES)
          return soup
      else:
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
  
+
  def get_url_as_text(url, method, cookie=None):
      http = httplib2.Http()
      headers = {}
@@ -432,7 +486,7 @@ def get_url_as_text(url, method, cookie=None):
          msg = "error accessing {0}, status {1}"
          msg = msg.format(url, response['status'])
          e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-    
+
  ################
  #  old stuff
  SUBMISSIONS_LACKING_LIBID = [
@@ -452,18 +506,18 @@ SUBMISSIONS_LACKING_LIBID = [
      ]
  
  
-
  def select_by_library_id(submission_list):
-    subl = [ (x.library_id, x) for x in submission_list if x.library_id ]
+    subl = [(x.library_id, x) for x in submission_list if x.library_id]
      libraries = {}
      for lib_id, subobj in subl:
          libraries.setdefault(lib_id, []).append(subobj)
  
      for submission in libraries.values():
          submission.sort(key=attrgetter('date'), reverse=True)
-        
+
      return libraries
  
+
  def library_to_freeze(selected_libraries):
      freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
      lib_ids = sorted(selected_libraries.keys())
@@ -486,7 +540,7 @@ def library_to_freeze(selected_libraries):
      report.append('<tbody>')
      for lib_id in lib_ids:
          report.append('<tr>')
-        lib_url = libraryNS[lib_id].uri
+        lib_url = LIBRARY_NS[lib_id].uri
          report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
          submissions = selected_libraries[lib_id]
          report.append('<td>{0}</td>'.format(submissions[0].name))
@@ -494,7 +548,6 @@ def library_to_freeze(selected_libraries):
          for sub in submissions:
              date = date_to_freeze(sub.date)
              batched.setdefault(date, []).append(sub)
-        print lib_id, batched
          for d in freezes:
              report.append('<td>')
              for s in batched.get(d, []):
@@ -509,12 +562,12 @@ def library_to_freeze(selected_libraries):
      report.append("</table></html>")
      return "\n".join(report)
  
-            
+
  def date_to_freeze(d):
-    freezes = [ (datetime(2010, 1, 30), '2010-Jan'),
-                (datetime(2010, 7, 30), '2010-Jul'),
-                (datetime(2011, 1, 30), '2011-Jan'),
-                ]
+    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+               (datetime(2010, 7, 30), '2010-Jul'),
+               (datetime(2011, 1, 30), '2011-Jan'),
+               ]
      for end, name in freezes:
          if d < end:
              return name
@@ -523,4 +576,3 @@ def date_to_freeze(d):
  
  if __name__ == "__main__":
      main()
-    
diff --git a/htsworkflow/submission/daf.py b/htsworkflow/submission/daf.py

index b8a4177fd46fd164ea9f1940c1ecc99ce8c92eab..1be882b051eba71ffe274c8b2b905500c70e67ca 100644 (file)
--- a/htsworkflow/submission/daf.py
+++ b/htsworkflow/submission/daf.py
@@ -24,47 +24,64 @@ from htsworkflow.util.hashfile import make_md5sum
  
  logger = logging.getLogger(__name__)
  
-#
-class ModelException(RuntimeError): pass
+
+class ModelException(RuntimeError):
+    """Assumptions about the RDF model failed"""
+    pass
+
+
  class MetadataLookupException(RuntimeError):
      """Problem accessing metadata"""
      pass
  
+
  # STATES
  DAF_HEADER = 1
  DAF_VIEW = 2
  
-def parse_into_model(model, submission_name, filename):
+
+def parse_into_model(model, subject, filename):
      """Read a DAF into RDF Model
  
-    requires a short submission name
+    requires a subject node to attach statements to
      """
      attributes = parse(filename)
-    add_to_model(model, attributes, submission_name)
+    add_to_model(model, attributes, subject)
  
-def fromstream_into_model(model, submission_name, daf_stream):
+
+def fromstream_into_model(model, subject, daf_stream):
+    """Load daf stream into model attached to node subject
+    """
      attributes = parse_stream(daf_stream)
-    add_to_model(model, attributes, submission_name)
-    
-def fromstring_into_model(model, submission_name, daf_string):
+    add_to_model(model, attributes, subject)
+
+
+def fromstring_into_model(model, subject, daf_string):
      """Read a string containing a DAF into RDF Model
  
      requires a short submission name
      """
      attributes = fromstring(daf_string)
-    add_to_model(model, attributes, submission_name)
-    
+    add_to_model(model, attributes, subject)
+
+
  def parse(filename):
-    stream = open(filename,'r')
-    attributes =  parse_stream(stream)
+    """Parse daf from a file
+    """
+    stream = open(filename, 'r')
+    attributes = parse_stream(stream)
      stream.close()
      return attributes
  
+
  def fromstring(daf_string):
+    """Parse UCSC daf from a provided string"""
      stream = StringIO(daf_string)
      return parse_stream(stream)
  
+
  def parse_stream(stream):
+    """Parse UCSC dat stored in a stream"""
      comment_re = re.compile("#.*$")
  
      state = DAF_HEADER
@@ -84,7 +101,7 @@ def parse_stream(stream):
              value = True
          elif value.lower() in ('no',):
              value = False
-            
+
          if len(name) == 0:
              if view_name is not None:
                  attributes['views'][view_name] = view_attributes
@@ -92,7 +109,7 @@ def parse_stream(stream):
                  view_attributes = {}
              state = DAF_HEADER
          elif state == DAF_HEADER and name == 'variables':
-            attributes[name] = [ x.strip() for x in value.split(',')]
+            attributes[name] = [x.strip() for x in value.split(',')]
          elif state == DAF_HEADER and name == 'view':
              view_name = value
              view_attributes['view'] = value
@@ -105,36 +122,51 @@ def parse_stream(stream):
      # save last block
      if view_name is not None:
          attributes['views'][view_name] = view_attributes
-        
+
      return attributes
  
+
  def _consume_whitespace(line, start=0):
+    """return index of next non whitespace character
+
+    returns length of string if it can't find anything
+    """
      for i in xrange(start, len(line)):
          if line[i] not in string.whitespace:
              return i
-        
+
      return len(line)
  
+
  def _extract_name_index(line, start=0):
+    """Used to find end of word by looking for a whitespace character
+
+    returns length of string if nothing matches
+    """
      for i in xrange(start, len(line)):
          if line[i] in string.whitespace:
              return i
-        
+
      return len(line)
  
+
  def _extract_value_index(line, start=0):
+    """Returns position of last non-whitespace character
+    """
      shortline = line.rstrip()
      return len(shortline)
  
-def convert_to_rdf_statements(attributes, name):
-    submission_uri = get_submission_uri(name)
-    subject = RDF.Node(submission_uri)
  
+def convert_to_rdf_statements(attributes, subject):
+    """Convert dictionary of DAF attributes into rdf statements
+
+    The statements are attached to the provided subject node
+    """
      statements = []
      for daf_key in attributes:
          predicate = dafTermOntology[daf_key]
          if daf_key == 'views':
-            statements.extend(_views_to_statements(name,
+            statements.extend(_views_to_statements(subject,
                                                     dafTermOntology,
                                                     attributes[daf_key]))
          elif daf_key == 'variables':
@@ -145,13 +177,15 @@ def convert_to_rdf_statements(attributes, name):
          else:
              value = attributes[daf_key]
              obj = toTypedNode(value)
-            statements.append(RDF.Statement(subject,predicate,obj))
+            statements.append(RDF.Statement(subject, predicate, obj))
  
      return statements
  
-def _views_to_statements(name, dafNS, views):
-    subject = RDF.Node(get_submission_uri(name))
-    viewNS = get_view_namespace(name)
+
+def _views_to_statements(subject, dafNS, views):
+    """Attach view attributes to new view nodes atached to provided subject
+    """
+    viewNS = get_view_namespace(subject)
  
      statements = []
      for view_name in views:
@@ -163,23 +197,38 @@ def _views_to_statements(name, dafNS, views):
          for view_attribute_name in view_attributes:
              predicate = dafNS[view_attribute_name]
              obj = toTypedNode(view_attributes[view_attribute_name])
-            statements.append(RDF.Statement(viewSubject, predicate, obj)) 
-            
+            statements.append(RDF.Statement(viewSubject, predicate, obj))
+
          #statements.extend(convert_to_rdf_statements(view, viewNode))
      return statements
  
-def add_to_model(model, attributes, name):
-    for statement in convert_to_rdf_statements(attributes, name):
+
+def add_to_model(model, attributes, subject):
+    for statement in convert_to_rdf_statements(attributes, subject):
          model.add_statement(statement)
-            
+
+
  def get_submission_uri(name):
-    return  submissionLog[name].uri
+    return submissionLog[name].uri
+
+
+def submission_uri_to_string(submission_uri):
+    if isinstance(submission_uri, RDF.Node):
+        submission_uri = str(submission_uri.uri)
+    elif isinstance(submission_uri, RDF.Uri):
+        submission_uri = str(submission_uri)
+    if submission_uri[-1] != '/':
+        submission_uri += '/'
+    return submission_uri
  
-def get_view_namespace(name):
-    submission_uri = get_submission_uri(name)
-    viewNS = RDF.NS(str(submission_uri) + '/view/')
+
+def get_view_namespace(submission_uri):
+    submission_uri = submission_uri_to_string(submission_uri)
+    view_uri = urlparse.urljoin(submission_uri, 'view/')
+    viewNS = RDF.NS(view_uri)
      return viewNS
  
+
  class DAFMapper(object):
      """Convert filenames to views in the UCSC Daf
      """
@@ -200,6 +249,9 @@ class DAFMapper(object):
              logger.error("We need a DAF or Model containing a DAF to work")
  
          self.name = name
+        self.submissionSet = get_submission_uri(self.name)
+        self.viewNS = get_view_namespace(self.submissionSet)
+
          if model is not None:
              self.model = model
          else:
@@ -207,46 +259,40 @@ class DAFMapper(object):
  
          if hasattr(daf_file, 'next'):
              # its some kind of stream
-            fromstream_into_model(self.model, name, daf_file)
+            fromstream_into_model(self.model, self.submissionSet, daf_file)
          else:
              # file
-            parse_into_model(self.model, name, daf_file)
+            parse_into_model(self.model, self.submissionSet, daf_file)
  
          self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
-        self.submissionSet = get_submission_uri(self.name)
-        self.submissionSetNS = RDF.NS(str(self.submissionSet)+'/')
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
          self.__view_map = None
  
-
      def add_pattern(self, view_name, filename_pattern):
          """Map a filename regular expression to a view name
          """
-        viewNS = get_view_namespace(self.name)
-
          obj = toTypedNode(filename_pattern)
          self.model.add_statement(
-            RDF.Statement(viewNS[view_name],
+            RDF.Statement(self.viewNS[view_name],
                            dafTermOntology['filename_re'],
                            obj))
  
-
      def import_submission_dir(self, submission_dir, library_id):
          """Import a submission directories and update our model as needed
          """
          #attributes = get_filename_attribute_map(paired)
          libNode = self.libraryNS[library_id + "/"]
-        
+
          self._add_library_details_to_model(libNode)
-        
+
          submission_files = os.listdir(submission_dir)
-        for f in submission_files:
-            self.construct_file_attributes(submission_dir, libNode, f)
+        for filename in submission_files:
+            self.construct_track_attributes(submission_dir, libNode, filename)
  
-        
-    def construct_file_attributes(self, submission_dir, libNode, pathname):
+    def construct_track_attributes(self, submission_dir, libNode, pathname):
          """Looking for the best extension
          The 'best' is the longest match
-        
+
          :Args:
          filename (str): the filename whose extention we are about to examine
          """
@@ -255,7 +301,7 @@ class DAFMapper(object):
          logger.debug("Searching for view")
          view = self.find_view(filename)
          if view is None:
-            logger.warn("Unrecognized file: %s" % (pathname,))
+            logger.warn("Unrecognized file: {0}".format(pathname))
              return None
          if str(view) == str(libraryOntology['ignore']):
              return None
@@ -263,55 +309,81 @@ class DAFMapper(object):
          submission_name = self.make_submission_name(submission_dir)
          submissionNode = self.get_submission_node(submission_dir)
          submission_uri = str(submissionNode.uri)
-        view_name = fromTypedNode(self.model.get_target(view, dafTermOntology['name']))
+        view_name = fromTypedNode(self.model.get_target(view,
+                                       dafTermOntology['name']))
          if view_name is None:
-            logging.warning('Could not find view name for {0}'.format(str(view)))
+            errmsg = 'Could not find view name for {0}'
+            logging.warning(errmsg.format(str(view)))
              return
-        
+
          view_name = str(view_name)
          submissionView = RDF.Node(RDF.Uri(submission_uri + '/' + view_name))
  
          self.model.add_statement(
-            RDF.Statement(self.submissionSet, dafTermOntology['has_submission'], submissionNode))
+            RDF.Statement(self.submissionSet,
+                          dafTermOntology['has_submission'],
+                          submissionNode))
          logger.debug("Adding statements to {0}".format(str(submissionNode)))
-        self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['has_view'], submissionView))
-        self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['name'], toTypedNode(submission_name)))
-        self.model.add_statement(RDF.Statement(submissionNode, rdfNS['type'], submissionOntology['submission']))
-        self.model.add_statement(RDF.Statement(submissionNode, submissionOntology['library'], libNode))
+        self.model.add_statement(RDF.Statement(submissionNode,
+                                               submissionOntology['has_view'],
+                                               submissionView))
+        self.model.add_statement(RDF.Statement(submissionNode,
+                                               submissionOntology['name'],
+                                               toTypedNode(submission_name)))
+        self.model.add_statement(
+            RDF.Statement(submissionNode,
+                          rdfNS['type'],
+                          submissionOntology['submission']))
+        self.model.add_statement(RDF.Statement(submissionNode,
+                                               submissionOntology['library'],
+                                               libNode))
  
          logger.debug("Adding statements to {0}".format(str(submissionView)))
-        # add trac specific information
+        # add track specific information
          self.model.add_statement(
              RDF.Statement(submissionView, dafTermOntology['view'], view))
          self.model.add_statement(
-            RDF.Statement(submissionView, dafTermOntology['paired'], toTypedNode(self._is_paired(libNode))))
+            RDF.Statement(submissionView,
+                          dafTermOntology['paired'],
+                          toTypedNode(self._is_paired(libNode))))
          self.model.add_statement(
-            RDF.Statement(submissionView, dafTermOntology['submission'], submissionNode))
+            RDF.Statement(submissionView,
+                          dafTermOntology['submission'],
+                          submissionNode))
  
-        # extra information 
+        # extra information
          terms = [dafTermOntology['type'],
                   dafTermOntology['filename_re'],
                   ]
          terms.extend((dafTermOntology[v] for v in self.get_daf_variables()))
  
+        # add file specific information
+        self.create_file_attributes(filename, submissionView, submission_uri, submission_dir)
+
+        logger.debug("Done.")
+
+    def create_file_attributes(self, filename, submissionView, submission_uri, submission_dir):
          # add file specific information
          logger.debug("Updating file md5sum")
          fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
          submission_pathname = os.path.join(submission_dir, filename)
-        md5 = make_md5sum(submission_pathname)
          self.model.add_statement(
-            RDF.Statement(submissionView, dafTermOntology['has_file'], fileNode))
+            RDF.Statement(submissionView,
+                          dafTermOntology['has_file'],
+                          fileNode))
          self.model.add_statement(
-            RDF.Statement(fileNode, dafTermOntology['filename'], filename))
+            RDF.Statement(fileNode,
+                          dafTermOntology['filename'],
+                          filename))
  
+        md5 = make_md5sum(submission_pathname)
          if md5 is None:
-            logging.warning("Unable to produce md5sum for %s" % ( submission_pathname))
+            errmsg = "Unable to produce md5sum for {0}"
+            logging.warning(errmsg.format(submission_pathname))
          else:
              self.model.add_statement(
                  RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
  
-        logger.debug("Done.")
-        
      def _add_library_details_to_model(self, libNode):
          parser = RDF.Parser(name='rdfa')
          new_statements = parser.parse_as_stream(libNode.uri)
@@ -328,7 +400,7 @@ class DAFMapper(object):
          results = ['view']
          if self.need_replicate():
              results.append('replicate')
-            
+
          for obj in self.model.get_targets(self.submissionSet, variableTerm):
              value = str(fromTypedNode(obj))
              results.append(value)
@@ -340,15 +412,15 @@ class DAFMapper(object):
          submission_dir_name = os.path.split(submission_dir)[1]
          if len(submission_dir_name) == 0:
              raise RuntimeError(
-                "Submission dir name too short: %s" %(submission_dir,))
+                "Submission dir name too short: {0}".format(submission_dir))
          return submission_dir_name
-        
+
      def get_submission_node(self, submission_dir):
          """Convert a submission directory name to a submission node
          """
          submission_name = self.make_submission_name(submission_dir)
          return self.submissionSetNS[submission_name]
-        
+
      def _get_library_attribute(self, libNode, attribute):
          if not isinstance(attribute, RDF.Node):
              attribute = libraryOntology[attribute]
@@ -362,7 +434,7 @@ class DAFMapper(object):
          #targets = self._search_same_as(libNode, attribute)
          #if targets is not None:
          #    return self._format_library_attribute(targets)
-        
+
          # we don't know anything about this attribute
          self._add_library_details_to_model(libNode)
  
@@ -371,7 +443,7 @@ class DAFMapper(object):
              return self._format_library_attribute(targets)
  
          return None
-    
+
      def _format_library_attribute(self, targets):
          if len(targets) == 0:
              return None
@@ -388,7 +460,7 @@ class DAFMapper(object):
              if len(targets) > 0:
                  return targets
          return None
-        
+
      def find_view(self, filename):
          """Search through potential DAF filename patterns
          """
@@ -409,9 +481,10 @@ class DAFMapper(object):
              return results[0]
          else:
              return None
-        
+
      def get_view_name(self, view):
-        names = list(self.model.get_targets(view, submissionOntology['view_name']))
+        view_term = submissionOntology['view_name']
+        names = list(self.model.get_targets(view, view_term))
          if len(names) == 1:
              return fromTypedNode(names[0])
          else:
@@ -419,8 +492,7 @@ class DAFMapper(object):
              msg = msg.format(str(view), len(names))
              logger.error(msg)
              raise RuntimeError(msg)
-        
-            
+
      def _get_filename_view_map(self):
          """Query our model for filename patterns
  
@@ -443,16 +515,19 @@ class DAFMapper(object):
  
      def _get_library_url(self):
          return str(self.libraryNS[''].uri)
+
      def _set_library_url(self, value):
          self.libraryNS = RDF.NS(str(value))
+
      library_url = property(_get_library_url, _set_library_url)
  
      def _is_paired(self, libNode):
          """Determine if a library is paired end"""
          library_type = self._get_library_attribute(libNode, 'library_type')
          if library_type is None:
-            raise ModelException("%s doesn't have a library type" % (str(libNode),))
-        
+            errmsg = "%s doesn't have a library type"
+            raise ModelException(errmsg % (str(libNode),))
+
          #single = (1,3,6)
          single = ['Single End', 'Small RNA', 'CSHL (lacking last nt)']
          paired = ['Paired End', 'Multiplexing', 'Barcoded']
@@ -475,5 +550,5 @@ class DAFMapper(object):
              replicate = self.model.get_target(view, replicateTerm)
              if fromTypedNode(replicate):
                  return True
-            
+
          return False
diff --git a/htsworkflow/submission/test/test_daf.py b/htsworkflow/submission/test/test_daf.py

index 4f77799c0050a73d774e11476f77c7290b0410ca..3d31c95807ad38753c4c49735933916c924208e8 100644 (file)
--- a/htsworkflow/submission/test/test_daf.py
+++ b/htsworkflow/submission/test/test_daf.py
@@ -20,7 +20,7 @@ import RDF
  test_daf = """# Lab and general info
  grant             Hardison
  lab               Caltech-m
-dataType          ChipSeq 
+dataType          ChipSeq
  variables         cell, antibody,sex,age,strain,control
  compositeSuffix   CaltechHistone
  assembly          mm9
@@ -44,7 +44,7 @@ required         no
  test_daf_no_rep = """# Lab and general info
  grant             Hardison
  lab               Caltech-m
-dataType          ChipSeq 
+dataType          ChipSeq
  variables         cell, antibody,sex,age,strain,control
  compositeSuffix   CaltechHistone
  assembly          mm9
@@ -63,7 +63,7 @@ class TestDAF(unittest.TestCase):
      def test_parse(self):
  
          parsed = daf.fromstring(test_daf)
-        
+
          self.failUnlessEqual(parsed['assembly'], 'mm9')
          self.failUnlessEqual(parsed['grant'], 'Hardison')
          self.failUnlessEqual(len(parsed['variables']), 6)
@@ -85,7 +85,7 @@ class TestDAF(unittest.TestCase):
  
          name = 'cursub'
          subNS = RDF.NS(str(submissionLog[name].uri))
-        daf.add_to_model(model, parsed, name)
+        daf.add_to_model(model, parsed, submissionLog[name].uri)
  
          signal_view_node = RDF.Node(subNS['/view/Signal'].uri)
  
@@ -101,18 +101,38 @@ class TestDAF(unittest.TestCase):
          name = model.get_target(signal_view_node, dafTermOntology['name'])
          self.failUnlessEqual(fromTypedNode(name), u'Signal')
  
+    def test_get_view_namespace_from_string(self):
+        url = "http://jumpgate.caltech.edu/wiki/SubmissionLog/cursub/"
+        target = RDF.NS(url + 'view/')
+        view_namespace = daf.get_view_namespace(url)
+        self.assertEqual(view_namespace[''], target[''])
+
+    def test_get_view_namespace_from_string_no_trailing_slash(self):
+        url = "http://jumpgate.caltech.edu/wiki/SubmissionLog/cursub"
+        target = RDF.NS(url + '/view/')
+        view_namespace = daf.get_view_namespace(url)
+        self.assertEqual(view_namespace[''], target[''])
+
+    def test_get_view_namespace_from_uri_node(self):
+        url = "http://jumpgate.caltech.edu/wiki/SubmissionLog/cursub/"
+        node = RDF.Node(RDF.Uri(url))
+        target = RDF.NS(url + 'view/')
+        view_namespace = daf.get_view_namespace(node)
+        self.assertEqual(view_namespace[''], target[''])
+
+
  def load_daf_mapper(name, extra_statements=None, ns=None, test_daf=test_daf):
      """Load test model in
      """
      model = get_model()
      if ns is None:
          ns="http://extra"
-        
+
      if extra_statements is not None:
          parser = RDF.Parser(name='turtle')
          parser.parse_string_into_model(model, extra_statements,
                                         ns)
-        
+
      test_daf_stream = StringIO(test_daf)
      mapper = daf.DAFMapper(name, daf_file = test_daf_stream, model=model)
      return mapper
@@ -121,7 +141,7 @@ def dump_model(model):
      writer = get_serializer()
      turtle =  writer.serialize_model_to_string(model)
      print turtle
-    
+
  class TestDAFMapper(unittest.TestCase):
      def test_create_mapper_add_pattern(self):
          name = 'testsub'
@@ -129,7 +149,7 @@ class TestDAFMapper(unittest.TestCase):
          pattern = '.bam\Z(?ms)'
          mapper.add_pattern('Signal', pattern)
  
-        s = RDF.Statement(daf.get_view_namespace(name)['Signal'],
+        s = RDF.Statement(mapper.viewNS['Signal'],
                            dafTermOntology['filename_re'],
                            None)
          search = list(mapper.model.find_statements(s))
@@ -140,7 +160,7 @@ class TestDAFMapper(unittest.TestCase):
                               str(dafTermOntology['filename_re']))
          #self.failUnlessEqual(search[0].object.literal_value['string'], pattern)
  
-        
+
      def test_find_one_view(self):
          name='testfind'
          extra = '''@prefix dafTerm:<http://jumpgate.caltech.edu/wiki/UcscDaf#> .
@@ -152,7 +172,7 @@ thisView:FastqRd1 dafTerm:filename_re ".*_r1\\\\.fastq" .
          daf_mapper = load_daf_mapper(name, extra_statements = extra)
  
          view = daf_mapper.find_view('filename_r1.fastq')
-        
+
          # dump_model(daf_mapper.model)
          view_root = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/{0}/view/'
          view_root = view_root.format(name)
@@ -185,7 +205,7 @@ thisView:Signal dafTerm:filename_re ".*\\\\.bam" ;
        submissionOntology:view_name "Signal" .
  thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
          submissionOntology:view_name "FastqRd1" .
-<%(libUrl)s> <%(libraryOntology)sgel_cut> "100"^^xsd:decimal . 
+<%(libUrl)s> <%(libraryOntology)sgel_cut> "100"^^xsd:decimal .
  ''' % {'libraryOntology': 'http://jumpgate.caltech.edu/wiki/LibraryOntology#',
         'libUrl': lib_url}
  
@@ -203,13 +223,12 @@ thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
          with mktempdir('analysis') as analysis_dir:
              path, analysis_name = os.path.split(analysis_dir)
              with mktempfile('.bam', dir=analysis_dir) as filename:
-                print 'dir', os.listdir(analysis_dir)
-                daf_mapper.construct_file_attributes(analysis_dir,
-                                                     libNode,
-                                                     filename)
-            
+                daf_mapper.construct_track_attributes(analysis_dir,
+                                                      libNode,
+                                                      filename)
+
          #dump_model(daf_mapper.model)
-        
+
          sub_root = "http://jumpgate.caltech.edu/wiki/SubmissionsLog/testfind/"
          submission_name = sub_root + analysis_name
          source = daf_mapper.model.get_source(rdfNS['type'], submissionOntology['submission'])
@@ -219,7 +238,7 @@ thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
          view = daf_mapper.model.get_target(source, submissionOntology['has_view'])
          self.failUnlessEqual(str(view.uri), view_name)
  
-        
+
      def test_library_url(self):
          daf_mapper = load_daf_mapper('urltest')
  
@@ -232,12 +251,12 @@ thisView:FastqRd1 dafTerm:filename_re ".*\\\\.fastq" ;
          daf_mapper = load_daf_mapper('test_rep')
          self.failUnlessEqual(daf_mapper.need_replicate(), True)
          self.failUnless('replicate' in daf_mapper.get_daf_variables())
-                        
+
      def test_daf_without_replicate(self):
          daf_mapper = load_daf_mapper('test_rep',test_daf=test_daf_no_rep)
          self.failUnlessEqual(daf_mapper.need_replicate(), False)
          self.failUnless('replicate' not in daf_mapper.get_daf_variables())
-        
+
  @contextmanager
  def mktempdir(prefix='tmp'):
      d = tempfile.mkdtemp(prefix=prefix)
@@ -255,7 +274,7 @@ def mktempfile(suffix='', prefix='tmp', dir=None):
      os.unlink(pathname)
      print "unmade", pathname
  
-    
+
  def suite():
      suite = unittest.makeSuite(TestDAF, 'test')
      suite.addTest(unittest.makeSuite(TestDAFMapper, 'test'))
author	Diane Trout <diane@caltech.edu>
	Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)
committer	Diane Trout <diane@caltech.edu>
	Fri, 2 Sep 2011 23:28:39 +0000 (16:28 -0700)
extra/ucsc_encode_submission/encode_find.py		patch \| blob \| history
htsworkflow/submission/daf.py		patch \| blob \| history
htsworkflow/submission/test/test_daf.py		patch \| blob \| history