Write the librdf model database somewhere other than /tmp
[htsworkflow.git] / extra / ucsc_encode_submission / encode_find.py
index 7c614dc2cd7622e39006d7fb64613df786d057dd..3ac4f0420d435b66439b9d2b8488ae5999e522e2 100644 (file)
@@ -4,7 +4,7 @@ from BeautifulSoup import BeautifulSoup
 from datetime import datetime
 import httplib2
 from operator import attrgetter
-from optparse import OptionParser
+from optparse import OptionParser, OptionGroup
 # python keyring
 import keyring
 import logging
@@ -15,75 +15,323 @@ import RDF
 import sys
 import urllib
 
+from htsworkflow.util import api
+
+DBDIR = os.path.expanduser("~diane/proj/submission")
+
+logger = logging.getLogger("encode_find")
+
 libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
 submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
-submitNS = RDF.NS("http://jumpgate.caltech.edu/wiki/EncodeSubmit#")
+submitOntologyNS = RDF.NS("http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#")
+ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
+libOntNS = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
+
 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
 rdfsNS= RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
+xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
 
 LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
 USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
-DETAIL_URL = 'http://encodesubmit.ucsc.edu/pipeline/show/{0}'
-LIBRARY_URL = 'http://jumpgate.caltech.edu/library/{0}'
+
 USERNAME = 'detrout'
+CHARSET = 'utf-8'
 
 def main(cmdline=None):
     parser = make_parser()
     opts, args = parser.parse_args(cmdline)
 
-    cookie = login()
-    if cookie is None:
-        print "Failed to login"
+    if opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    htsw_authdata = api.make_auth_from_opts(opts, parser)
+    htswapi = api.HtswApi(opts.host, htsw_authdata)
+    
+    cookie = None
+    model = get_model(opts.load_model)
+    
+    if opts.load_rdf is not None:
+        load_into_model(model, opts.rdf_parser_name, opts.load_rdf)
+        
+    if opts.update:
+        cookie = login(cookie=cookie)
+        load_my_submissions(model, cookie=cookie)
+        load_encode_libraries(model, htswapi)
+
+    if opts.sparql is not None:
+        sparql_query(model, opts.sparql)
+
+    if opts.find_submission_with_no_library:
+        missing = find_submissions_with_no_library(model)
+                
+    if opts.print_rdf:
+        serializer = RDF.Serializer(name=opts.rdf_parser_name)
+        print serializer.serialize_model_to_string(model)
+
 
-    submissions = my_submissions(cookie)
-    for s in submissions:
-        for t in s.triples():
-            print t
-            
 def make_parser():
     parser = OptionParser()
-    return parser
-
+    commands = OptionGroup(parser, "Commands")
+    commands.add_option('--load-model', default=None,
+      help="Load model database")
+    commands.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    commands.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    commands.add_option('--update', action="store_true", default=False,
+      help="Query remote data sources and update our database")
+    #commands.add_option('--update-ucsc-status', default=None,
+    #  help="download status from ucsc, requires filename for extra rules")
+    #commands.add_option('--update-ddfs', action="store_true", default=False,
+    #  help="download ddf information for known submission")
+    #commands.add_option('--update-library', default=None,
+    #  help="download library info from htsw, requires filename for extra rules")
+    parser.add_option_group(commands)
+                      
+    queries = OptionGroup(parser, "Queries")
+    queries.add_option('--sparql', default=None,
+      help="execute arbitrary sparql query")
+    queries.add_option('--find-submission-with-no-library', default=False,
+      action="store_true",
+      help="find submissions with no library ID")    
+    parser.add_option_group(queries)
 
-def login():
-    keys = keyring.get_keyring()
-    password = keys.get_password(LOGIN_URL, USERNAME)
-    credentials = {'login': USERNAME,
-                   'password': password}
-    headers = {'Content-type': 'application/x-www-form-urlencoded'}
-    http = httplib2.Http()
-    response, content = http.request(LOGIN_URL,
-                                     'POST',
-                                     headers=headers,
-                                     body=urllib.urlencode(credentials))
-    logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
-                                                    response['status']))
+    options = OptionGroup(parser, "Options")
+    options.add_option("--rdf-parser-name", default="turtle",
+      help="set rdf file parser type")
+    options.add_option("-v", "--verbose", action="store_true", default=False)
+    parser.add_option_group(options)
     
-    cookie = response.get('set-cookie', None)
-    return cookie
+    api.add_auth_options(parser)
+
+    return parser
 
-def my_submissions(cookie):
+def get_model(model_name=None):
+    if model_name is None:
+        storage = RDF.MemoryStorage()
+    else:
+        storage = RDF.HashStorage(model_name,
+                      options="hash-type='bdb',dir='{0}'".format(DBDIR))
+    model = RDF.Model(storage)
+    return model
+        
+def load_my_submissions(model, cookie=None):
+    if cookie is None:
+        cookie = login()
+        
     soup = get_url_as_soup(USER_URL, 'GET', cookie)
     p = soup.find('table', attrs={'id':'projects'})
     tr = p.findNext('tr')
     # first record is header
     tr = tr.findNext()
-    submissions = []
+    TypeN = rdfsNS['type']
+    NameN = submitOntologyNS['name']
+    SpeciesN = submitOntologyNS['species']
+    LibraryURN = submitOntologyNS['library_urn']
+
     while tr is not None:
         td = tr.findAll('td')
         if td is not None and len(td) > 1:
-            subid = td[0].contents[0].contents[0]
-            species = get_contents(td[2])
+            subUrnText = td[0].contents[0].contents[0].encode(CHARSET)
+            subUrn = submissionNS[subUrnText]
+
+            add_stmt(model, subUrn, TypeN, submitOntologyNS['Submission'])
+                
             name = get_contents(td[4])
+            add_stmt(model, subUrn, NameN, name)
+                
+            species = get_contents(td[2])
+            if species is not None:
+                add_stmt(model, subUrn, SpeciesN, species)
+
+            library_id = get_library_id(name)
+            if library_id is not None:
+                add_submission_to_library_urn(model,
+                                              subUrn,
+                                              LibraryURN,
+                                              library_id)
+
+            add_submission_creation_date(model, subUrn, cookie)
+
+            # grab changing atttributes
             status = get_contents(td[6]).strip()
-            date = get_date_contents(td[8])
-            age = get_contents(td[10])
-            submissions.append(
-                Submission(subid, species, name, status, date, age, cookie)
-            )
+            last_mod_datetime = get_date_contents(td[8])
+            last_mod = last_mod_datetime.isoformat()
+
+            update_submission_detail(model, subUrn, status, last_mod, cookie=cookie)
+
+            logging.info("Processed {0}".format( subUrn))
+            
         tr = tr.findNext('tr')
-    return submissions
+
+
+def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
+    """Add a link from a UCSC submission to woldlab library if needed
+    """
+    libraryUrn = libraryNS[library_id]
+    query = RDF.Statement(submissionUrn, predicate, libraryUrn)
+    if not model.contains_statement(query):
+        link = RDF.Statement(submissionUrn, predicate, libraryNS[library_id])
+        logger.info("Adding Sub -> Lib link: {0}".format(link))
+        model.add_statement(link)
+    else:
+        logger.debug("Found: {0}".format(str(query)))
+
+    
+def find_submissions_with_no_library(model):
+    missing_lib_query = RDF.SPARQLQuery("""
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT 
+ ?subid ?name
+WHERE {{
+  ?subid submissionOntology:name ?name
+  OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
+  FILTER  (!bound(?libid))
+}}""".format(submissionOntology=submitOntologyNS[''].uri)
+)    
+
+    results = missing_lib_query.execute(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn <http://jumpgate.caltech.edu/library/> ."
+        print ""
+    
+
+def add_submission_creation_date(model, subUrn, cookie):
+    # in theory the submission page might have more information on it.
+    creationDateN = libOntNS['date']
+    dateTimeType = xsdNS['dateTime']
+    query = RDF.Statement(subUrn, creationDateN, None)
+    creation_dates = list(model.find_statements(query))
+    if len(creation_dates) == 0:
+        logger.info("Getting creation date for: {0}".format(str(subUrn)))
+        soup = get_url_as_soup(str(subUrn.uri), 'GET', cookie)
+        created_label = soup.find(text="Created: ")
+        if created_label:
+            created_date = get_date_contents(created_label.next)
+            created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                         datatype=dateTimeType.uri)
+            add_stmt(model, subUrn, creationDateN, created_date_node)
+    else:
+        logger.debug("Found creation date for: {0}".format(str(subUrn)))
+
+def update_submission_detail(model, subUrn, status, recent_update, cookie):
+    HasStatusN = submitOntologyNS['has_status']
+    StatusN = submitOntologyNS['status']
+    LastModifyN = submitOntologyNS['last_modify_date']
+
+    status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
+    status_nodes = list(model.find_statements(status_nodes_query))
+
+    if len(status_nodes) == 0:
+        # has no status node, add one
+        logging.info("Adding status node to {0}".format(subUrn))
+        status_blank = RDF.Node()
+        add_stmt(model, subUrn, HasStatusN, status_blank)
+        add_stmt(model, status_blank, rdfsNS['type'], StatusN)
+        add_stmt(model, status_blank, StatusN, status)
+        add_stmt(model, status_blank, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_blank, cookie=cookie)
+    else:
+        logging.info("Found {0} status blanks".format(len(status_nodes)))
+        for status_statement in status_nodes:
+            status_blank = status_statement.object
+            last_modified_query = RDF.Statement(status_blank, LastModifyN, None)
+            last_mod_nodes = model.find_statements(last_modified_query)
+            for last_mod_statement in last_mod_nodes:
+                last_mod_date = str(last_mod_statement.object)
+                if recent_update == str(last_mod_date):
+                    update_ddf(model, subUrn, status_blank, cookie=cookie)
+                    break
+
+
+    
+def update_ddf(model, subUrn, statusNode, cookie):
+    TypeN = rdfsNS['type']
+    
+    download_ddf_url = str(subUrn).replace('show', 'download_ddf')
+    ddfUrn = RDF.Uri(download_ddf_url)
+    
+    status_is_ddf = RDF.Statement(statusNode, TypeN, ddfNS['ddf'])
+    if not model.contains_statement(status_is_ddf):
+        logging.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
+        ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
+        add_ddf_statements(model, statusNode, ddf_text)
+        model.add_statement(status_is_ddf)
+
+
+def add_ddf_statements(model, statusNode, ddf_string):
+    """Convert a ddf text file into RDF Statements
+    """
+    ddf_lines = ddf_string.split('\n')
+    # first line is header
+    header = ddf_lines[0].split()
+    attributes = [ ddfNS[x] for x in header ]
+    statements = []
+
+    for ddf_line in ddf_lines[1:]:
+        ddf_line = ddf_line.strip()
+        if len(ddf_line) == 0:
+            continue
+        if ddf_line.startswith("#"):
+            continue
+        
+        ddf_record = ddf_line.split('\t')
+        files = ddf_record[0].split(',')
+        file_attributes = ddf_record[1:]
+
+        for f in files:
+            fileNode = RDF.Node()
+            add_stmt(model, statusNode, submitOntologyNS['has_file'], fileNode)
+            add_stmt(model, fileNode, rdfsNS['type'], ddfNS['file'])
+            add_stmt(model, fileNode, ddfNS['filename'], f)
+
+            for predicate, object in zip( attributes[1:], file_attributes):
+                add_stmt(model, fileNode, predicate, object)
+
+
+def load_encode_libraries(model, htswapi):
+    """Get libraries associated with encode.
+    """
+    encodeUrl = os.path.join(htswapi.root_url + "/library/?affiliations__id__exact=44")
+    rdfaParser = RDF.Parser(name='rdfa')
+    print encodeUrl
+    rdfaParser.parse_into_model(model, encodeUrl)
+    query = RDF.Statement(None, libOntNS['library_id'], None)
+    libraries = model.find_statements(query)
+    for statement in libraries:
+        libraryUrn = statement.subject
+        load_library_detail(model, libraryUrn)
+
+
+def load_library_detail(model, libraryUrn):
+    """Grab detail information from library page
+    """
+    rdfaParser = RDF.Parser(name='rdfa')
+    query = RDF.Statement(libraryUrn, libOntNS['date'], None)
+    results = list(model.find_statements(query))
+    if len(results) == 0:
+        logger.info("Loading {0}".format(str(libraryUrn)))
+        rdfaParser.parse_into_model(model, libraryUrn.uri)
+    elif len(results) == 1:
+        pass # Assuming that a loaded dataset has one record
+    else:
+        logging.warning("Many dates for {0}".format(libraryUrn))
+                        
+def get_library_id(name):
+    """Guess library ID from library name
+    """
+    match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", name)
+    library_id = None
+    if match is not None:
+        library_id = match.group('id')
+    return library_id
+
 
 def get_contents(element):
     """Return contents or none.
@@ -93,10 +341,11 @@ def get_contents(element):
 
     a = element.find('a')
     if a is not None:
-        return a.contents[0]
-
-    return element.contents[0]
+        return a.contents[0].encode(CHARSET)
 
+    return element.contents[0].encode(CHARSET)
+    
+    
 def get_date_contents(element):
     data = get_contents(element)
     if data:
@@ -104,6 +353,92 @@ def get_date_contents(element):
     else:
         return None
 
+def sparql_query(model, query_filename):
+    """Execute sparql query from file
+    """
+    query_body = open(query_filename,'r').read()
+    query = RDF.SPARQLQuery(query_body)
+    results = query.execute(model)
+    for row in results:
+        output = []
+        for k,v in row.items()[::-1]:
+            print "{0}: {1}".format(k,v)
+        print 
+
+        
+def load_into_model(model, parser_name, filename):
+    if not os.path.exists(filename):
+        raise IOError("Can't find {0}".format(filename))
+    
+    data = open(filename, 'r').read()
+    rdf_parser = RDF.Parser(name=parser_name)
+    ns_uri = submitOntologyNS[''].uri
+    rdf_parser.parse_string_into_model(model, data, ns_uri)
+
+def add_stmt(model, subject, predicate, object):
+    """Convienence create RDF Statement and add to a model
+    """
+    return model.add_statement(
+        RDF.Statement(subject, predicate, object)
+    )
+
+def login(cookie=None):
+    """Login if we don't have a cookie
+    """
+    if cookie is not None:
+        return cookie
+    
+    keys = keyring.get_keyring()
+    password = keys.get_password(LOGIN_URL, USERNAME)
+    credentials = {'login': USERNAME,
+                   'password': password}
+    headers = {'Content-type': 'application/x-www-form-urlencoded'}
+    http = httplib2.Http()
+    response, content = http.request(LOGIN_URL,
+                                     'POST',
+                                     headers=headers,
+                                     body=urllib.urlencode(credentials))
+    logging.debug("Login to {0}, status {1}".format(LOGIN_URL,
+                                                    response['status']))
+    
+    cookie = response.get('set-cookie', None)
+    if cookie is None:
+        raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
+    return cookie
+
+                
+def get_url_as_soup(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        soup = BeautifulSoup(content,
+                             fromEncoding="utf-8", # should read from header
+                             convertEntities=BeautifulSoup.HTML_ENTITIES
+                             )
+        return soup
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+def get_url_as_text(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        return content
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+    
+################
+#  old stuff
 SUBMISSIONS_LACKING_LIBID = [
     ('1x75-Directional-HeLa-Rep1',    '11208'),
     ('1x75-Directional-HeLa-Rep2',    '11207'),
@@ -116,73 +451,10 @@ SUBMISSIONS_LACKING_LIBID = [
     ('1x75-Directional-K562-Rep1',    '11008'),
     ('1x75-Directional-K562-Rep2',    '11007'),
     ('1x75-Directional-NHEK-Rep1',    '11204'),
+    ('1x75-Directional-GM12878-Rep1', '11011'),
+    ('1x75-Directional-GM12878-Rep2', '11010'),
     ]
 
-class Submission(object):
-    def __init__(self, subid, species, name, status, date, age, cookie=None):
-        self.cookie = cookie
-        self.subid = subid
-        self.species = species
-        self.name = name
-        self.status = status
-        self.date = date
-        self.age = age
-        self._library_id = None
-        self._created_date = None
-
-    def triples(self):
-        subNode = submissionNS[self.subid.encode('utf-8')]
-        dateNode = self.date.strftime("%Y-%m-%d")
-        s = [RDF.Statement(subNode, submitNS['name'],
-                           self.name.encode('utf-8')),
-             RDF.Statement(subNode, submitNS['status'],
-                           self.status.encode('utf-8')),
-             RDF.Statement(subNode, submitNS['last_modify_date'], dateNode),
-             ]
-        if self.species is not None:
-            s.append(RDF.Statement(subNode, submitNS['species'],
-                                   self.species.encode('utf-8')))
-        if self.library_id is not None:
-             libId = libraryNS[self.library_id.encode('utf-8')]
-             s.append(RDF.Statement(subNode, rdfsNS['seeAlso'], libId))
-        
-        return s
-        
-
-    def _get_library_id(self):
-        if self._library_id is None:
-            match = re.search(r"[ -](?P<id>([\d]{5})|(SL[\d]{4}))", self.name)
-            if match is not None:
-                self._library_id = match.group('id')
-            else:
-                for dir_lib_name, lib_id in SUBMISSIONS_LACKING_LIBID:
-                    if dir_lib_name in self.name:
-                        self._library_id = lib_id
-                        break
-            
-        return self._library_id
-    
-    library_id = property(_get_library_id)
-
-    def _get_detail(self):
-        detail = DETAIL_URL.format(self.subid)
-        soup = get_url_as_soup(detail, 'GET', self.cookie)
-
-        created_label = soup.find(text="Created: ")
-        if created_label:
-            self._created_date = get_date_contents(created_label.next)
-            
-    def _get_created_date(self):
-        if self._created_date is None:
-            self._get_detail()
-        return self._created_date
-    created_date = property(_get_created_date)
-    
-    def __unicode__(self):
-        return u"{0}\t{1}\t{2}".format(self.subid, self.library_id, self.name)
-
-    def __repr__(self):
-        return u"<Submission ({0}) '{1}'>".format(self.subid, self.name)
 
 
 def select_by_library_id(submission_list):
@@ -218,7 +490,7 @@ def library_to_freeze(selected_libraries):
     report.append('<tbody>')
     for lib_id in lib_ids:
         report.append('<tr>')
-        lib_url = LIBRARY_URL.format(lib_id)
+        lib_url = libraryNS[lib_id].uri
         report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
         submissions = selected_libraries[lib_id]
         report.append('<td>{0}</td>'.format(submissions[0].name))
@@ -230,7 +502,8 @@ def library_to_freeze(selected_libraries):
         for d in freezes:
             report.append('<td>')
             for s in batched.get(d, []):
-                subid = '<a href="http://encodesubmit.ucsc.edu/pipeline/show/{0}">{0}</a>'.format(s.subid)
+                show_url = submissionNS[s.subid].uri
+                subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
                 report.append("{0}:{1}".format(subid, s.status))
             report.append('</td>')
         else:
@@ -251,24 +524,6 @@ def date_to_freeze(d):
             return name
     else:
         return None
-    
-                
-def get_url_as_soup(url, method, cookie=None):
-    http = httplib2.Http()
-    headers = {}
-    if cookie is not None:
-        headers['Cookie'] = cookie
-    response, content = http.request(url, method, headers=headers)
-    if response['status'] == '200':
-        soup = BeautifulSoup(content,
-                             fromEncoding="utf-8", # should read from header
-                             convertEntities=BeautifulSoup.HTML_ENTITIES
-                             )
-        return soup
-    else:
-        msg = "error accessing {0}, status {1}"
-        msg = msg.format(url, response['status'])
-        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
 
 if __name__ == "__main__":
     main()