import urllib
import urlparse
-from htsworkflow.submission import daf
+from htsworkflow.submission import daf, ucsc
from htsworkflow.util import api
from htsworkflow.util.rdfhelp import \
rdfsNS, \
xsdNS
TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
# URL mappings
LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
from htsworkflow.submission.ucsc import \
daf_download_url, \
ddf_download_url, \
+ get_ucsc_file_index, \
submission_view_url, \
UCSCEncodePipeline
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
DBDIR = os.path.expanduser("~diane/proj/submission")
USERNAME = 'detrout'
CHARSET = 'utf-8'
-
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+ "{genome}/encodeDCC/{composite}/"
def main(cmdline=None):
"""
Parse command line arguments
cookie = login(cookie=cookie)
load_my_submissions(model, limit=limit, cookie=cookie)
load_encode_libraries(model, htswapi)
+ our_tracks = [
+ {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
+ ]
+ for track_info in our_tracks:
+ load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
if opts.sparql is not None:
sparql_query(model, opts.sparql)
if limit is None or submission_id in limit:
subUrn = RDF.Uri(submission_view_url(submission_id))
- add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
+ add_stmt(model,
+ subUrn,
+ TYPE_N,
+ submissionOntology['Submission'])
+ add_stmt(model,
+ subUrn,
+ DCC_NS['subId'],
+ RDF.Node(submission_id))
name = str(cell[4].text_content())
add_stmt(model, subUrn, name_n, name)
LOGGER.info("Processed {0}".format(subUrn))
-
-
def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
"""Add a link from a UCSC submission to woldlab library if needed
"""
def add_submission_creation_date(model, subUrn, cookie):
# in theory the submission page might have more information on it.
- creationDateN = libraryOntology['date']
- dateTimeType = xsdNS['dateTime']
- query = RDF.Statement(subUrn, creationDateN, None)
- creation_dates = list(model.find_statements(query))
+ creation_dates = get_creation_dates(model, subUrn)
if len(creation_dates) == 0:
LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
- tree = get_url_as_tree(str(subUrn), 'GET', cookie)
- cells = tree.findall('.//td')
- created_label = [x for x in cells
- if x.text_content().startswith('Created')]
- if len(created_label) == 1:
- created_date = get_date_contents(created_label[0].getnext())
- created_date_node = RDF.Node(literal=created_date.isoformat(),
- datatype=dateTimeType.uri)
- add_stmt(model, subUrn, creationDateN, created_date_node)
- else:
- msg = 'Unable to find creation date for {0}'.format(str(subUrn))
- LOGGER.warn(msg)
- raise Warning(msg)
+ submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+ parse_submission_page(model, cells, subUrn)
else:
LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+def get_creation_dates(model, subUrn):
+ query = RDF.Statement(subUrn, CREATION_DATE, None)
+ creation_dates = list(model.find_statements(query))
+ return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+ cells = submissionTree.findall('.//td')
+ dateTimeType = xsdNS['dateTime']
+ created_label = [x for x in cells
+ if x.text_content().startswith('Created')]
+ if len(created_label) == 1:
+ created_date = get_date_contents(created_label[0].getnext())
+ created_date_node = RDF.Node(literal=created_date.isoformat(),
+ datatype=dateTimeType.uri)
+ add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+ else:
+ msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+ LOGGER.warn(msg)
+ raise Warning(msg)
+
def update_submission_detail(model, subUrn, status, recent_update, cookie):
HasStatusN = submissionOntology['has_status']
download_ddf_url = str(subUrn).replace('show', 'download_ddf')
ddfUrn = RDF.Uri(download_ddf_url)
- status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
+ status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
if not model.contains_statement(status_is_ddf):
LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
ddf_lines = ddf_string.split('\n')
# first line is header
header = ddf_lines[0].split()
- attributes = [DDF_NS[x] for x in header]
+ attributes = [DCC_NS[x] for x in header]
for ddf_line in ddf_lines[1:]:
ddf_line = ddf_line.strip()
statusNode,
submissionOntology['has_file'],
fileNode)
- add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
- add_stmt(model, fileNode, DDF_NS['filename'], f)
+ add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+ add_stmt(model, fileNode, DCC_NS['filename'], f)
for predicate, object in zip(attributes[1:], file_attributes):
add_stmt(model, fileNode, predicate, object)
load_library_detail(model, libraryUrn)
+def load_encodedcc_files(model, base_url):
+ if base_url[-1] != '/':
+ base_url += '/'
+
+ file_index = ucsc.get_ucsc_file_index(base_url)
+ for filename, attributes in file_index.items():
+ s = RDF.Node(RDF.Uri(base_url + filename))
+ for name, value in attributes.items():
+ p = RDF.Node(DCC_NS[name])
+ o = RDF.Node(value)
+ model.add_statement(RDF.Statement(s,p,o))
+
def load_library_detail(model, libraryUrn):
"""Grab detail information from library page
"""
def create_status_node(submission_uri, timestamp):
submission_uri = daf.submission_uri_to_string(submission_uri)
- status_uri = urlparse.urljoin(submission_uri, timestamp)
+ if submission_uri[-1] != '/':
+ sumbission_uri += '/'
+ status_uri = submission_uri + timestamp
return RDF.Node(RDF.Uri(status_uri))
--- /dev/null
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+ def test_create_status_node_with_uri(self):
+ subURL = submission_view_url('5136')
+ submissionUri = RDF.Uri(subURL)
+ timestamp = '2011-12-19T12:42:53.048956'
+ manualUri = subURL + '/' + timestamp
+ nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+ self.assertEqual(str(nodeUri.uri), manualUri)
+
+ def test_create_status_node_with_str(self):
+ subURL = submission_view_url('5136')
+ timestamp = '2011-12-19T12:42:53.048956'
+ manualUri = subURL + '/' + timestamp
+ nodeUri = encode_find.create_status_node(subURL, timestamp)
+ self.assertEqual(str(nodeUri.uri), manualUri)
+
+ def test_parse_submission_page(self):
+ timestamp = '2011-12-19T12:42:53.048956'
+ subURL = submission_view_url('5136')
+ subNode = encode_find.create_status_node(subURL, timestamp)
+ test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+ from lxml.html import parse
+ tree = parse(test_file)
+ model = get_model()
+ dates = encode_find.get_creation_dates(model, subNode)
+ self.assertEqual(len(dates), 0)
+ encode_find.parse_submission_page(model, tree, subNode)
+ dates = encode_find.get_creation_dates(model, subNode)
+ self.assertEqual(len(dates), 1)
+ self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+ return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
+"""Utilities for extracting information from the ENCODE DCC
+"""
import urlparse
+import urllib2
UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
+
def ddf_download_url(submission_id):
"""Return url to download a DDF for a submission
fragment = 'download_ddf/%s' % (submission_id,)
return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
def daf_download_url(submission_id):
"""Return url to download a DAF for a submission
fragment = 'download_daf/%s' % (submission_id,)
return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
def submission_view_url(submission_id):
"""Return url to download a DAF for a submission
"""
fragment = 'show/%s' % (submission_id,)
return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+ """Get index of files for a ENCODE collection
+ """
+ if base_url[-1] != '/': base_url += '/'
+ request = urllib2.urlopen(base_url + 'files.txt')
+ file_index = parse_ucsc_file_index(request)
+ return file_index
+
+
+def parse_ucsc_file_index(stream):
+ """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+ """
+ file_index = {}
+ for line in stream:
+ filename, attribute_line = line.split('\t')
+ attributes = {}
+ for assignment in attribute_line.split(';'):
+ name, value = assignment.split('=')
+ attributes[name.strip()] = value.strip()
+
+ file_index[filename] = attributes
+ return file_index