Also rename it to encode_submission to make the directory listing fit better.
--- /dev/null
+I was building a variety of scripts to handle submitting our data to the
+UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
+databases, and since I needed an official place to put the scripts
+
+I decided here.
--- /dev/null
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
+
+construct { ?library ddf:treatment ?treatment ;
+ ddf:protocol ?pcr . }
+WHERE {
+ ?status ucscSubmission:has_file ?file .
+ ?submission ucscSubmission:has_status ?status ;
+ ucscSubmission:library_urn ?library ;
+ ucscSubmission:name ?name .
+ ?file ddf:treatment ?treatment ;
+ ddf:protocol ?pcr .
+}
+
--- /dev/null
+##
+## Override submission ID to library URN names for our libraries
+## whose names either lack, or have the wrong library ID string
+## embedded in them.
+##
+
+@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
+@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
+
+# woldlab-hepg2-rnaseq-2009dec
+<http://encodesubmit.ucsc.edu/pipeline/show/805>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part2
+<http://encodesubmit.ucsc.edu/pipeline/show/810>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part3
+<http://encodesubmit.ucsc.edu/pipeline/show/869>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
+<http://encodesubmit.ucsc.edu/pipeline/show/870>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab-hepg2-rnaseq-2010Jan-part4
+<http://encodesubmit.ucsc.edu/pipeline/show/897>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
+<http://encodesubmit.ucsc.edu/pipeline/show/898>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep1-2010Jan6
+<http://encodesubmit.ucsc.edu/pipeline/show/903>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep2-2010jan9
+<http://encodesubmit.ucsc.edu/pipeline/show/904>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab hESC 10886 rep1 2009Jan13
+<http://encodesubmit.ucsc.edu/pipeline/show/1026>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
+
+# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1483>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1626>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
+
+# woldlab jun 18 1x75-Directional-GM12878-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1631>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab jun 18 1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1632>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1633>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1634>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1635>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1636>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1637>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1638>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1639>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1645>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1646>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab June 2x75-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1856>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
+
+#2010 jul 9corrected fastqs
+<http://encodesubmit.ucsc.edu/pipeline/show/1874>
+ ucscSubmission:ignore "1" .
+# ucscSubmission:library_urn "
+
+# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
+<http://encodesubmit.ucsc.edu/pipeline/show/2926>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2930>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2931>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
+
+# 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2932>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2933>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2934>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2935>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2936>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2937>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2938>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2939>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2940>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2941>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# "3438 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4607>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
+
+# "3439 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4608>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
+
+# "3437 Fastq re-submission"
+<http://encodesubmit.ucsc.edu/pipeline/show/4609>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
+
+# "1x75-Directional-HepG2-rep2-replace 3522"
+<http://encodesubmit.ucsc.edu/pipeline/show/4797>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# "1x75-Directional-HepG2-rep1 replacement of 3521"
+<http://encodesubmit.ucsc.edu/pipeline/show/4798>
+ ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
--- /dev/null
+#!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
+
+from datetime import datetime
+import hashlib
+import httplib2
+import keyring
+import logging
+from lxml.html import fromstring
+from operator import attrgetter
+from optparse import OptionParser, OptionGroup
+# python keyring
+import os
+import re
+# redland rdf lib
+import RDF
+import sys
+import urllib
+import urlparse
+
+from htsworkflow.submission import daf, ucsc
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+ dafTermOntology, \
+ dublinCoreNS, \
+ get_model, \
+ get_serializer, \
+ sparql_query, \
+ submissionOntology, \
+ libraryOntology, \
+ load_into_model, \
+ rdfNS, \
+ rdfsNS, \
+ xsdNS
+TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
+
+# URL mappings
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+ daf_download_url, \
+ ddf_download_url, \
+ get_ucsc_file_index, \
+ submission_view_url, \
+ UCSCEncodePipeline
+
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
+
+DBDIR = os.path.expanduser("~diane/proj/submission")
+
+LOGGER = logging.getLogger("encode_find")
+
+LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
+USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
+
+USERNAME = 'detrout'
+CHARSET = 'utf-8'
+
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+ "{genome}/encodeDCC/{composite}/"
+def main(cmdline=None):
+ """
+ Parse command line arguments
+
+ Takes a list of arguments (assuming arg[0] is the program name) or None
+ If None, it looks at sys.argv
+ """
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+
+ if opts.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ elif opts.verbose:
+ logging.basicConfig(level=logging.INFO)
+
+ htsw_authdata = api.make_auth_from_opts(opts, parser)
+ htswapi = api.HtswApi(opts.host, htsw_authdata)
+
+ cookie = None
+ model = get_model(opts.load_model, DBDIR)
+
+ if opts.load_rdf is not None:
+ ns_uri = submissionOntology[''].uri
+ load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
+
+ if len(args) == 0:
+ limit = None
+ else:
+ limit = args
+
+ if opts.update:
+ cookie = login(cookie=cookie)
+ load_my_submissions(model, limit=limit, cookie=cookie)
+ load_encode_libraries(model, htswapi)
+ our_tracks = [
+ {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
+ {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
+ ]
+ for track_info in our_tracks:
+ load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
+
+ if opts.sparql is not None:
+ sparql_query(model, opts.sparql)
+
+ if opts.find_submission_with_no_library:
+ find_submissions_with_no_library(model)
+
+ if opts.print_rdf:
+ serializer = get_serializer(name=opts.rdf_parser_name)
+ print serializer.serialize_model_to_string(model)
+
+
+def make_parser():
+ """Construct option parser
+ """
+ parser = OptionParser()
+ commands = OptionGroup(parser, "Commands")
+ commands.add_option('--load-model', default=None,
+ help="Load model database")
+ commands.add_option('--load-rdf', default=None,
+ help="load rdf statements into model")
+ commands.add_option('--print-rdf', action="store_true", default=False,
+ help="print ending model state")
+ commands.add_option('--update', action="store_true", default=False,
+ help="Query remote data sources and update our database")
+ #commands.add_option('--update-ucsc-status', default=None,
+ # help="download status from ucsc, requires filename for extra rules")
+ #commands.add_option('--update-ddfs', action="store_true", default=False,
+ # help="download ddf information for known submission")
+ #commands.add_option('--update-library', default=None,
+ # help="download library info from htsw, "\
+ # "requires filename for extra rules")
+ parser.add_option_group(commands)
+
+ queries = OptionGroup(parser, "Queries")
+ queries.add_option('--sparql', default=None,
+ help="execute arbitrary sparql query")
+ queries.add_option('--find-submission-with-no-library', default=False,
+ action="store_true",
+ help="find submissions with no library ID")
+ parser.add_option_group(queries)
+
+ options = OptionGroup(parser, "Options")
+ options.add_option("--rdf-parser-name", default="turtle",
+ help="set rdf file parser type")
+ options.add_option("-v", "--verbose", action="store_true", default=False)
+ options.add_option("--debug", action="store_true", default=False)
+ parser.add_option_group(options)
+
+ api.add_auth_options(parser)
+
+ return parser
+
+
+def load_my_submissions(model, limit=None, cookie=None):
+ """Parse all the submissions from UCSC into model
+ It will look at the global USER_URL to figure out who to scrape
+ cookie contains the session cookie, if none, will attempt to login
+ """
+ if cookie is None:
+ cookie = login()
+
+ tree = get_url_as_tree(USER_URL, 'GET', cookie)
+ table_rows = tree.xpath('//table[@id="projects"]/tr')
+ # first record is header
+ name_n = submissionOntology['name']
+ species_n = submissionOntology['species']
+ library_urn = submissionOntology['library_urn']
+
+ # skip header
+ for row in table_rows[1:]:
+ cell = row.xpath('td')
+ if cell is not None and len(cell) > 1:
+ submission_id = str(cell[0].text_content())
+ if limit is None or submission_id in limit:
+ subUrn = RDF.Uri(submission_view_url(submission_id))
+
+ add_stmt(model,
+ subUrn,
+ TYPE_N,
+ submissionOntology['Submission'])
+ add_stmt(model,
+ subUrn,
+ DCC_NS['subId'],
+ RDF.Node(submission_id))
+
+ name = str(cell[4].text_content())
+ add_stmt(model, subUrn, name_n, name)
+
+ species = str(cell[2].text_content())
+ if species is not None:
+ add_stmt(model, subUrn, species_n, species)
+
+ library_id = get_library_id(name)
+ if library_id is not None:
+ add_submission_to_library_urn(model,
+ subUrn,
+ library_urn,
+ library_id)
+ else:
+ errmsg = 'Unable to find library id in {0} for {1}'
+ LOGGER.warn(errmsg.format(name, str(subUrn)))
+
+ add_submission_creation_date(model, subUrn, cookie)
+
+ # grab changing atttributes
+ status = str(cell[6].text_content()).strip()
+ last_mod_datetime = get_date_contents(cell[8])
+ last_mod = last_mod_datetime.isoformat()
+
+ update_submission_detail(model, subUrn, status, last_mod,
+ cookie=cookie)
+
+ LOGGER.info("Processed {0}".format(subUrn))
+
+
+def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
+ """Add a link from a UCSC submission to woldlab library if needed
+ """
+ libraryUrn = LIBRARY_NS[library_id + '/']
+ query = RDF.Statement(submissionUrn, predicate, libraryUrn)
+ if not model.contains_statement(query):
+ link = RDF.Statement(submissionUrn, predicate, libraryUrn)
+ LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
+ model.add_statement(link)
+ else:
+ LOGGER.debug("Found: {0}".format(str(query)))
+
+
+def find_submissions_with_no_library(model):
+ missing_lib_query_text = """
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT
+ ?subid ?name
+WHERE {{
+ ?subid submissionOntology:name ?name
+ OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
+ FILTER (!bound(?libid))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+ missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
+
+ results = missing_lib_query.execute(model)
+ for row in results:
+ subid = row['subid']
+ name = row['name']
+ print "# {0}".format(name)
+ print "<{0}>".format(subid.uri)
+ print " encodeSubmit:library_urn "\
+ "<http://jumpgate.caltech.edu/library/> ."
+ print ""
+
+
+def add_submission_creation_date(model, subUrn, cookie):
+ # in theory the submission page might have more information on it.
+ creation_dates = get_creation_dates(model, subUrn)
+ if len(creation_dates) == 0:
+ LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
+ submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+ parse_submission_page(model, cells, subUrn)
+ else:
+ LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
+def get_creation_dates(model, subUrn):
+ query = RDF.Statement(subUrn, CREATION_DATE, None)
+ creation_dates = list(model.find_statements(query))
+ return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+ cells = submissionTree.findall('.//td')
+ dateTimeType = xsdNS['dateTime']
+ created_label = [x for x in cells
+ if x.text_content().startswith('Created')]
+ if len(created_label) == 1:
+ created_date = get_date_contents(created_label[0].getnext())
+ created_date_node = RDF.Node(literal=created_date.isoformat(),
+ datatype=dateTimeType.uri)
+ add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+ else:
+ msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+ LOGGER.warn(msg)
+ raise Warning(msg)
+
+
+def update_submission_detail(model, subUrn, status, recent_update, cookie):
+ HasStatusN = submissionOntology['has_status']
+ StatusN = submissionOntology['status']
+ LastModifyN = submissionOntology['last_modify_date']
+
+ status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
+ status_nodes = list(model.find_statements(status_nodes_query))
+
+ if len(status_nodes) == 0:
+ # has no status node, add one
+ LOGGER.info("Adding status node to {0}".format(subUrn))
+ status_node = create_status_node(subUrn, recent_update)
+ add_stmt(model, subUrn, HasStatusN, status_node)
+ add_stmt(model, status_node, rdfNS['type'], StatusN)
+ add_stmt(model, status_node, StatusN, status)
+ add_stmt(model, status_node, LastModifyN, recent_update)
+ update_ddf(model, subUrn, status_node, cookie=cookie)
+ update_daf(model, subUrn, status_node, cookie=cookie)
+ else:
+ LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
+ for status_statement in status_nodes:
+ status_node = status_statement.object
+ last_modified_query = RDF.Statement(status_node,
+ LastModifyN,
+ None)
+ last_mod_nodes = model.find_statements(last_modified_query)
+ for last_mod_statement in last_mod_nodes:
+ last_mod_date = str(last_mod_statement.object)
+ if recent_update == str(last_mod_date):
+ update_ddf(model, subUrn, status_node, cookie=cookie)
+ update_daf(model, subUrn, status_node, cookie=cookie)
+ break
+
+
+def update_daf(model, submission_url, status_node, cookie):
+ download_daf_uri = str(submission_url).replace('show', 'download_daf')
+ daf_uri = RDF.Uri(download_daf_uri)
+
+ status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+ if not model.contains_statement(status_is_daf):
+ LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
+ status_node))
+ daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+ daf_hash = hashlib.md5(daf_text).hexdigest()
+ daf_hash_stmt = RDF.Statement(status_node,
+ dafTermOntology['md5sum'],
+ daf_hash)
+ model.add_statement(daf_hash_stmt)
+ daf.fromstring_into_model(model, status_node, daf_text)
+
+
+def update_ddf(model, subUrn, statusNode, cookie):
+ download_ddf_url = str(subUrn).replace('show', 'download_ddf')
+ ddfUrn = RDF.Uri(download_ddf_url)
+
+ status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
+ if not model.contains_statement(status_is_ddf):
+ LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
+ ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
+ add_ddf_statements(model, statusNode, ddf_text)
+ model.add_statement(status_is_ddf)
+
+
+def add_ddf_statements(model, statusNode, ddf_string):
+ """Convert a ddf text file into RDF Statements
+ """
+ ddf_lines = ddf_string.split('\n')
+ # first line is header
+ header = ddf_lines[0].split()
+ attributes = [DCC_NS[x] for x in header]
+
+ for ddf_line in ddf_lines[1:]:
+ ddf_line = ddf_line.strip()
+ if len(ddf_line) == 0:
+ continue
+ if ddf_line.startswith("#"):
+ continue
+
+ ddf_record = ddf_line.split('\t')
+ files = ddf_record[0].split(',')
+ file_attributes = ddf_record[1:]
+
+ for f in files:
+ fileNode = RDF.Node()
+ add_stmt(model,
+ statusNode,
+ submissionOntology['has_file'],
+ fileNode)
+ add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+ add_stmt(model, fileNode, DCC_NS['filename'], f)
+
+ for predicate, object in zip(attributes[1:], file_attributes):
+ add_stmt(model, fileNode, predicate, object)
+
+
+def load_encode_libraries(model, htswapi):
+ """Get libraries associated with encode.
+ """
+ encodeFilters = ["/library/?affiliations__id__exact=44",
+ "/library/?affiliations__id__exact=80",
+ ]
+
+ encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
+ rdfaParser = RDF.Parser(name='rdfa')
+ for encodeUrl in encodeUrls:
+ LOGGER.info("Scanning library url {0}".format(encodeUrl))
+ rdfaParser.parse_into_model(model, encodeUrl)
+ query = RDF.Statement(None, libraryOntology['library_id'], None)
+ libraries = model.find_statements(query)
+ for statement in libraries:
+ libraryUrn = statement.subject
+ LOGGER.info("Scanning {0}".format(str(libraryUrn)))
+ load_library_detail(model, libraryUrn)
+
+
+def load_encodedcc_files(model, base_url):
+ if base_url[-1] != '/':
+ base_url += '/'
+
+ file_index = ucsc.get_ucsc_file_index(base_url)
+ for filename, attributes in file_index.items():
+ s = RDF.Node(RDF.Uri(base_url + filename))
+ for name, value in attributes.items():
+ p = RDF.Node(DCC_NS[name])
+ o = RDF.Node(value)
+ model.add_statement(RDF.Statement(s,p,o))
+
+def load_library_detail(model, libraryUrn):
+ """Grab detail information from library page
+ """
+ rdfaParser = RDF.Parser(name='rdfa')
+ query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
+ results = list(model.find_statements(query))
+ log_message = "Found {0} statements for {1}"
+ LOGGER.debug(log_message.format(len(results), libraryUrn))
+ if len(results) == 0:
+ LOGGER.info("Loading {0}".format(str(libraryUrn)))
+ rdfaParser.parse_into_model(model, libraryUrn.uri)
+ elif len(results) == 1:
+ pass # Assuming that a loaded dataset has one record
+ else:
+ LOGGER.warning("Many dates for {0}".format(libraryUrn))
+
+
+def get_library_id(name):
+ """Guess library ID from library name
+
+ >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
+ '11039'
+ >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
+ '10150'
+ """
+ match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
+ library_id = None
+ if match is not None:
+ library_id = match.group('id')
+ return library_id
+
+
+def get_contents(element):
+ """Return contents or none.
+ """
+ if len(element.contents) == 0:
+ return None
+
+ a = element.find('a')
+ if a is not None:
+ return a.contents[0].encode(CHARSET)
+
+ return element.contents[0].encode(CHARSET)
+
+
+def create_status_node(submission_uri, timestamp):
+ submission_uri = daf.submission_uri_to_string(submission_uri)
+ if submission_uri[-1] != '/':
+ sumbission_uri += '/'
+ status_uri = submission_uri + timestamp
+ return RDF.Node(RDF.Uri(status_uri))
+
+
+def get_date_contents(element):
+ data = element.text_content()
+ if data:
+ return datetime.strptime(data, "%Y-%m-%d %H:%M")
+ else:
+ return None
+
+
+def add_stmt(model, subject, predicate, rdf_object):
+ """Convienence create RDF Statement and add to a model
+ """
+ return model.add_statement(
+ RDF.Statement(subject, predicate, rdf_object))
+
+
+def login(cookie=None):
+ """Login if we don't have a cookie
+ """
+ if cookie is not None:
+ return cookie
+
+ keys = keyring.get_keyring()
+ password = keys.get_password(LOGIN_URL, USERNAME)
+ credentials = {'login': USERNAME,
+ 'password': password}
+ headers = {'Content-type': 'application/x-www-form-urlencoded'}
+ http = httplib2.Http()
+ response, content = http.request(LOGIN_URL,
+ 'POST',
+ headers=headers,
+ body=urllib.urlencode(credentials))
+ LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
+ response['status']))
+
+ cookie = response.get('set-cookie', None)
+ if cookie is None:
+ raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
+ return cookie
+
+
+def get_url_as_tree(url, method, cookie=None):
+ http = httplib2.Http()
+ headers = {}
+ if cookie is not None:
+ headers['Cookie'] = cookie
+ response, content = http.request(url, method, headers=headers)
+ if response['status'] == '200':
+ tree = fromstring(content, base_url=url)
+ return tree
+ else:
+ msg = "error accessing {0}, status {1}"
+ msg = msg.format(url, response['status'])
+ e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+
+def get_url_as_text(url, method, cookie=None):
+ http = httplib2.Http()
+ headers = {}
+ if cookie is not None:
+ headers['Cookie'] = cookie
+ response, content = http.request(url, method, headers=headers)
+ if response['status'] == '200':
+ return content
+ else:
+ msg = "error accessing {0}, status {1}"
+ msg = msg.format(url, response['status'])
+ e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+################
+# old stuff
+SUBMISSIONS_LACKING_LIBID = [
+ ('1x75-Directional-HeLa-Rep1', '11208'),
+ ('1x75-Directional-HeLa-Rep2', '11207'),
+ ('1x75-Directional-HepG2-Rep1', '11210'),
+ ('1x75-Directional-HepG2-Rep2', '11209'),
+ ('1x75-Directional-H1-hESC-Rep1', '10947'),
+ ('1x75-Directional-H1-hESC-Rep2', '11009'),
+ ('1x75-Directional-HUVEC-Rep1', '11206'),
+ ('1x75-Directional-HUVEC-Rep2', '11205'),
+ ('1x75-Directional-K562-Rep1', '11008'),
+ ('1x75-Directional-K562-Rep2', '11007'),
+ ('1x75-Directional-NHEK-Rep1', '11204'),
+ ('1x75-Directional-GM12878-Rep1', '11011'),
+ ('1x75-Directional-GM12878-Rep2', '11010'),
+ ]
+
+
+def select_by_library_id(submission_list):
+ subl = [(x.library_id, x) for x in submission_list if x.library_id]
+ libraries = {}
+ for lib_id, subobj in subl:
+ libraries.setdefault(lib_id, []).append(subobj)
+
+ for submission in libraries.values():
+ submission.sort(key=attrgetter('date'), reverse=True)
+
+ return libraries
+
+
+def library_to_freeze(selected_libraries):
+ freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
+ lib_ids = sorted(selected_libraries.keys())
+ report = ['<html><table border="1">']
+ report = ["""<html>
+<head>
+<style type="text/css">
+ td {border-width:0 0 1px 1px; border-style:solid;}
+</style>
+</head>
+<body>
+<table>
+"""]
+ report.append('<thead>')
+ report.append('<tr><td>Library ID</td><td>Name</td>')
+ for f in freezes:
+ report.append('<td>{0}</td>'.format(f))
+ report.append('</tr>')
+ report.append('</thead>')
+ report.append('<tbody>')
+ for lib_id in lib_ids:
+ report.append('<tr>')
+ lib_url = LIBRARY_NS[lib_id].uri
+ report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
+ submissions = selected_libraries[lib_id]
+ report.append('<td>{0}</td>'.format(submissions[0].name))
+ batched = {}
+ for sub in submissions:
+ date = date_to_freeze(sub.date)
+ batched.setdefault(date, []).append(sub)
+ for d in freezes:
+ report.append('<td>')
+ for s in batched.get(d, []):
+ show_url = submission_view_url(s.subid)
+ subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
+ report.append("{0}:{1}".format(subid, s.status))
+ report.append('</td>')
+ else:
+ report.append('<td></td>')
+ report.append("</tr>")
+ report.append('</tbody>')
+ report.append("</table></html>")
+ return "\n".join(report)
+
+
+def date_to_freeze(d):
+ freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+ (datetime(2010, 7, 30), '2010-Jul'),
+ (datetime(2011, 1, 30), '2011-Jan'),
+ ]
+ for end, name in freezes:
+ if d < end:
+ return name
+ else:
+ return None
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+##
+## Find submissions that are currently "failed"
+##
+
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
+PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
+
+#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
+#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
+
+SELECT
+ ?subid ?subname ?liburn ?status
+WHERE {
+ ?subid submitOnt:name ?subname .
+ ?subid submitOnt:library_urn ?liburn .
+ ?subid submitOnt:has_status ?statusNode .
+ ?statusNode submitOnt:status ?status .
+ ?statusNode submitOnt:last_modify_date ?last_modify .
+ FILTER (regex(?status, "failed", "i"))
+}
--- /dev/null
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
+WHERE {
+ ?subid ucscSubmission:name ?name .
+ OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
+ libraryOntology:date ?submission_date .
+ ?liburn libraryOntology:cell_line ?cell ;
+ libraryOntology:replicate ?replicate . }
+ #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
+ #filter(!bound(?liburn))
+}
+ORDER BY ?submission_date ?cell ?replicate ?liburn
--- /dev/null
+from optparse import OptionParser
+import os
+import sys
+from pprint import pprint
+
+def main(cmdline=None):
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+
+ extensions = scan(args)
+ common_extensions = find_common_suffix(extensions)
+
+ if opts.rdf:
+ print_rdf(common_extensions)
+ else:
+ print common_extensions
+
+def make_parser():
+ parser = OptionParser("%prog: directory [directory...]")
+ parser.add_option('--rdf', action="store_true", default=False,
+ help="Produce rdf configuration file for ucsc_gather")
+ return parser
+
+def scan(toscan):
+ index = {}
+ for cur_scan_dir in toscan:
+ for path, dirnames, filenames in os.walk(cur_scan_dir):
+ for filename in filenames:
+ base, ext = os.path.splitext(filename)
+ if ext in ('.daf', 'ddf'):
+ continue
+ next_index = index
+ for c in filename[::-1]:
+ next_index = next_index.setdefault(c, {})
+ return index
+
+def find_common_suffix(index, tail=[]):
+ if len(tail) > 0 and len(index) > 1:
+ return "".join(tail[::-1])
+
+ results = []
+ for key, choice in index.items():
+ r = find_common_suffix(choice, tail+[key])
+ if r is not None:
+ results.append (r)
+
+ if len(results) == 0:
+ return None
+ elif len(results) == 1:
+ return results[0]
+ else:
+ return results
+
+def print_rdf(common_extensions):
+ import RDF
+ from htsworkflow.util import rdfhelp
+ model = rdfhelp.get_model()
+
+ viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
+ subView = RDF.NS(viewName)
+ fileReTerm = rdfhelp.dafTermOntology['filename_re']
+
+ count = 1
+ for ext in common_extensions:
+ s = RDF.Statement(subView['VIEW{0}'.format(count)],
+ fileReTerm,
+ '.*{0}$'.format(ext.replace('.', '\\.')))
+ model.add_statement(s)
+ count += 1
+
+ writer = rdfhelp.get_serializer()
+ writer.set_namespace('thisSubmissionView', subView._prefix)
+ print writer.serialize_model_to_string(model)
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+ def test_create_status_node_with_uri(self):
+ subURL = submission_view_url('5136')
+ submissionUri = RDF.Uri(subURL)
+ timestamp = '2011-12-19T12:42:53.048956'
+ manualUri = subURL + '/' + timestamp
+ nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+ self.assertEqual(str(nodeUri.uri), manualUri)
+
+ def test_create_status_node_with_str(self):
+ subURL = submission_view_url('5136')
+ timestamp = '2011-12-19T12:42:53.048956'
+ manualUri = subURL + '/' + timestamp
+ nodeUri = encode_find.create_status_node(subURL, timestamp)
+ self.assertEqual(str(nodeUri.uri), manualUri)
+
+ def test_parse_submission_page(self):
+ timestamp = '2011-12-19T12:42:53.048956'
+ subURL = submission_view_url('5136')
+ subNode = encode_find.create_status_node(subURL, timestamp)
+ test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+ from lxml.html import parse
+ tree = parse(test_file)
+ model = get_model()
+ dates = encode_find.get_creation_dates(model, subNode)
+ self.assertEqual(len(dates), 0)
+ encode_find.parse_submission_page(model, tree, subNode)
+ dates = encode_find.get_creation_dates(model, subNode)
+ self.assertEqual(len(dates), 1)
+ self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+ return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
--- /dev/null
+import unittest
+
+import ucsc_gather
+
+class testUCSCGather(unittest.TestCase):
+ pass
+
+def suite():
+ return unittest.makeSuite(testUCSCGather,"test")
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
--- /dev/null
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+ <meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />
+ <meta http-equiv="Content-Language" content="en-us" />
+ <title>
+ ENCODE DCC Data Submission Production
+ </title>
+<link href="/stylesheets/encode.css?1278401455" media="screen" rel="Stylesheet" type="text/css" />
+</head>
+<body id="encode-pipeline">
+
+<div id="container">
+ <img align="left" height=50 width=100 src="/images/encode_logo.png">
+
+ <div id="header">
+ ENCODE DCC Data Submission
+ <font size=-1 color="gray"> <em>
+ Production
+ </em></font>
+ </div>
+ <div id="user"> Logged In: <strong> detrout </strong>
+
+ </div>
+
+ <div id="nav">
+ <!-- <table width="100%"><tr>-->
+ <table cellpadding=0 cellspacing=0 width="100%"><tr>
+ <td align="left">
+ <a href="/pipeline/new">New Submission</a>
+ |
+ <a href="/pipeline/list">All Submissions</a>
+
+ |
+ <a href="/pipeline/show_active">Active Submissions</a>
+
+ |
+ <a href="/pipeline/show_user">My Submissions</a>
+
+ </td>
+ <td align="right">
+ <a href="/account/logout"> Log Out</a>
+ |
+ <a href="/account/change_profile">Change Profile</a>
+ |
+ <a href="/pipeline/show_tools">Tools</a>
+ </td>
+
+ </tr></table>
+ </div>
+
+ <div id="message">
+
+
+
+ </div>
+ <div id="content">
+ <p>
+
+
+
+<table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+
+<tr>
+<td>Submission: </td><td>wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2 resubmit</td><td> </td>
+<td>Created: </td><td>2011-12-07 15:23</td>
+ <td> </td>
+ <td>11 days ago</td>
+</tr>
+
+<tr>
+<td>DB: </td><td>hg19</td><td> </td>
+
+<td>Updated: </td><td>2011-12-08 14:54</td>
+ <td> </td>
+ <td>10 days ago</td>
+</tr>
+
+<tr>
+<td>Status: </td><td>approved</td>
+</tr>
+
+<tr>
+
+ <td> </td>
+</tr>
+
+
+</table>
+
+
+
+
+
+
+ <table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+ <tr>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <td> [
+ <a href="/pipeline/show_daf/5136">Show DAF</a> ] </td>
+
+
+
+ <td> [
+ <a href="/pipeline/show_ddf/5136">Show DDF</a> ] </td>
+
+
+ </tr>
+
+ </table>
+
+
+
+<p>
+
+<table cellspacing=2 cellpadding=2>
+<tr><td></td>
+<td align="left"><b>File</b></td>
+<td align="left"><b>Size</b></td>
+<td align="left"><b>Updated</b></td>
+
+<td></td></tr>
+
+
+
+
+
+ <tr style="margin: 10;">
+ <td><b>Archive</b></td><td>002_CaltechRnaSeq_Fastq_DAF.tar.gz</td>
+ <td align="right">1397</td>
+ <td>2011-12-08 14:08</td>
+
+
+
+
+ </tr>
+
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeq.daf</td>
+ <td align=right>4187</td>
+ <td>2011-12-08 14:08</td>
+ </tr>
+
+
+
+
+
+
+
+ <tr style="margin: 10;">
+
+ <td><b>Archive</b></td><td>001_wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2.fastq.tgz</td>
+ <td align="right">8833154623</td>
+ <td>2011-12-07 15:23</td>
+
+
+
+
+ </tr>
+
+
+ <tr>
+ <td> </td>
+
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l7_r2.fastq</td>
+ <td align=right>1629293100</td>
+ <td>2011-12-07 15:34</td>
+ </tr>
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l8_r2.fastq</td>
+
+ <td align=right>1628417888</td>
+ <td>2011-12-07 15:34</td>
+ </tr>
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_616L7AAXX_c152_l1_r2.fastq</td>
+ <td align=right>5152104576</td>
+
+ <td>2011-12-07 15:34</td>
+ </tr>
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l1_r2.fastq</td>
+ <td align=right>6094749091</td>
+ <td>2011-12-07 15:34</td>
+
+ </tr>
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l2_r2.fastq</td>
+ <td align=right>7483882081</td>
+ <td>2011-12-07 15:34</td>
+ </tr>
+
+
+ <tr>
+ <td> </td>
+ <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_ilmn200901_c202_l4_r2.fastq</td>
+ <td align=right>5282142818</td>
+ <td>2011-12-07 15:34</td>
+ </tr>
+
+
+
+</table>
+
+<p>
+
+
+
+
+ </div>
+ <div id="footer">
+ <A HREF="/help.html">Help</A>
+ |
+ <A HREF="/contact">Contact Us</A>
+ </div>
+
+</div>
+</body>
+</html>
--- /dev/null
+#!/usr/bin/env python
+from ConfigParser import SafeConfigParser
+import fnmatch
+from glob import glob
+import json
+import logging
+import netrc
+from optparse import OptionParser, OptionGroup
+import os
+from pprint import pprint, pformat
+import shlex
+from StringIO import StringIO
+import stat
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+import RDF
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+ dafTermOntology, \
+ fromTypedNode, \
+ get_model, \
+ get_serializer, \
+ load_into_model, \
+ sparql_query, \
+ submissionOntology
+from htsworkflow.submission.daf import \
+ DAFMapper, \
+ MetadataLookupException, \
+ get_submission_uri
+from htsworkflow.submission.condorfastq import CondorFastqExtract
+
+logger = logging.getLogger('ucsc_gather')
+
+def main(cmdline=None):
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+ submission_uri = None
+
+ if opts.debug:
+ logging.basicConfig(level = logging.DEBUG )
+ elif opts.verbose:
+ logging.basicConfig(level = logging.INFO )
+ else:
+ logging.basicConfig(level = logging.WARNING )
+
+ apidata = api.make_auth_from_opts(opts, parser)
+
+ model = get_model(opts.load_model)
+ if opts.name:
+ mapper = DAFMapper(opts.name, opts.daf, model)
+ if opts.library_url is not None:
+ mapper.library_url = opts.library_url
+ submission_uri = get_submission_uri(opts.name)
+
+
+ if opts.load_rdf is not None:
+ if submission_uri is None:
+ parser.error("Please specify the submission name")
+ load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
+
+ if opts.make_ddf and opts.daf is None:
+ parser.error("Please specify your daf when making ddf files")
+
+ library_result_map = []
+ for a in args:
+ library_result_map.extend(read_library_result_map(a))
+
+ if opts.make_tree_from is not None:
+ make_tree_from(opts.make_tree_from, library_result_map)
+
+ if opts.link_daf:
+ if opts.daf is None:
+ parser.error("Please specify daf filename with --daf")
+ link_daf(opts.daf, library_result_map)
+
+ if opts.fastq:
+ extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+ force=opts.force)
+ extractor.build_fastqs(library_result_map)
+
+ if opts.scan_submission:
+ scan_submission_dirs(mapper, library_result_map)
+
+ if opts.make_ddf:
+ make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
+
+ if opts.sparql:
+ sparql_query(model, opts.sparql)
+
+ if opts.print_rdf:
+ writer = get_serializer()
+ print writer.serialize_model_to_string(model)
+
+
+def make_parser():
+ parser = OptionParser()
+
+ model = OptionGroup(parser, 'model')
+ model.add_option('--name', help="Set submission name")
+ model.add_option('--load-model', default=None,
+ help="Load model database")
+ model.add_option('--load-rdf', default=None,
+ help="load rdf statements into model")
+ model.add_option('--sparql', default=None, help="execute sparql query")
+ model.add_option('--print-rdf', action="store_true", default=False,
+ help="print ending model state")
+ parser.add_option_group(model)
+ # commands
+ commands = OptionGroup(parser, 'commands')
+ commands.add_option('--make-tree-from',
+ help="create directories & link data files",
+ default=None)
+ commands.add_option('--fastq', default=False, action="store_true",
+ help="generate scripts for making fastq files")
+ commands.add_option('--scan-submission', default=False, action="store_true",
+ help="Import metadata for submission into our model")
+ commands.add_option('--link-daf', default=False, action="store_true",
+ help="link daf into submission directories")
+ commands.add_option('--make-ddf', help='make the ddfs', default=False,
+ action="store_true")
+ parser.add_option_group(commands)
+
+ parser.add_option('--force', default=False, action="store_true",
+ help="Force regenerating fastqs")
+ parser.add_option('--daf', default=None, help='specify daf name')
+ parser.add_option('--library-url', default=None,
+ help="specify an alternate source for library information")
+ # debugging
+ parser.add_option('--verbose', default=False, action="store_true",
+ help='verbose logging')
+ parser.add_option('--debug', default=False, action="store_true",
+ help='debug logging')
+
+ api.add_auth_options(parser)
+
+ return parser
+
+def make_tree_from(source_path, library_result_map):
+ """Create a tree using data files from source path.
+ """
+ for lib_id, lib_path in library_result_map:
+ if not os.path.exists(lib_path):
+ logger.info("Making dir {0}".format(lib_path))
+ os.mkdir(lib_path)
+ source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
+ if os.path.exists(source_lib_dir):
+ pass
+ for filename in os.listdir(source_lib_dir):
+ source_pathname = os.path.join(source_lib_dir, filename)
+ target_pathname = os.path.join(lib_path, filename)
+ if not os.path.exists(source_pathname):
+ raise IOError("{0} does not exist".format(source_pathname))
+ if not os.path.exists(target_pathname):
+ os.symlink(source_pathname, target_pathname)
+ logger.info(
+ 'LINK {0} to {1}'.format(source_pathname, target_pathname))
+
+
+def link_daf(daf_path, library_result_map):
+ if not os.path.exists(daf_path):
+ raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
+
+ base_daf = os.path.basename(daf_path)
+
+ for lib_id, result_dir in library_result_map:
+ if not os.path.exists(result_dir):
+ raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
+ submission_daf = os.path.join(result_dir, base_daf)
+ if not os.path.exists(submission_daf):
+ if not os.path.exists(daf_path):
+ raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
+ os.link(daf_path, submission_daf)
+
+
+def scan_submission_dirs(view_map, library_result_map):
+ """Look through our submission directories and collect needed information
+ """
+ for lib_id, result_dir in library_result_map:
+ logger.info("Importing %s from %s" % (lib_id, result_dir))
+ try:
+ view_map.import_submission_dir(result_dir, lib_id)
+ except MetadataLookupException, e:
+ logger.error("Skipping %s: %s" % (lib_id, str(e)))
+
+def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
+ dag_fragment = []
+ for lib_id, result_dir in library_result_map:
+ submissionNode = view_map.get_submission_node(result_dir)
+ dag_fragment.extend(
+ make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
+ )
+
+ if make_condor and len(dag_fragment) > 0:
+ dag_filename = 'submission.dagman'
+ if not force and os.path.exists(dag_filename):
+ logger.warn("%s exists, please delete" % (dag_filename,))
+ else:
+ f = open(dag_filename,'w')
+ f.write( os.linesep.join(dag_fragment))
+ f.write( os.linesep )
+ f.close()
+
+
+def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
+ """
+ Make ddf files, and bonus condor file
+ """
+ query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+
+select ?submitView ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
+WHERE {
+ ?file ucscDaf:filename ?files ;
+ ucscDaf:md5sum ?md5sum .
+ ?submitView ucscDaf:has_file ?file ;
+ ucscDaf:view ?dafView ;
+ ucscDaf:submission <%(submission)s> .
+ ?dafView ucscDaf:name ?view .
+ <%(submission)s> submissionOntology:library ?library ;
+
+ OPTIONAL { ?library libraryOntology:antibody ?antibody }
+ OPTIONAL { ?library libraryOntology:cell_line ?cell }
+ OPTIONAL { <%(submission)s> ucscDaf:control ?control }
+ OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
+ OPTIONAL { ?library ucscDaf:sex ?sex }
+ OPTIONAL { ?library libraryOntology:library_id ?labExpId }
+ OPTIONAL { ?library libraryOntology:library_id ?labVersion }
+ OPTIONAL { ?library libraryOntology:replicate ?replicate }
+ OPTIONAL { ?library libraryOntology:condition ?treatment }
+ OPTIONAL { ?library ucscDaf:protocol ?protocol }
+ OPTIONAL { ?library ucscDaf:readType ?readType }
+ OPTIONAL { ?library ucscDaf:strain ?strain }
+ OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+ OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+}
+ORDER BY ?submitView"""
+ dag_fragments = []
+
+ name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
+ if name is None:
+ logger.error("Need name for %s" % (str(submissionNode)))
+ return []
+
+ ddf_name = name + '.ddf'
+ if outdir is not None:
+ outfile = os.path.join(outdir, ddf_name)
+ output = open(outfile,'w')
+ else:
+ outfile = 'stdout:'
+ output = sys.stdout
+
+ formatted_query = query_template % {'submission': str(submissionNode.uri)}
+
+ query = RDF.SPARQLQuery(formatted_query)
+ results = query.execute(view_map.model)
+
+ # filename goes first
+ variables = view_map.get_daf_variables()
+ # 'controlId',
+ output.write('\t'.join(variables))
+ output.write(os.linesep)
+
+ all_views = {}
+ all_files = []
+ for row in results:
+ viewname = fromTypedNode(row['view'])
+ current = all_views.setdefault(viewname, {})
+ for variable_name in variables:
+ value = str(fromTypedNode(row[variable_name]))
+ if value is None or value == 'None':
+ logger.warn("{0}: {1} was None".format(outfile, variable_name))
+ if variable_name in ('files', 'md5sum'):
+ current.setdefault(variable_name,[]).append(value)
+ else:
+ current[variable_name] = value
+
+ for view in all_views.keys():
+ line = []
+ for variable_name in variables:
+ if variable_name in ('files', 'md5sum'):
+ line.append(','.join(all_views[view][variable_name]))
+ else:
+ line.append(all_views[view][variable_name])
+ output.write("\t".join(line))
+ output.write(os.linesep)
+ all_files.extend(all_views[view]['files'])
+
+ logger.info(
+ "Examined {0}, found files: {1}".format(
+ str(submissionNode), ", ".join(all_files)))
+
+ all_files.append(daf_name)
+ all_files.append(ddf_name)
+
+ if make_condor:
+ archive_condor = make_condor_archive_script(name, all_files, outdir)
+ upload_condor = make_condor_upload_script(name, outdir)
+
+ dag_fragments.extend(
+ make_dag_fragment(name, archive_condor, upload_condor)
+ )
+
+ return dag_fragments
+
+
+def read_library_result_map(filename):
+ """
+ Read a file that maps library id to result directory.
+ Does not support spaces in filenames.
+
+ For example:
+ 10000 result/foo/bar
+ """
+ stream = open(filename,'r')
+
+ results = []
+ for line in stream:
+ line = line.rstrip()
+ if not line.startswith('#') and len(line) > 0 :
+ library_id, result_dir = line.split()
+ results.append((library_id, result_dir))
+ return results
+
+
+def make_condor_archive_script(name, files, outdir=None):
+ script = """Universe = vanilla
+
+Executable = /bin/tar
+arguments = czvhf ../%(archivename)s %(filelist)s
+
+Error = compress.out.$(Process).log
+Output = compress.out.$(Process).log
+Log = /tmp/submission-compress-%(user)s.log
+initialdir = %(initialdir)s
+environment="GZIP=-3"
+request_memory = 20
+
+queue
+"""
+ if outdir is None:
+ outdir = os.getcwd()
+ for f in files:
+ pathname = os.path.join(outdir, f)
+ if not os.path.exists(pathname):
+ raise RuntimeError("Missing %s from %s" % (f,outdir))
+
+ context = {'archivename': make_submission_name(name),
+ 'filelist': " ".join(files),
+ 'initialdir': os.path.abspath(outdir),
+ 'user': os.getlogin()}
+
+ condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
+ condor_stream = open(condor_script,'w')
+ condor_stream.write(script % context)
+ condor_stream.close()
+ return condor_script
+
+
+def make_condor_upload_script(name, outdir=None):
+ script = """Universe = vanilla
+
+Executable = /usr/bin/lftp
+arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
+
+Error = upload.out.$(Process).log
+Output = upload.out.$(Process).log
+Log = /tmp/submission-upload-%(user)s.log
+initialdir = %(initialdir)s
+
+queue
+"""
+ if outdir is None:
+ outdir = os.getcwd()
+
+ auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
+
+ encodeftp = 'encodeftp.cse.ucsc.edu'
+ ftpuser = auth.hosts[encodeftp][0]
+ ftppassword = auth.hosts[encodeftp][2]
+ context = {'archivename': make_submission_name(name),
+ 'initialdir': os.path.abspath(outdir),
+ 'user': os.getlogin(),
+ 'ftpuser': ftpuser,
+ 'ftppassword': ftppassword,
+ 'ftphost': encodeftp}
+
+ condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
+ condor_stream = open(condor_script,'w')
+ condor_stream.write(script % context)
+ condor_stream.close()
+ os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
+
+ return condor_script
+
+
+def make_dag_fragment(ininame, archive_condor, upload_condor):
+ """
+ Make the couple of fragments compress and then upload the data.
+ """
+ cur_dir = os.getcwd()
+ archive_condor = os.path.join(cur_dir, archive_condor)
+ upload_condor = os.path.join(cur_dir, upload_condor)
+ job_basename = make_base_name(ininame)
+
+ fragments = []
+ fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
+ fragments.append('JOB %s_upload %s' % (job_basename, upload_condor))
+ fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
+
+ return fragments
+
+
+def get_library_info(host, apidata, library_id):
+ url = api.library_url(host, library_id)
+ contents = api.retrieve_info(url, apidata)
+ return contents
+
+
+def make_submission_section(line_counter, files, attributes):
+ """
+ Create a section in the submission ini file
+ """
+ inifile = [ "[line%s]" % (line_counter,) ]
+ inifile += ["files=%s" % (",".join(files))]
+
+ for k,v in attributes.items():
+ inifile += ["%s=%s" % (k,v)]
+ return inifile
+
+
+def make_base_name(pathname):
+ base = os.path.basename(pathname)
+ name, ext = os.path.splitext(base)
+ return name
+
+
+def make_submission_name(ininame):
+ name = make_base_name(ininame)
+ return name + ".tgz"
+
+
+def make_ddf_name(pathname):
+ name = make_base_name(pathname)
+ return name + ".ddf"
+
+
+def make_condor_name(pathname, run_type=None):
+ name = make_base_name(pathname)
+ elements = [name]
+ if run_type is not None:
+ elements.append(run_type)
+ elements.append("condor")
+ return ".".join(elements)
+
+
+def parse_filelist(file_string):
+ return file_string.split(",")
+
+
+def validate_filelist(files):
+ """
+ Die if a file doesn't exist in a file list
+ """
+ for f in files:
+ if not os.path.exists(f):
+ raise RuntimeError("%s does not exist" % (f,))
+
+if __name__ == "__main__":
+ main()
+++ /dev/null
-I was building a variety of scripts to handle submitting our data to the
-UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
-databases, and since I needed an official place to put the scripts
-
-I decided here.
+++ /dev/null
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
-
-construct { ?library ddf:treatment ?treatment ;
- ddf:protocol ?pcr . }
-WHERE {
- ?status ucscSubmission:has_file ?file .
- ?submission ucscSubmission:has_status ?status ;
- ucscSubmission:library_urn ?library ;
- ucscSubmission:name ?name .
- ?file ddf:treatment ?treatment ;
- ddf:protocol ?pcr .
-}
-
+++ /dev/null
-##
-## Override submission ID to library URN names for our libraries
-## whose names either lack, or have the wrong library ID string
-## embedded in them.
-##
-
-@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
-@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
-
-# woldlab-hepg2-rnaseq-2009dec
-<http://encodesubmit.ucsc.edu/pipeline/show/805>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part2
-<http://encodesubmit.ucsc.edu/pipeline/show/810>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part3
-<http://encodesubmit.ucsc.edu/pipeline/show/869>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
-<http://encodesubmit.ucsc.edu/pipeline/show/870>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab-hepg2-rnaseq-2010Jan-part4
-<http://encodesubmit.ucsc.edu/pipeline/show/897>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
-<http://encodesubmit.ucsc.edu/pipeline/show/898>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep1-2010Jan6
-<http://encodesubmit.ucsc.edu/pipeline/show/903>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep2-2010jan9
-<http://encodesubmit.ucsc.edu/pipeline/show/904>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab hESC 10886 rep1 2009Jan13
-<http://encodesubmit.ucsc.edu/pipeline/show/1026>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
-
-# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1483>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1626>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
-
-# woldlab jun 18 1x75-Directional-GM12878-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1631>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab jun 18 1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1632>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1633>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1634>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1635>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1636>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1637>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1638>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1639>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1645>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1646>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab June 2x75-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1856>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
-
-#2010 jul 9corrected fastqs
-<http://encodesubmit.ucsc.edu/pipeline/show/1874>
- ucscSubmission:ignore "1" .
-# ucscSubmission:library_urn "
-
-# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
-<http://encodesubmit.ucsc.edu/pipeline/show/2926>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2930>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2931>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
-
-# 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2932>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2933>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2934>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2935>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2936>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2937>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2938>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2939>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2940>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2941>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# "3438 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4607>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
-
-# "3439 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4608>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
-
-# "3437 Fastq re-submission"
-<http://encodesubmit.ucsc.edu/pipeline/show/4609>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
-
-# "1x75-Directional-HepG2-rep2-replace 3522"
-<http://encodesubmit.ucsc.edu/pipeline/show/4797>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# "1x75-Directional-HepG2-rep1 replacement of 3521"
-<http://encodesubmit.ucsc.edu/pipeline/show/4798>
- ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
+++ /dev/null
-#!/usr/bin/env python
-"""
-Gather information about our submissions into a single RDF store
-"""
-
-from datetime import datetime
-import hashlib
-import httplib2
-import keyring
-import logging
-from lxml.html import fromstring
-from operator import attrgetter
-from optparse import OptionParser, OptionGroup
-# python keyring
-import os
-import re
-# redland rdf lib
-import RDF
-import sys
-import urllib
-import urlparse
-
-from htsworkflow.submission import daf, ucsc
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
- dafTermOntology, \
- dublinCoreNS, \
- get_model, \
- get_serializer, \
- sparql_query, \
- submissionOntology, \
- libraryOntology, \
- load_into_model, \
- rdfNS, \
- rdfsNS, \
- xsdNS
-TYPE_N = rdfNS['type']
-CREATION_DATE = libraryOntology['date']
-
-# URL mappings
-LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
-
-from htsworkflow.submission.ucsc import \
- daf_download_url, \
- ddf_download_url, \
- get_ucsc_file_index, \
- submission_view_url, \
- UCSCEncodePipeline
-
-DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
-
-DBDIR = os.path.expanduser("~diane/proj/submission")
-
-LOGGER = logging.getLogger("encode_find")
-
-LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
-USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
-
-USERNAME = 'detrout'
-CHARSET = 'utf-8'
-
-GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
- "{genome}/encodeDCC/{composite}/"
-def main(cmdline=None):
- """
- Parse command line arguments
-
- Takes a list of arguments (assuming arg[0] is the program name) or None
- If None, it looks at sys.argv
- """
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
-
- if opts.debug:
- logging.basicConfig(level=logging.DEBUG)
- elif opts.verbose:
- logging.basicConfig(level=logging.INFO)
-
- htsw_authdata = api.make_auth_from_opts(opts, parser)
- htswapi = api.HtswApi(opts.host, htsw_authdata)
-
- cookie = None
- model = get_model(opts.load_model, DBDIR)
-
- if opts.load_rdf is not None:
- ns_uri = submissionOntology[''].uri
- load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-
- if len(args) == 0:
- limit = None
- else:
- limit = args
-
- if opts.update:
- cookie = login(cookie=cookie)
- load_my_submissions(model, limit=limit, cookie=cookie)
- load_encode_libraries(model, htswapi)
- our_tracks = [
- {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
- {'genome':'mm9', 'composite':'wgEncodeCaltechHist'},
- {'genome':'mm9', 'composite':'wgEncodeCaltechHistone'},
- {'genome':'mm9', 'composite':'wgEncodeCaltechTfbs'}
- ]
- for track_info in our_tracks:
- load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
-
-
- if opts.sparql is not None:
- sparql_query(model, opts.sparql)
-
- if opts.find_submission_with_no_library:
- find_submissions_with_no_library(model)
-
- if opts.print_rdf:
- serializer = get_serializer(name=opts.rdf_parser_name)
- print serializer.serialize_model_to_string(model)
-
-
-def make_parser():
- """Construct option parser
- """
- parser = OptionParser()
- commands = OptionGroup(parser, "Commands")
- commands.add_option('--load-model', default=None,
- help="Load model database")
- commands.add_option('--load-rdf', default=None,
- help="load rdf statements into model")
- commands.add_option('--print-rdf', action="store_true", default=False,
- help="print ending model state")
- commands.add_option('--update', action="store_true", default=False,
- help="Query remote data sources and update our database")
- #commands.add_option('--update-ucsc-status', default=None,
- # help="download status from ucsc, requires filename for extra rules")
- #commands.add_option('--update-ddfs', action="store_true", default=False,
- # help="download ddf information for known submission")
- #commands.add_option('--update-library', default=None,
- # help="download library info from htsw, "\
- # "requires filename for extra rules")
- parser.add_option_group(commands)
-
- queries = OptionGroup(parser, "Queries")
- queries.add_option('--sparql', default=None,
- help="execute arbitrary sparql query")
- queries.add_option('--find-submission-with-no-library', default=False,
- action="store_true",
- help="find submissions with no library ID")
- parser.add_option_group(queries)
-
- options = OptionGroup(parser, "Options")
- options.add_option("--rdf-parser-name", default="turtle",
- help="set rdf file parser type")
- options.add_option("-v", "--verbose", action="store_true", default=False)
- options.add_option("--debug", action="store_true", default=False)
- parser.add_option_group(options)
-
- api.add_auth_options(parser)
-
- return parser
-
-
-def load_my_submissions(model, limit=None, cookie=None):
- """Parse all the submissions from UCSC into model
- It will look at the global USER_URL to figure out who to scrape
- cookie contains the session cookie, if none, will attempt to login
- """
- if cookie is None:
- cookie = login()
-
- tree = get_url_as_tree(USER_URL, 'GET', cookie)
- table_rows = tree.xpath('//table[@id="projects"]/tr')
- # first record is header
- name_n = submissionOntology['name']
- species_n = submissionOntology['species']
- library_urn = submissionOntology['library_urn']
-
- # skip header
- for row in table_rows[1:]:
- cell = row.xpath('td')
- if cell is not None and len(cell) > 1:
- submission_id = str(cell[0].text_content())
- if limit is None or submission_id in limit:
- subUrn = RDF.Uri(submission_view_url(submission_id))
-
- add_stmt(model,
- subUrn,
- TYPE_N,
- submissionOntology['Submission'])
- add_stmt(model,
- subUrn,
- DCC_NS['subId'],
- RDF.Node(submission_id))
-
- name = str(cell[4].text_content())
- add_stmt(model, subUrn, name_n, name)
-
- species = str(cell[2].text_content())
- if species is not None:
- add_stmt(model, subUrn, species_n, species)
-
- library_id = get_library_id(name)
- if library_id is not None:
- add_submission_to_library_urn(model,
- subUrn,
- library_urn,
- library_id)
- else:
- errmsg = 'Unable to find library id in {0} for {1}'
- LOGGER.warn(errmsg.format(name, str(subUrn)))
-
- add_submission_creation_date(model, subUrn, cookie)
-
- # grab changing atttributes
- status = str(cell[6].text_content()).strip()
- last_mod_datetime = get_date_contents(cell[8])
- last_mod = last_mod_datetime.isoformat()
-
- update_submission_detail(model, subUrn, status, last_mod,
- cookie=cookie)
-
- LOGGER.info("Processed {0}".format(subUrn))
-
-
-def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
- """Add a link from a UCSC submission to woldlab library if needed
- """
- libraryUrn = LIBRARY_NS[library_id + '/']
- query = RDF.Statement(submissionUrn, predicate, libraryUrn)
- if not model.contains_statement(query):
- link = RDF.Statement(submissionUrn, predicate, libraryUrn)
- LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
- model.add_statement(link)
- else:
- LOGGER.debug("Found: {0}".format(str(query)))
-
-
-def find_submissions_with_no_library(model):
- missing_lib_query_text = """
-PREFIX submissionOntology:<{submissionOntology}>
-
-SELECT
- ?subid ?name
-WHERE {{
- ?subid submissionOntology:name ?name
- OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
- FILTER (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
- missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
-
- results = missing_lib_query.execute(model)
- for row in results:
- subid = row['subid']
- name = row['name']
- print "# {0}".format(name)
- print "<{0}>".format(subid.uri)
- print " encodeSubmit:library_urn "\
- "<http://jumpgate.caltech.edu/library/> ."
- print ""
-
-
-def add_submission_creation_date(model, subUrn, cookie):
- # in theory the submission page might have more information on it.
- creation_dates = get_creation_dates(model, subUrn)
- if len(creation_dates) == 0:
- LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
- submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
- parse_submission_page(model, cells, subUrn)
- else:
- LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
-
-def get_creation_dates(model, subUrn):
- query = RDF.Statement(subUrn, CREATION_DATE, None)
- creation_dates = list(model.find_statements(query))
- return creation_dates
-
-def parse_submission_page(model, submissionTree, subUrn):
- cells = submissionTree.findall('.//td')
- dateTimeType = xsdNS['dateTime']
- created_label = [x for x in cells
- if x.text_content().startswith('Created')]
- if len(created_label) == 1:
- created_date = get_date_contents(created_label[0].getnext())
- created_date_node = RDF.Node(literal=created_date.isoformat(),
- datatype=dateTimeType.uri)
- add_stmt(model, subUrn, CREATION_DATE, created_date_node)
- else:
- msg = 'Unable to find creation date for {0}'.format(str(subUrn))
- LOGGER.warn(msg)
- raise Warning(msg)
-
-
-def update_submission_detail(model, subUrn, status, recent_update, cookie):
- HasStatusN = submissionOntology['has_status']
- StatusN = submissionOntology['status']
- LastModifyN = submissionOntology['last_modify_date']
-
- status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
- status_nodes = list(model.find_statements(status_nodes_query))
-
- if len(status_nodes) == 0:
- # has no status node, add one
- LOGGER.info("Adding status node to {0}".format(subUrn))
- status_node = create_status_node(subUrn, recent_update)
- add_stmt(model, subUrn, HasStatusN, status_node)
- add_stmt(model, status_node, rdfNS['type'], StatusN)
- add_stmt(model, status_node, StatusN, status)
- add_stmt(model, status_node, LastModifyN, recent_update)
- update_ddf(model, subUrn, status_node, cookie=cookie)
- update_daf(model, subUrn, status_node, cookie=cookie)
- else:
- LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
- for status_statement in status_nodes:
- status_node = status_statement.object
- last_modified_query = RDF.Statement(status_node,
- LastModifyN,
- None)
- last_mod_nodes = model.find_statements(last_modified_query)
- for last_mod_statement in last_mod_nodes:
- last_mod_date = str(last_mod_statement.object)
- if recent_update == str(last_mod_date):
- update_ddf(model, subUrn, status_node, cookie=cookie)
- update_daf(model, subUrn, status_node, cookie=cookie)
- break
-
-
-def update_daf(model, submission_url, status_node, cookie):
- download_daf_uri = str(submission_url).replace('show', 'download_daf')
- daf_uri = RDF.Uri(download_daf_uri)
-
- status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
- if not model.contains_statement(status_is_daf):
- LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
- status_node))
- daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
- daf_hash = hashlib.md5(daf_text).hexdigest()
- daf_hash_stmt = RDF.Statement(status_node,
- dafTermOntology['md5sum'],
- daf_hash)
- model.add_statement(daf_hash_stmt)
- daf.fromstring_into_model(model, status_node, daf_text)
-
-
-def update_ddf(model, subUrn, statusNode, cookie):
- download_ddf_url = str(subUrn).replace('show', 'download_ddf')
- ddfUrn = RDF.Uri(download_ddf_url)
-
- status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
- if not model.contains_statement(status_is_ddf):
- LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
- ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
- add_ddf_statements(model, statusNode, ddf_text)
- model.add_statement(status_is_ddf)
-
-
-def add_ddf_statements(model, statusNode, ddf_string):
- """Convert a ddf text file into RDF Statements
- """
- ddf_lines = ddf_string.split('\n')
- # first line is header
- header = ddf_lines[0].split()
- attributes = [DCC_NS[x] for x in header]
-
- for ddf_line in ddf_lines[1:]:
- ddf_line = ddf_line.strip()
- if len(ddf_line) == 0:
- continue
- if ddf_line.startswith("#"):
- continue
-
- ddf_record = ddf_line.split('\t')
- files = ddf_record[0].split(',')
- file_attributes = ddf_record[1:]
-
- for f in files:
- fileNode = RDF.Node()
- add_stmt(model,
- statusNode,
- submissionOntology['has_file'],
- fileNode)
- add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
- add_stmt(model, fileNode, DCC_NS['filename'], f)
-
- for predicate, object in zip(attributes[1:], file_attributes):
- add_stmt(model, fileNode, predicate, object)
-
-
-def load_encode_libraries(model, htswapi):
- """Get libraries associated with encode.
- """
- encodeFilters = ["/library/?affiliations__id__exact=44",
- "/library/?affiliations__id__exact=80",
- ]
-
- encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
- rdfaParser = RDF.Parser(name='rdfa')
- for encodeUrl in encodeUrls:
- LOGGER.info("Scanning library url {0}".format(encodeUrl))
- rdfaParser.parse_into_model(model, encodeUrl)
- query = RDF.Statement(None, libraryOntology['library_id'], None)
- libraries = model.find_statements(query)
- for statement in libraries:
- libraryUrn = statement.subject
- LOGGER.info("Scanning {0}".format(str(libraryUrn)))
- load_library_detail(model, libraryUrn)
-
-
-def load_encodedcc_files(model, base_url):
- if base_url[-1] != '/':
- base_url += '/'
-
- file_index = ucsc.get_ucsc_file_index(base_url)
- for filename, attributes in file_index.items():
- s = RDF.Node(RDF.Uri(base_url + filename))
- for name, value in attributes.items():
- p = RDF.Node(DCC_NS[name])
- o = RDF.Node(value)
- model.add_statement(RDF.Statement(s,p,o))
-
-def load_library_detail(model, libraryUrn):
- """Grab detail information from library page
- """
- rdfaParser = RDF.Parser(name='rdfa')
- query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
- results = list(model.find_statements(query))
- log_message = "Found {0} statements for {1}"
- LOGGER.debug(log_message.format(len(results), libraryUrn))
- if len(results) == 0:
- LOGGER.info("Loading {0}".format(str(libraryUrn)))
- rdfaParser.parse_into_model(model, libraryUrn.uri)
- elif len(results) == 1:
- pass # Assuming that a loaded dataset has one record
- else:
- LOGGER.warning("Many dates for {0}".format(libraryUrn))
-
-
-def get_library_id(name):
- """Guess library ID from library name
-
- >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
- '11039'
- >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
- '10150'
- """
- match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
- library_id = None
- if match is not None:
- library_id = match.group('id')
- return library_id
-
-
-def get_contents(element):
- """Return contents or none.
- """
- if len(element.contents) == 0:
- return None
-
- a = element.find('a')
- if a is not None:
- return a.contents[0].encode(CHARSET)
-
- return element.contents[0].encode(CHARSET)
-
-
-def create_status_node(submission_uri, timestamp):
- submission_uri = daf.submission_uri_to_string(submission_uri)
- if submission_uri[-1] != '/':
- sumbission_uri += '/'
- status_uri = submission_uri + timestamp
- return RDF.Node(RDF.Uri(status_uri))
-
-
-def get_date_contents(element):
- data = element.text_content()
- if data:
- return datetime.strptime(data, "%Y-%m-%d %H:%M")
- else:
- return None
-
-
-def add_stmt(model, subject, predicate, rdf_object):
- """Convienence create RDF Statement and add to a model
- """
- return model.add_statement(
- RDF.Statement(subject, predicate, rdf_object))
-
-
-def login(cookie=None):
- """Login if we don't have a cookie
- """
- if cookie is not None:
- return cookie
-
- keys = keyring.get_keyring()
- password = keys.get_password(LOGIN_URL, USERNAME)
- credentials = {'login': USERNAME,
- 'password': password}
- headers = {'Content-type': 'application/x-www-form-urlencoded'}
- http = httplib2.Http()
- response, content = http.request(LOGIN_URL,
- 'POST',
- headers=headers,
- body=urllib.urlencode(credentials))
- LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
- response['status']))
-
- cookie = response.get('set-cookie', None)
- if cookie is None:
- raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
- return cookie
-
-
-def get_url_as_tree(url, method, cookie=None):
- http = httplib2.Http()
- headers = {}
- if cookie is not None:
- headers['Cookie'] = cookie
- response, content = http.request(url, method, headers=headers)
- if response['status'] == '200':
- tree = fromstring(content, base_url=url)
- return tree
- else:
- msg = "error accessing {0}, status {1}"
- msg = msg.format(url, response['status'])
- e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-
-def get_url_as_text(url, method, cookie=None):
- http = httplib2.Http()
- headers = {}
- if cookie is not None:
- headers['Cookie'] = cookie
- response, content = http.request(url, method, headers=headers)
- if response['status'] == '200':
- return content
- else:
- msg = "error accessing {0}, status {1}"
- msg = msg.format(url, response['status'])
- e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-################
-# old stuff
-SUBMISSIONS_LACKING_LIBID = [
- ('1x75-Directional-HeLa-Rep1', '11208'),
- ('1x75-Directional-HeLa-Rep2', '11207'),
- ('1x75-Directional-HepG2-Rep1', '11210'),
- ('1x75-Directional-HepG2-Rep2', '11209'),
- ('1x75-Directional-H1-hESC-Rep1', '10947'),
- ('1x75-Directional-H1-hESC-Rep2', '11009'),
- ('1x75-Directional-HUVEC-Rep1', '11206'),
- ('1x75-Directional-HUVEC-Rep2', '11205'),
- ('1x75-Directional-K562-Rep1', '11008'),
- ('1x75-Directional-K562-Rep2', '11007'),
- ('1x75-Directional-NHEK-Rep1', '11204'),
- ('1x75-Directional-GM12878-Rep1', '11011'),
- ('1x75-Directional-GM12878-Rep2', '11010'),
- ]
-
-
-def select_by_library_id(submission_list):
- subl = [(x.library_id, x) for x in submission_list if x.library_id]
- libraries = {}
- for lib_id, subobj in subl:
- libraries.setdefault(lib_id, []).append(subobj)
-
- for submission in libraries.values():
- submission.sort(key=attrgetter('date'), reverse=True)
-
- return libraries
-
-
-def library_to_freeze(selected_libraries):
- freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
- lib_ids = sorted(selected_libraries.keys())
- report = ['<html><table border="1">']
- report = ["""<html>
-<head>
-<style type="text/css">
- td {border-width:0 0 1px 1px; border-style:solid;}
-</style>
-</head>
-<body>
-<table>
-"""]
- report.append('<thead>')
- report.append('<tr><td>Library ID</td><td>Name</td>')
- for f in freezes:
- report.append('<td>{0}</td>'.format(f))
- report.append('</tr>')
- report.append('</thead>')
- report.append('<tbody>')
- for lib_id in lib_ids:
- report.append('<tr>')
- lib_url = LIBRARY_NS[lib_id].uri
- report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
- submissions = selected_libraries[lib_id]
- report.append('<td>{0}</td>'.format(submissions[0].name))
- batched = {}
- for sub in submissions:
- date = date_to_freeze(sub.date)
- batched.setdefault(date, []).append(sub)
- for d in freezes:
- report.append('<td>')
- for s in batched.get(d, []):
- show_url = submission_view_url(s.subid)
- subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
- report.append("{0}:{1}".format(subid, s.status))
- report.append('</td>')
- else:
- report.append('<td></td>')
- report.append("</tr>")
- report.append('</tbody>')
- report.append("</table></html>")
- return "\n".join(report)
-
-
-def date_to_freeze(d):
- freezes = [(datetime(2010, 1, 30), '2010-Jan'),
- (datetime(2010, 7, 30), '2010-Jul'),
- (datetime(2011, 1, 30), '2011-Jan'),
- ]
- for end, name in freezes:
- if d < end:
- return name
- else:
- return None
-
-if __name__ == "__main__":
- main()
+++ /dev/null
-##
-## Find submissions that are currently "failed"
-##
-
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
-PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
-
-#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
-#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
-#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
-
-SELECT
- ?subid ?subname ?liburn ?status
-WHERE {
- ?subid submitOnt:name ?subname .
- ?subid submitOnt:library_urn ?liburn .
- ?subid submitOnt:has_status ?statusNode .
- ?statusNode submitOnt:status ?status .
- ?statusNode submitOnt:last_modify_date ?last_modify .
- FILTER (regex(?status, "failed", "i"))
-}
+++ /dev/null
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-
-SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
-WHERE {
- ?subid ucscSubmission:name ?name .
- OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
- libraryOntology:date ?submission_date .
- ?liburn libraryOntology:cell_line ?cell ;
- libraryOntology:replicate ?replicate . }
- #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
- #filter(!bound(?liburn))
-}
-ORDER BY ?submission_date ?cell ?replicate ?liburn
+++ /dev/null
-from optparse import OptionParser
-import os
-import sys
-from pprint import pprint
-
-def main(cmdline=None):
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
-
- extensions = scan(args)
- common_extensions = find_common_suffix(extensions)
-
- if opts.rdf:
- print_rdf(common_extensions)
- else:
- print common_extensions
-
-def make_parser():
- parser = OptionParser("%prog: directory [directory...]")
- parser.add_option('--rdf', action="store_true", default=False,
- help="Produce rdf configuration file for ucsc_gather")
- return parser
-
-def scan(toscan):
- index = {}
- for cur_scan_dir in toscan:
- for path, dirnames, filenames in os.walk(cur_scan_dir):
- for filename in filenames:
- base, ext = os.path.splitext(filename)
- if ext in ('.daf', 'ddf'):
- continue
- next_index = index
- for c in filename[::-1]:
- next_index = next_index.setdefault(c, {})
- return index
-
-def find_common_suffix(index, tail=[]):
- if len(tail) > 0 and len(index) > 1:
- return "".join(tail[::-1])
-
- results = []
- for key, choice in index.items():
- r = find_common_suffix(choice, tail+[key])
- if r is not None:
- results.append (r)
-
- if len(results) == 0:
- return None
- elif len(results) == 1:
- return results[0]
- else:
- return results
-
-def print_rdf(common_extensions):
- import RDF
- from htsworkflow.util import rdfhelp
- model = rdfhelp.get_model()
-
- viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
- subView = RDF.NS(viewName)
- fileReTerm = rdfhelp.dafTermOntology['filename_re']
-
- count = 1
- for ext in common_extensions:
- s = RDF.Statement(subView['VIEW{0}'.format(count)],
- fileReTerm,
- '.*{0}$'.format(ext.replace('.', '\\.')))
- model.add_statement(s)
- count += 1
-
- writer = rdfhelp.get_serializer()
- writer.set_namespace('thisSubmissionView', subView._prefix)
- print writer.serialize_model_to_string(model)
-
-if __name__ == "__main__":
- main()
+++ /dev/null
-#!/usr/bin/env python
-import os
-import unittest
-
-import RDF
-
-import encode_find
-from htsworkflow.submission.ucsc import submission_view_url
-from htsworkflow.util.rdfhelp import dump_model, get_model
-
-SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
-print SOURCE_PATH
-
-class TestEncodeFind(unittest.TestCase):
- def test_create_status_node_with_uri(self):
- subURL = submission_view_url('5136')
- submissionUri = RDF.Uri(subURL)
- timestamp = '2011-12-19T12:42:53.048956'
- manualUri = subURL + '/' + timestamp
- nodeUri = encode_find.create_status_node(submissionUri, timestamp)
- self.assertEqual(str(nodeUri.uri), manualUri)
-
- def test_create_status_node_with_str(self):
- subURL = submission_view_url('5136')
- timestamp = '2011-12-19T12:42:53.048956'
- manualUri = subURL + '/' + timestamp
- nodeUri = encode_find.create_status_node(subURL, timestamp)
- self.assertEqual(str(nodeUri.uri), manualUri)
-
- def test_parse_submission_page(self):
- timestamp = '2011-12-19T12:42:53.048956'
- subURL = submission_view_url('5136')
- subNode = encode_find.create_status_node(subURL, timestamp)
- test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
- from lxml.html import parse
- tree = parse(test_file)
- model = get_model()
- dates = encode_find.get_creation_dates(model, subNode)
- self.assertEqual(len(dates), 0)
- encode_find.parse_submission_page(model, tree, subNode)
- dates = encode_find.get_creation_dates(model, subNode)
- self.assertEqual(len(dates), 1)
- self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
-
-def suite():
- return unittest.makeSuite(TestEncodeFind, "test")
-
-if __name__ == "__main__":
- unittest.main(defaultTest="suite")
+++ /dev/null
-import unittest
-
-import ucsc_gather
-
-class testUCSCGather(unittest.TestCase):
- pass
-
-def suite():
- return unittest.makeSuite(testUCSCGather,"test")
-
-if __name__ == "__main__":
- unittest.main(defaultTest="suite")
+++ /dev/null
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
- <meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />
- <meta http-equiv="Content-Language" content="en-us" />
- <title>
- ENCODE DCC Data Submission Production
- </title>
-<link href="/stylesheets/encode.css?1278401455" media="screen" rel="Stylesheet" type="text/css" />
-</head>
-<body id="encode-pipeline">
-
-<div id="container">
- <img align="left" height=50 width=100 src="/images/encode_logo.png">
-
- <div id="header">
- ENCODE DCC Data Submission
- <font size=-1 color="gray"> <em>
- Production
- </em></font>
- </div>
- <div id="user"> Logged In: <strong> detrout </strong>
-
- </div>
-
- <div id="nav">
- <!-- <table width="100%"><tr>-->
- <table cellpadding=0 cellspacing=0 width="100%"><tr>
- <td align="left">
- <a href="/pipeline/new">New Submission</a>
- |
- <a href="/pipeline/list">All Submissions</a>
-
- |
- <a href="/pipeline/show_active">Active Submissions</a>
-
- |
- <a href="/pipeline/show_user">My Submissions</a>
-
- </td>
- <td align="right">
- <a href="/account/logout"> Log Out</a>
- |
- <a href="/account/change_profile">Change Profile</a>
- |
- <a href="/pipeline/show_tools">Tools</a>
- </td>
-
- </tr></table>
- </div>
-
- <div id="message">
-
-
-
- </div>
- <div id="content">
- <p>
-
-
-
-<table style="margin-top:10px;" cellpadding=1 cellspacing=1>
-
-<tr>
-<td>Submission: </td><td>wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2 resubmit</td><td> </td>
-<td>Created: </td><td>2011-12-07 15:23</td>
- <td> </td>
- <td>11 days ago</td>
-</tr>
-
-<tr>
-<td>DB: </td><td>hg19</td><td> </td>
-
-<td>Updated: </td><td>2011-12-08 14:54</td>
- <td> </td>
- <td>10 days ago</td>
-</tr>
-
-<tr>
-<td>Status: </td><td>approved</td>
-</tr>
-
-<tr>
-
- <td> </td>
-</tr>
-
-
-</table>
-
-
-
-
-
-
- <table style="margin-top:10px;" cellpadding=1 cellspacing=1>
- <tr>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- <td> [
- <a href="/pipeline/show_daf/5136">Show DAF</a> ] </td>
-
-
-
- <td> [
- <a href="/pipeline/show_ddf/5136">Show DDF</a> ] </td>
-
-
- </tr>
-
- </table>
-
-
-
-<p>
-
-<table cellspacing=2 cellpadding=2>
-<tr><td></td>
-<td align="left"><b>File</b></td>
-<td align="left"><b>Size</b></td>
-<td align="left"><b>Updated</b></td>
-
-<td></td></tr>
-
-
-
-
-
- <tr style="margin: 10;">
- <td><b>Archive</b></td><td>002_CaltechRnaSeq_Fastq_DAF.tar.gz</td>
- <td align="right">1397</td>
- <td>2011-12-08 14:08</td>
-
-
-
-
- </tr>
-
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeq.daf</td>
- <td align=right>4187</td>
- <td>2011-12-08 14:08</td>
- </tr>
-
-
-
-
-
-
-
- <tr style="margin: 10;">
-
- <td><b>Archive</b></td><td>001_wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2.fastq.tgz</td>
- <td align="right">8833154623</td>
- <td>2011-12-07 15:23</td>
-
-
-
-
- </tr>
-
-
- <tr>
- <td> </td>
-
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l7_r2.fastq</td>
- <td align=right>1629293100</td>
- <td>2011-12-07 15:34</td>
- </tr>
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l8_r2.fastq</td>
-
- <td align=right>1628417888</td>
- <td>2011-12-07 15:34</td>
- </tr>
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_616L7AAXX_c152_l1_r2.fastq</td>
- <td align=right>5152104576</td>
-
- <td>2011-12-07 15:34</td>
- </tr>
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l1_r2.fastq</td>
- <td align=right>6094749091</td>
- <td>2011-12-07 15:34</td>
-
- </tr>
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l2_r2.fastq</td>
- <td align=right>7483882081</td>
- <td>2011-12-07 15:34</td>
- </tr>
-
-
- <tr>
- <td> </td>
- <td> wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_ilmn200901_c202_l4_r2.fastq</td>
- <td align=right>5282142818</td>
- <td>2011-12-07 15:34</td>
- </tr>
-
-
-
-</table>
-
-<p>
-
-
-
-
- </div>
- <div id="footer">
- <A HREF="/help.html">Help</A>
- |
- <A HREF="/contact">Contact Us</A>
- </div>
-
-</div>
-</body>
-</html>
+++ /dev/null
-#!/usr/bin/env python
-from ConfigParser import SafeConfigParser
-import fnmatch
-from glob import glob
-import json
-import logging
-import netrc
-from optparse import OptionParser, OptionGroup
-import os
-from pprint import pprint, pformat
-import shlex
-from StringIO import StringIO
-import stat
-import sys
-import time
-import types
-import urllib
-import urllib2
-import urlparse
-
-import RDF
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
- dafTermOntology, \
- fromTypedNode, \
- get_model, \
- get_serializer, \
- load_into_model, \
- sparql_query, \
- submissionOntology
-from htsworkflow.submission.daf import \
- DAFMapper, \
- MetadataLookupException, \
- get_submission_uri
-from htsworkflow.submission.condorfastq import CondorFastqExtract
-
-logger = logging.getLogger('ucsc_gather')
-
-def main(cmdline=None):
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
- submission_uri = None
-
- if opts.debug:
- logging.basicConfig(level = logging.DEBUG )
- elif opts.verbose:
- logging.basicConfig(level = logging.INFO )
- else:
- logging.basicConfig(level = logging.WARNING )
-
- apidata = api.make_auth_from_opts(opts, parser)
-
- model = get_model(opts.load_model)
- if opts.name:
- mapper = DAFMapper(opts.name, opts.daf, model)
- if opts.library_url is not None:
- mapper.library_url = opts.library_url
- submission_uri = get_submission_uri(opts.name)
-
-
- if opts.load_rdf is not None:
- if submission_uri is None:
- parser.error("Please specify the submission name")
- load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
-
- if opts.make_ddf and opts.daf is None:
- parser.error("Please specify your daf when making ddf files")
-
- library_result_map = []
- for a in args:
- library_result_map.extend(read_library_result_map(a))
-
- if opts.make_tree_from is not None:
- make_tree_from(opts.make_tree_from, library_result_map)
-
- if opts.link_daf:
- if opts.daf is None:
- parser.error("Please specify daf filename with --daf")
- link_daf(opts.daf, library_result_map)
-
- if opts.fastq:
- extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
- force=opts.force)
- extractor.build_fastqs(library_result_map)
-
- if opts.scan_submission:
- scan_submission_dirs(mapper, library_result_map)
-
- if opts.make_ddf:
- make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
-
- if opts.sparql:
- sparql_query(model, opts.sparql)
-
- if opts.print_rdf:
- writer = get_serializer()
- print writer.serialize_model_to_string(model)
-
-
-def make_parser():
- parser = OptionParser()
-
- model = OptionGroup(parser, 'model')
- model.add_option('--name', help="Set submission name")
- model.add_option('--load-model', default=None,
- help="Load model database")
- model.add_option('--load-rdf', default=None,
- help="load rdf statements into model")
- model.add_option('--sparql', default=None, help="execute sparql query")
- model.add_option('--print-rdf', action="store_true", default=False,
- help="print ending model state")
- parser.add_option_group(model)
- # commands
- commands = OptionGroup(parser, 'commands')
- commands.add_option('--make-tree-from',
- help="create directories & link data files",
- default=None)
- commands.add_option('--fastq', default=False, action="store_true",
- help="generate scripts for making fastq files")
- commands.add_option('--scan-submission', default=False, action="store_true",
- help="Import metadata for submission into our model")
- commands.add_option('--link-daf', default=False, action="store_true",
- help="link daf into submission directories")
- commands.add_option('--make-ddf', help='make the ddfs', default=False,
- action="store_true")
- parser.add_option_group(commands)
-
- parser.add_option('--force', default=False, action="store_true",
- help="Force regenerating fastqs")
- parser.add_option('--daf', default=None, help='specify daf name')
- parser.add_option('--library-url', default=None,
- help="specify an alternate source for library information")
- # debugging
- parser.add_option('--verbose', default=False, action="store_true",
- help='verbose logging')
- parser.add_option('--debug', default=False, action="store_true",
- help='debug logging')
-
- api.add_auth_options(parser)
-
- return parser
-
-def make_tree_from(source_path, library_result_map):
- """Create a tree using data files from source path.
- """
- for lib_id, lib_path in library_result_map:
- if not os.path.exists(lib_path):
- logger.info("Making dir {0}".format(lib_path))
- os.mkdir(lib_path)
- source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
- if os.path.exists(source_lib_dir):
- pass
- for filename in os.listdir(source_lib_dir):
- source_pathname = os.path.join(source_lib_dir, filename)
- target_pathname = os.path.join(lib_path, filename)
- if not os.path.exists(source_pathname):
- raise IOError("{0} does not exist".format(source_pathname))
- if not os.path.exists(target_pathname):
- os.symlink(source_pathname, target_pathname)
- logger.info(
- 'LINK {0} to {1}'.format(source_pathname, target_pathname))
-
-
-def link_daf(daf_path, library_result_map):
- if not os.path.exists(daf_path):
- raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
-
- base_daf = os.path.basename(daf_path)
-
- for lib_id, result_dir in library_result_map:
- if not os.path.exists(result_dir):
- raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
- submission_daf = os.path.join(result_dir, base_daf)
- if not os.path.exists(submission_daf):
- if not os.path.exists(daf_path):
- raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
- os.link(daf_path, submission_daf)
-
-
-def scan_submission_dirs(view_map, library_result_map):
- """Look through our submission directories and collect needed information
- """
- for lib_id, result_dir in library_result_map:
- logger.info("Importing %s from %s" % (lib_id, result_dir))
- try:
- view_map.import_submission_dir(result_dir, lib_id)
- except MetadataLookupException, e:
- logger.error("Skipping %s: %s" % (lib_id, str(e)))
-
-def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
- dag_fragment = []
- for lib_id, result_dir in library_result_map:
- submissionNode = view_map.get_submission_node(result_dir)
- dag_fragment.extend(
- make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
- )
-
- if make_condor and len(dag_fragment) > 0:
- dag_filename = 'submission.dagman'
- if not force and os.path.exists(dag_filename):
- logger.warn("%s exists, please delete" % (dag_filename,))
- else:
- f = open(dag_filename,'w')
- f.write( os.linesep.join(dag_fragment))
- f.write( os.linesep )
- f.close()
-
-
-def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
- """
- Make ddf files, and bonus condor file
- """
- query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-
-select ?submitView ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
-WHERE {
- ?file ucscDaf:filename ?files ;
- ucscDaf:md5sum ?md5sum .
- ?submitView ucscDaf:has_file ?file ;
- ucscDaf:view ?dafView ;
- ucscDaf:submission <%(submission)s> .
- ?dafView ucscDaf:name ?view .
- <%(submission)s> submissionOntology:library ?library ;
-
- OPTIONAL { ?library libraryOntology:antibody ?antibody }
- OPTIONAL { ?library libraryOntology:cell_line ?cell }
- OPTIONAL { <%(submission)s> ucscDaf:control ?control }
- OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
- OPTIONAL { ?library ucscDaf:sex ?sex }
- OPTIONAL { ?library libraryOntology:library_id ?labExpId }
- OPTIONAL { ?library libraryOntology:library_id ?labVersion }
- OPTIONAL { ?library libraryOntology:replicate ?replicate }
- OPTIONAL { ?library libraryOntology:condition ?treatment }
- OPTIONAL { ?library ucscDaf:protocol ?protocol }
- OPTIONAL { ?library ucscDaf:readType ?readType }
- OPTIONAL { ?library ucscDaf:strain ?strain }
- OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
- OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-}
-ORDER BY ?submitView"""
- dag_fragments = []
-
- name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
- if name is None:
- logger.error("Need name for %s" % (str(submissionNode)))
- return []
-
- ddf_name = name + '.ddf'
- if outdir is not None:
- outfile = os.path.join(outdir, ddf_name)
- output = open(outfile,'w')
- else:
- outfile = 'stdout:'
- output = sys.stdout
-
- formatted_query = query_template % {'submission': str(submissionNode.uri)}
-
- query = RDF.SPARQLQuery(formatted_query)
- results = query.execute(view_map.model)
-
- # filename goes first
- variables = view_map.get_daf_variables()
- # 'controlId',
- output.write('\t'.join(variables))
- output.write(os.linesep)
-
- all_views = {}
- all_files = []
- for row in results:
- viewname = fromTypedNode(row['view'])
- current = all_views.setdefault(viewname, {})
- for variable_name in variables:
- value = str(fromTypedNode(row[variable_name]))
- if value is None or value == 'None':
- logger.warn("{0}: {1} was None".format(outfile, variable_name))
- if variable_name in ('files', 'md5sum'):
- current.setdefault(variable_name,[]).append(value)
- else:
- current[variable_name] = value
-
- for view in all_views.keys():
- line = []
- for variable_name in variables:
- if variable_name in ('files', 'md5sum'):
- line.append(','.join(all_views[view][variable_name]))
- else:
- line.append(all_views[view][variable_name])
- output.write("\t".join(line))
- output.write(os.linesep)
- all_files.extend(all_views[view]['files'])
-
- logger.info(
- "Examined {0}, found files: {1}".format(
- str(submissionNode), ", ".join(all_files)))
-
- all_files.append(daf_name)
- all_files.append(ddf_name)
-
- if make_condor:
- archive_condor = make_condor_archive_script(name, all_files, outdir)
- upload_condor = make_condor_upload_script(name, outdir)
-
- dag_fragments.extend(
- make_dag_fragment(name, archive_condor, upload_condor)
- )
-
- return dag_fragments
-
-
-def read_library_result_map(filename):
- """
- Read a file that maps library id to result directory.
- Does not support spaces in filenames.
-
- For example:
- 10000 result/foo/bar
- """
- stream = open(filename,'r')
-
- results = []
- for line in stream:
- line = line.rstrip()
- if not line.startswith('#') and len(line) > 0 :
- library_id, result_dir = line.split()
- results.append((library_id, result_dir))
- return results
-
-
-def make_condor_archive_script(name, files, outdir=None):
- script = """Universe = vanilla
-
-Executable = /bin/tar
-arguments = czvhf ../%(archivename)s %(filelist)s
-
-Error = compress.out.$(Process).log
-Output = compress.out.$(Process).log
-Log = /tmp/submission-compress-%(user)s.log
-initialdir = %(initialdir)s
-environment="GZIP=-3"
-request_memory = 20
-
-queue
-"""
- if outdir is None:
- outdir = os.getcwd()
- for f in files:
- pathname = os.path.join(outdir, f)
- if not os.path.exists(pathname):
- raise RuntimeError("Missing %s from %s" % (f,outdir))
-
- context = {'archivename': make_submission_name(name),
- 'filelist': " ".join(files),
- 'initialdir': os.path.abspath(outdir),
- 'user': os.getlogin()}
-
- condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
- condor_stream = open(condor_script,'w')
- condor_stream.write(script % context)
- condor_stream.close()
- return condor_script
-
-
-def make_condor_upload_script(name, outdir=None):
- script = """Universe = vanilla
-
-Executable = /usr/bin/lftp
-arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
-
-Error = upload.out.$(Process).log
-Output = upload.out.$(Process).log
-Log = /tmp/submission-upload-%(user)s.log
-initialdir = %(initialdir)s
-
-queue
-"""
- if outdir is None:
- outdir = os.getcwd()
-
- auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
-
- encodeftp = 'encodeftp.cse.ucsc.edu'
- ftpuser = auth.hosts[encodeftp][0]
- ftppassword = auth.hosts[encodeftp][2]
- context = {'archivename': make_submission_name(name),
- 'initialdir': os.path.abspath(outdir),
- 'user': os.getlogin(),
- 'ftpuser': ftpuser,
- 'ftppassword': ftppassword,
- 'ftphost': encodeftp}
-
- condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
- condor_stream = open(condor_script,'w')
- condor_stream.write(script % context)
- condor_stream.close()
- os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
-
- return condor_script
-
-
-def make_dag_fragment(ininame, archive_condor, upload_condor):
- """
- Make the couple of fragments compress and then upload the data.
- """
- cur_dir = os.getcwd()
- archive_condor = os.path.join(cur_dir, archive_condor)
- upload_condor = os.path.join(cur_dir, upload_condor)
- job_basename = make_base_name(ininame)
-
- fragments = []
- fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
- fragments.append('JOB %s_upload %s' % (job_basename, upload_condor))
- fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
-
- return fragments
-
-
-def get_library_info(host, apidata, library_id):
- url = api.library_url(host, library_id)
- contents = api.retrieve_info(url, apidata)
- return contents
-
-
-def make_submission_section(line_counter, files, attributes):
- """
- Create a section in the submission ini file
- """
- inifile = [ "[line%s]" % (line_counter,) ]
- inifile += ["files=%s" % (",".join(files))]
-
- for k,v in attributes.items():
- inifile += ["%s=%s" % (k,v)]
- return inifile
-
-
-def make_base_name(pathname):
- base = os.path.basename(pathname)
- name, ext = os.path.splitext(base)
- return name
-
-
-def make_submission_name(ininame):
- name = make_base_name(ininame)
- return name + ".tgz"
-
-
-def make_ddf_name(pathname):
- name = make_base_name(pathname)
- return name + ".ddf"
-
-
-def make_condor_name(pathname, run_type=None):
- name = make_base_name(pathname)
- elements = [name]
- if run_type is not None:
- elements.append(run_type)
- elements.append("condor")
- return ".".join(elements)
-
-
-def parse_filelist(file_string):
- return file_string.split(",")
-
-
-def validate_filelist(files):
- """
- Die if a file doesn't exist in a file list
- """
- for f in files:
- if not os.path.exists(f):
- raise RuntimeError("%s does not exist" % (f,))
-
-if __name__ == "__main__":
- main()