Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow

author Diane Trout <diane@caltech.edu>

Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)

committer Diane Trout <diane@caltech.edu>

Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
author Diane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
committer Diane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
diff --git a/encode_submission/README.txt b/encode_submission/README.txt

new file mode 100644 (file)

index 0000000..bab7a55
--- /dev/null
+++ b/encode_submission/README.txt
@@ -0,0 +1,5 @@
+I was building a variety of scripts to handle submitting our data to the 
+UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
+databases, and since I needed an official place to put the scripts
+
+I decided here.
diff --git a/encode_submission/__init__.py b/encode_submission/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/encode_submission/add-treatment-to-library.sparql b/encode_submission/add-treatment-to-library.sparql

new file mode 100644 (file)

index 0000000..c97dce2
--- /dev/null
+++ b/encode_submission/add-treatment-to-library.sparql
@@ -0,0 +1,19 @@
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
+
+construct { ?library ddf:treatment ?treatment ;
+                     ddf:protocol ?pcr . }
+WHERE {
+   ?status ucscSubmission:has_file ?file .
+   ?submission ucscSubmission:has_status ?status ;
+               ucscSubmission:library_urn ?library ;
+               ucscSubmission:name ?name .
+   ?file ddf:treatment ?treatment ;
+         ddf:protocol ?pcr .
+}
+
diff --git a/encode_submission/dt-overrides.turtle b/encode_submission/dt-overrides.turtle

new file mode 100644 (file)

index 0000000..ffe2759
--- /dev/null
+++ b/encode_submission/dt-overrides.turtle
@@ -0,0 +1,178 @@
+##
+## Override submission ID to library URN names for our libraries
+## whose names either lack, or have the wrong library ID string
+## embedded in them.
+##
+
+@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
+@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
+
+# woldlab-hepg2-rnaseq-2009dec
+<http://encodesubmit.ucsc.edu/pipeline/show/805>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part2
+<http://encodesubmit.ucsc.edu/pipeline/show/810>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part3
+<http://encodesubmit.ucsc.edu/pipeline/show/869>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
+<http://encodesubmit.ucsc.edu/pipeline/show/870>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab-hepg2-rnaseq-2010Jan-part4
+<http://encodesubmit.ucsc.edu/pipeline/show/897>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
+<http://encodesubmit.ucsc.edu/pipeline/show/898>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep1-2010Jan6
+<http://encodesubmit.ucsc.edu/pipeline/show/903>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep2-2010jan9
+<http://encodesubmit.ucsc.edu/pipeline/show/904>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab hESC 10886 rep1 2009Jan13
+<http://encodesubmit.ucsc.edu/pipeline/show/1026>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
+
+# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1483>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1626>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
+
+# woldlab jun 18 1x75-Directional-GM12878-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1631>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab jun 18  1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1632>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1633>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1634>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1635>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1636>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1637>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1638>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1639>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1645>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1646>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab June  2x75-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1856>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
+
+#2010 jul 9corrected fastqs
+<http://encodesubmit.ucsc.edu/pipeline/show/1874>
+     ucscSubmission:ignore "1" .
+#    ucscSubmission:library_urn "
+
+# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
+<http://encodesubmit.ucsc.edu/pipeline/show/2926>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2930>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2931>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
+
+# 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2932>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2933>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2934>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2935>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2936>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2937>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2938>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2939>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2940>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2941>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# "3438 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4607>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
+
+# "3439 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4608>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
+
+# "3437 Fastq re-submission"
+<http://encodesubmit.ucsc.edu/pipeline/show/4609>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
+
+# "1x75-Directional-HepG2-rep2-replace 3522"
+<http://encodesubmit.ucsc.edu/pipeline/show/4797>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# "1x75-Directional-HepG2-rep1 replacement of 3521"
+<http://encodesubmit.ucsc.edu/pipeline/show/4798>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
diff --git a/encode_submission/encode_find.py b/encode_submission/encode_find.py

new file mode 100644 (file)

index 0000000..6608a05
--- /dev/null
+++ b/encode_submission/encode_find.py
@@ -0,0 +1,628 @@
+#!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
+
+from datetime import datetime
+import hashlib
+import httplib2
+import keyring
+import logging
+from lxml.html import fromstring
+from operator import attrgetter
+from optparse import OptionParser, OptionGroup
+# python keyring
+import os
+import re
+# redland rdf lib
+import RDF
+import sys
+import urllib
+import urlparse
+
+from htsworkflow.submission import daf, ucsc
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     dublinCoreNS, \
+     get_model, \
+     get_serializer, \
+     sparql_query, \
+     submissionOntology, \
+     libraryOntology, \
+     load_into_model, \
+     rdfNS, \
+     rdfsNS, \
+     xsdNS
+TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
+
+# URL mappings
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+     daf_download_url, \
+     ddf_download_url, \
+     get_ucsc_file_index, \
+     submission_view_url, \
+     UCSCEncodePipeline
+
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
+
+DBDIR = os.path.expanduser("~diane/proj/submission")
+
+LOGGER = logging.getLogger("encode_find")
+
+LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
+USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
+
+USERNAME = 'detrout'
+CHARSET = 'utf-8'
+
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
+def main(cmdline=None):
+    """
+    Parse command line arguments
+
+    Takes a list of arguments (assuming arg[0] is the program name) or None
+    If None, it looks at sys.argv
+    """
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    elif opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    htsw_authdata = api.make_auth_from_opts(opts, parser)
+    htswapi = api.HtswApi(opts.host, htsw_authdata)
+
+    cookie = None
+    model = get_model(opts.load_model, DBDIR)
+
+    if opts.load_rdf is not None:
+        ns_uri = submissionOntology[''].uri
+        load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
+
+    if len(args) == 0:
+        limit = None
+    else:
+        limit = args
+
+    if opts.update:
+        cookie = login(cookie=cookie)
+        load_my_submissions(model, limit=limit, cookie=cookie)
+        load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
+
+    if opts.sparql is not None:
+        sparql_query(model, opts.sparql)
+
+    if opts.find_submission_with_no_library:
+        find_submissions_with_no_library(model)
+
+    if opts.print_rdf:
+        serializer = get_serializer(name=opts.rdf_parser_name)
+        print serializer.serialize_model_to_string(model)
+
+
+def make_parser():
+    """Construct option parser
+    """
+    parser = OptionParser()
+    commands = OptionGroup(parser, "Commands")
+    commands.add_option('--load-model', default=None,
+      help="Load model database")
+    commands.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    commands.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    commands.add_option('--update', action="store_true", default=False,
+      help="Query remote data sources and update our database")
+    #commands.add_option('--update-ucsc-status', default=None,
+    #  help="download status from ucsc, requires filename for extra rules")
+    #commands.add_option('--update-ddfs', action="store_true", default=False,
+    #  help="download ddf information for known submission")
+    #commands.add_option('--update-library', default=None,
+    #  help="download library info from htsw, "\
+    #       "requires filename for extra rules")
+    parser.add_option_group(commands)
+
+    queries = OptionGroup(parser, "Queries")
+    queries.add_option('--sparql', default=None,
+      help="execute arbitrary sparql query")
+    queries.add_option('--find-submission-with-no-library', default=False,
+      action="store_true",
+      help="find submissions with no library ID")
+    parser.add_option_group(queries)
+
+    options = OptionGroup(parser, "Options")
+    options.add_option("--rdf-parser-name", default="turtle",
+      help="set rdf file parser type")
+    options.add_option("-v", "--verbose", action="store_true", default=False)
+    options.add_option("--debug", action="store_true", default=False)
+    parser.add_option_group(options)
+
+    api.add_auth_options(parser)
+
+    return parser
+
+
+def load_my_submissions(model, limit=None, cookie=None):
+    """Parse all the submissions from UCSC into model
+    It will look at the global USER_URL to figure out who to scrape
+    cookie contains the session cookie, if none, will attempt to login
+    """
+    if cookie is None:
+        cookie = login()
+
+    tree = get_url_as_tree(USER_URL, 'GET', cookie)
+    table_rows = tree.xpath('//table[@id="projects"]/tr')
+    # first record is header
+    name_n = submissionOntology['name']
+    species_n = submissionOntology['species']
+    library_urn = submissionOntology['library_urn']
+
+    # skip header
+    for row in table_rows[1:]:
+        cell = row.xpath('td')
+        if cell is not None and len(cell) > 1:
+            submission_id = str(cell[0].text_content())
+            if limit is None or submission_id in limit:
+                subUrn = RDF.Uri(submission_view_url(submission_id))
+
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
+
+                name = str(cell[4].text_content())
+                add_stmt(model, subUrn, name_n, name)
+
+                species = str(cell[2].text_content())
+                if species is not None:
+                    add_stmt(model, subUrn, species_n, species)
+
+                library_id = get_library_id(name)
+                if library_id is not None:
+                    add_submission_to_library_urn(model,
+                                                  subUrn,
+                                                  library_urn,
+                                                  library_id)
+                else:
+                    errmsg = 'Unable to find library id in {0} for {1}'
+                    LOGGER.warn(errmsg.format(name, str(subUrn)))
+
+                add_submission_creation_date(model, subUrn, cookie)
+
+                # grab changing atttributes
+                status = str(cell[6].text_content()).strip()
+                last_mod_datetime = get_date_contents(cell[8])
+                last_mod = last_mod_datetime.isoformat()
+
+                update_submission_detail(model, subUrn, status, last_mod,
+                                         cookie=cookie)
+
+                LOGGER.info("Processed {0}".format(subUrn))
+
+
+def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
+    """Add a link from a UCSC submission to woldlab library if needed
+    """
+    libraryUrn = LIBRARY_NS[library_id + '/']
+    query = RDF.Statement(submissionUrn, predicate, libraryUrn)
+    if not model.contains_statement(query):
+        link = RDF.Statement(submissionUrn, predicate, libraryUrn)
+        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
+        model.add_statement(link)
+    else:
+        LOGGER.debug("Found: {0}".format(str(query)))
+
+
+def find_submissions_with_no_library(model):
+    missing_lib_query_text = """
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT
+ ?subid ?name
+WHERE {{
+  ?subid submissionOntology:name ?name
+  OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
+  FILTER  (!bound(?libid))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+    missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
+
+    results = missing_lib_query.execute(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn "\
+              "<http://jumpgate.caltech.edu/library/> ."
+        print ""
+
+
+def add_submission_creation_date(model, subUrn, cookie):
+    # in theory the submission page might have more information on it.
+    creation_dates = get_creation_dates(model, subUrn)
+    if len(creation_dates) == 0:
+        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
+    else:
+        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
+
+def update_submission_detail(model, subUrn, status, recent_update, cookie):
+    HasStatusN = submissionOntology['has_status']
+    StatusN = submissionOntology['status']
+    LastModifyN = submissionOntology['last_modify_date']
+
+    status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
+    status_nodes = list(model.find_statements(status_nodes_query))
+
+    if len(status_nodes) == 0:
+        # has no status node, add one
+        LOGGER.info("Adding status node to {0}".format(subUrn))
+        status_node = create_status_node(subUrn, recent_update)
+        add_stmt(model, subUrn, HasStatusN, status_node)
+        add_stmt(model, status_node, rdfNS['type'], StatusN)
+        add_stmt(model, status_node, StatusN, status)
+        add_stmt(model, status_node, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_node, cookie=cookie)
+        update_daf(model, subUrn, status_node, cookie=cookie)
+    else:
+        LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
+        for status_statement in status_nodes:
+            status_node = status_statement.object
+            last_modified_query = RDF.Statement(status_node,
+                                                LastModifyN,
+                                                None)
+            last_mod_nodes = model.find_statements(last_modified_query)
+            for last_mod_statement in last_mod_nodes:
+                last_mod_date = str(last_mod_statement.object)
+                if recent_update == str(last_mod_date):
+                    update_ddf(model, subUrn, status_node, cookie=cookie)
+                    update_daf(model, subUrn, status_node, cookie=cookie)
+                    break
+
+
+def update_daf(model, submission_url, status_node, cookie):
+    download_daf_uri = str(submission_url).replace('show', 'download_daf')
+    daf_uri = RDF.Uri(download_daf_uri)
+
+    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+    if not model.contains_statement(status_is_daf):
+        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
+                                                     status_node))
+        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+        daf_hash = hashlib.md5(daf_text).hexdigest()
+        daf_hash_stmt = RDF.Statement(status_node,
+                                      dafTermOntology['md5sum'],
+                                      daf_hash)
+        model.add_statement(daf_hash_stmt)
+        daf.fromstring_into_model(model, status_node, daf_text)
+
+
+def update_ddf(model, subUrn, statusNode, cookie):
+    download_ddf_url = str(subUrn).replace('show', 'download_ddf')
+    ddfUrn = RDF.Uri(download_ddf_url)
+
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
+    if not model.contains_statement(status_is_ddf):
+        LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
+        ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
+        add_ddf_statements(model, statusNode, ddf_text)
+        model.add_statement(status_is_ddf)
+
+
+def add_ddf_statements(model, statusNode, ddf_string):
+    """Convert a ddf text file into RDF Statements
+    """
+    ddf_lines = ddf_string.split('\n')
+    # first line is header
+    header = ddf_lines[0].split()
+    attributes = [DCC_NS[x] for x in header]
+
+    for ddf_line in ddf_lines[1:]:
+        ddf_line = ddf_line.strip()
+        if len(ddf_line) == 0:
+            continue
+        if ddf_line.startswith("#"):
+            continue
+
+        ddf_record = ddf_line.split('\t')
+        files = ddf_record[0].split(',')
+        file_attributes = ddf_record[1:]
+
+        for f in files:
+            fileNode = RDF.Node()
+            add_stmt(model,
+                     statusNode,
+                     submissionOntology['has_file'],
+                     fileNode)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
+
+            for predicate, object in zip(attributes[1:], file_attributes):
+                add_stmt(model, fileNode, predicate, object)
+
+
+def load_encode_libraries(model, htswapi):
+    """Get libraries associated with encode.
+    """
+    encodeFilters = ["/library/?affiliations__id__exact=44",
+                     "/library/?affiliations__id__exact=80",
+                    ]
+
+    encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
+    rdfaParser = RDF.Parser(name='rdfa')
+    for encodeUrl in encodeUrls:
+        LOGGER.info("Scanning library url {0}".format(encodeUrl))
+        rdfaParser.parse_into_model(model, encodeUrl)
+        query = RDF.Statement(None, libraryOntology['library_id'], None)
+        libraries = model.find_statements(query)
+        for statement in libraries:
+            libraryUrn = statement.subject
+            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
+            load_library_detail(model, libraryUrn)
+
+
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
+def load_library_detail(model, libraryUrn):
+    """Grab detail information from library page
+    """
+    rdfaParser = RDF.Parser(name='rdfa')
+    query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
+    results = list(model.find_statements(query))
+    log_message = "Found {0} statements for {1}"
+    LOGGER.debug(log_message.format(len(results), libraryUrn))
+    if len(results) == 0:
+        LOGGER.info("Loading {0}".format(str(libraryUrn)))
+        rdfaParser.parse_into_model(model, libraryUrn.uri)
+    elif len(results) == 1:
+        pass  # Assuming that a loaded dataset has one record
+    else:
+        LOGGER.warning("Many dates for {0}".format(libraryUrn))
+
+
+def get_library_id(name):
+    """Guess library ID from library name
+
+    >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
+    '11039'
+    >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
+    '10150'
+    """
+    match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
+    library_id = None
+    if match is not None:
+        library_id = match.group('id')
+    return library_id
+
+
+def get_contents(element):
+    """Return contents or none.
+    """
+    if len(element.contents) == 0:
+        return None
+
+    a = element.find('a')
+    if a is not None:
+        return a.contents[0].encode(CHARSET)
+
+    return element.contents[0].encode(CHARSET)
+
+
+def create_status_node(submission_uri, timestamp):
+    submission_uri = daf.submission_uri_to_string(submission_uri)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
+    return RDF.Node(RDF.Uri(status_uri))
+
+
+def get_date_contents(element):
+    data = element.text_content()
+    if data:
+        return datetime.strptime(data, "%Y-%m-%d %H:%M")
+    else:
+        return None
+
+
+def add_stmt(model, subject, predicate, rdf_object):
+    """Convienence create RDF Statement and add to a model
+    """
+    return model.add_statement(
+        RDF.Statement(subject, predicate, rdf_object))
+
+
+def login(cookie=None):
+    """Login if we don't have a cookie
+    """
+    if cookie is not None:
+        return cookie
+
+    keys = keyring.get_keyring()
+    password = keys.get_password(LOGIN_URL, USERNAME)
+    credentials = {'login': USERNAME,
+                   'password': password}
+    headers = {'Content-type': 'application/x-www-form-urlencoded'}
+    http = httplib2.Http()
+    response, content = http.request(LOGIN_URL,
+                                     'POST',
+                                     headers=headers,
+                                     body=urllib.urlencode(credentials))
+    LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
+                                                    response['status']))
+
+    cookie = response.get('set-cookie', None)
+    if cookie is None:
+        raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
+    return cookie
+
+
+def get_url_as_tree(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        tree = fromstring(content, base_url=url)
+        return tree
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+
+def get_url_as_text(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        return content
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+################
+#  old stuff
+SUBMISSIONS_LACKING_LIBID = [
+    ('1x75-Directional-HeLa-Rep1',    '11208'),
+    ('1x75-Directional-HeLa-Rep2',    '11207'),
+    ('1x75-Directional-HepG2-Rep1',   '11210'),
+    ('1x75-Directional-HepG2-Rep2',   '11209'),
+    ('1x75-Directional-H1-hESC-Rep1', '10947'),
+    ('1x75-Directional-H1-hESC-Rep2', '11009'),
+    ('1x75-Directional-HUVEC-Rep1',   '11206'),
+    ('1x75-Directional-HUVEC-Rep2',   '11205'),
+    ('1x75-Directional-K562-Rep1',    '11008'),
+    ('1x75-Directional-K562-Rep2',    '11007'),
+    ('1x75-Directional-NHEK-Rep1',    '11204'),
+    ('1x75-Directional-GM12878-Rep1', '11011'),
+    ('1x75-Directional-GM12878-Rep2', '11010'),
+    ]
+
+
+def select_by_library_id(submission_list):
+    subl = [(x.library_id, x) for x in submission_list if x.library_id]
+    libraries = {}
+    for lib_id, subobj in subl:
+        libraries.setdefault(lib_id, []).append(subobj)
+
+    for submission in libraries.values():
+        submission.sort(key=attrgetter('date'), reverse=True)
+
+    return libraries
+
+
+def library_to_freeze(selected_libraries):
+    freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
+    lib_ids = sorted(selected_libraries.keys())
+    report = ['<html><table border="1">']
+    report = ["""<html>
+<head>
+<style type="text/css">
+ td {border-width:0 0 1px 1px; border-style:solid;}
+</style>
+</head>
+<body>
+<table>
+"""]
+    report.append('<thead>')
+    report.append('<tr><td>Library ID</td><td>Name</td>')
+    for f in freezes:
+        report.append('<td>{0}</td>'.format(f))
+    report.append('</tr>')
+    report.append('</thead>')
+    report.append('<tbody>')
+    for lib_id in lib_ids:
+        report.append('<tr>')
+        lib_url = LIBRARY_NS[lib_id].uri
+        report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
+        submissions = selected_libraries[lib_id]
+        report.append('<td>{0}</td>'.format(submissions[0].name))
+        batched = {}
+        for sub in submissions:
+            date = date_to_freeze(sub.date)
+            batched.setdefault(date, []).append(sub)
+        for d in freezes:
+            report.append('<td>')
+            for s in batched.get(d, []):
+                show_url = submission_view_url(s.subid)
+                subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
+                report.append("{0}:{1}".format(subid, s.status))
+            report.append('</td>')
+        else:
+            report.append('<td></td>')
+        report.append("</tr>")
+    report.append('</tbody>')
+    report.append("</table></html>")
+    return "\n".join(report)
+
+
+def date_to_freeze(d):
+    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+               (datetime(2010, 7, 30), '2010-Jul'),
+               (datetime(2011, 1, 30), '2011-Jan'),
+               ]
+    for end, name in freezes:
+        if d < end:
+            return name
+    else:
+        return None
+
+if __name__ == "__main__":
+    main()
diff --git a/encode_submission/failed-submissions.sparql b/encode_submission/failed-submissions.sparql

new file mode 100644 (file)

index 0000000..af4af4e
--- /dev/null
+++ b/encode_submission/failed-submissions.sparql
@@ -0,0 +1,22 @@
+##
+## Find submissions that are currently "failed"
+##
+
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
+PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
+
+#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
+#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
+
+SELECT 
+ ?subid ?subname ?liburn ?status
+WHERE {
+  ?subid submitOnt:name ?subname .
+  ?subid submitOnt:library_urn ?liburn .
+  ?subid submitOnt:has_status ?statusNode .
+  ?statusNode submitOnt:status ?status .
+  ?statusNode submitOnt:last_modify_date ?last_modify .
+  FILTER (regex(?status, "failed", "i"))
+} 
diff --git a/encode_submission/find-lib-by-cell.sparql b/encode_submission/find-lib-by-cell.sparql

new file mode 100644 (file)

index 0000000..c4585c5
--- /dev/null
+++ b/encode_submission/find-lib-by-cell.sparql
@@ -0,0 +1,17 @@
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
+WHERE {
+    ?subid ucscSubmission:name ?name .
+    OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
+                       libraryOntology:date ?submission_date .
+               ?liburn libraryOntology:cell_line ?cell ;
+                       libraryOntology:replicate ?replicate . }
+    #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
+    #filter(!bound(?liburn))
+}
+ORDER BY ?submission_date ?cell ?replicate ?liburn
diff --git a/encode_submission/scan_extension.py b/encode_submission/scan_extension.py

new file mode 100644 (file)

index 0000000..39f19c6
--- /dev/null
+++ b/encode_submission/scan_extension.py
@@ -0,0 +1,76 @@
+from optparse import OptionParser
+import os
+import sys
+from pprint import pprint
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    extensions = scan(args)
+    common_extensions = find_common_suffix(extensions)
+
+    if opts.rdf:
+        print_rdf(common_extensions)
+    else:
+        print common_extensions
+        
+def make_parser():
+    parser = OptionParser("%prog: directory [directory...]")
+    parser.add_option('--rdf', action="store_true", default=False,
+                      help="Produce rdf configuration file for ucsc_gather")
+    return parser
+
+def scan(toscan):
+    index = {}
+    for cur_scan_dir in toscan:
+        for path, dirnames, filenames in os.walk(cur_scan_dir):
+            for filename in filenames:
+                base, ext = os.path.splitext(filename)
+                if ext in ('.daf', 'ddf'):
+                    continue
+                next_index = index
+                for c in filename[::-1]:
+                    next_index = next_index.setdefault(c, {})
+    return index
+
+def find_common_suffix(index, tail=[]):
+    if len(tail) > 0 and len(index) > 1:
+        return "".join(tail[::-1])
+
+    results = []
+    for key, choice in index.items():
+        r = find_common_suffix(choice, tail+[key])
+        if r is not None:
+            results.append (r)
+        
+    if len(results) == 0:
+        return None
+    elif len(results) == 1:
+        return results[0]
+    else:
+        return results
+
+def print_rdf(common_extensions):
+    import RDF
+    from htsworkflow.util import rdfhelp
+    model = rdfhelp.get_model()
+
+    viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
+    subView = RDF.NS(viewName)
+    fileReTerm = rdfhelp.dafTermOntology['filename_re']
+
+    count = 1
+    for ext in common_extensions:
+        s = RDF.Statement(subView['VIEW{0}'.format(count)],
+                          fileReTerm,
+                          '.*{0}$'.format(ext.replace('.', '\\.')))
+        model.add_statement(s)
+        count += 1
+        
+    writer = rdfhelp.get_serializer()
+    writer.set_namespace('thisSubmissionView', subView._prefix)
+    print writer.serialize_model_to_string(model)
+
+if __name__ == "__main__":
+    main()
diff --git a/encode_submission/test_encode_find.py b/encode_submission/test_encode_find.py

new file mode 100644 (file)

index 0000000..98bdb46
--- /dev/null
+++ b/encode_submission/test_encode_find.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+    def test_create_status_node_with_uri(self):
+        subURL = submission_view_url('5136')
+        submissionUri = RDF.Uri(subURL)
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_create_status_node_with_str(self):
+        subURL = submission_view_url('5136')
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(subURL, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_parse_submission_page(self):
+        timestamp = '2011-12-19T12:42:53.048956'
+        subURL = submission_view_url('5136')
+        subNode = encode_find.create_status_node(subURL, timestamp)
+        test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+        from lxml.html import parse
+        tree = parse(test_file)
+        model = get_model()
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 0)
+        encode_find.parse_submission_page(model, tree, subNode)
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 1)
+        self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+    return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/encode_submission/test_ucsc_gather.py b/encode_submission/test_ucsc_gather.py

new file mode 100644 (file)

index 0000000..d7d54e6
--- /dev/null
+++ b/encode_submission/test_ucsc_gather.py
@@ -0,0 +1,12 @@
+import unittest
+
+import ucsc_gather
+
+class testUCSCGather(unittest.TestCase):
+    pass
+
+def suite():
+    return unittest.makeSuite(testUCSCGather,"test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/encode_submission/testdata/5136SubDetail.html b/encode_submission/testdata/5136SubDetail.html

new file mode 100644 (file)

index 0000000..1daeb58
--- /dev/null
+++ b/encode_submission/testdata/5136SubDetail.html
@@ -0,0 +1,245 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />
+  <meta http-equiv="Content-Language" content="en-us" />
+    <title>
+    ENCODE DCC Data Submission Production
+    </title>
+<link href="/stylesheets/encode.css?1278401455" media="screen" rel="Stylesheet" type="text/css" />
+</head>
+<body id="encode-pipeline">
+
+<div id="container">
+  <img align="left" height=50 width=100  src="/images/encode_logo.png">
+
+    <div id="header">
+        ENCODE DCC Data Submission
+        <font size=-1 color="gray"> <em>
+        Production
+        </em></font>
+    </div>
+  <div id="user"> Logged In: <strong> detrout </strong>
+
+  </div>
+
+  <div id="nav">
+    <!-- <table width="100%"><tr>-->
+    <table cellpadding=0 cellspacing=0 width="100%"><tr>
+    <td align="left">
+    <a href="/pipeline/new">New Submission</a>
+     |
+    <a href="/pipeline/list">All Submissions</a>
+
+     |
+    <a href="/pipeline/show_active">Active Submissions</a>
+
+     |
+    <a href="/pipeline/show_user">My Submissions</a>
+
+    </td>
+    <td align="right">
+    <a href="/account/logout"> Log Out</a>
+     |
+    <a href="/account/change_profile">Change Profile</a>
+     |
+    <a href="/pipeline/show_tools">Tools</a>
+    </td>
+
+    </tr></table>
+  </div>
+
+  <div id="message">
+
+
+
+  </div>
+  <div id="content">
+      <p>
+
+
+
+<table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+
+<tr>
+<td>Submission: </td><td>wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2 resubmit</td><td>&nbsp;&nbsp;&nbsp;</td>
+<td>Created: </td><td>2011-12-07 15:23</td>
+  <td>&nbsp;&nbsp;</td>
+  <td>11 days ago</td>
+</tr>
+
+<tr>
+<td>DB: </td><td>hg19</td><td>&nbsp;&nbsp;&nbsp;</td>
+
+<td>Updated: </td><td>2011-12-08 14:54</td>
+  <td>&nbsp;&nbsp;</td>
+  <td>10 days ago</td>
+</tr>
+
+<tr>
+<td>Status: </td><td>approved</td>
+</tr>
+
+<tr>
+
+  <td>&nbsp;</td>
+</tr>
+
+
+</table>
+
+
+
+
+
+
+  <table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+    <tr>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+      <td> [
+      <a href="/pipeline/show_daf/5136">Show DAF</a>       ]  </td>
+
+
+
+      <td> [
+      <a href="/pipeline/show_ddf/5136">Show DDF</a>       ]  </td>
+
+
+    </tr>
+
+  </table>
+
+
+
+<p>
+
+<table cellspacing=2 cellpadding=2>
+<tr><td></td>
+<td align="left"><b>File</b></td>
+<td align="left"><b>Size</b></td>
+<td align="left"><b>Updated</b></td>
+
+<td></td></tr>
+
+
+
+
+
+  <tr style="margin: 10;">
+  <td><b>Archive</b></td><td>002_CaltechRnaSeq_Fastq_DAF.tar.gz</td>
+  <td align="right">1397</td>
+  <td>2011-12-08 14:08</td>
+
+
+
+
+  </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeq.daf</td>
+      <td align=right>4187</td>
+      <td>2011-12-08 14:08</td>
+    </tr>
+
+
+
+
+
+
+
+  <tr style="margin: 10;">
+
+  <td><b>Archive</b></td><td>001_wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2.fastq.tgz</td>
+  <td align="right">8833154623</td>
+  <td>2011-12-07 15:23</td>
+
+
+
+
+  </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l7_r2.fastq</td>
+      <td align=right>1629293100</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l8_r2.fastq</td>
+
+      <td align=right>1628417888</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_616L7AAXX_c152_l1_r2.fastq</td>
+      <td align=right>5152104576</td>
+
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l1_r2.fastq</td>
+      <td align=right>6094749091</td>
+      <td>2011-12-07 15:34</td>
+
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l2_r2.fastq</td>
+      <td align=right>7483882081</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_ilmn200901_c202_l4_r2.fastq</td>
+      <td align=right>5282142818</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+
+
+</table>
+
+<p>
+
+
+
+
+  </div>
+  <div id="footer">
+    <A HREF="/help.html">Help</A>
+     |
+    <A HREF="/contact">Contact Us</A>
+  </div>
+
+</div>
+</body>
+</html>
diff --git a/encode_submission/ucsc_gather.py b/encode_submission/ucsc_gather.py

new file mode 100644 (file)

index 0000000..fd8db12
--- /dev/null
+++ b/encode_submission/ucsc_gather.py
@@ -0,0 +1,476 @@
+#!/usr/bin/env python
+from ConfigParser import SafeConfigParser
+import fnmatch
+from glob import glob
+import json
+import logging
+import netrc
+from optparse import OptionParser, OptionGroup
+import os
+from pprint import pprint, pformat
+import shlex
+from StringIO import StringIO
+import stat
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+import RDF
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     fromTypedNode, \
+     get_model, \
+     get_serializer, \
+     load_into_model, \
+     sparql_query, \
+     submissionOntology
+from htsworkflow.submission.daf import \
+     DAFMapper, \
+     MetadataLookupException, \
+     get_submission_uri
+from htsworkflow.submission.condorfastq import CondorFastqExtract
+
+logger = logging.getLogger('ucsc_gather')
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+    submission_uri = None
+
+    if opts.debug:
+        logging.basicConfig(level = logging.DEBUG )
+    elif opts.verbose:
+        logging.basicConfig(level = logging.INFO )
+    else:
+        logging.basicConfig(level = logging.WARNING )
+
+    apidata = api.make_auth_from_opts(opts, parser)
+
+    model = get_model(opts.load_model)
+    if opts.name:
+        mapper = DAFMapper(opts.name, opts.daf,  model)
+        if opts.library_url is not None:
+            mapper.library_url = opts.library_url
+        submission_uri = get_submission_uri(opts.name)
+
+
+    if opts.load_rdf is not None:
+        if submission_uri is None:
+            parser.error("Please specify the submission name")
+        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
+
+    if opts.make_ddf and opts.daf is None:
+        parser.error("Please specify your daf when making ddf files")
+
+    library_result_map = []
+    for a in args:
+        library_result_map.extend(read_library_result_map(a))
+
+    if opts.make_tree_from is not None:
+        make_tree_from(opts.make_tree_from, library_result_map)
+
+    if opts.link_daf:
+        if opts.daf is None:
+            parser.error("Please specify daf filename with --daf")
+        link_daf(opts.daf, library_result_map)
+
+    if opts.fastq:
+        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+                                       force=opts.force)
+        extractor.build_fastqs(library_result_map)
+
+    if opts.scan_submission:
+        scan_submission_dirs(mapper, library_result_map)
+
+    if opts.make_ddf:
+        make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
+
+    if opts.sparql:
+        sparql_query(model, opts.sparql)
+
+    if opts.print_rdf:
+        writer = get_serializer()
+        print writer.serialize_model_to_string(model)
+
+
+def make_parser():
+    parser = OptionParser()
+
+    model = OptionGroup(parser, 'model')
+    model.add_option('--name', help="Set submission name")
+    model.add_option('--load-model', default=None,
+      help="Load model database")
+    model.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    model.add_option('--sparql', default=None, help="execute sparql query")
+    model.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    parser.add_option_group(model)
+    # commands
+    commands = OptionGroup(parser, 'commands')
+    commands.add_option('--make-tree-from',
+                      help="create directories & link data files",
+                      default=None)
+    commands.add_option('--fastq', default=False, action="store_true",
+                        help="generate scripts for making fastq files")
+    commands.add_option('--scan-submission', default=False, action="store_true",
+                      help="Import metadata for submission into our model")
+    commands.add_option('--link-daf', default=False, action="store_true",
+                        help="link daf into submission directories")
+    commands.add_option('--make-ddf', help='make the ddfs', default=False,
+                      action="store_true")
+    parser.add_option_group(commands)
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="Force regenerating fastqs")
+    parser.add_option('--daf', default=None, help='specify daf name')
+    parser.add_option('--library-url', default=None,
+                      help="specify an alternate source for library information")
+    # debugging
+    parser.add_option('--verbose', default=False, action="store_true",
+                      help='verbose logging')
+    parser.add_option('--debug', default=False, action="store_true",
+                      help='debug logging')
+
+    api.add_auth_options(parser)
+
+    return parser
+
+def make_tree_from(source_path, library_result_map):
+    """Create a tree using data files from source path.
+    """
+    for lib_id, lib_path in library_result_map:
+        if not os.path.exists(lib_path):
+            logger.info("Making dir {0}".format(lib_path))
+            os.mkdir(lib_path)
+        source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
+        if os.path.exists(source_lib_dir):
+            pass
+        for filename in os.listdir(source_lib_dir):
+            source_pathname = os.path.join(source_lib_dir, filename)
+            target_pathname = os.path.join(lib_path, filename)
+            if not os.path.exists(source_pathname):
+                raise IOError("{0} does not exist".format(source_pathname))
+            if not os.path.exists(target_pathname):
+                os.symlink(source_pathname, target_pathname)
+                logger.info(
+                    'LINK {0} to {1}'.format(source_pathname, target_pathname))
+
+
+def link_daf(daf_path, library_result_map):
+    if not os.path.exists(daf_path):
+        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
+
+    base_daf = os.path.basename(daf_path)
+
+    for lib_id, result_dir in library_result_map:
+        if not os.path.exists(result_dir):
+            raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
+        submission_daf = os.path.join(result_dir, base_daf)
+        if not os.path.exists(submission_daf):
+            if not os.path.exists(daf_path):
+                raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
+            os.link(daf_path, submission_daf)
+
+
+def scan_submission_dirs(view_map, library_result_map):
+    """Look through our submission directories and collect needed information
+    """
+    for lib_id, result_dir in library_result_map:
+        logger.info("Importing %s from %s" % (lib_id, result_dir))
+        try:
+            view_map.import_submission_dir(result_dir, lib_id)
+        except MetadataLookupException, e:
+            logger.error("Skipping %s: %s" % (lib_id, str(e)))
+
+def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
+    dag_fragment = []
+    for lib_id, result_dir in library_result_map:
+        submissionNode = view_map.get_submission_node(result_dir)
+        dag_fragment.extend(
+            make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
+        )
+
+    if make_condor and len(dag_fragment) > 0:
+        dag_filename = 'submission.dagman'
+        if not force and os.path.exists(dag_filename):
+            logger.warn("%s exists, please delete" % (dag_filename,))
+        else:
+            f = open(dag_filename,'w')
+            f.write( os.linesep.join(dag_fragment))
+            f.write( os.linesep )
+            f.close()
+
+
+def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
+    """
+    Make ddf files, and bonus condor file
+    """
+    query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+
+select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
+WHERE {
+  ?file ucscDaf:filename ?files ;
+        ucscDaf:md5sum ?md5sum .
+  ?submitView ucscDaf:has_file ?file ;
+              ucscDaf:view ?dafView ;
+              ucscDaf:submission <%(submission)s> .
+  ?dafView ucscDaf:name ?view .
+  <%(submission)s> submissionOntology:library ?library ;
+
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell }
+  OPTIONAL { <%(submission)s> ucscDaf:control ?control }
+  OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
+  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:condition ?treatment }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+}
+ORDER BY  ?submitView"""
+    dag_fragments = []
+
+    name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
+    if name is None:
+        logger.error("Need name for %s" % (str(submissionNode)))
+        return []
+
+    ddf_name = name + '.ddf'
+    if outdir is not None:
+        outfile = os.path.join(outdir, ddf_name)
+        output = open(outfile,'w')
+    else:
+        outfile = 'stdout:'
+        output = sys.stdout
+
+    formatted_query = query_template % {'submission': str(submissionNode.uri)}
+
+    query = RDF.SPARQLQuery(formatted_query)
+    results = query.execute(view_map.model)
+
+    # filename goes first
+    variables = view_map.get_daf_variables()
+    # 'controlId',
+    output.write('\t'.join(variables))
+    output.write(os.linesep)
+
+    all_views = {}
+    all_files = []
+    for row in results:
+        viewname = fromTypedNode(row['view'])
+        current = all_views.setdefault(viewname, {})
+        for variable_name in variables:
+            value = str(fromTypedNode(row[variable_name]))
+            if value is None or value == 'None':
+                logger.warn("{0}: {1} was None".format(outfile, variable_name))
+            if variable_name in ('files', 'md5sum'):
+                current.setdefault(variable_name,[]).append(value)
+            else:
+                current[variable_name] = value
+
+    for view in all_views.keys():
+        line = []
+        for variable_name in variables:
+            if variable_name in ('files', 'md5sum'):
+                line.append(','.join(all_views[view][variable_name]))
+            else:
+                line.append(all_views[view][variable_name])
+        output.write("\t".join(line))
+        output.write(os.linesep)
+        all_files.extend(all_views[view]['files'])
+
+    logger.info(
+        "Examined {0}, found files: {1}".format(
+            str(submissionNode), ", ".join(all_files)))
+
+    all_files.append(daf_name)
+    all_files.append(ddf_name)
+
+    if make_condor:
+        archive_condor = make_condor_archive_script(name, all_files, outdir)
+        upload_condor = make_condor_upload_script(name, outdir)
+
+        dag_fragments.extend(
+            make_dag_fragment(name, archive_condor, upload_condor)
+        )
+
+    return dag_fragments
+
+
+def read_library_result_map(filename):
+    """
+    Read a file that maps library id to result directory.
+    Does not support spaces in filenames.
+
+    For example:
+      10000 result/foo/bar
+    """
+    stream = open(filename,'r')
+
+    results = []
+    for line in stream:
+        line = line.rstrip()
+        if not line.startswith('#') and len(line) > 0 :
+            library_id, result_dir = line.split()
+            results.append((library_id, result_dir))
+    return results
+
+
+def make_condor_archive_script(name, files, outdir=None):
+    script = """Universe = vanilla
+
+Executable = /bin/tar
+arguments = czvhf ../%(archivename)s %(filelist)s
+
+Error = compress.out.$(Process).log
+Output = compress.out.$(Process).log
+Log = /tmp/submission-compress-%(user)s.log
+initialdir = %(initialdir)s
+environment="GZIP=-3"
+request_memory = 20
+
+queue
+"""
+    if outdir is None:
+        outdir = os.getcwd()
+    for f in files:
+        pathname = os.path.join(outdir, f)
+        if not os.path.exists(pathname):
+            raise RuntimeError("Missing %s from %s" % (f,outdir))
+
+    context = {'archivename': make_submission_name(name),
+               'filelist': " ".join(files),
+               'initialdir': os.path.abspath(outdir),
+               'user': os.getlogin()}
+
+    condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    return condor_script
+
+
+def make_condor_upload_script(name, outdir=None):
+    script = """Universe = vanilla
+
+Executable = /usr/bin/lftp
+arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
+
+Error = upload.out.$(Process).log
+Output = upload.out.$(Process).log
+Log = /tmp/submission-upload-%(user)s.log
+initialdir = %(initialdir)s
+
+queue
+"""
+    if outdir is None:
+        outdir = os.getcwd()
+
+    auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
+
+    encodeftp = 'encodeftp.cse.ucsc.edu'
+    ftpuser = auth.hosts[encodeftp][0]
+    ftppassword = auth.hosts[encodeftp][2]
+    context = {'archivename': make_submission_name(name),
+               'initialdir': os.path.abspath(outdir),
+               'user': os.getlogin(),
+               'ftpuser': ftpuser,
+               'ftppassword': ftppassword,
+               'ftphost': encodeftp}
+
+    condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
+
+    return condor_script
+
+
+def make_dag_fragment(ininame, archive_condor, upload_condor):
+    """
+    Make the couple of fragments compress and then upload the data.
+    """
+    cur_dir = os.getcwd()
+    archive_condor = os.path.join(cur_dir, archive_condor)
+    upload_condor = os.path.join(cur_dir, upload_condor)
+    job_basename = make_base_name(ininame)
+
+    fragments = []
+    fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
+    fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
+    fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
+
+    return fragments
+
+
+def get_library_info(host, apidata, library_id):
+    url = api.library_url(host, library_id)
+    contents = api.retrieve_info(url, apidata)
+    return contents
+
+
+def make_submission_section(line_counter, files, attributes):
+    """
+    Create a section in the submission ini file
+    """
+    inifile = [ "[line%s]" % (line_counter,) ]
+    inifile += ["files=%s" % (",".join(files))]
+
+    for k,v in attributes.items():
+        inifile += ["%s=%s" % (k,v)]
+    return inifile
+
+
+def make_base_name(pathname):
+    base = os.path.basename(pathname)
+    name, ext = os.path.splitext(base)
+    return name
+
+
+def make_submission_name(ininame):
+    name = make_base_name(ininame)
+    return name + ".tgz"
+
+
+def make_ddf_name(pathname):
+    name = make_base_name(pathname)
+    return name + ".ddf"
+
+
+def make_condor_name(pathname, run_type=None):
+    name = make_base_name(pathname)
+    elements = [name]
+    if run_type is not None:
+        elements.append(run_type)
+    elements.append("condor")
+    return ".".join(elements)
+
+
+def parse_filelist(file_string):
+    return file_string.split(",")
+
+
+def validate_filelist(files):
+    """
+    Die if a file doesn't exist in a file list
+    """
+    for f in files:
+        if not os.path.exists(f):
+            raise RuntimeError("%s does not exist" % (f,))
+
+if __name__ == "__main__":
+    main()
diff --git a/extra/ucsc_encode_submission/README.txt b/extra/ucsc_encode_submission/README.txt

deleted file mode 100644 (file)

index bab7a55..0000000
--- a/extra/ucsc_encode_submission/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-I was building a variety of scripts to handle submitting our data to the 
-UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
-databases, and since I needed an official place to put the scripts
-
-I decided here.
diff --git a/extra/ucsc_encode_submission/add-treatment-to-library.sparql b/extra/ucsc_encode_submission/add-treatment-to-library.sparql

deleted file mode 100755 (executable)

index c97dce2..0000000
--- a/extra/ucsc_encode_submission/add-treatment-to-library.sparql
+++ /dev/null
@@ -1,19 +0,0 @@
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
-
-construct { ?library ddf:treatment ?treatment ;
-                     ddf:protocol ?pcr . }
-WHERE {
-   ?status ucscSubmission:has_file ?file .
-   ?submission ucscSubmission:has_status ?status ;
-               ucscSubmission:library_urn ?library ;
-               ucscSubmission:name ?name .
-   ?file ddf:treatment ?treatment ;
-         ddf:protocol ?pcr .
-}
-
diff --git a/extra/ucsc_encode_submission/dt-overrides.turtle b/extra/ucsc_encode_submission/dt-overrides.turtle

deleted file mode 100644 (file)

index ffe2759..0000000
--- a/extra/ucsc_encode_submission/dt-overrides.turtle
+++ /dev/null
@@ -1,178 +0,0 @@
-##
-## Override submission ID to library URN names for our libraries
-## whose names either lack, or have the wrong library ID string
-## embedded in them.
-##
-
-@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
-@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
-
-# woldlab-hepg2-rnaseq-2009dec
-<http://encodesubmit.ucsc.edu/pipeline/show/805>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part2
-<http://encodesubmit.ucsc.edu/pipeline/show/810>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part3
-<http://encodesubmit.ucsc.edu/pipeline/show/869>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
-<http://encodesubmit.ucsc.edu/pipeline/show/870>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab-hepg2-rnaseq-2010Jan-part4
-<http://encodesubmit.ucsc.edu/pipeline/show/897>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
-<http://encodesubmit.ucsc.edu/pipeline/show/898>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep1-2010Jan6
-<http://encodesubmit.ucsc.edu/pipeline/show/903>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep2-2010jan9
-<http://encodesubmit.ucsc.edu/pipeline/show/904>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab hESC 10886 rep1 2009Jan13
-<http://encodesubmit.ucsc.edu/pipeline/show/1026>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
-
-# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1483>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1626>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
-
-# woldlab jun 18 1x75-Directional-GM12878-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1631>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab jun 18  1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1632>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1633>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1634>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1635>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1636>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1637>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1638>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1639>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1645>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1646>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab June  2x75-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1856>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
-
-#2010 jul 9corrected fastqs
-<http://encodesubmit.ucsc.edu/pipeline/show/1874>
-     ucscSubmission:ignore "1" .
-#    ucscSubmission:library_urn "
-
-# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
-<http://encodesubmit.ucsc.edu/pipeline/show/2926>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2930>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2931>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
-
-# 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2932>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2933>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2934>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2935>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2936>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2937>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2938>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2939>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2940>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2941>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# "3438 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4607>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
-
-# "3439 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4608>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
-
-# "3437 Fastq re-submission"
-<http://encodesubmit.ucsc.edu/pipeline/show/4609>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
-
-# "1x75-Directional-HepG2-rep2-replace 3522"
-<http://encodesubmit.ucsc.edu/pipeline/show/4797>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# "1x75-Directional-HepG2-rep1 replacement of 3521"
-<http://encodesubmit.ucsc.edu/pipeline/show/4798>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py

deleted file mode 100644 (file)

index 5082912..0000000
--- a/extra/ucsc_encode_submission/encode_find.py
+++ /dev/null
@@ -1,592 +0,0 @@
-#!/usr/bin/env python
-"""
-Gather information about our submissions into a single RDF store
-"""
-
-from datetime import datetime
-import hashlib
-import httplib2
-import keyring
-import logging
-from lxml.html import fromstring
-from operator import attrgetter
-from optparse import OptionParser, OptionGroup
-# python keyring
-import os
-import re
-# redland rdf lib
-import RDF
-import sys
-import urllib
-import urlparse
-
-from htsworkflow.submission import daf
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     dublinCoreNS, \
-     get_model, \
-     get_serializer, \
-     sparql_query, \
-     submissionOntology, \
-     libraryOntology, \
-     load_into_model, \
-     rdfNS, \
-     rdfsNS, \
-     xsdNS
-TYPE_N = rdfNS['type']
-
-# URL mappings
-LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
-
-from htsworkflow.submission.ucsc import \
-     daf_download_url, \
-     ddf_download_url, \
-     submission_view_url, \
-     UCSCEncodePipeline
-
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
-
-DBDIR = os.path.expanduser("~diane/proj/submission")
-
-LOGGER = logging.getLogger("encode_find")
-
-LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
-USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
-
-USERNAME = 'detrout'
-CHARSET = 'utf-8'
-
-
-def main(cmdline=None):
-    """
-    Parse command line arguments
-
-    Takes a list of arguments (assuming arg[0] is the program name) or None
-    If None, it looks at sys.argv
-    """
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if opts.debug:
-        logging.basicConfig(level=logging.DEBUG)
-    elif opts.verbose:
-        logging.basicConfig(level=logging.INFO)
-
-    htsw_authdata = api.make_auth_from_opts(opts, parser)
-    htswapi = api.HtswApi(opts.host, htsw_authdata)
-
-    cookie = None
-    model = get_model(opts.load_model, DBDIR)
-
-    if opts.load_rdf is not None:
-        ns_uri = submissionOntology[''].uri
-        load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-
-    if len(args) == 0:
-        limit = None
-    else:
-        limit = args
-
-    if opts.update:
-        cookie = login(cookie=cookie)
-        load_my_submissions(model, limit=limit, cookie=cookie)
-        load_encode_libraries(model, htswapi)
-
-    if opts.sparql is not None:
-        sparql_query(model, opts.sparql)
-
-    if opts.find_submission_with_no_library:
-        find_submissions_with_no_library(model)
-
-    if opts.print_rdf:
-        serializer = get_serializer(name=opts.rdf_parser_name)
-        print serializer.serialize_model_to_string(model)
-
-
-def make_parser():
-    """Construct option parser
-    """
-    parser = OptionParser()
-    commands = OptionGroup(parser, "Commands")
-    commands.add_option('--load-model', default=None,
-      help="Load model database")
-    commands.add_option('--load-rdf', default=None,
-      help="load rdf statements into model")
-    commands.add_option('--print-rdf', action="store_true", default=False,
-      help="print ending model state")
-    commands.add_option('--update', action="store_true", default=False,
-      help="Query remote data sources and update our database")
-    #commands.add_option('--update-ucsc-status', default=None,
-    #  help="download status from ucsc, requires filename for extra rules")
-    #commands.add_option('--update-ddfs', action="store_true", default=False,
-    #  help="download ddf information for known submission")
-    #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, "\
-    #       "requires filename for extra rules")
-    parser.add_option_group(commands)
-
-    queries = OptionGroup(parser, "Queries")
-    queries.add_option('--sparql', default=None,
-      help="execute arbitrary sparql query")
-    queries.add_option('--find-submission-with-no-library', default=False,
-      action="store_true",
-      help="find submissions with no library ID")
-    parser.add_option_group(queries)
-
-    options = OptionGroup(parser, "Options")
-    options.add_option("--rdf-parser-name", default="turtle",
-      help="set rdf file parser type")
-    options.add_option("-v", "--verbose", action="store_true", default=False)
-    options.add_option("--debug", action="store_true", default=False)
-    parser.add_option_group(options)
-
-    api.add_auth_options(parser)
-
-    return parser
-
-
-def load_my_submissions(model, limit=None, cookie=None):
-    """Parse all the submissions from UCSC into model
-    It will look at the global USER_URL to figure out who to scrape
-    cookie contains the session cookie, if none, will attempt to login
-    """
-    if cookie is None:
-        cookie = login()
-
-    tree = get_url_as_tree(USER_URL, 'GET', cookie)
-    table_rows = tree.xpath('//table[@id="projects"]/tr')
-    # first record is header
-    name_n = submissionOntology['name']
-    species_n = submissionOntology['species']
-    library_urn = submissionOntology['library_urn']
-
-    # skip header
-    for row in table_rows[1:]:
-        cell = row.xpath('td')
-        if cell is not None and len(cell) > 1:
-            submission_id = str(cell[0].text_content())
-            if limit is None or submission_id in limit:
-                subUrn = RDF.Uri(submission_view_url(submission_id))
-
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
-
-                name = str(cell[4].text_content())
-                add_stmt(model, subUrn, name_n, name)
-
-                species = str(cell[2].text_content())
-                if species is not None:
-                    add_stmt(model, subUrn, species_n, species)
-
-                library_id = get_library_id(name)
-                if library_id is not None:
-                    add_submission_to_library_urn(model,
-                                                  subUrn,
-                                                  library_urn,
-                                                  library_id)
-                else:
-                    errmsg = 'Unable to find library id in {0} for {1}'
-                    LOGGER.warn(errmsg.format(name, str(subUrn)))
-
-                add_submission_creation_date(model, subUrn, cookie)
-
-                # grab changing atttributes
-                status = str(cell[6].text_content()).strip()
-                last_mod_datetime = get_date_contents(cell[8])
-                last_mod = last_mod_datetime.isoformat()
-
-                update_submission_detail(model, subUrn, status, last_mod,
-                                         cookie=cookie)
-
-                LOGGER.info("Processed {0}".format(subUrn))
-
-
-
-
-def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
-    """Add a link from a UCSC submission to woldlab library if needed
-    """
-    libraryUrn = LIBRARY_NS[library_id + '/']
-    query = RDF.Statement(submissionUrn, predicate, libraryUrn)
-    if not model.contains_statement(query):
-        link = RDF.Statement(submissionUrn, predicate, libraryUrn)
-        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
-        model.add_statement(link)
-    else:
-        LOGGER.debug("Found: {0}".format(str(query)))
-
-
-def find_submissions_with_no_library(model):
-    missing_lib_query_text = """
-PREFIX submissionOntology:<{submissionOntology}>
-
-SELECT
- ?subid ?name
-WHERE {{
-  ?subid submissionOntology:name ?name
-  OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
-  FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-    missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
-
-    results = missing_lib_query.execute(model)
-    for row in results:
-        subid = row['subid']
-        name = row['name']
-        print "# {0}".format(name)
-        print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn "\
-              "<http://jumpgate.caltech.edu/library/> ."
-        print ""
-
-
-def add_submission_creation_date(model, subUrn, cookie):
-    # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
-    if len(creation_dates) == 0:
-        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
-    else:
-        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
-
-
-def update_submission_detail(model, subUrn, status, recent_update, cookie):
-    HasStatusN = submissionOntology['has_status']
-    StatusN = submissionOntology['status']
-    LastModifyN = submissionOntology['last_modify_date']
-
-    status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
-    status_nodes = list(model.find_statements(status_nodes_query))
-
-    if len(status_nodes) == 0:
-        # has no status node, add one
-        LOGGER.info("Adding status node to {0}".format(subUrn))
-        status_node = create_status_node(subUrn, recent_update)
-        add_stmt(model, subUrn, HasStatusN, status_node)
-        add_stmt(model, status_node, rdfNS['type'], StatusN)
-        add_stmt(model, status_node, StatusN, status)
-        add_stmt(model, status_node, LastModifyN, recent_update)
-        update_ddf(model, subUrn, status_node, cookie=cookie)
-        update_daf(model, subUrn, status_node, cookie=cookie)
-    else:
-        LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
-        for status_statement in status_nodes:
-            status_node = status_statement.object
-            last_modified_query = RDF.Statement(status_node,
-                                                LastModifyN,
-                                                None)
-            last_mod_nodes = model.find_statements(last_modified_query)
-            for last_mod_statement in last_mod_nodes:
-                last_mod_date = str(last_mod_statement.object)
-                if recent_update == str(last_mod_date):
-                    update_ddf(model, subUrn, status_node, cookie=cookie)
-                    update_daf(model, subUrn, status_node, cookie=cookie)
-                    break
-
-
-def update_daf(model, submission_url, status_node, cookie):
-    download_daf_uri = str(submission_url).replace('show', 'download_daf')
-    daf_uri = RDF.Uri(download_daf_uri)
-
-    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
-    if not model.contains_statement(status_is_daf):
-        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
-                                                     status_node))
-        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
-        daf_hash = hashlib.md5(daf_text).hexdigest()
-        daf_hash_stmt = RDF.Statement(status_node,
-                                      dafTermOntology['md5sum'],
-                                      daf_hash)
-        model.add_statement(daf_hash_stmt)
-        daf.fromstring_into_model(model, status_node, daf_text)
-
-
-def update_ddf(model, subUrn, statusNode, cookie):
-    download_ddf_url = str(subUrn).replace('show', 'download_ddf')
-    ddfUrn = RDF.Uri(download_ddf_url)
-
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
-    if not model.contains_statement(status_is_ddf):
-        LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
-        ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
-        add_ddf_statements(model, statusNode, ddf_text)
-        model.add_statement(status_is_ddf)
-
-
-def add_ddf_statements(model, statusNode, ddf_string):
-    """Convert a ddf text file into RDF Statements
-    """
-    ddf_lines = ddf_string.split('\n')
-    # first line is header
-    header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
-
-    for ddf_line in ddf_lines[1:]:
-        ddf_line = ddf_line.strip()
-        if len(ddf_line) == 0:
-            continue
-        if ddf_line.startswith("#"):
-            continue
-
-        ddf_record = ddf_line.split('\t')
-        files = ddf_record[0].split(',')
-        file_attributes = ddf_record[1:]
-
-        for f in files:
-            fileNode = RDF.Node()
-            add_stmt(model,
-                     statusNode,
-                     submissionOntology['has_file'],
-                     fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
-
-            for predicate, object in zip(attributes[1:], file_attributes):
-                add_stmt(model, fileNode, predicate, object)
-
-
-def load_encode_libraries(model, htswapi):
-    """Get libraries associated with encode.
-    """
-    encodeFilters = ["/library/?affiliations__id__exact=44",
-                     "/library/?affiliations__id__exact=80",
-                    ]
-
-    encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
-    rdfaParser = RDF.Parser(name='rdfa')
-    for encodeUrl in encodeUrls:
-        LOGGER.info("Scanning library url {0}".format(encodeUrl))
-        rdfaParser.parse_into_model(model, encodeUrl)
-        query = RDF.Statement(None, libraryOntology['library_id'], None)
-        libraries = model.find_statements(query)
-        for statement in libraries:
-            libraryUrn = statement.subject
-            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
-            load_library_detail(model, libraryUrn)
-
-
-def load_library_detail(model, libraryUrn):
-    """Grab detail information from library page
-    """
-    rdfaParser = RDF.Parser(name='rdfa')
-    query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
-    results = list(model.find_statements(query))
-    log_message = "Found {0} statements for {1}"
-    LOGGER.debug(log_message.format(len(results), libraryUrn))
-    if len(results) == 0:
-        LOGGER.info("Loading {0}".format(str(libraryUrn)))
-        rdfaParser.parse_into_model(model, libraryUrn.uri)
-    elif len(results) == 1:
-        pass  # Assuming that a loaded dataset has one record
-    else:
-        LOGGER.warning("Many dates for {0}".format(libraryUrn))
-
-
-def get_library_id(name):
-    """Guess library ID from library name
-
-    >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
-    '11039'
-    >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
-    '10150'
-    """
-    match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
-    library_id = None
-    if match is not None:
-        library_id = match.group('id')
-    return library_id
-
-
-def get_contents(element):
-    """Return contents or none.
-    """
-    if len(element.contents) == 0:
-        return None
-
-    a = element.find('a')
-    if a is not None:
-        return a.contents[0].encode(CHARSET)
-
-    return element.contents[0].encode(CHARSET)
-
-
-def create_status_node(submission_uri, timestamp):
-    submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
-    return RDF.Node(RDF.Uri(status_uri))
-
-
-def get_date_contents(element):
-    data = element.text_content()
-    if data:
-        return datetime.strptime(data, "%Y-%m-%d %H:%M")
-    else:
-        return None
-
-
-def add_stmt(model, subject, predicate, rdf_object):
-    """Convienence create RDF Statement and add to a model
-    """
-    return model.add_statement(
-        RDF.Statement(subject, predicate, rdf_object))
-
-
-def login(cookie=None):
-    """Login if we don't have a cookie
-    """
-    if cookie is not None:
-        return cookie
-
-    keys = keyring.get_keyring()
-    password = keys.get_password(LOGIN_URL, USERNAME)
-    credentials = {'login': USERNAME,
-                   'password': password}
-    headers = {'Content-type': 'application/x-www-form-urlencoded'}
-    http = httplib2.Http()
-    response, content = http.request(LOGIN_URL,
-                                     'POST',
-                                     headers=headers,
-                                     body=urllib.urlencode(credentials))
-    LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
-                                                    response['status']))
-
-    cookie = response.get('set-cookie', None)
-    if cookie is None:
-        raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
-    return cookie
-
-
-def get_url_as_tree(url, method, cookie=None):
-    http = httplib2.Http()
-    headers = {}
-    if cookie is not None:
-        headers['Cookie'] = cookie
-    response, content = http.request(url, method, headers=headers)
-    if response['status'] == '200':
-        tree = fromstring(content, base_url=url)
-        return tree
-    else:
-        msg = "error accessing {0}, status {1}"
-        msg = msg.format(url, response['status'])
-        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-
-def get_url_as_text(url, method, cookie=None):
-    http = httplib2.Http()
-    headers = {}
-    if cookie is not None:
-        headers['Cookie'] = cookie
-    response, content = http.request(url, method, headers=headers)
-    if response['status'] == '200':
-        return content
-    else:
-        msg = "error accessing {0}, status {1}"
-        msg = msg.format(url, response['status'])
-        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-################
-#  old stuff
-SUBMISSIONS_LACKING_LIBID = [
-    ('1x75-Directional-HeLa-Rep1',    '11208'),
-    ('1x75-Directional-HeLa-Rep2',    '11207'),
-    ('1x75-Directional-HepG2-Rep1',   '11210'),
-    ('1x75-Directional-HepG2-Rep2',   '11209'),
-    ('1x75-Directional-H1-hESC-Rep1', '10947'),
-    ('1x75-Directional-H1-hESC-Rep2', '11009'),
-    ('1x75-Directional-HUVEC-Rep1',   '11206'),
-    ('1x75-Directional-HUVEC-Rep2',   '11205'),
-    ('1x75-Directional-K562-Rep1',    '11008'),
-    ('1x75-Directional-K562-Rep2',    '11007'),
-    ('1x75-Directional-NHEK-Rep1',    '11204'),
-    ('1x75-Directional-GM12878-Rep1', '11011'),
-    ('1x75-Directional-GM12878-Rep2', '11010'),
-    ]
-
-
-def select_by_library_id(submission_list):
-    subl = [(x.library_id, x) for x in submission_list if x.library_id]
-    libraries = {}
-    for lib_id, subobj in subl:
-        libraries.setdefault(lib_id, []).append(subobj)
-
-    for submission in libraries.values():
-        submission.sort(key=attrgetter('date'), reverse=True)
-
-    return libraries
-
-
-def library_to_freeze(selected_libraries):
-    freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
-    lib_ids = sorted(selected_libraries.keys())
-    report = ['<html><table border="1">']
-    report = ["""<html>
-<head>
-<style type="text/css">
- td {border-width:0 0 1px 1px; border-style:solid;}
-</style>
-</head>
-<body>
-<table>
-"""]
-    report.append('<thead>')
-    report.append('<tr><td>Library ID</td><td>Name</td>')
-    for f in freezes:
-        report.append('<td>{0}</td>'.format(f))
-    report.append('</tr>')
-    report.append('</thead>')
-    report.append('<tbody>')
-    for lib_id in lib_ids:
-        report.append('<tr>')
-        lib_url = LIBRARY_NS[lib_id].uri
-        report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
-        submissions = selected_libraries[lib_id]
-        report.append('<td>{0}</td>'.format(submissions[0].name))
-        batched = {}
-        for sub in submissions:
-            date = date_to_freeze(sub.date)
-            batched.setdefault(date, []).append(sub)
-        for d in freezes:
-            report.append('<td>')
-            for s in batched.get(d, []):
-                show_url = submission_view_url(s.subid)
-                subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
-                report.append("{0}:{1}".format(subid, s.status))
-            report.append('</td>')
-        else:
-            report.append('<td></td>')
-        report.append("</tr>")
-    report.append('</tbody>')
-    report.append("</table></html>")
-    return "\n".join(report)
-
-
-def date_to_freeze(d):
-    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
-               (datetime(2010, 7, 30), '2010-Jul'),
-               (datetime(2011, 1, 30), '2011-Jan'),
-               ]
-    for end, name in freezes:
-        if d < end:
-            return name
-    else:
-        return None
-
-if __name__ == "__main__":
-    main()
diff --git a/extra/ucsc_encode_submission/failed-submissions.sparql b/extra/ucsc_encode_submission/failed-submissions.sparql

deleted file mode 100644 (file)

index af4af4e..0000000
--- a/extra/ucsc_encode_submission/failed-submissions.sparql
+++ /dev/null
@@ -1,22 +0,0 @@
-##
-## Find submissions that are currently "failed"
-##
-
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
-PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
-
-#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
-#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
-#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
-
-SELECT 
- ?subid ?subname ?liburn ?status
-WHERE {
-  ?subid submitOnt:name ?subname .
-  ?subid submitOnt:library_urn ?liburn .
-  ?subid submitOnt:has_status ?statusNode .
-  ?statusNode submitOnt:status ?status .
-  ?statusNode submitOnt:last_modify_date ?last_modify .
-  FILTER (regex(?status, "failed", "i"))
-} 
diff --git a/extra/ucsc_encode_submission/find-lib-by-cell.sparql b/extra/ucsc_encode_submission/find-lib-by-cell.sparql

deleted file mode 100755 (executable)

index c4585c5..0000000
--- a/extra/ucsc_encode_submission/find-lib-by-cell.sparql
+++ /dev/null
@@ -1,17 +0,0 @@
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-
-SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
-WHERE {
-    ?subid ucscSubmission:name ?name .
-    OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
-                       libraryOntology:date ?submission_date .
-               ?liburn libraryOntology:cell_line ?cell ;
-                       libraryOntology:replicate ?replicate . }
-    #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
-    #filter(!bound(?liburn))
-}
-ORDER BY ?submission_date ?cell ?replicate ?liburn
diff --git a/extra/ucsc_encode_submission/scan_extension.py b/extra/ucsc_encode_submission/scan_extension.py

deleted file mode 100644 (file)

index 39f19c6..0000000
--- a/extra/ucsc_encode_submission/scan_extension.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from optparse import OptionParser
-import os
-import sys
-from pprint import pprint
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    extensions = scan(args)
-    common_extensions = find_common_suffix(extensions)
-
-    if opts.rdf:
-        print_rdf(common_extensions)
-    else:
-        print common_extensions
-        
-def make_parser():
-    parser = OptionParser("%prog: directory [directory...]")
-    parser.add_option('--rdf', action="store_true", default=False,
-                      help="Produce rdf configuration file for ucsc_gather")
-    return parser
-
-def scan(toscan):
-    index = {}
-    for cur_scan_dir in toscan:
-        for path, dirnames, filenames in os.walk(cur_scan_dir):
-            for filename in filenames:
-                base, ext = os.path.splitext(filename)
-                if ext in ('.daf', 'ddf'):
-                    continue
-                next_index = index
-                for c in filename[::-1]:
-                    next_index = next_index.setdefault(c, {})
-    return index
-
-def find_common_suffix(index, tail=[]):
-    if len(tail) > 0 and len(index) > 1:
-        return "".join(tail[::-1])
-
-    results = []
-    for key, choice in index.items():
-        r = find_common_suffix(choice, tail+[key])
-        if r is not None:
-            results.append (r)
-        
-    if len(results) == 0:
-        return None
-    elif len(results) == 1:
-        return results[0]
-    else:
-        return results
-
-def print_rdf(common_extensions):
-    import RDF
-    from htsworkflow.util import rdfhelp
-    model = rdfhelp.get_model()
-
-    viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
-    subView = RDF.NS(viewName)
-    fileReTerm = rdfhelp.dafTermOntology['filename_re']
-
-    count = 1
-    for ext in common_extensions:
-        s = RDF.Statement(subView['VIEW{0}'.format(count)],
-                          fileReTerm,
-                          '.*{0}$'.format(ext.replace('.', '\\.')))
-        model.add_statement(s)
-        count += 1
-        
-    writer = rdfhelp.get_serializer()
-    writer.set_namespace('thisSubmissionView', subView._prefix)
-    print writer.serialize_model_to_string(model)
-
-if __name__ == "__main__":
-    main()
diff --git a/extra/ucsc_encode_submission/test_ucsc_gather.py b/extra/ucsc_encode_submission/test_ucsc_gather.py

deleted file mode 100644 (file)

index 64f712d..0000000
--- a/extra/ucsc_encode_submission/test_ucsc_gather.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import unittest
-
-import ucsc_gather
-
-class testUCSCGather(unittest.TestCase):
-    def test_view_attribute_map(self):
-        view_map = ucsc_gather.NameToViewMap()
-        view_map.lib_cache["0"] = {
-            "cell_line": "NLHF",
-            "replicate": "1",
-            "lane_set": {},
-            }
-    
-        a = view_map.find_attributes("foo.ini", "0")    
-        self.failUnless(a["view"] is None)
-    
-        a = view_map.find_attributes("asdf.fdsa", "0")
-        self.failUnless(a is None)
-    
-        a = view_map.find_attributes("foo.fastq", "0")
-        self.failUnlessEqual(a["view"], "Fastq", "0")
-    
-        a = view_map.find_attributes("foo_r1.fastq", "0")
-        self.failUnlessEqual(a["view"], "FastqRd1", "0")
-
-    def test_get_library_info_paired(self):
-        view_map = ucsc_gather.NameToViewMap()
-        view_map.lib_cache["11588"] = {
-            u'antibody_id': None,
-            u'cell_line': u'NHLF',
-            u'cell_line_id': 13,
-            u'experiment_type': u'RNA-seq',
-            u'experiment_type_id': 4,
-            u'gel_cut_size': 300,
-            u'hidden': False,
-            u'id': u'11588',
-            u'insert_size': 200,
-            u'lane_set': [{u'flowcell': u'61PKCAAXX',
-                           u'lane_number': 8,
-                           u'paired_end': True,
-                           u'read_length': 76,
-                           u'status': u'Unknown',
-                           u'status_code': None},
-                          {u'flowcell': u'61PKLAAXX',
-                           u'lane_number': 8,
-                           u'paired_end': True,
-                           u'read_length': 76,
-                           u'status': u'Unknown',
-                           u'status_code': None}],
-            u'library_id': u'11588',
-            u'library_name': u'Paired ends 254 NHLF 31',
-            u'library_species': u'Homo sapiens',
-            u'library_species_id': 8,
-            u'library_type': u'Paired End',
-            u'library_type_id': 2,
-            u'made_by': u'Brian',
-            u'made_for': u'Brian',
-            u'notes': u'300 bp gel fragment, SPRI beads cleanup',
-            u'replicate': 2,
-            u'stopping_point': u'1Aa',
-            u'successful_pM': None,
-            u'undiluted_concentration': u'26.2'}
-
-        a = view_map.find_attributes("foo.bam", "11588")
-        self.failUnlessEqual(a["view"], "Paired")
-        self.failUnlessEqual(a["insertLength"], 200)
-
-
-def suite():
-    return unittest.makeSuite(testUCSCGather,"test")
-
-if __name__ == "__main__":
-    unittest.main(defaultTest="suite")
-
diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py

deleted file mode 100755 (executable)

index fd8db12..0000000
--- a/extra/ucsc_encode_submission/ucsc_gather.py
+++ /dev/null
@@ -1,476 +0,0 @@
-#!/usr/bin/env python
-from ConfigParser import SafeConfigParser
-import fnmatch
-from glob import glob
-import json
-import logging
-import netrc
-from optparse import OptionParser, OptionGroup
-import os
-from pprint import pprint, pformat
-import shlex
-from StringIO import StringIO
-import stat
-import sys
-import time
-import types
-import urllib
-import urllib2
-import urlparse
-
-import RDF
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     fromTypedNode, \
-     get_model, \
-     get_serializer, \
-     load_into_model, \
-     sparql_query, \
-     submissionOntology
-from htsworkflow.submission.daf import \
-     DAFMapper, \
-     MetadataLookupException, \
-     get_submission_uri
-from htsworkflow.submission.condorfastq import CondorFastqExtract
-
-logger = logging.getLogger('ucsc_gather')
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-    submission_uri = None
-
-    if opts.debug:
-        logging.basicConfig(level = logging.DEBUG )
-    elif opts.verbose:
-        logging.basicConfig(level = logging.INFO )
-    else:
-        logging.basicConfig(level = logging.WARNING )
-
-    apidata = api.make_auth_from_opts(opts, parser)
-
-    model = get_model(opts.load_model)
-    if opts.name:
-        mapper = DAFMapper(opts.name, opts.daf,  model)
-        if opts.library_url is not None:
-            mapper.library_url = opts.library_url
-        submission_uri = get_submission_uri(opts.name)
-
-
-    if opts.load_rdf is not None:
-        if submission_uri is None:
-            parser.error("Please specify the submission name")
-        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
-
-    if opts.make_ddf and opts.daf is None:
-        parser.error("Please specify your daf when making ddf files")
-
-    library_result_map = []
-    for a in args:
-        library_result_map.extend(read_library_result_map(a))
-
-    if opts.make_tree_from is not None:
-        make_tree_from(opts.make_tree_from, library_result_map)
-
-    if opts.link_daf:
-        if opts.daf is None:
-            parser.error("Please specify daf filename with --daf")
-        link_daf(opts.daf, library_result_map)
-
-    if opts.fastq:
-        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
-                                       force=opts.force)
-        extractor.build_fastqs(library_result_map)
-
-    if opts.scan_submission:
-        scan_submission_dirs(mapper, library_result_map)
-
-    if opts.make_ddf:
-        make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
-
-    if opts.sparql:
-        sparql_query(model, opts.sparql)
-
-    if opts.print_rdf:
-        writer = get_serializer()
-        print writer.serialize_model_to_string(model)
-
-
-def make_parser():
-    parser = OptionParser()
-
-    model = OptionGroup(parser, 'model')
-    model.add_option('--name', help="Set submission name")
-    model.add_option('--load-model', default=None,
-      help="Load model database")
-    model.add_option('--load-rdf', default=None,
-      help="load rdf statements into model")
-    model.add_option('--sparql', default=None, help="execute sparql query")
-    model.add_option('--print-rdf', action="store_true", default=False,
-      help="print ending model state")
-    parser.add_option_group(model)
-    # commands
-    commands = OptionGroup(parser, 'commands')
-    commands.add_option('--make-tree-from',
-                      help="create directories & link data files",
-                      default=None)
-    commands.add_option('--fastq', default=False, action="store_true",
-                        help="generate scripts for making fastq files")
-    commands.add_option('--scan-submission', default=False, action="store_true",
-                      help="Import metadata for submission into our model")
-    commands.add_option('--link-daf', default=False, action="store_true",
-                        help="link daf into submission directories")
-    commands.add_option('--make-ddf', help='make the ddfs', default=False,
-                      action="store_true")
-    parser.add_option_group(commands)
-
-    parser.add_option('--force', default=False, action="store_true",
-                      help="Force regenerating fastqs")
-    parser.add_option('--daf', default=None, help='specify daf name')
-    parser.add_option('--library-url', default=None,
-                      help="specify an alternate source for library information")
-    # debugging
-    parser.add_option('--verbose', default=False, action="store_true",
-                      help='verbose logging')
-    parser.add_option('--debug', default=False, action="store_true",
-                      help='debug logging')
-
-    api.add_auth_options(parser)
-
-    return parser
-
-def make_tree_from(source_path, library_result_map):
-    """Create a tree using data files from source path.
-    """
-    for lib_id, lib_path in library_result_map:
-        if not os.path.exists(lib_path):
-            logger.info("Making dir {0}".format(lib_path))
-            os.mkdir(lib_path)
-        source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
-        if os.path.exists(source_lib_dir):
-            pass
-        for filename in os.listdir(source_lib_dir):
-            source_pathname = os.path.join(source_lib_dir, filename)
-            target_pathname = os.path.join(lib_path, filename)
-            if not os.path.exists(source_pathname):
-                raise IOError("{0} does not exist".format(source_pathname))
-            if not os.path.exists(target_pathname):
-                os.symlink(source_pathname, target_pathname)
-                logger.info(
-                    'LINK {0} to {1}'.format(source_pathname, target_pathname))
-
-
-def link_daf(daf_path, library_result_map):
-    if not os.path.exists(daf_path):
-        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
-
-    base_daf = os.path.basename(daf_path)
-
-    for lib_id, result_dir in library_result_map:
-        if not os.path.exists(result_dir):
-            raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
-        submission_daf = os.path.join(result_dir, base_daf)
-        if not os.path.exists(submission_daf):
-            if not os.path.exists(daf_path):
-                raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
-            os.link(daf_path, submission_daf)
-
-
-def scan_submission_dirs(view_map, library_result_map):
-    """Look through our submission directories and collect needed information
-    """
-    for lib_id, result_dir in library_result_map:
-        logger.info("Importing %s from %s" % (lib_id, result_dir))
-        try:
-            view_map.import_submission_dir(result_dir, lib_id)
-        except MetadataLookupException, e:
-            logger.error("Skipping %s: %s" % (lib_id, str(e)))
-
-def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
-    dag_fragment = []
-    for lib_id, result_dir in library_result_map:
-        submissionNode = view_map.get_submission_node(result_dir)
-        dag_fragment.extend(
-            make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
-        )
-
-    if make_condor and len(dag_fragment) > 0:
-        dag_filename = 'submission.dagman'
-        if not force and os.path.exists(dag_filename):
-            logger.warn("%s exists, please delete" % (dag_filename,))
-        else:
-            f = open(dag_filename,'w')
-            f.write( os.linesep.join(dag_fragment))
-            f.write( os.linesep )
-            f.close()
-
-
-def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
-    """
-    Make ddf files, and bonus condor file
-    """
-    query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-
-select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
-WHERE {
-  ?file ucscDaf:filename ?files ;
-        ucscDaf:md5sum ?md5sum .
-  ?submitView ucscDaf:has_file ?file ;
-              ucscDaf:view ?dafView ;
-              ucscDaf:submission <%(submission)s> .
-  ?dafView ucscDaf:name ?view .
-  <%(submission)s> submissionOntology:library ?library ;
-
-  OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell }
-  OPTIONAL { <%(submission)s> ucscDaf:control ?control }
-  OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
-  OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:condition ?treatment }
-  OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  OPTIONAL { ?library ucscDaf:readType ?readType }
-  OPTIONAL { ?library ucscDaf:strain ?strain }
-  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-}
-ORDER BY  ?submitView"""
-    dag_fragments = []
-
-    name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
-    if name is None:
-        logger.error("Need name for %s" % (str(submissionNode)))
-        return []
-
-    ddf_name = name + '.ddf'
-    if outdir is not None:
-        outfile = os.path.join(outdir, ddf_name)
-        output = open(outfile,'w')
-    else:
-        outfile = 'stdout:'
-        output = sys.stdout
-
-    formatted_query = query_template % {'submission': str(submissionNode.uri)}
-
-    query = RDF.SPARQLQuery(formatted_query)
-    results = query.execute(view_map.model)
-
-    # filename goes first
-    variables = view_map.get_daf_variables()
-    # 'controlId',
-    output.write('\t'.join(variables))
-    output.write(os.linesep)
-
-    all_views = {}
-    all_files = []
-    for row in results:
-        viewname = fromTypedNode(row['view'])
-        current = all_views.setdefault(viewname, {})
-        for variable_name in variables:
-            value = str(fromTypedNode(row[variable_name]))
-            if value is None or value == 'None':
-                logger.warn("{0}: {1} was None".format(outfile, variable_name))
-            if variable_name in ('files', 'md5sum'):
-                current.setdefault(variable_name,[]).append(value)
-            else:
-                current[variable_name] = value
-
-    for view in all_views.keys():
-        line = []
-        for variable_name in variables:
-            if variable_name in ('files', 'md5sum'):
-                line.append(','.join(all_views[view][variable_name]))
-            else:
-                line.append(all_views[view][variable_name])
-        output.write("\t".join(line))
-        output.write(os.linesep)
-        all_files.extend(all_views[view]['files'])
-
-    logger.info(
-        "Examined {0}, found files: {1}".format(
-            str(submissionNode), ", ".join(all_files)))
-
-    all_files.append(daf_name)
-    all_files.append(ddf_name)
-
-    if make_condor:
-        archive_condor = make_condor_archive_script(name, all_files, outdir)
-        upload_condor = make_condor_upload_script(name, outdir)
-
-        dag_fragments.extend(
-            make_dag_fragment(name, archive_condor, upload_condor)
-        )
-
-    return dag_fragments
-
-
-def read_library_result_map(filename):
-    """
-    Read a file that maps library id to result directory.
-    Does not support spaces in filenames.
-
-    For example:
-      10000 result/foo/bar
-    """
-    stream = open(filename,'r')
-
-    results = []
-    for line in stream:
-        line = line.rstrip()
-        if not line.startswith('#') and len(line) > 0 :
-            library_id, result_dir = line.split()
-            results.append((library_id, result_dir))
-    return results
-
-
-def make_condor_archive_script(name, files, outdir=None):
-    script = """Universe = vanilla
-
-Executable = /bin/tar
-arguments = czvhf ../%(archivename)s %(filelist)s
-
-Error = compress.out.$(Process).log
-Output = compress.out.$(Process).log
-Log = /tmp/submission-compress-%(user)s.log
-initialdir = %(initialdir)s
-environment="GZIP=-3"
-request_memory = 20
-
-queue
-"""
-    if outdir is None:
-        outdir = os.getcwd()
-    for f in files:
-        pathname = os.path.join(outdir, f)
-        if not os.path.exists(pathname):
-            raise RuntimeError("Missing %s from %s" % (f,outdir))
-
-    context = {'archivename': make_submission_name(name),
-               'filelist': " ".join(files),
-               'initialdir': os.path.abspath(outdir),
-               'user': os.getlogin()}
-
-    condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
-    condor_stream = open(condor_script,'w')
-    condor_stream.write(script % context)
-    condor_stream.close()
-    return condor_script
-
-
-def make_condor_upload_script(name, outdir=None):
-    script = """Universe = vanilla
-
-Executable = /usr/bin/lftp
-arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
-
-Error = upload.out.$(Process).log
-Output = upload.out.$(Process).log
-Log = /tmp/submission-upload-%(user)s.log
-initialdir = %(initialdir)s
-
-queue
-"""
-    if outdir is None:
-        outdir = os.getcwd()
-
-    auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
-
-    encodeftp = 'encodeftp.cse.ucsc.edu'
-    ftpuser = auth.hosts[encodeftp][0]
-    ftppassword = auth.hosts[encodeftp][2]
-    context = {'archivename': make_submission_name(name),
-               'initialdir': os.path.abspath(outdir),
-               'user': os.getlogin(),
-               'ftpuser': ftpuser,
-               'ftppassword': ftppassword,
-               'ftphost': encodeftp}
-
-    condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
-    condor_stream = open(condor_script,'w')
-    condor_stream.write(script % context)
-    condor_stream.close()
-    os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
-
-    return condor_script
-
-
-def make_dag_fragment(ininame, archive_condor, upload_condor):
-    """
-    Make the couple of fragments compress and then upload the data.
-    """
-    cur_dir = os.getcwd()
-    archive_condor = os.path.join(cur_dir, archive_condor)
-    upload_condor = os.path.join(cur_dir, upload_condor)
-    job_basename = make_base_name(ininame)
-
-    fragments = []
-    fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
-    fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
-    fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
-
-    return fragments
-
-
-def get_library_info(host, apidata, library_id):
-    url = api.library_url(host, library_id)
-    contents = api.retrieve_info(url, apidata)
-    return contents
-
-
-def make_submission_section(line_counter, files, attributes):
-    """
-    Create a section in the submission ini file
-    """
-    inifile = [ "[line%s]" % (line_counter,) ]
-    inifile += ["files=%s" % (",".join(files))]
-
-    for k,v in attributes.items():
-        inifile += ["%s=%s" % (k,v)]
-    return inifile
-
-
-def make_base_name(pathname):
-    base = os.path.basename(pathname)
-    name, ext = os.path.splitext(base)
-    return name
-
-
-def make_submission_name(ininame):
-    name = make_base_name(ininame)
-    return name + ".tgz"
-
-
-def make_ddf_name(pathname):
-    name = make_base_name(pathname)
-    return name + ".ddf"
-
-
-def make_condor_name(pathname, run_type=None):
-    name = make_base_name(pathname)
-    elements = [name]
-    if run_type is not None:
-        elements.append(run_type)
-    elements.append("condor")
-    return ".".join(elements)
-
-
-def parse_filelist(file_string):
-    return file_string.split(",")
-
-
-def validate_filelist(files):
-    """
-    Die if a file doesn't exist in a file list
-    """
-    for f in files:
-        if not os.path.exists(f):
-            raise RuntimeError("%s does not exist" % (f,))
-
-if __name__ == "__main__":
-    main()
diff --git a/htsworkflow/frontend/experiments/__init__.py b/htsworkflow/frontend/experiments/__init__.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/frontend/experiments/models.py b/htsworkflow/frontend/experiments/models.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/frontend/experiments/urls.py b/htsworkflow/frontend/experiments/urls.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/frontend/experiments/views.py b/htsworkflow/frontend/experiments/views.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/frontend/reports/reports.py b/htsworkflow/frontend/reports/reports.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/pipelines/qseq2fastq.py b/htsworkflow/pipelines/qseq2fastq.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/pipelines/srf2fastq.py b/htsworkflow/pipelines/srf2fastq.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/submission/test/test_ucsc.py b/htsworkflow/submission/test/test_ucsc.py

new file mode 100644 (file)

index 0000000..f05de23
--- /dev/null
+++ b/htsworkflow/submission/test/test_ucsc.py
@@ -0,0 +1,29 @@
+import unittest
+from StringIO import StringIO
+
+from htsworkflow.submission import ucsc
+
+
+ENCODE_FILES="""wgEncodeGisChiaPetHCT116D000005593.bed.gz      project=wgEncode; grant=Ruan; lab=GIS-Ruan; composite=wgEncodeGisChiaPet; dataType=ChiaPet; view=Interactions; cell=HCT-116; antibody=Pol2; replicate=1; origAssembly=hg19; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001427; dateSubmitted=2011-02-04; dateUnrestricted=2011-11-04; subId=3267; labVersion=CHH524; tableName=wgEncodeGisChiaPetHCT116D000005593; type=bed; md5sum=a3c7420aece4acfb15f80f4dfe9f1fb3; size=924K
+wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep1.fastq.tgz   project=wgEncode; grant=Myers; lab=Caltech; composite=wgEncodeCaltechRnaSeq; dataType=RnaSeq; view=FastqRd2; cell=GM12878; localization=cell; rnaExtract=longPolyA; readType=2x75; insertLength=200; replicate=1; origAssembly=hg18; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000122; dateSubmitted=2010-07-14; dateResubmitted=2010-06-21; dateUnrestricted=2011-04-14; subId=1647; type=fastq; md5sum=51c4d1679b0ad29888bea2b40e26364a; size=4.8G
+"""
+
+
+class TestUCSCInfo(unittest.TestCase):
+    def test_parse_encodedcc_file(self):
+        stream = StringIO(ENCODE_FILES)
+        file_index = ucsc.parse_ucsc_file_index(stream)
+        self.assertEquals(len(file_index), 2)
+
+        for attributes in file_index.values():
+            self.assertIn('subId', attributes)
+            self.assertIn('project', attributes)
+            self.assertEquals(attributes['project'], 'wgEncode')
+
+def suite():
+    suite = unittest.makeSuite(TestUCSCInfo, 'test')
+    return suite
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
diff --git a/htsworkflow/submission/ucsc.py b/htsworkflow/submission/ucsc.py

index f80629a9cda462ac6ce8e45f58949b4ffb9037f9..9181830e34983724e586a906a7194d96fc9507eb 100644 (file)
--- a/htsworkflow/submission/ucsc.py
+++ b/htsworkflow/submission/ucsc.py
@@ -1,7 +1,11 @@
+"""Utilities for extracting information from the ENCODE DCC
+"""
  import urlparse
+import urllib2
  
  UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
  
+
  def ddf_download_url(submission_id):
      """Return url to download a DDF for a submission
  
@@ -11,6 +15,7 @@ def ddf_download_url(submission_id):
      fragment = 'download_ddf/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
  
+
  def daf_download_url(submission_id):
      """Return url to download a DAF for a submission
  
@@ -20,6 +25,7 @@ def daf_download_url(submission_id):
      fragment = 'download_daf/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
  
+
  def submission_view_url(submission_id):
      """Return url to download a DAF for a submission
  
@@ -28,3 +34,27 @@ def submission_view_url(submission_id):
      """
      fragment = 'show/%s' % (submission_id,)
      return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+    """Get index of files for a ENCODE collection
+    """
+    if base_url[-1] != '/': base_url += '/'
+    request = urllib2.urlopen(base_url + 'files.txt')
+    file_index = parse_ucsc_file_index(request)
+    return file_index
+
+
+def parse_ucsc_file_index(stream):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index
diff --git a/htsworkflow/util/makebed.py b/htsworkflow/util/makebed.py

old mode 100755 (executable)

new mode 100644 (file)
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py

index 9fcd3311ee77215fd8ae4ed732e7a22b287d4a62..8fb1424960571d4ab327ab2c4261b293eb1efb56 100644 (file)
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -33,7 +33,7 @@ def sparql_query(model, query_filename):
          output = []
          for k,v in row.items()[::-1]:
              print "{0}: {1}".format(k,v)
-        print 
+        print
  
  
  def blankOrUri(value=None):
@@ -99,11 +99,11 @@ def fromTypedNode(node):
  
      return literal
  
-    
+
  def get_model(model_name=None, directory=None):
      if directory is None:
          directory = os.getcwd()
-        
+
      if model_name is None:
          storage = RDF.MemoryStorage()
          logger.info("Using RDF Memory model")
@@ -114,12 +114,12 @@ def get_model(model_name=None, directory=None):
          logger.info("Using {0} with options {1}".format(model_name, options))
      model = RDF.Model(storage)
      return model
-        
+
  
  def load_into_model(model, parser_name, filename, ns=None):
      if not os.path.exists(filename):
          raise IOError("Can't find {0}".format(filename))
-    
+
      data = open(filename, 'r').read()
      load_string_into_model(model, parser_name, data, ns)
  
@@ -127,7 +127,7 @@ def load_into_model(model, parser_name, filename, ns=None):
  def load_string_into_model(model, parser_name, data, ns=None):
      if ns is None:
          ns = "http://localhost/"
-        
+
      rdf_parser = RDF.Parser(name=parser_name)
      rdf_parser.parse_string_into_model(model, data, ns)
  
@@ -148,3 +148,6 @@ def get_serializer(name='turtle'):
      writer.set_namespace('ucscDaf', dafTermOntology._prefix)
      return writer
  
+def dump_model(model):
+    serializer = get_serializer()
+    print serializer.serialize_model_to_string(model)
author	Diane Trout <diane@caltech.edu>
	Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
committer	Diane Trout <diane@caltech.edu>
	Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
encode_submission/README.txt	[new file with mode: 0644]	patch \| blob
encode_submission/__init__.py	[new file with mode: 0644]	patch \| blob
encode_submission/add-treatment-to-library.sparql	[new file with mode: 0644]	patch \| blob
encode_submission/dt-overrides.turtle	[new file with mode: 0644]	patch \| blob
encode_submission/encode_find.py	[new file with mode: 0644]	patch \| blob
encode_submission/failed-submissions.sparql	[new file with mode: 0644]	patch \| blob
encode_submission/find-lib-by-cell.sparql	[new file with mode: 0644]	patch \| blob
encode_submission/scan_extension.py	[new file with mode: 0644]	patch \| blob
encode_submission/test_encode_find.py	[new file with mode: 0644]	patch \| blob
encode_submission/test_ucsc_gather.py	[new file with mode: 0644]	patch \| blob
encode_submission/testdata/5136SubDetail.html	[new file with mode: 0644]	patch \| blob
encode_submission/ucsc_gather.py	[new file with mode: 0644]	patch \| blob
extra/ucsc_encode_submission/README.txt	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/add-treatment-to-library.sparql	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/dt-overrides.turtle	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/encode_find.py	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/failed-submissions.sparql	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/find-lib-by-cell.sparql	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/scan_extension.py	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/test_ucsc_gather.py	[deleted file]	patch \| blob \| history
extra/ucsc_encode_submission/ucsc_gather.py	[deleted file]	patch \| blob \| history
htsworkflow/frontend/experiments/__init__.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/frontend/experiments/models.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/frontend/experiments/urls.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/frontend/experiments/views.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/frontend/reports/reports.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/pipelines/qseq2fastq.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/pipelines/srf2fastq.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/submission/test/test_ucsc.py	[new file with mode: 0644]	patch \| blob
htsworkflow/submission/ucsc.py		patch \| blob \| history
htsworkflow/util/makebed.py	[changed mode: 0755->0644]	patch \| blob \| history
htsworkflow/util/rdfhelp.py		patch \| blob \| history