Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
authorDiane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
committerDiane Trout <diane@caltech.edu>
Tue, 20 Dec 2011 18:53:40 +0000 (10:53 -0800)
32 files changed:
encode_submission/README.txt [new file with mode: 0644]
encode_submission/__init__.py [new file with mode: 0644]
encode_submission/add-treatment-to-library.sparql [new file with mode: 0644]
encode_submission/dt-overrides.turtle [new file with mode: 0644]
encode_submission/encode_find.py [new file with mode: 0644]
encode_submission/failed-submissions.sparql [new file with mode: 0644]
encode_submission/find-lib-by-cell.sparql [new file with mode: 0644]
encode_submission/scan_extension.py [new file with mode: 0644]
encode_submission/test_encode_find.py [new file with mode: 0644]
encode_submission/test_ucsc_gather.py [new file with mode: 0644]
encode_submission/testdata/5136SubDetail.html [new file with mode: 0644]
encode_submission/ucsc_gather.py [new file with mode: 0644]
extra/ucsc_encode_submission/README.txt [deleted file]
extra/ucsc_encode_submission/add-treatment-to-library.sparql [deleted file]
extra/ucsc_encode_submission/dt-overrides.turtle [deleted file]
extra/ucsc_encode_submission/encode_find.py [deleted file]
extra/ucsc_encode_submission/failed-submissions.sparql [deleted file]
extra/ucsc_encode_submission/find-lib-by-cell.sparql [deleted file]
extra/ucsc_encode_submission/scan_extension.py [deleted file]
extra/ucsc_encode_submission/test_ucsc_gather.py [deleted file]
extra/ucsc_encode_submission/ucsc_gather.py [deleted file]
htsworkflow/frontend/experiments/__init__.py [changed mode: 0755->0644]
htsworkflow/frontend/experiments/models.py [changed mode: 0755->0644]
htsworkflow/frontend/experiments/urls.py [changed mode: 0755->0644]
htsworkflow/frontend/experiments/views.py [changed mode: 0755->0644]
htsworkflow/frontend/reports/reports.py [changed mode: 0755->0644]
htsworkflow/pipelines/qseq2fastq.py [changed mode: 0755->0644]
htsworkflow/pipelines/srf2fastq.py [changed mode: 0755->0644]
htsworkflow/submission/test/test_ucsc.py [new file with mode: 0644]
htsworkflow/submission/ucsc.py
htsworkflow/util/makebed.py [changed mode: 0755->0644]
htsworkflow/util/rdfhelp.py

diff --git a/encode_submission/README.txt b/encode_submission/README.txt
new file mode 100644 (file)
index 0000000..bab7a55
--- /dev/null
@@ -0,0 +1,5 @@
+I was building a variety of scripts to handle submitting our data to the 
+UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
+databases, and since I needed an official place to put the scripts
+
+I decided here.
diff --git a/encode_submission/__init__.py b/encode_submission/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/encode_submission/add-treatment-to-library.sparql b/encode_submission/add-treatment-to-library.sparql
new file mode 100644 (file)
index 0000000..c97dce2
--- /dev/null
@@ -0,0 +1,19 @@
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
+
+construct { ?library ddf:treatment ?treatment ;
+                     ddf:protocol ?pcr . }
+WHERE {
+   ?status ucscSubmission:has_file ?file .
+   ?submission ucscSubmission:has_status ?status ;
+               ucscSubmission:library_urn ?library ;
+               ucscSubmission:name ?name .
+   ?file ddf:treatment ?treatment ;
+         ddf:protocol ?pcr .
+}
+
diff --git a/encode_submission/dt-overrides.turtle b/encode_submission/dt-overrides.turtle
new file mode 100644 (file)
index 0000000..ffe2759
--- /dev/null
@@ -0,0 +1,178 @@
+##
+## Override submission ID to library URN names for our libraries
+## whose names either lack, or have the wrong library ID string
+## embedded in them.
+##
+
+@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
+@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
+
+# woldlab-hepg2-rnaseq-2009dec
+<http://encodesubmit.ucsc.edu/pipeline/show/805>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part2
+<http://encodesubmit.ucsc.edu/pipeline/show/810>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-hepg2-rnaseq-2009dec-part3
+<http://encodesubmit.ucsc.edu/pipeline/show/869>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
+<http://encodesubmit.ucsc.edu/pipeline/show/870>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab-hepg2-rnaseq-2010Jan-part4
+<http://encodesubmit.ucsc.edu/pipeline/show/897>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
+
+# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
+<http://encodesubmit.ucsc.edu/pipeline/show/898>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep1-2010Jan6
+<http://encodesubmit.ucsc.edu/pipeline/show/903>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab-K562-directional-rnaseq-rep2-2010jan9
+<http://encodesubmit.ucsc.edu/pipeline/show/904>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab hESC 10886 rep1 2009Jan13
+<http://encodesubmit.ucsc.edu/pipeline/show/1026>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
+
+# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1483>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1626>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
+
+# woldlab jun 18 1x75-Directional-GM12878-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1631>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
+
+# woldlab jun 18  1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1632>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1633>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1634>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# woldlab jun 18 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1635>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1636>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1637>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# woldlab jun 18 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1638>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/1639>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# woldlab jun 18 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1645>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# woldlab jun 18 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1646>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# woldlab June  2x75-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/1856>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
+
+#2010 jul 9corrected fastqs
+<http://encodesubmit.ucsc.edu/pipeline/show/1874>
+     ucscSubmission:ignore "1" .
+#    ucscSubmission:library_urn "
+
+# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
+<http://encodesubmit.ucsc.edu/pipeline/show/2926>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-GM12878-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2930>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
+
+# 1x75-Directional-H1-hESC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2931>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
+
+# 1x75-Directional-H1-hESC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2932>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HUVEC-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2933>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
+
+# 1x75-Directional-HUVEC-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2934>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
+
+# 1x75-Directional-HeLa-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2935>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
+
+# 1x75-Directional-HeLa-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2936>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
+
+# 1x75-Directional-HepG2-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2937>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
+# 1x75-Directional-HepG2-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2938>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# 1x75-Directional-K562-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2939>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
+
+# 1x75-Directional-K562-Rep2
+<http://encodesubmit.ucsc.edu/pipeline/show/2940>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
+
+# 1x75-Directional-NHEK-Rep1
+<http://encodesubmit.ucsc.edu/pipeline/show/2941>
+    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
+
+# "3438 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4607>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
+
+# "3439 fastq resubmit"
+<http://encodesubmit.ucsc.edu/pipeline/show/4608>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
+
+# "3437 Fastq re-submission"
+<http://encodesubmit.ucsc.edu/pipeline/show/4609>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
+
+# "1x75-Directional-HepG2-rep2-replace 3522"
+<http://encodesubmit.ucsc.edu/pipeline/show/4797>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
+
+# "1x75-Directional-HepG2-rep1 replacement of 3521"
+<http://encodesubmit.ucsc.edu/pipeline/show/4798>
+  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
+
diff --git a/encode_submission/encode_find.py b/encode_submission/encode_find.py
new file mode 100644 (file)
index 0000000..6608a05
--- /dev/null
@@ -0,0 +1,628 @@
+#!/usr/bin/env python
+"""
+Gather information about our submissions into a single RDF store
+"""
+
+from datetime import datetime
+import hashlib
+import httplib2
+import keyring
+import logging
+from lxml.html import fromstring
+from operator import attrgetter
+from optparse import OptionParser, OptionGroup
+# python keyring
+import os
+import re
+# redland rdf lib
+import RDF
+import sys
+import urllib
+import urlparse
+
+from htsworkflow.submission import daf, ucsc
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     dublinCoreNS, \
+     get_model, \
+     get_serializer, \
+     sparql_query, \
+     submissionOntology, \
+     libraryOntology, \
+     load_into_model, \
+     rdfNS, \
+     rdfsNS, \
+     xsdNS
+TYPE_N = rdfNS['type']
+CREATION_DATE = libraryOntology['date']
+
+# URL mappings
+LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
+
+from htsworkflow.submission.ucsc import \
+     daf_download_url, \
+     ddf_download_url, \
+     get_ucsc_file_index, \
+     submission_view_url, \
+     UCSCEncodePipeline
+
+DCC_NS = RDF.NS(UCSCEncodePipeline + 'download_ddf#')
+
+DBDIR = os.path.expanduser("~diane/proj/submission")
+
+LOGGER = logging.getLogger("encode_find")
+
+LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
+USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
+
+USERNAME = 'detrout'
+CHARSET = 'utf-8'
+
+GOLDEN_PATH_TEST = "http://hgdownload-test.cse.ucsc.edu/goldenPath/"\
+                   "{genome}/encodeDCC/{composite}/"
+def main(cmdline=None):
+    """
+    Parse command line arguments
+
+    Takes a list of arguments (assuming arg[0] is the program name) or None
+    If None, it looks at sys.argv
+    """
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    elif opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    htsw_authdata = api.make_auth_from_opts(opts, parser)
+    htswapi = api.HtswApi(opts.host, htsw_authdata)
+
+    cookie = None
+    model = get_model(opts.load_model, DBDIR)
+
+    if opts.load_rdf is not None:
+        ns_uri = submissionOntology[''].uri
+        load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
+
+    if len(args) == 0:
+        limit = None
+    else:
+        limit = args
+
+    if opts.update:
+        cookie = login(cookie=cookie)
+        load_my_submissions(model, limit=limit, cookie=cookie)
+        load_encode_libraries(model, htswapi)
+        our_tracks = [
+            {'genome':'hg19', 'composite':'wgEncodeCaltechRnaSeq'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHist'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechHistone'},
+            {'genome':'mm9',  'composite':'wgEncodeCaltechTfbs'}
+        ]
+        for track_info in our_tracks:
+            load_encodedcc_files(model, GOLDEN_PATH_TEST.format(**track_info))
+
+
+    if opts.sparql is not None:
+        sparql_query(model, opts.sparql)
+
+    if opts.find_submission_with_no_library:
+        find_submissions_with_no_library(model)
+
+    if opts.print_rdf:
+        serializer = get_serializer(name=opts.rdf_parser_name)
+        print serializer.serialize_model_to_string(model)
+
+
+def make_parser():
+    """Construct option parser
+    """
+    parser = OptionParser()
+    commands = OptionGroup(parser, "Commands")
+    commands.add_option('--load-model', default=None,
+      help="Load model database")
+    commands.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    commands.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    commands.add_option('--update', action="store_true", default=False,
+      help="Query remote data sources and update our database")
+    #commands.add_option('--update-ucsc-status', default=None,
+    #  help="download status from ucsc, requires filename for extra rules")
+    #commands.add_option('--update-ddfs', action="store_true", default=False,
+    #  help="download ddf information for known submission")
+    #commands.add_option('--update-library', default=None,
+    #  help="download library info from htsw, "\
+    #       "requires filename for extra rules")
+    parser.add_option_group(commands)
+
+    queries = OptionGroup(parser, "Queries")
+    queries.add_option('--sparql', default=None,
+      help="execute arbitrary sparql query")
+    queries.add_option('--find-submission-with-no-library', default=False,
+      action="store_true",
+      help="find submissions with no library ID")
+    parser.add_option_group(queries)
+
+    options = OptionGroup(parser, "Options")
+    options.add_option("--rdf-parser-name", default="turtle",
+      help="set rdf file parser type")
+    options.add_option("-v", "--verbose", action="store_true", default=False)
+    options.add_option("--debug", action="store_true", default=False)
+    parser.add_option_group(options)
+
+    api.add_auth_options(parser)
+
+    return parser
+
+
+def load_my_submissions(model, limit=None, cookie=None):
+    """Parse all the submissions from UCSC into model
+    It will look at the global USER_URL to figure out who to scrape
+    cookie contains the session cookie, if none, will attempt to login
+    """
+    if cookie is None:
+        cookie = login()
+
+    tree = get_url_as_tree(USER_URL, 'GET', cookie)
+    table_rows = tree.xpath('//table[@id="projects"]/tr')
+    # first record is header
+    name_n = submissionOntology['name']
+    species_n = submissionOntology['species']
+    library_urn = submissionOntology['library_urn']
+
+    # skip header
+    for row in table_rows[1:]:
+        cell = row.xpath('td')
+        if cell is not None and len(cell) > 1:
+            submission_id = str(cell[0].text_content())
+            if limit is None or submission_id in limit:
+                subUrn = RDF.Uri(submission_view_url(submission_id))
+
+                add_stmt(model,
+                         subUrn,
+                         TYPE_N,
+                         submissionOntology['Submission'])
+                add_stmt(model,
+                         subUrn,
+                         DCC_NS['subId'],
+                         RDF.Node(submission_id))
+
+                name = str(cell[4].text_content())
+                add_stmt(model, subUrn, name_n, name)
+
+                species = str(cell[2].text_content())
+                if species is not None:
+                    add_stmt(model, subUrn, species_n, species)
+
+                library_id = get_library_id(name)
+                if library_id is not None:
+                    add_submission_to_library_urn(model,
+                                                  subUrn,
+                                                  library_urn,
+                                                  library_id)
+                else:
+                    errmsg = 'Unable to find library id in {0} for {1}'
+                    LOGGER.warn(errmsg.format(name, str(subUrn)))
+
+                add_submission_creation_date(model, subUrn, cookie)
+
+                # grab changing atttributes
+                status = str(cell[6].text_content()).strip()
+                last_mod_datetime = get_date_contents(cell[8])
+                last_mod = last_mod_datetime.isoformat()
+
+                update_submission_detail(model, subUrn, status, last_mod,
+                                         cookie=cookie)
+
+                LOGGER.info("Processed {0}".format(subUrn))
+
+
+def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
+    """Add a link from a UCSC submission to woldlab library if needed
+    """
+    libraryUrn = LIBRARY_NS[library_id + '/']
+    query = RDF.Statement(submissionUrn, predicate, libraryUrn)
+    if not model.contains_statement(query):
+        link = RDF.Statement(submissionUrn, predicate, libraryUrn)
+        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
+        model.add_statement(link)
+    else:
+        LOGGER.debug("Found: {0}".format(str(query)))
+
+
+def find_submissions_with_no_library(model):
+    missing_lib_query_text = """
+PREFIX submissionOntology:<{submissionOntology}>
+
+SELECT
+ ?subid ?name
+WHERE {{
+  ?subid submissionOntology:name ?name
+  OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
+  FILTER  (!bound(?libid))
+}}""".format(submissionOntology=submissionOntology[''].uri)
+    missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
+
+    results = missing_lib_query.execute(model)
+    for row in results:
+        subid = row['subid']
+        name = row['name']
+        print "# {0}".format(name)
+        print "<{0}>".format(subid.uri)
+        print "  encodeSubmit:library_urn "\
+              "<http://jumpgate.caltech.edu/library/> ."
+        print ""
+
+
+def add_submission_creation_date(model, subUrn, cookie):
+    # in theory the submission page might have more information on it.
+    creation_dates = get_creation_dates(model, subUrn)
+    if len(creation_dates) == 0:
+        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
+        submissionTree = get_url_as_tree(str(subUrn), 'GET', cookie)
+        parse_submission_page(model, cells, subUrn)
+    else:
+        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
+
+def get_creation_dates(model, subUrn):
+    query = RDF.Statement(subUrn, CREATION_DATE, None)
+    creation_dates = list(model.find_statements(query))
+    return creation_dates
+
+def parse_submission_page(model, submissionTree, subUrn):
+    cells = submissionTree.findall('.//td')
+    dateTimeType = xsdNS['dateTime']
+    created_label = [x for x in cells
+                     if x.text_content().startswith('Created')]
+    if len(created_label) == 1:
+        created_date = get_date_contents(created_label[0].getnext())
+        created_date_node = RDF.Node(literal=created_date.isoformat(),
+                                     datatype=dateTimeType.uri)
+        add_stmt(model, subUrn, CREATION_DATE, created_date_node)
+    else:
+        msg = 'Unable to find creation date for {0}'.format(str(subUrn))
+        LOGGER.warn(msg)
+        raise Warning(msg)
+
+
+def update_submission_detail(model, subUrn, status, recent_update, cookie):
+    HasStatusN = submissionOntology['has_status']
+    StatusN = submissionOntology['status']
+    LastModifyN = submissionOntology['last_modify_date']
+
+    status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
+    status_nodes = list(model.find_statements(status_nodes_query))
+
+    if len(status_nodes) == 0:
+        # has no status node, add one
+        LOGGER.info("Adding status node to {0}".format(subUrn))
+        status_node = create_status_node(subUrn, recent_update)
+        add_stmt(model, subUrn, HasStatusN, status_node)
+        add_stmt(model, status_node, rdfNS['type'], StatusN)
+        add_stmt(model, status_node, StatusN, status)
+        add_stmt(model, status_node, LastModifyN, recent_update)
+        update_ddf(model, subUrn, status_node, cookie=cookie)
+        update_daf(model, subUrn, status_node, cookie=cookie)
+    else:
+        LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
+        for status_statement in status_nodes:
+            status_node = status_statement.object
+            last_modified_query = RDF.Statement(status_node,
+                                                LastModifyN,
+                                                None)
+            last_mod_nodes = model.find_statements(last_modified_query)
+            for last_mod_statement in last_mod_nodes:
+                last_mod_date = str(last_mod_statement.object)
+                if recent_update == str(last_mod_date):
+                    update_ddf(model, subUrn, status_node, cookie=cookie)
+                    update_daf(model, subUrn, status_node, cookie=cookie)
+                    break
+
+
+def update_daf(model, submission_url, status_node, cookie):
+    download_daf_uri = str(submission_url).replace('show', 'download_daf')
+    daf_uri = RDF.Uri(download_daf_uri)
+
+    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
+    if not model.contains_statement(status_is_daf):
+        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
+                                                     status_node))
+        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
+        daf_hash = hashlib.md5(daf_text).hexdigest()
+        daf_hash_stmt = RDF.Statement(status_node,
+                                      dafTermOntology['md5sum'],
+                                      daf_hash)
+        model.add_statement(daf_hash_stmt)
+        daf.fromstring_into_model(model, status_node, daf_text)
+
+
+def update_ddf(model, subUrn, statusNode, cookie):
+    download_ddf_url = str(subUrn).replace('show', 'download_ddf')
+    ddfUrn = RDF.Uri(download_ddf_url)
+
+    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DCC_NS[''])
+    if not model.contains_statement(status_is_ddf):
+        LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
+        ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
+        add_ddf_statements(model, statusNode, ddf_text)
+        model.add_statement(status_is_ddf)
+
+
+def add_ddf_statements(model, statusNode, ddf_string):
+    """Convert a ddf text file into RDF Statements
+    """
+    ddf_lines = ddf_string.split('\n')
+    # first line is header
+    header = ddf_lines[0].split()
+    attributes = [DCC_NS[x] for x in header]
+
+    for ddf_line in ddf_lines[1:]:
+        ddf_line = ddf_line.strip()
+        if len(ddf_line) == 0:
+            continue
+        if ddf_line.startswith("#"):
+            continue
+
+        ddf_record = ddf_line.split('\t')
+        files = ddf_record[0].split(',')
+        file_attributes = ddf_record[1:]
+
+        for f in files:
+            fileNode = RDF.Node()
+            add_stmt(model,
+                     statusNode,
+                     submissionOntology['has_file'],
+                     fileNode)
+            add_stmt(model, fileNode, rdfNS['type'], DCC_NS['file'])
+            add_stmt(model, fileNode, DCC_NS['filename'], f)
+
+            for predicate, object in zip(attributes[1:], file_attributes):
+                add_stmt(model, fileNode, predicate, object)
+
+
+def load_encode_libraries(model, htswapi):
+    """Get libraries associated with encode.
+    """
+    encodeFilters = ["/library/?affiliations__id__exact=44",
+                     "/library/?affiliations__id__exact=80",
+                    ]
+
+    encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
+    rdfaParser = RDF.Parser(name='rdfa')
+    for encodeUrl in encodeUrls:
+        LOGGER.info("Scanning library url {0}".format(encodeUrl))
+        rdfaParser.parse_into_model(model, encodeUrl)
+        query = RDF.Statement(None, libraryOntology['library_id'], None)
+        libraries = model.find_statements(query)
+        for statement in libraries:
+            libraryUrn = statement.subject
+            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
+            load_library_detail(model, libraryUrn)
+
+
+def load_encodedcc_files(model, base_url):
+    if base_url[-1] != '/':
+        base_url += '/'
+
+    file_index = ucsc.get_ucsc_file_index(base_url)
+    for filename, attributes in file_index.items():
+        s = RDF.Node(RDF.Uri(base_url + filename))
+        for name, value in attributes.items():
+            p = RDF.Node(DCC_NS[name])
+            o = RDF.Node(value)
+            model.add_statement(RDF.Statement(s,p,o))
+
+def load_library_detail(model, libraryUrn):
+    """Grab detail information from library page
+    """
+    rdfaParser = RDF.Parser(name='rdfa')
+    query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
+    results = list(model.find_statements(query))
+    log_message = "Found {0} statements for {1}"
+    LOGGER.debug(log_message.format(len(results), libraryUrn))
+    if len(results) == 0:
+        LOGGER.info("Loading {0}".format(str(libraryUrn)))
+        rdfaParser.parse_into_model(model, libraryUrn.uri)
+    elif len(results) == 1:
+        pass  # Assuming that a loaded dataset has one record
+    else:
+        LOGGER.warning("Many dates for {0}".format(libraryUrn))
+
+
+def get_library_id(name):
+    """Guess library ID from library name
+
+    >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
+    '11039'
+    >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
+    '10150'
+    """
+    match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
+    library_id = None
+    if match is not None:
+        library_id = match.group('id')
+    return library_id
+
+
+def get_contents(element):
+    """Return contents or none.
+    """
+    if len(element.contents) == 0:
+        return None
+
+    a = element.find('a')
+    if a is not None:
+        return a.contents[0].encode(CHARSET)
+
+    return element.contents[0].encode(CHARSET)
+
+
+def create_status_node(submission_uri, timestamp):
+    submission_uri = daf.submission_uri_to_string(submission_uri)
+    if submission_uri[-1] != '/':
+        sumbission_uri += '/'
+    status_uri = submission_uri + timestamp
+    return RDF.Node(RDF.Uri(status_uri))
+
+
+def get_date_contents(element):
+    data = element.text_content()
+    if data:
+        return datetime.strptime(data, "%Y-%m-%d %H:%M")
+    else:
+        return None
+
+
+def add_stmt(model, subject, predicate, rdf_object):
+    """Convienence create RDF Statement and add to a model
+    """
+    return model.add_statement(
+        RDF.Statement(subject, predicate, rdf_object))
+
+
+def login(cookie=None):
+    """Login if we don't have a cookie
+    """
+    if cookie is not None:
+        return cookie
+
+    keys = keyring.get_keyring()
+    password = keys.get_password(LOGIN_URL, USERNAME)
+    credentials = {'login': USERNAME,
+                   'password': password}
+    headers = {'Content-type': 'application/x-www-form-urlencoded'}
+    http = httplib2.Http()
+    response, content = http.request(LOGIN_URL,
+                                     'POST',
+                                     headers=headers,
+                                     body=urllib.urlencode(credentials))
+    LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
+                                                    response['status']))
+
+    cookie = response.get('set-cookie', None)
+    if cookie is None:
+        raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
+    return cookie
+
+
+def get_url_as_tree(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        tree = fromstring(content, base_url=url)
+        return tree
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+
+def get_url_as_text(url, method, cookie=None):
+    http = httplib2.Http()
+    headers = {}
+    if cookie is not None:
+        headers['Cookie'] = cookie
+    response, content = http.request(url, method, headers=headers)
+    if response['status'] == '200':
+        return content
+    else:
+        msg = "error accessing {0}, status {1}"
+        msg = msg.format(url, response['status'])
+        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
+
+################
+#  old stuff
+SUBMISSIONS_LACKING_LIBID = [
+    ('1x75-Directional-HeLa-Rep1',    '11208'),
+    ('1x75-Directional-HeLa-Rep2',    '11207'),
+    ('1x75-Directional-HepG2-Rep1',   '11210'),
+    ('1x75-Directional-HepG2-Rep2',   '11209'),
+    ('1x75-Directional-H1-hESC-Rep1', '10947'),
+    ('1x75-Directional-H1-hESC-Rep2', '11009'),
+    ('1x75-Directional-HUVEC-Rep1',   '11206'),
+    ('1x75-Directional-HUVEC-Rep2',   '11205'),
+    ('1x75-Directional-K562-Rep1',    '11008'),
+    ('1x75-Directional-K562-Rep2',    '11007'),
+    ('1x75-Directional-NHEK-Rep1',    '11204'),
+    ('1x75-Directional-GM12878-Rep1', '11011'),
+    ('1x75-Directional-GM12878-Rep2', '11010'),
+    ]
+
+
+def select_by_library_id(submission_list):
+    subl = [(x.library_id, x) for x in submission_list if x.library_id]
+    libraries = {}
+    for lib_id, subobj in subl:
+        libraries.setdefault(lib_id, []).append(subobj)
+
+    for submission in libraries.values():
+        submission.sort(key=attrgetter('date'), reverse=True)
+
+    return libraries
+
+
+def library_to_freeze(selected_libraries):
+    freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
+    lib_ids = sorted(selected_libraries.keys())
+    report = ['<html><table border="1">']
+    report = ["""<html>
+<head>
+<style type="text/css">
+ td {border-width:0 0 1px 1px; border-style:solid;}
+</style>
+</head>
+<body>
+<table>
+"""]
+    report.append('<thead>')
+    report.append('<tr><td>Library ID</td><td>Name</td>')
+    for f in freezes:
+        report.append('<td>{0}</td>'.format(f))
+    report.append('</tr>')
+    report.append('</thead>')
+    report.append('<tbody>')
+    for lib_id in lib_ids:
+        report.append('<tr>')
+        lib_url = LIBRARY_NS[lib_id].uri
+        report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
+        submissions = selected_libraries[lib_id]
+        report.append('<td>{0}</td>'.format(submissions[0].name))
+        batched = {}
+        for sub in submissions:
+            date = date_to_freeze(sub.date)
+            batched.setdefault(date, []).append(sub)
+        for d in freezes:
+            report.append('<td>')
+            for s in batched.get(d, []):
+                show_url = submission_view_url(s.subid)
+                subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
+                report.append("{0}:{1}".format(subid, s.status))
+            report.append('</td>')
+        else:
+            report.append('<td></td>')
+        report.append("</tr>")
+    report.append('</tbody>')
+    report.append("</table></html>")
+    return "\n".join(report)
+
+
+def date_to_freeze(d):
+    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
+               (datetime(2010, 7, 30), '2010-Jul'),
+               (datetime(2011, 1, 30), '2011-Jan'),
+               ]
+    for end, name in freezes:
+        if d < end:
+            return name
+    else:
+        return None
+
+if __name__ == "__main__":
+    main()
diff --git a/encode_submission/failed-submissions.sparql b/encode_submission/failed-submissions.sparql
new file mode 100644 (file)
index 0000000..af4af4e
--- /dev/null
@@ -0,0 +1,22 @@
+##
+## Find submissions that are currently "failed"
+##
+
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
+PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
+
+#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
+#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
+#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
+
+SELECT 
+ ?subid ?subname ?liburn ?status
+WHERE {
+  ?subid submitOnt:name ?subname .
+  ?subid submitOnt:library_urn ?liburn .
+  ?subid submitOnt:has_status ?statusNode .
+  ?statusNode submitOnt:status ?status .
+  ?statusNode submitOnt:last_modify_date ?last_modify .
+  FILTER (regex(?status, "failed", "i"))
+} 
diff --git a/encode_submission/find-lib-by-cell.sparql b/encode_submission/find-lib-by-cell.sparql
new file mode 100644 (file)
index 0000000..c4585c5
--- /dev/null
@@ -0,0 +1,17 @@
+# Produce list of submissions associated with a cell/replicate
+PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+
+SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
+WHERE {
+    ?subid ucscSubmission:name ?name .
+    OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
+                       libraryOntology:date ?submission_date .
+               ?liburn libraryOntology:cell_line ?cell ;
+                       libraryOntology:replicate ?replicate . }
+    #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
+    #filter(!bound(?liburn))
+}
+ORDER BY ?submission_date ?cell ?replicate ?liburn
diff --git a/encode_submission/scan_extension.py b/encode_submission/scan_extension.py
new file mode 100644 (file)
index 0000000..39f19c6
--- /dev/null
@@ -0,0 +1,76 @@
+from optparse import OptionParser
+import os
+import sys
+from pprint import pprint
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    extensions = scan(args)
+    common_extensions = find_common_suffix(extensions)
+
+    if opts.rdf:
+        print_rdf(common_extensions)
+    else:
+        print common_extensions
+        
+def make_parser():
+    parser = OptionParser("%prog: directory [directory...]")
+    parser.add_option('--rdf', action="store_true", default=False,
+                      help="Produce rdf configuration file for ucsc_gather")
+    return parser
+
+def scan(toscan):
+    index = {}
+    for cur_scan_dir in toscan:
+        for path, dirnames, filenames in os.walk(cur_scan_dir):
+            for filename in filenames:
+                base, ext = os.path.splitext(filename)
+                if ext in ('.daf', 'ddf'):
+                    continue
+                next_index = index
+                for c in filename[::-1]:
+                    next_index = next_index.setdefault(c, {})
+    return index
+
+def find_common_suffix(index, tail=[]):
+    if len(tail) > 0 and len(index) > 1:
+        return "".join(tail[::-1])
+
+    results = []
+    for key, choice in index.items():
+        r = find_common_suffix(choice, tail+[key])
+        if r is not None:
+            results.append (r)
+        
+    if len(results) == 0:
+        return None
+    elif len(results) == 1:
+        return results[0]
+    else:
+        return results
+
+def print_rdf(common_extensions):
+    import RDF
+    from htsworkflow.util import rdfhelp
+    model = rdfhelp.get_model()
+
+    viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
+    subView = RDF.NS(viewName)
+    fileReTerm = rdfhelp.dafTermOntology['filename_re']
+
+    count = 1
+    for ext in common_extensions:
+        s = RDF.Statement(subView['VIEW{0}'.format(count)],
+                          fileReTerm,
+                          '.*{0}$'.format(ext.replace('.', '\\.')))
+        model.add_statement(s)
+        count += 1
+        
+    writer = rdfhelp.get_serializer()
+    writer.set_namespace('thisSubmissionView', subView._prefix)
+    print writer.serialize_model_to_string(model)
+
+if __name__ == "__main__":
+    main()
diff --git a/encode_submission/test_encode_find.py b/encode_submission/test_encode_find.py
new file mode 100644 (file)
index 0000000..98bdb46
--- /dev/null
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import os
+import unittest
+
+import RDF
+
+import encode_find
+from htsworkflow.submission.ucsc import submission_view_url
+from htsworkflow.util.rdfhelp import dump_model, get_model
+
+SOURCE_PATH = os.path.split(os.path.abspath(__file__))[0]
+print SOURCE_PATH
+
+class TestEncodeFind(unittest.TestCase):
+    def test_create_status_node_with_uri(self):
+        subURL = submission_view_url('5136')
+        submissionUri = RDF.Uri(subURL)
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(submissionUri, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_create_status_node_with_str(self):
+        subURL = submission_view_url('5136')
+        timestamp = '2011-12-19T12:42:53.048956'
+        manualUri = subURL + '/' + timestamp
+        nodeUri = encode_find.create_status_node(subURL, timestamp)
+        self.assertEqual(str(nodeUri.uri), manualUri)
+
+    def test_parse_submission_page(self):
+        timestamp = '2011-12-19T12:42:53.048956'
+        subURL = submission_view_url('5136')
+        subNode = encode_find.create_status_node(subURL, timestamp)
+        test_file = os.path.join(SOURCE_PATH, 'testdata', '5136SubDetail.html')
+        from lxml.html import parse
+        tree = parse(test_file)
+        model = get_model()
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 0)
+        encode_find.parse_submission_page(model, tree, subNode)
+        dates = encode_find.get_creation_dates(model, subNode)
+        self.assertEqual(len(dates), 1)
+        self.assertEqual(str(dates[0].object), '2011-12-07T15:23:00')
+
+def suite():
+    return unittest.makeSuite(TestEncodeFind, "test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/encode_submission/test_ucsc_gather.py b/encode_submission/test_ucsc_gather.py
new file mode 100644 (file)
index 0000000..d7d54e6
--- /dev/null
@@ -0,0 +1,12 @@
+import unittest
+
+import ucsc_gather
+
+class testUCSCGather(unittest.TestCase):
+    pass
+
+def suite():
+    return unittest.makeSuite(testUCSCGather,"test")
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/encode_submission/testdata/5136SubDetail.html b/encode_submission/testdata/5136SubDetail.html
new file mode 100644 (file)
index 0000000..1daeb58
--- /dev/null
@@ -0,0 +1,245 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1" />
+  <meta http-equiv="Content-Language" content="en-us" />
+    <title>
+    ENCODE DCC Data Submission Production
+    </title>
+<link href="/stylesheets/encode.css?1278401455" media="screen" rel="Stylesheet" type="text/css" />
+</head>
+<body id="encode-pipeline">
+
+<div id="container">
+  <img align="left" height=50 width=100  src="/images/encode_logo.png">
+
+    <div id="header">
+        ENCODE DCC Data Submission
+        <font size=-1 color="gray"> <em>
+        Production
+        </em></font>
+    </div>
+  <div id="user"> Logged In: <strong> detrout </strong>
+
+  </div>
+
+  <div id="nav">
+    <!-- <table width="100%"><tr>-->
+    <table cellpadding=0 cellspacing=0 width="100%"><tr>
+    <td align="left">
+    <a href="/pipeline/new">New Submission</a>
+     |
+    <a href="/pipeline/list">All Submissions</a>
+
+     |
+    <a href="/pipeline/show_active">Active Submissions</a>
+
+     |
+    <a href="/pipeline/show_user">My Submissions</a>
+
+    </td>
+    <td align="right">
+    <a href="/account/logout"> Log Out</a>
+     |
+    <a href="/account/change_profile">Change Profile</a>
+     |
+    <a href="/pipeline/show_tools">Tools</a>
+    </td>
+
+    </tr></table>
+  </div>
+
+  <div id="message">
+
+
+
+  </div>
+  <div id="content">
+      <p>
+
+
+
+<table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+
+<tr>
+<td>Submission: </td><td>wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2 resubmit</td><td>&nbsp;&nbsp;&nbsp;</td>
+<td>Created: </td><td>2011-12-07 15:23</td>
+  <td>&nbsp;&nbsp;</td>
+  <td>11 days ago</td>
+</tr>
+
+<tr>
+<td>DB: </td><td>hg19</td><td>&nbsp;&nbsp;&nbsp;</td>
+
+<td>Updated: </td><td>2011-12-08 14:54</td>
+  <td>&nbsp;&nbsp;</td>
+  <td>10 days ago</td>
+</tr>
+
+<tr>
+<td>Status: </td><td>approved</td>
+</tr>
+
+<tr>
+
+  <td>&nbsp;</td>
+</tr>
+
+
+</table>
+
+
+
+
+
+
+  <table style="margin-top:10px;" cellpadding=1 cellspacing=1>
+    <tr>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+      <td> [
+      <a href="/pipeline/show_daf/5136">Show DAF</a>       ]  </td>
+
+
+
+      <td> [
+      <a href="/pipeline/show_ddf/5136">Show DDF</a>       ]  </td>
+
+
+    </tr>
+
+  </table>
+
+
+
+<p>
+
+<table cellspacing=2 cellpadding=2>
+<tr><td></td>
+<td align="left"><b>File</b></td>
+<td align="left"><b>Size</b></td>
+<td align="left"><b>Updated</b></td>
+
+<td></td></tr>
+
+
+
+
+
+  <tr style="margin: 10;">
+  <td><b>Archive</b></td><td>002_CaltechRnaSeq_Fastq_DAF.tar.gz</td>
+  <td align="right">1397</td>
+  <td>2011-12-08 14:08</td>
+
+
+
+
+  </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeq.daf</td>
+      <td align=right>4187</td>
+      <td>2011-12-08 14:08</td>
+    </tr>
+
+
+
+
+
+
+
+  <tr style="margin: 10;">
+
+  <td><b>Archive</b></td><td>001_wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2.fastq.tgz</td>
+  <td align="right">8833154623</td>
+  <td>2011-12-07 15:23</td>
+
+
+
+
+  </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l7_r2.fastq</td>
+      <td align=right>1629293100</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_30DY0AAXX_c151_l8_r2.fastq</td>
+
+      <td align=right>1628417888</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_616L7AAXX_c152_l1_r2.fastq</td>
+      <td align=right>5152104576</td>
+
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l1_r2.fastq</td>
+      <td align=right>6094749091</td>
+      <td>2011-12-07 15:34</td>
+
+    </tr>
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_61PKHAAXX_c152_l2_r2.fastq</td>
+      <td align=right>7483882081</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+
+    <tr>
+      <td>&nbsp;</td>
+      <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep2/10515_ilmn200901_c202_l4_r2.fastq</td>
+      <td align=right>5282142818</td>
+      <td>2011-12-07 15:34</td>
+    </tr>
+
+
+
+</table>
+
+<p>
+
+
+
+
+  </div>
+  <div id="footer">
+    <A HREF="/help.html">Help</A>
+     |
+    <A HREF="/contact">Contact Us</A>
+  </div>
+
+</div>
+</body>
+</html>
diff --git a/encode_submission/ucsc_gather.py b/encode_submission/ucsc_gather.py
new file mode 100644 (file)
index 0000000..fd8db12
--- /dev/null
@@ -0,0 +1,476 @@
+#!/usr/bin/env python
+from ConfigParser import SafeConfigParser
+import fnmatch
+from glob import glob
+import json
+import logging
+import netrc
+from optparse import OptionParser, OptionGroup
+import os
+from pprint import pprint, pformat
+import shlex
+from StringIO import StringIO
+import stat
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+
+import RDF
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     fromTypedNode, \
+     get_model, \
+     get_serializer, \
+     load_into_model, \
+     sparql_query, \
+     submissionOntology
+from htsworkflow.submission.daf import \
+     DAFMapper, \
+     MetadataLookupException, \
+     get_submission_uri
+from htsworkflow.submission.condorfastq import CondorFastqExtract
+
+logger = logging.getLogger('ucsc_gather')
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+    submission_uri = None
+
+    if opts.debug:
+        logging.basicConfig(level = logging.DEBUG )
+    elif opts.verbose:
+        logging.basicConfig(level = logging.INFO )
+    else:
+        logging.basicConfig(level = logging.WARNING )
+
+    apidata = api.make_auth_from_opts(opts, parser)
+
+    model = get_model(opts.load_model)
+    if opts.name:
+        mapper = DAFMapper(opts.name, opts.daf,  model)
+        if opts.library_url is not None:
+            mapper.library_url = opts.library_url
+        submission_uri = get_submission_uri(opts.name)
+
+
+    if opts.load_rdf is not None:
+        if submission_uri is None:
+            parser.error("Please specify the submission name")
+        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
+
+    if opts.make_ddf and opts.daf is None:
+        parser.error("Please specify your daf when making ddf files")
+
+    library_result_map = []
+    for a in args:
+        library_result_map.extend(read_library_result_map(a))
+
+    if opts.make_tree_from is not None:
+        make_tree_from(opts.make_tree_from, library_result_map)
+
+    if opts.link_daf:
+        if opts.daf is None:
+            parser.error("Please specify daf filename with --daf")
+        link_daf(opts.daf, library_result_map)
+
+    if opts.fastq:
+        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+                                       force=opts.force)
+        extractor.build_fastqs(library_result_map)
+
+    if opts.scan_submission:
+        scan_submission_dirs(mapper, library_result_map)
+
+    if opts.make_ddf:
+        make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
+
+    if opts.sparql:
+        sparql_query(model, opts.sparql)
+
+    if opts.print_rdf:
+        writer = get_serializer()
+        print writer.serialize_model_to_string(model)
+
+
+def make_parser():
+    parser = OptionParser()
+
+    model = OptionGroup(parser, 'model')
+    model.add_option('--name', help="Set submission name")
+    model.add_option('--load-model', default=None,
+      help="Load model database")
+    model.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    model.add_option('--sparql', default=None, help="execute sparql query")
+    model.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    parser.add_option_group(model)
+    # commands
+    commands = OptionGroup(parser, 'commands')
+    commands.add_option('--make-tree-from',
+                      help="create directories & link data files",
+                      default=None)
+    commands.add_option('--fastq', default=False, action="store_true",
+                        help="generate scripts for making fastq files")
+    commands.add_option('--scan-submission', default=False, action="store_true",
+                      help="Import metadata for submission into our model")
+    commands.add_option('--link-daf', default=False, action="store_true",
+                        help="link daf into submission directories")
+    commands.add_option('--make-ddf', help='make the ddfs', default=False,
+                      action="store_true")
+    parser.add_option_group(commands)
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="Force regenerating fastqs")
+    parser.add_option('--daf', default=None, help='specify daf name')
+    parser.add_option('--library-url', default=None,
+                      help="specify an alternate source for library information")
+    # debugging
+    parser.add_option('--verbose', default=False, action="store_true",
+                      help='verbose logging')
+    parser.add_option('--debug', default=False, action="store_true",
+                      help='debug logging')
+
+    api.add_auth_options(parser)
+
+    return parser
+
+def make_tree_from(source_path, library_result_map):
+    """Create a tree using data files from source path.
+    """
+    for lib_id, lib_path in library_result_map:
+        if not os.path.exists(lib_path):
+            logger.info("Making dir {0}".format(lib_path))
+            os.mkdir(lib_path)
+        source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
+        if os.path.exists(source_lib_dir):
+            pass
+        for filename in os.listdir(source_lib_dir):
+            source_pathname = os.path.join(source_lib_dir, filename)
+            target_pathname = os.path.join(lib_path, filename)
+            if not os.path.exists(source_pathname):
+                raise IOError("{0} does not exist".format(source_pathname))
+            if not os.path.exists(target_pathname):
+                os.symlink(source_pathname, target_pathname)
+                logger.info(
+                    'LINK {0} to {1}'.format(source_pathname, target_pathname))
+
+
+def link_daf(daf_path, library_result_map):
+    if not os.path.exists(daf_path):
+        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
+
+    base_daf = os.path.basename(daf_path)
+
+    for lib_id, result_dir in library_result_map:
+        if not os.path.exists(result_dir):
+            raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
+        submission_daf = os.path.join(result_dir, base_daf)
+        if not os.path.exists(submission_daf):
+            if not os.path.exists(daf_path):
+                raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
+            os.link(daf_path, submission_daf)
+
+
+def scan_submission_dirs(view_map, library_result_map):
+    """Look through our submission directories and collect needed information
+    """
+    for lib_id, result_dir in library_result_map:
+        logger.info("Importing %s from %s" % (lib_id, result_dir))
+        try:
+            view_map.import_submission_dir(result_dir, lib_id)
+        except MetadataLookupException, e:
+            logger.error("Skipping %s: %s" % (lib_id, str(e)))
+
+def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
+    dag_fragment = []
+    for lib_id, result_dir in library_result_map:
+        submissionNode = view_map.get_submission_node(result_dir)
+        dag_fragment.extend(
+            make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
+        )
+
+    if make_condor and len(dag_fragment) > 0:
+        dag_filename = 'submission.dagman'
+        if not force and os.path.exists(dag_filename):
+            logger.warn("%s exists, please delete" % (dag_filename,))
+        else:
+            f = open(dag_filename,'w')
+            f.write( os.linesep.join(dag_fragment))
+            f.write( os.linesep )
+            f.close()
+
+
+def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
+    """
+    Make ddf files, and bonus condor file
+    """
+    query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+
+select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
+WHERE {
+  ?file ucscDaf:filename ?files ;
+        ucscDaf:md5sum ?md5sum .
+  ?submitView ucscDaf:has_file ?file ;
+              ucscDaf:view ?dafView ;
+              ucscDaf:submission <%(submission)s> .
+  ?dafView ucscDaf:name ?view .
+  <%(submission)s> submissionOntology:library ?library ;
+
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell }
+  OPTIONAL { <%(submission)s> ucscDaf:control ?control }
+  OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
+  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:condition ?treatment }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+}
+ORDER BY  ?submitView"""
+    dag_fragments = []
+
+    name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
+    if name is None:
+        logger.error("Need name for %s" % (str(submissionNode)))
+        return []
+
+    ddf_name = name + '.ddf'
+    if outdir is not None:
+        outfile = os.path.join(outdir, ddf_name)
+        output = open(outfile,'w')
+    else:
+        outfile = 'stdout:'
+        output = sys.stdout
+
+    formatted_query = query_template % {'submission': str(submissionNode.uri)}
+
+    query = RDF.SPARQLQuery(formatted_query)
+    results = query.execute(view_map.model)
+
+    # filename goes first
+    variables = view_map.get_daf_variables()
+    # 'controlId',
+    output.write('\t'.join(variables))
+    output.write(os.linesep)
+
+    all_views = {}
+    all_files = []
+    for row in results:
+        viewname = fromTypedNode(row['view'])
+        current = all_views.setdefault(viewname, {})
+        for variable_name in variables:
+            value = str(fromTypedNode(row[variable_name]))
+            if value is None or value == 'None':
+                logger.warn("{0}: {1} was None".format(outfile, variable_name))
+            if variable_name in ('files', 'md5sum'):
+                current.setdefault(variable_name,[]).append(value)
+            else:
+                current[variable_name] = value
+
+    for view in all_views.keys():
+        line = []
+        for variable_name in variables:
+            if variable_name in ('files', 'md5sum'):
+                line.append(','.join(all_views[view][variable_name]))
+            else:
+                line.append(all_views[view][variable_name])
+        output.write("\t".join(line))
+        output.write(os.linesep)
+        all_files.extend(all_views[view]['files'])
+
+    logger.info(
+        "Examined {0}, found files: {1}".format(
+            str(submissionNode), ", ".join(all_files)))
+
+    all_files.append(daf_name)
+    all_files.append(ddf_name)
+
+    if make_condor:
+        archive_condor = make_condor_archive_script(name, all_files, outdir)
+        upload_condor = make_condor_upload_script(name, outdir)
+
+        dag_fragments.extend(
+            make_dag_fragment(name, archive_condor, upload_condor)
+        )
+
+    return dag_fragments
+
+
+def read_library_result_map(filename):
+    """
+    Read a file that maps library id to result directory.
+    Does not support spaces in filenames.
+
+    For example:
+      10000 result/foo/bar
+    """
+    stream = open(filename,'r')
+
+    results = []
+    for line in stream:
+        line = line.rstrip()
+        if not line.startswith('#') and len(line) > 0 :
+            library_id, result_dir = line.split()
+            results.append((library_id, result_dir))
+    return results
+
+
+def make_condor_archive_script(name, files, outdir=None):
+    script = """Universe = vanilla
+
+Executable = /bin/tar
+arguments = czvhf ../%(archivename)s %(filelist)s
+
+Error = compress.out.$(Process).log
+Output = compress.out.$(Process).log
+Log = /tmp/submission-compress-%(user)s.log
+initialdir = %(initialdir)s
+environment="GZIP=-3"
+request_memory = 20
+
+queue
+"""
+    if outdir is None:
+        outdir = os.getcwd()
+    for f in files:
+        pathname = os.path.join(outdir, f)
+        if not os.path.exists(pathname):
+            raise RuntimeError("Missing %s from %s" % (f,outdir))
+
+    context = {'archivename': make_submission_name(name),
+               'filelist': " ".join(files),
+               'initialdir': os.path.abspath(outdir),
+               'user': os.getlogin()}
+
+    condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    return condor_script
+
+
+def make_condor_upload_script(name, outdir=None):
+    script = """Universe = vanilla
+
+Executable = /usr/bin/lftp
+arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
+
+Error = upload.out.$(Process).log
+Output = upload.out.$(Process).log
+Log = /tmp/submission-upload-%(user)s.log
+initialdir = %(initialdir)s
+
+queue
+"""
+    if outdir is None:
+        outdir = os.getcwd()
+
+    auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
+
+    encodeftp = 'encodeftp.cse.ucsc.edu'
+    ftpuser = auth.hosts[encodeftp][0]
+    ftppassword = auth.hosts[encodeftp][2]
+    context = {'archivename': make_submission_name(name),
+               'initialdir': os.path.abspath(outdir),
+               'user': os.getlogin(),
+               'ftpuser': ftpuser,
+               'ftppassword': ftppassword,
+               'ftphost': encodeftp}
+
+    condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
+    condor_stream = open(condor_script,'w')
+    condor_stream.write(script % context)
+    condor_stream.close()
+    os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
+
+    return condor_script
+
+
+def make_dag_fragment(ininame, archive_condor, upload_condor):
+    """
+    Make the couple of fragments compress and then upload the data.
+    """
+    cur_dir = os.getcwd()
+    archive_condor = os.path.join(cur_dir, archive_condor)
+    upload_condor = os.path.join(cur_dir, upload_condor)
+    job_basename = make_base_name(ininame)
+
+    fragments = []
+    fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
+    fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
+    fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
+
+    return fragments
+
+
+def get_library_info(host, apidata, library_id):
+    url = api.library_url(host, library_id)
+    contents = api.retrieve_info(url, apidata)
+    return contents
+
+
+def make_submission_section(line_counter, files, attributes):
+    """
+    Create a section in the submission ini file
+    """
+    inifile = [ "[line%s]" % (line_counter,) ]
+    inifile += ["files=%s" % (",".join(files))]
+
+    for k,v in attributes.items():
+        inifile += ["%s=%s" % (k,v)]
+    return inifile
+
+
+def make_base_name(pathname):
+    base = os.path.basename(pathname)
+    name, ext = os.path.splitext(base)
+    return name
+
+
+def make_submission_name(ininame):
+    name = make_base_name(ininame)
+    return name + ".tgz"
+
+
+def make_ddf_name(pathname):
+    name = make_base_name(pathname)
+    return name + ".ddf"
+
+
+def make_condor_name(pathname, run_type=None):
+    name = make_base_name(pathname)
+    elements = [name]
+    if run_type is not None:
+        elements.append(run_type)
+    elements.append("condor")
+    return ".".join(elements)
+
+
+def parse_filelist(file_string):
+    return file_string.split(",")
+
+
+def validate_filelist(files):
+    """
+    Die if a file doesn't exist in a file list
+    """
+    for f in files:
+        if not os.path.exists(f):
+            raise RuntimeError("%s does not exist" % (f,))
+
+if __name__ == "__main__":
+    main()
diff --git a/extra/ucsc_encode_submission/README.txt b/extra/ucsc_encode_submission/README.txt
deleted file mode 100644 (file)
index bab7a55..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-I was building a variety of scripts to handle submitting our data to the 
-UCSC ENCODE pipeline, some of them were pulling data out of the htsworkflow
-databases, and since I needed an official place to put the scripts
-
-I decided here.
diff --git a/extra/ucsc_encode_submission/add-treatment-to-library.sparql b/extra/ucsc_encode_submission/add-treatment-to-library.sparql
deleted file mode 100755 (executable)
index c97dce2..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX daf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ddf: <http://encodesubmit.ucsc.edu/pipeline/download_ddf#>
-
-construct { ?library ddf:treatment ?treatment ;
-                     ddf:protocol ?pcr . }
-WHERE {
-   ?status ucscSubmission:has_file ?file .
-   ?submission ucscSubmission:has_status ?status ;
-               ucscSubmission:library_urn ?library ;
-               ucscSubmission:name ?name .
-   ?file ddf:treatment ?treatment ;
-         ddf:protocol ?pcr .
-}
-
diff --git a/extra/ucsc_encode_submission/dt-overrides.turtle b/extra/ucsc_encode_submission/dt-overrides.turtle
deleted file mode 100644 (file)
index ffe2759..0000000
+++ /dev/null
@@ -1,178 +0,0 @@
-##
-## Override submission ID to library URN names for our libraries
-## whose names either lack, or have the wrong library ID string
-## embedded in them.
-##
-
-@base <file:///home/diane/proj/solexa/htsworkflow/extra/ucsc_encode_submission/no-lib.sparql> .
-@prefix ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#> .
-
-# woldlab-hepg2-rnaseq-2009dec
-<http://encodesubmit.ucsc.edu/pipeline/show/805>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part2
-<http://encodesubmit.ucsc.edu/pipeline/show/810>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-hepg2-rnaseq-2009dec-part3
-<http://encodesubmit.ucsc.edu/pipeline/show/869>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-rnaseq-GM12878-rep1-stranded-2010Jan15
-<http://encodesubmit.ucsc.edu/pipeline/show/870>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab-hepg2-rnaseq-2010Jan-part4
-<http://encodesubmit.ucsc.edu/pipeline/show/897>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10879/> .
-
-# woldlab-gm12878-directional-rep2-rnaseq-2010Jan06
-<http://encodesubmit.ucsc.edu/pipeline/show/898>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep1-2010Jan6
-<http://encodesubmit.ucsc.edu/pipeline/show/903>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab-K562-directional-rnaseq-rep2-2010jan9
-<http://encodesubmit.ucsc.edu/pipeline/show/904>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab hESC 10886 rep1 2009Jan13
-<http://encodesubmit.ucsc.edu/pipeline/show/1026>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11286/> .
-
-# woldlab 2010Jun15 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1483>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# woldlab Jun18 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1626>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11009/> .
-
-# woldlab jun 18 1x75-Directional-GM12878-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1631>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11011/> .
-
-# woldlab jun 18  1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1632>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# woldlab jun 18 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1633>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1634>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# woldlab jun 18 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1635>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1636>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1637>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# woldlab jun 18 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1638>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/1639>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# woldlab jun 18 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1645>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# woldlab jun 18 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1646>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# woldlab June  2x75-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/1856>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10515/> .
-
-#2010 jul 9corrected fastqs
-<http://encodesubmit.ucsc.edu/pipeline/show/1874>
-     ucscSubmission:ignore "1" .
-#    ucscSubmission:library_urn "
-
-# 2010-11-05 Correction 1x75-Directional-GM12878-Rep1.tgz
-<http://encodesubmit.ucsc.edu/pipeline/show/2926>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-GM12878-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2930>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11010/> .
-
-# 1x75-Directional-H1-hESC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2931>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/10947/> .
-
-# 1x75-Directional-H1-hESC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2932>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HUVEC-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2933>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11206/> .
-
-# 1x75-Directional-HUVEC-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2934>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11205/> .
-
-# 1x75-Directional-HeLa-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2935>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11208/> .
-
-# 1x75-Directional-HeLa-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2936>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11207/> .
-
-# 1x75-Directional-HepG2-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2937>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
-# 1x75-Directional-HepG2-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2938>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# 1x75-Directional-K562-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2939>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11008/> .
-
-# 1x75-Directional-K562-Rep2
-<http://encodesubmit.ucsc.edu/pipeline/show/2940>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11007/> .
-
-# 1x75-Directional-NHEK-Rep1
-<http://encodesubmit.ucsc.edu/pipeline/show/2941>
-    ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11204/> .
-
-# "3438 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4607>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02970/> .
-
-# "3439 fastq resubmit"
-<http://encodesubmit.ucsc.edu/pipeline/show/4608>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02973/> .
-
-# "3437 Fastq re-submission"
-<http://encodesubmit.ucsc.edu/pipeline/show/4609>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/02971/> .
-
-# "1x75-Directional-HepG2-rep2-replace 3522"
-<http://encodesubmit.ucsc.edu/pipeline/show/4797>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11209/> .
-
-# "1x75-Directional-HepG2-rep1 replacement of 3521"
-<http://encodesubmit.ucsc.edu/pipeline/show/4798>
-  ucscSubmission:library_urn <http://jumpgate.caltech.edu/library/11210/> .
-
diff --git a/extra/ucsc_encode_submission/encode_find.py b/extra/ucsc_encode_submission/encode_find.py
deleted file mode 100644 (file)
index 5082912..0000000
+++ /dev/null
@@ -1,592 +0,0 @@
-#!/usr/bin/env python
-"""
-Gather information about our submissions into a single RDF store
-"""
-
-from datetime import datetime
-import hashlib
-import httplib2
-import keyring
-import logging
-from lxml.html import fromstring
-from operator import attrgetter
-from optparse import OptionParser, OptionGroup
-# python keyring
-import os
-import re
-# redland rdf lib
-import RDF
-import sys
-import urllib
-import urlparse
-
-from htsworkflow.submission import daf
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     dublinCoreNS, \
-     get_model, \
-     get_serializer, \
-     sparql_query, \
-     submissionOntology, \
-     libraryOntology, \
-     load_into_model, \
-     rdfNS, \
-     rdfsNS, \
-     xsdNS
-TYPE_N = rdfNS['type']
-
-# URL mappings
-LIBRARY_NS = RDF.NS("http://jumpgate.caltech.edu/library/")
-
-from htsworkflow.submission.ucsc import \
-     daf_download_url, \
-     ddf_download_url, \
-     submission_view_url, \
-     UCSCEncodePipeline
-
-DOWNLOAD_DDF = UCSCEncodePipeline + "download_ddf#"
-DDF_NS = RDF.NS(DOWNLOAD_DDF)
-
-DBDIR = os.path.expanduser("~diane/proj/submission")
-
-LOGGER = logging.getLogger("encode_find")
-
-LOGIN_URL = 'http://encodesubmit.ucsc.edu/account/login'
-USER_URL = 'http://encodesubmit.ucsc.edu/pipeline/show_user'
-
-USERNAME = 'detrout'
-CHARSET = 'utf-8'
-
-
-def main(cmdline=None):
-    """
-    Parse command line arguments
-
-    Takes a list of arguments (assuming arg[0] is the program name) or None
-    If None, it looks at sys.argv
-    """
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if opts.debug:
-        logging.basicConfig(level=logging.DEBUG)
-    elif opts.verbose:
-        logging.basicConfig(level=logging.INFO)
-
-    htsw_authdata = api.make_auth_from_opts(opts, parser)
-    htswapi = api.HtswApi(opts.host, htsw_authdata)
-
-    cookie = None
-    model = get_model(opts.load_model, DBDIR)
-
-    if opts.load_rdf is not None:
-        ns_uri = submissionOntology[''].uri
-        load_into_model(model, opts.rdf_parser_name, opts.load_rdf, ns_uri)
-
-    if len(args) == 0:
-        limit = None
-    else:
-        limit = args
-
-    if opts.update:
-        cookie = login(cookie=cookie)
-        load_my_submissions(model, limit=limit, cookie=cookie)
-        load_encode_libraries(model, htswapi)
-
-    if opts.sparql is not None:
-        sparql_query(model, opts.sparql)
-
-    if opts.find_submission_with_no_library:
-        find_submissions_with_no_library(model)
-
-    if opts.print_rdf:
-        serializer = get_serializer(name=opts.rdf_parser_name)
-        print serializer.serialize_model_to_string(model)
-
-
-def make_parser():
-    """Construct option parser
-    """
-    parser = OptionParser()
-    commands = OptionGroup(parser, "Commands")
-    commands.add_option('--load-model', default=None,
-      help="Load model database")
-    commands.add_option('--load-rdf', default=None,
-      help="load rdf statements into model")
-    commands.add_option('--print-rdf', action="store_true", default=False,
-      help="print ending model state")
-    commands.add_option('--update', action="store_true", default=False,
-      help="Query remote data sources and update our database")
-    #commands.add_option('--update-ucsc-status', default=None,
-    #  help="download status from ucsc, requires filename for extra rules")
-    #commands.add_option('--update-ddfs', action="store_true", default=False,
-    #  help="download ddf information for known submission")
-    #commands.add_option('--update-library', default=None,
-    #  help="download library info from htsw, "\
-    #       "requires filename for extra rules")
-    parser.add_option_group(commands)
-
-    queries = OptionGroup(parser, "Queries")
-    queries.add_option('--sparql', default=None,
-      help="execute arbitrary sparql query")
-    queries.add_option('--find-submission-with-no-library', default=False,
-      action="store_true",
-      help="find submissions with no library ID")
-    parser.add_option_group(queries)
-
-    options = OptionGroup(parser, "Options")
-    options.add_option("--rdf-parser-name", default="turtle",
-      help="set rdf file parser type")
-    options.add_option("-v", "--verbose", action="store_true", default=False)
-    options.add_option("--debug", action="store_true", default=False)
-    parser.add_option_group(options)
-
-    api.add_auth_options(parser)
-
-    return parser
-
-
-def load_my_submissions(model, limit=None, cookie=None):
-    """Parse all the submissions from UCSC into model
-    It will look at the global USER_URL to figure out who to scrape
-    cookie contains the session cookie, if none, will attempt to login
-    """
-    if cookie is None:
-        cookie = login()
-
-    tree = get_url_as_tree(USER_URL, 'GET', cookie)
-    table_rows = tree.xpath('//table[@id="projects"]/tr')
-    # first record is header
-    name_n = submissionOntology['name']
-    species_n = submissionOntology['species']
-    library_urn = submissionOntology['library_urn']
-
-    # skip header
-    for row in table_rows[1:]:
-        cell = row.xpath('td')
-        if cell is not None and len(cell) > 1:
-            submission_id = str(cell[0].text_content())
-            if limit is None or submission_id in limit:
-                subUrn = RDF.Uri(submission_view_url(submission_id))
-
-                add_stmt(model, subUrn, TYPE_N, submissionOntology['Submission'])
-
-                name = str(cell[4].text_content())
-                add_stmt(model, subUrn, name_n, name)
-
-                species = str(cell[2].text_content())
-                if species is not None:
-                    add_stmt(model, subUrn, species_n, species)
-
-                library_id = get_library_id(name)
-                if library_id is not None:
-                    add_submission_to_library_urn(model,
-                                                  subUrn,
-                                                  library_urn,
-                                                  library_id)
-                else:
-                    errmsg = 'Unable to find library id in {0} for {1}'
-                    LOGGER.warn(errmsg.format(name, str(subUrn)))
-
-                add_submission_creation_date(model, subUrn, cookie)
-
-                # grab changing atttributes
-                status = str(cell[6].text_content()).strip()
-                last_mod_datetime = get_date_contents(cell[8])
-                last_mod = last_mod_datetime.isoformat()
-
-                update_submission_detail(model, subUrn, status, last_mod,
-                                         cookie=cookie)
-
-                LOGGER.info("Processed {0}".format(subUrn))
-
-
-
-
-def add_submission_to_library_urn(model, submissionUrn, predicate, library_id):
-    """Add a link from a UCSC submission to woldlab library if needed
-    """
-    libraryUrn = LIBRARY_NS[library_id + '/']
-    query = RDF.Statement(submissionUrn, predicate, libraryUrn)
-    if not model.contains_statement(query):
-        link = RDF.Statement(submissionUrn, predicate, libraryUrn)
-        LOGGER.info("Adding Sub -> Lib link: {0}".format(link))
-        model.add_statement(link)
-    else:
-        LOGGER.debug("Found: {0}".format(str(query)))
-
-
-def find_submissions_with_no_library(model):
-    missing_lib_query_text = """
-PREFIX submissionOntology:<{submissionOntology}>
-
-SELECT
- ?subid ?name
-WHERE {{
-  ?subid submissionOntology:name ?name
-  OPTIONAL {{ ?subid submissionOntology:library_urn ?libid }}
-  FILTER  (!bound(?libid))
-}}""".format(submissionOntology=submissionOntology[''].uri)
-    missing_lib_query = RDF.SPARQLQuery(missing_lib_query_text)
-
-    results = missing_lib_query.execute(model)
-    for row in results:
-        subid = row['subid']
-        name = row['name']
-        print "# {0}".format(name)
-        print "<{0}>".format(subid.uri)
-        print "  encodeSubmit:library_urn "\
-              "<http://jumpgate.caltech.edu/library/> ."
-        print ""
-
-
-def add_submission_creation_date(model, subUrn, cookie):
-    # in theory the submission page might have more information on it.
-    creationDateN = libraryOntology['date']
-    dateTimeType = xsdNS['dateTime']
-    query = RDF.Statement(subUrn, creationDateN, None)
-    creation_dates = list(model.find_statements(query))
-    if len(creation_dates) == 0:
-        LOGGER.info("Getting creation date for: {0}".format(str(subUrn)))
-        tree = get_url_as_tree(str(subUrn), 'GET', cookie)
-        cells = tree.findall('.//td')
-        created_label = [x for x in cells
-                         if x.text_content().startswith('Created')]
-        if len(created_label) == 1:
-            created_date = get_date_contents(created_label[0].getnext())
-            created_date_node = RDF.Node(literal=created_date.isoformat(),
-                                         datatype=dateTimeType.uri)
-            add_stmt(model, subUrn, creationDateN, created_date_node)
-        else:
-            msg = 'Unable to find creation date for {0}'.format(str(subUrn))
-            LOGGER.warn(msg)
-            raise Warning(msg)
-    else:
-        LOGGER.debug("Found creation date for: {0}".format(str(subUrn)))
-
-
-def update_submission_detail(model, subUrn, status, recent_update, cookie):
-    HasStatusN = submissionOntology['has_status']
-    StatusN = submissionOntology['status']
-    LastModifyN = submissionOntology['last_modify_date']
-
-    status_nodes_query = RDF.Statement(subUrn, HasStatusN, None)
-    status_nodes = list(model.find_statements(status_nodes_query))
-
-    if len(status_nodes) == 0:
-        # has no status node, add one
-        LOGGER.info("Adding status node to {0}".format(subUrn))
-        status_node = create_status_node(subUrn, recent_update)
-        add_stmt(model, subUrn, HasStatusN, status_node)
-        add_stmt(model, status_node, rdfNS['type'], StatusN)
-        add_stmt(model, status_node, StatusN, status)
-        add_stmt(model, status_node, LastModifyN, recent_update)
-        update_ddf(model, subUrn, status_node, cookie=cookie)
-        update_daf(model, subUrn, status_node, cookie=cookie)
-    else:
-        LOGGER.info("Found {0} status blanks".format(len(status_nodes)))
-        for status_statement in status_nodes:
-            status_node = status_statement.object
-            last_modified_query = RDF.Statement(status_node,
-                                                LastModifyN,
-                                                None)
-            last_mod_nodes = model.find_statements(last_modified_query)
-            for last_mod_statement in last_mod_nodes:
-                last_mod_date = str(last_mod_statement.object)
-                if recent_update == str(last_mod_date):
-                    update_ddf(model, subUrn, status_node, cookie=cookie)
-                    update_daf(model, subUrn, status_node, cookie=cookie)
-                    break
-
-
-def update_daf(model, submission_url, status_node, cookie):
-    download_daf_uri = str(submission_url).replace('show', 'download_daf')
-    daf_uri = RDF.Uri(download_daf_uri)
-
-    status_is_daf = RDF.Statement(status_node, TYPE_N, dafTermOntology[''])
-    if not model.contains_statement(status_is_daf):
-        LOGGER.info('Adding daf to {0}, {1}'.format(submission_url,
-                                                     status_node))
-        daf_text = get_url_as_text(download_daf_uri, 'GET', cookie)
-        daf_hash = hashlib.md5(daf_text).hexdigest()
-        daf_hash_stmt = RDF.Statement(status_node,
-                                      dafTermOntology['md5sum'],
-                                      daf_hash)
-        model.add_statement(daf_hash_stmt)
-        daf.fromstring_into_model(model, status_node, daf_text)
-
-
-def update_ddf(model, subUrn, statusNode, cookie):
-    download_ddf_url = str(subUrn).replace('show', 'download_ddf')
-    ddfUrn = RDF.Uri(download_ddf_url)
-
-    status_is_ddf = RDF.Statement(statusNode, TYPE_N, DDF_NS[''])
-    if not model.contains_statement(status_is_ddf):
-        LOGGER.info('Adding ddf to {0}, {1}'.format(subUrn, statusNode))
-        ddf_text = get_url_as_text(download_ddf_url, 'GET', cookie)
-        add_ddf_statements(model, statusNode, ddf_text)
-        model.add_statement(status_is_ddf)
-
-
-def add_ddf_statements(model, statusNode, ddf_string):
-    """Convert a ddf text file into RDF Statements
-    """
-    ddf_lines = ddf_string.split('\n')
-    # first line is header
-    header = ddf_lines[0].split()
-    attributes = [DDF_NS[x] for x in header]
-
-    for ddf_line in ddf_lines[1:]:
-        ddf_line = ddf_line.strip()
-        if len(ddf_line) == 0:
-            continue
-        if ddf_line.startswith("#"):
-            continue
-
-        ddf_record = ddf_line.split('\t')
-        files = ddf_record[0].split(',')
-        file_attributes = ddf_record[1:]
-
-        for f in files:
-            fileNode = RDF.Node()
-            add_stmt(model,
-                     statusNode,
-                     submissionOntology['has_file'],
-                     fileNode)
-            add_stmt(model, fileNode, rdfNS['type'], DDF_NS['file'])
-            add_stmt(model, fileNode, DDF_NS['filename'], f)
-
-            for predicate, object in zip(attributes[1:], file_attributes):
-                add_stmt(model, fileNode, predicate, object)
-
-
-def load_encode_libraries(model, htswapi):
-    """Get libraries associated with encode.
-    """
-    encodeFilters = ["/library/?affiliations__id__exact=44",
-                     "/library/?affiliations__id__exact=80",
-                    ]
-
-    encodeUrls = [os.path.join(htswapi.root_url + u) for u in encodeFilters]
-    rdfaParser = RDF.Parser(name='rdfa')
-    for encodeUrl in encodeUrls:
-        LOGGER.info("Scanning library url {0}".format(encodeUrl))
-        rdfaParser.parse_into_model(model, encodeUrl)
-        query = RDF.Statement(None, libraryOntology['library_id'], None)
-        libraries = model.find_statements(query)
-        for statement in libraries:
-            libraryUrn = statement.subject
-            LOGGER.info("Scanning {0}".format(str(libraryUrn)))
-            load_library_detail(model, libraryUrn)
-
-
-def load_library_detail(model, libraryUrn):
-    """Grab detail information from library page
-    """
-    rdfaParser = RDF.Parser(name='rdfa')
-    query = RDF.Statement(libraryUrn, libraryOntology['date'], None)
-    results = list(model.find_statements(query))
-    log_message = "Found {0} statements for {1}"
-    LOGGER.debug(log_message.format(len(results), libraryUrn))
-    if len(results) == 0:
-        LOGGER.info("Loading {0}".format(str(libraryUrn)))
-        rdfaParser.parse_into_model(model, libraryUrn.uri)
-    elif len(results) == 1:
-        pass  # Assuming that a loaded dataset has one record
-    else:
-        LOGGER.warning("Many dates for {0}".format(libraryUrn))
-
-
-def get_library_id(name):
-    """Guess library ID from library name
-
-    >>> get_library_id('2x75-GM12892-rep1-11039 20110217 elements')
-    '11039'
-    >>> get_library_id('10150 C2C12-24h-myogenin-2PCR-Rep1.32mers')
-    '10150'
-    """
-    match = re.search(r"([ -]|^)(?P<id>([\d]{5})|(SL[\d]{4}))", name)
-    library_id = None
-    if match is not None:
-        library_id = match.group('id')
-    return library_id
-
-
-def get_contents(element):
-    """Return contents or none.
-    """
-    if len(element.contents) == 0:
-        return None
-
-    a = element.find('a')
-    if a is not None:
-        return a.contents[0].encode(CHARSET)
-
-    return element.contents[0].encode(CHARSET)
-
-
-def create_status_node(submission_uri, timestamp):
-    submission_uri = daf.submission_uri_to_string(submission_uri)
-    status_uri = urlparse.urljoin(submission_uri, timestamp)
-    return RDF.Node(RDF.Uri(status_uri))
-
-
-def get_date_contents(element):
-    data = element.text_content()
-    if data:
-        return datetime.strptime(data, "%Y-%m-%d %H:%M")
-    else:
-        return None
-
-
-def add_stmt(model, subject, predicate, rdf_object):
-    """Convienence create RDF Statement and add to a model
-    """
-    return model.add_statement(
-        RDF.Statement(subject, predicate, rdf_object))
-
-
-def login(cookie=None):
-    """Login if we don't have a cookie
-    """
-    if cookie is not None:
-        return cookie
-
-    keys = keyring.get_keyring()
-    password = keys.get_password(LOGIN_URL, USERNAME)
-    credentials = {'login': USERNAME,
-                   'password': password}
-    headers = {'Content-type': 'application/x-www-form-urlencoded'}
-    http = httplib2.Http()
-    response, content = http.request(LOGIN_URL,
-                                     'POST',
-                                     headers=headers,
-                                     body=urllib.urlencode(credentials))
-    LOGGER.debug("Login to {0}, status {1}".format(LOGIN_URL,
-                                                    response['status']))
-
-    cookie = response.get('set-cookie', None)
-    if cookie is None:
-        raise RuntimeError("Wasn't able to log into: {0}".format(LOGIN_URL))
-    return cookie
-
-
-def get_url_as_tree(url, method, cookie=None):
-    http = httplib2.Http()
-    headers = {}
-    if cookie is not None:
-        headers['Cookie'] = cookie
-    response, content = http.request(url, method, headers=headers)
-    if response['status'] == '200':
-        tree = fromstring(content, base_url=url)
-        return tree
-    else:
-        msg = "error accessing {0}, status {1}"
-        msg = msg.format(url, response['status'])
-        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-
-def get_url_as_text(url, method, cookie=None):
-    http = httplib2.Http()
-    headers = {}
-    if cookie is not None:
-        headers['Cookie'] = cookie
-    response, content = http.request(url, method, headers=headers)
-    if response['status'] == '200':
-        return content
-    else:
-        msg = "error accessing {0}, status {1}"
-        msg = msg.format(url, response['status'])
-        e = httplib2.HttpLib2ErrorWithResponse(msg, response, content)
-
-################
-#  old stuff
-SUBMISSIONS_LACKING_LIBID = [
-    ('1x75-Directional-HeLa-Rep1',    '11208'),
-    ('1x75-Directional-HeLa-Rep2',    '11207'),
-    ('1x75-Directional-HepG2-Rep1',   '11210'),
-    ('1x75-Directional-HepG2-Rep2',   '11209'),
-    ('1x75-Directional-H1-hESC-Rep1', '10947'),
-    ('1x75-Directional-H1-hESC-Rep2', '11009'),
-    ('1x75-Directional-HUVEC-Rep1',   '11206'),
-    ('1x75-Directional-HUVEC-Rep2',   '11205'),
-    ('1x75-Directional-K562-Rep1',    '11008'),
-    ('1x75-Directional-K562-Rep2',    '11007'),
-    ('1x75-Directional-NHEK-Rep1',    '11204'),
-    ('1x75-Directional-GM12878-Rep1', '11011'),
-    ('1x75-Directional-GM12878-Rep2', '11010'),
-    ]
-
-
-def select_by_library_id(submission_list):
-    subl = [(x.library_id, x) for x in submission_list if x.library_id]
-    libraries = {}
-    for lib_id, subobj in subl:
-        libraries.setdefault(lib_id, []).append(subobj)
-
-    for submission in libraries.values():
-        submission.sort(key=attrgetter('date'), reverse=True)
-
-    return libraries
-
-
-def library_to_freeze(selected_libraries):
-    freezes = ['2010-Jan', '2010-Jul', '2011-Jan']
-    lib_ids = sorted(selected_libraries.keys())
-    report = ['<html><table border="1">']
-    report = ["""<html>
-<head>
-<style type="text/css">
- td {border-width:0 0 1px 1px; border-style:solid;}
-</style>
-</head>
-<body>
-<table>
-"""]
-    report.append('<thead>')
-    report.append('<tr><td>Library ID</td><td>Name</td>')
-    for f in freezes:
-        report.append('<td>{0}</td>'.format(f))
-    report.append('</tr>')
-    report.append('</thead>')
-    report.append('<tbody>')
-    for lib_id in lib_ids:
-        report.append('<tr>')
-        lib_url = LIBRARY_NS[lib_id].uri
-        report.append('<td><a href="{0}">{1}</a></td>'.format(lib_url, lib_id))
-        submissions = selected_libraries[lib_id]
-        report.append('<td>{0}</td>'.format(submissions[0].name))
-        batched = {}
-        for sub in submissions:
-            date = date_to_freeze(sub.date)
-            batched.setdefault(date, []).append(sub)
-        for d in freezes:
-            report.append('<td>')
-            for s in batched.get(d, []):
-                show_url = submission_view_url(s.subid)
-                subid = '<a href="{0}">{1}</a>'.format(show_url, s.subid)
-                report.append("{0}:{1}".format(subid, s.status))
-            report.append('</td>')
-        else:
-            report.append('<td></td>')
-        report.append("</tr>")
-    report.append('</tbody>')
-    report.append("</table></html>")
-    return "\n".join(report)
-
-
-def date_to_freeze(d):
-    freezes = [(datetime(2010, 1, 30), '2010-Jan'),
-               (datetime(2010, 7, 30), '2010-Jul'),
-               (datetime(2011, 1, 30), '2011-Jan'),
-               ]
-    for end, name in freezes:
-        if d < end:
-            return name
-    else:
-        return None
-
-if __name__ == "__main__":
-    main()
diff --git a/extra/ucsc_encode_submission/failed-submissions.sparql b/extra/ucsc_encode_submission/failed-submissions.sparql
deleted file mode 100644 (file)
index af4af4e..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-##
-## Find submissions that are currently "failed"
-##
-
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX submitOnt:<http://jumpgate.caltech.edu/wiki/UCSCSubmissionOntology#>
-PREFIX libOntNS:<http://jumpgate.caltech.edu/wiki/LibraryOntology#">
-
-#libraryNS = RDF.NS("http://jumpgate.caltech.edu/library/")
-#submissionNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/show/")
-#ddfNS = RDF.NS("http://encodesubmit.ucsc.edu/pipeline/download_ddf#")
-
-SELECT 
- ?subid ?subname ?liburn ?status
-WHERE {
-  ?subid submitOnt:name ?subname .
-  ?subid submitOnt:library_urn ?liburn .
-  ?subid submitOnt:has_status ?statusNode .
-  ?statusNode submitOnt:status ?status .
-  ?statusNode submitOnt:last_modify_date ?last_modify .
-  FILTER (regex(?status, "failed", "i"))
-} 
diff --git a/extra/ucsc_encode_submission/find-lib-by-cell.sparql b/extra/ucsc_encode_submission/find-lib-by-cell.sparql
deleted file mode 100755 (executable)
index c4585c5..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-# Produce list of submissions associated with a cell/replicate
-PREFIX xsd:<http://www.w3.org/2001/XMLSchema#>
-PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
-PREFIX ucscSubmission:<http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX libraryOntology:<http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-
-SELECT distinct ?liburn ?cell ?replicate ?subid ?name ?submission_date
-WHERE {
-    ?subid ucscSubmission:name ?name .
-    OPTIONAL { ?subid ucscSubmission:library_urn ?liburn ;
-                       libraryOntology:date ?submission_date .
-               ?liburn libraryOntology:cell_line ?cell ;
-                       libraryOntology:replicate ?replicate . }
-    #filter(?submission_date > "2011-04-01T00:00:00Z"^^xsd:dateTime)
-    #filter(!bound(?liburn))
-}
-ORDER BY ?submission_date ?cell ?replicate ?liburn
diff --git a/extra/ucsc_encode_submission/scan_extension.py b/extra/ucsc_encode_submission/scan_extension.py
deleted file mode 100644 (file)
index 39f19c6..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-from optparse import OptionParser
-import os
-import sys
-from pprint import pprint
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    extensions = scan(args)
-    common_extensions = find_common_suffix(extensions)
-
-    if opts.rdf:
-        print_rdf(common_extensions)
-    else:
-        print common_extensions
-        
-def make_parser():
-    parser = OptionParser("%prog: directory [directory...]")
-    parser.add_option('--rdf', action="store_true", default=False,
-                      help="Produce rdf configuration file for ucsc_gather")
-    return parser
-
-def scan(toscan):
-    index = {}
-    for cur_scan_dir in toscan:
-        for path, dirnames, filenames in os.walk(cur_scan_dir):
-            for filename in filenames:
-                base, ext = os.path.splitext(filename)
-                if ext in ('.daf', 'ddf'):
-                    continue
-                next_index = index
-                for c in filename[::-1]:
-                    next_index = next_index.setdefault(c, {})
-    return index
-
-def find_common_suffix(index, tail=[]):
-    if len(tail) > 0 and len(index) > 1:
-        return "".join(tail[::-1])
-
-    results = []
-    for key, choice in index.items():
-        r = find_common_suffix(choice, tail+[key])
-        if r is not None:
-            results.append (r)
-        
-    if len(results) == 0:
-        return None
-    elif len(results) == 1:
-        return results[0]
-    else:
-        return results
-
-def print_rdf(common_extensions):
-    import RDF
-    from htsworkflow.util import rdfhelp
-    model = rdfhelp.get_model()
-
-    viewName = 'http://jumpgate.caltech.edu/wiki/SubmissionsLog/NAME/view/'
-    subView = RDF.NS(viewName)
-    fileReTerm = rdfhelp.dafTermOntology['filename_re']
-
-    count = 1
-    for ext in common_extensions:
-        s = RDF.Statement(subView['VIEW{0}'.format(count)],
-                          fileReTerm,
-                          '.*{0}$'.format(ext.replace('.', '\\.')))
-        model.add_statement(s)
-        count += 1
-        
-    writer = rdfhelp.get_serializer()
-    writer.set_namespace('thisSubmissionView', subView._prefix)
-    print writer.serialize_model_to_string(model)
-
-if __name__ == "__main__":
-    main()
diff --git a/extra/ucsc_encode_submission/test_ucsc_gather.py b/extra/ucsc_encode_submission/test_ucsc_gather.py
deleted file mode 100644 (file)
index 64f712d..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-import unittest
-
-import ucsc_gather
-
-class testUCSCGather(unittest.TestCase):
-    def test_view_attribute_map(self):
-        view_map = ucsc_gather.NameToViewMap()
-        view_map.lib_cache["0"] = {
-            "cell_line": "NLHF",
-            "replicate": "1",
-            "lane_set": {},
-            }
-    
-        a = view_map.find_attributes("foo.ini", "0")    
-        self.failUnless(a["view"] is None)
-    
-        a = view_map.find_attributes("asdf.fdsa", "0")
-        self.failUnless(a is None)
-    
-        a = view_map.find_attributes("foo.fastq", "0")
-        self.failUnlessEqual(a["view"], "Fastq", "0")
-    
-        a = view_map.find_attributes("foo_r1.fastq", "0")
-        self.failUnlessEqual(a["view"], "FastqRd1", "0")
-
-    def test_get_library_info_paired(self):
-        view_map = ucsc_gather.NameToViewMap()
-        view_map.lib_cache["11588"] = {
-            u'antibody_id': None,
-            u'cell_line': u'NHLF',
-            u'cell_line_id': 13,
-            u'experiment_type': u'RNA-seq',
-            u'experiment_type_id': 4,
-            u'gel_cut_size': 300,
-            u'hidden': False,
-            u'id': u'11588',
-            u'insert_size': 200,
-            u'lane_set': [{u'flowcell': u'61PKCAAXX',
-                           u'lane_number': 8,
-                           u'paired_end': True,
-                           u'read_length': 76,
-                           u'status': u'Unknown',
-                           u'status_code': None},
-                          {u'flowcell': u'61PKLAAXX',
-                           u'lane_number': 8,
-                           u'paired_end': True,
-                           u'read_length': 76,
-                           u'status': u'Unknown',
-                           u'status_code': None}],
-            u'library_id': u'11588',
-            u'library_name': u'Paired ends 254 NHLF 31',
-            u'library_species': u'Homo sapiens',
-            u'library_species_id': 8,
-            u'library_type': u'Paired End',
-            u'library_type_id': 2,
-            u'made_by': u'Brian',
-            u'made_for': u'Brian',
-            u'notes': u'300 bp gel fragment, SPRI beads cleanup',
-            u'replicate': 2,
-            u'stopping_point': u'1Aa',
-            u'successful_pM': None,
-            u'undiluted_concentration': u'26.2'}
-
-        a = view_map.find_attributes("foo.bam", "11588")
-        self.failUnlessEqual(a["view"], "Paired")
-        self.failUnlessEqual(a["insertLength"], 200)
-
-
-def suite():
-    return unittest.makeSuite(testUCSCGather,"test")
-
-if __name__ == "__main__":
-    unittest.main(defaultTest="suite")
-
diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py
deleted file mode 100755 (executable)
index fd8db12..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-#!/usr/bin/env python
-from ConfigParser import SafeConfigParser
-import fnmatch
-from glob import glob
-import json
-import logging
-import netrc
-from optparse import OptionParser, OptionGroup
-import os
-from pprint import pprint, pformat
-import shlex
-from StringIO import StringIO
-import stat
-import sys
-import time
-import types
-import urllib
-import urllib2
-import urlparse
-
-import RDF
-
-from htsworkflow.util import api
-from htsworkflow.util.rdfhelp import \
-     dafTermOntology, \
-     fromTypedNode, \
-     get_model, \
-     get_serializer, \
-     load_into_model, \
-     sparql_query, \
-     submissionOntology
-from htsworkflow.submission.daf import \
-     DAFMapper, \
-     MetadataLookupException, \
-     get_submission_uri
-from htsworkflow.submission.condorfastq import CondorFastqExtract
-
-logger = logging.getLogger('ucsc_gather')
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-    submission_uri = None
-
-    if opts.debug:
-        logging.basicConfig(level = logging.DEBUG )
-    elif opts.verbose:
-        logging.basicConfig(level = logging.INFO )
-    else:
-        logging.basicConfig(level = logging.WARNING )
-
-    apidata = api.make_auth_from_opts(opts, parser)
-
-    model = get_model(opts.load_model)
-    if opts.name:
-        mapper = DAFMapper(opts.name, opts.daf,  model)
-        if opts.library_url is not None:
-            mapper.library_url = opts.library_url
-        submission_uri = get_submission_uri(opts.name)
-
-
-    if opts.load_rdf is not None:
-        if submission_uri is None:
-            parser.error("Please specify the submission name")
-        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
-
-    if opts.make_ddf and opts.daf is None:
-        parser.error("Please specify your daf when making ddf files")
-
-    library_result_map = []
-    for a in args:
-        library_result_map.extend(read_library_result_map(a))
-
-    if opts.make_tree_from is not None:
-        make_tree_from(opts.make_tree_from, library_result_map)
-
-    if opts.link_daf:
-        if opts.daf is None:
-            parser.error("Please specify daf filename with --daf")
-        link_daf(opts.daf, library_result_map)
-
-    if opts.fastq:
-        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
-                                       force=opts.force)
-        extractor.build_fastqs(library_result_map)
-
-    if opts.scan_submission:
-        scan_submission_dirs(mapper, library_result_map)
-
-    if opts.make_ddf:
-        make_all_ddfs(mapper, library_result_map, opts.daf, force=opts.force)
-
-    if opts.sparql:
-        sparql_query(model, opts.sparql)
-
-    if opts.print_rdf:
-        writer = get_serializer()
-        print writer.serialize_model_to_string(model)
-
-
-def make_parser():
-    parser = OptionParser()
-
-    model = OptionGroup(parser, 'model')
-    model.add_option('--name', help="Set submission name")
-    model.add_option('--load-model', default=None,
-      help="Load model database")
-    model.add_option('--load-rdf', default=None,
-      help="load rdf statements into model")
-    model.add_option('--sparql', default=None, help="execute sparql query")
-    model.add_option('--print-rdf', action="store_true", default=False,
-      help="print ending model state")
-    parser.add_option_group(model)
-    # commands
-    commands = OptionGroup(parser, 'commands')
-    commands.add_option('--make-tree-from',
-                      help="create directories & link data files",
-                      default=None)
-    commands.add_option('--fastq', default=False, action="store_true",
-                        help="generate scripts for making fastq files")
-    commands.add_option('--scan-submission', default=False, action="store_true",
-                      help="Import metadata for submission into our model")
-    commands.add_option('--link-daf', default=False, action="store_true",
-                        help="link daf into submission directories")
-    commands.add_option('--make-ddf', help='make the ddfs', default=False,
-                      action="store_true")
-    parser.add_option_group(commands)
-
-    parser.add_option('--force', default=False, action="store_true",
-                      help="Force regenerating fastqs")
-    parser.add_option('--daf', default=None, help='specify daf name')
-    parser.add_option('--library-url', default=None,
-                      help="specify an alternate source for library information")
-    # debugging
-    parser.add_option('--verbose', default=False, action="store_true",
-                      help='verbose logging')
-    parser.add_option('--debug', default=False, action="store_true",
-                      help='debug logging')
-
-    api.add_auth_options(parser)
-
-    return parser
-
-def make_tree_from(source_path, library_result_map):
-    """Create a tree using data files from source path.
-    """
-    for lib_id, lib_path in library_result_map:
-        if not os.path.exists(lib_path):
-            logger.info("Making dir {0}".format(lib_path))
-            os.mkdir(lib_path)
-        source_lib_dir = os.path.abspath(os.path.join(source_path, lib_path))
-        if os.path.exists(source_lib_dir):
-            pass
-        for filename in os.listdir(source_lib_dir):
-            source_pathname = os.path.join(source_lib_dir, filename)
-            target_pathname = os.path.join(lib_path, filename)
-            if not os.path.exists(source_pathname):
-                raise IOError("{0} does not exist".format(source_pathname))
-            if not os.path.exists(target_pathname):
-                os.symlink(source_pathname, target_pathname)
-                logger.info(
-                    'LINK {0} to {1}'.format(source_pathname, target_pathname))
-
-
-def link_daf(daf_path, library_result_map):
-    if not os.path.exists(daf_path):
-        raise RuntimeError("%s does not exist, how can I link to it?" % (daf_path,))
-
-    base_daf = os.path.basename(daf_path)
-
-    for lib_id, result_dir in library_result_map:
-        if not os.path.exists(result_dir):
-            raise RuntimeError("Couldn't find target directory %s" %(result_dir,))
-        submission_daf = os.path.join(result_dir, base_daf)
-        if not os.path.exists(submission_daf):
-            if not os.path.exists(daf_path):
-                raise RuntimeError("Couldn't find daf: %s" %(daf_path,))
-            os.link(daf_path, submission_daf)
-
-
-def scan_submission_dirs(view_map, library_result_map):
-    """Look through our submission directories and collect needed information
-    """
-    for lib_id, result_dir in library_result_map:
-        logger.info("Importing %s from %s" % (lib_id, result_dir))
-        try:
-            view_map.import_submission_dir(result_dir, lib_id)
-        except MetadataLookupException, e:
-            logger.error("Skipping %s: %s" % (lib_id, str(e)))
-
-def make_all_ddfs(view_map, library_result_map, daf_name, make_condor=True, force=False):
-    dag_fragment = []
-    for lib_id, result_dir in library_result_map:
-        submissionNode = view_map.get_submission_node(result_dir)
-        dag_fragment.extend(
-            make_ddf(view_map, submissionNode, daf_name, make_condor, result_dir)
-        )
-
-    if make_condor and len(dag_fragment) > 0:
-        dag_filename = 'submission.dagman'
-        if not force and os.path.exists(dag_filename):
-            logger.warn("%s exists, please delete" % (dag_filename,))
-        else:
-            f = open(dag_filename,'w')
-            f.write( os.linesep.join(dag_fragment))
-            f.write( os.linesep )
-            f.close()
-
-
-def make_ddf(view_map, submissionNode, daf_name, make_condor=False, outdir=None):
-    """
-    Make ddf files, and bonus condor file
-    """
-    query_template = """PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-
-select ?submitView  ?files ?md5sum ?view ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate ?mapAlgorithm
-WHERE {
-  ?file ucscDaf:filename ?files ;
-        ucscDaf:md5sum ?md5sum .
-  ?submitView ucscDaf:has_file ?file ;
-              ucscDaf:view ?dafView ;
-              ucscDaf:submission <%(submission)s> .
-  ?dafView ucscDaf:name ?view .
-  <%(submission)s> submissionOntology:library ?library ;
-
-  OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell }
-  OPTIONAL { <%(submission)s> ucscDaf:control ?control }
-  OPTIONAL { <%(submission)s> ucscDaf:controlId ?controlId }
-  OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:condition ?treatment }
-  OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  OPTIONAL { ?library ucscDaf:readType ?readType }
-  OPTIONAL { ?library ucscDaf:strain ?strain }
-  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-}
-ORDER BY  ?submitView"""
-    dag_fragments = []
-
-    name = fromTypedNode(view_map.model.get_target(submissionNode, submissionOntology['name']))
-    if name is None:
-        logger.error("Need name for %s" % (str(submissionNode)))
-        return []
-
-    ddf_name = name + '.ddf'
-    if outdir is not None:
-        outfile = os.path.join(outdir, ddf_name)
-        output = open(outfile,'w')
-    else:
-        outfile = 'stdout:'
-        output = sys.stdout
-
-    formatted_query = query_template % {'submission': str(submissionNode.uri)}
-
-    query = RDF.SPARQLQuery(formatted_query)
-    results = query.execute(view_map.model)
-
-    # filename goes first
-    variables = view_map.get_daf_variables()
-    # 'controlId',
-    output.write('\t'.join(variables))
-    output.write(os.linesep)
-
-    all_views = {}
-    all_files = []
-    for row in results:
-        viewname = fromTypedNode(row['view'])
-        current = all_views.setdefault(viewname, {})
-        for variable_name in variables:
-            value = str(fromTypedNode(row[variable_name]))
-            if value is None or value == 'None':
-                logger.warn("{0}: {1} was None".format(outfile, variable_name))
-            if variable_name in ('files', 'md5sum'):
-                current.setdefault(variable_name,[]).append(value)
-            else:
-                current[variable_name] = value
-
-    for view in all_views.keys():
-        line = []
-        for variable_name in variables:
-            if variable_name in ('files', 'md5sum'):
-                line.append(','.join(all_views[view][variable_name]))
-            else:
-                line.append(all_views[view][variable_name])
-        output.write("\t".join(line))
-        output.write(os.linesep)
-        all_files.extend(all_views[view]['files'])
-
-    logger.info(
-        "Examined {0}, found files: {1}".format(
-            str(submissionNode), ", ".join(all_files)))
-
-    all_files.append(daf_name)
-    all_files.append(ddf_name)
-
-    if make_condor:
-        archive_condor = make_condor_archive_script(name, all_files, outdir)
-        upload_condor = make_condor_upload_script(name, outdir)
-
-        dag_fragments.extend(
-            make_dag_fragment(name, archive_condor, upload_condor)
-        )
-
-    return dag_fragments
-
-
-def read_library_result_map(filename):
-    """
-    Read a file that maps library id to result directory.
-    Does not support spaces in filenames.
-
-    For example:
-      10000 result/foo/bar
-    """
-    stream = open(filename,'r')
-
-    results = []
-    for line in stream:
-        line = line.rstrip()
-        if not line.startswith('#') and len(line) > 0 :
-            library_id, result_dir = line.split()
-            results.append((library_id, result_dir))
-    return results
-
-
-def make_condor_archive_script(name, files, outdir=None):
-    script = """Universe = vanilla
-
-Executable = /bin/tar
-arguments = czvhf ../%(archivename)s %(filelist)s
-
-Error = compress.out.$(Process).log
-Output = compress.out.$(Process).log
-Log = /tmp/submission-compress-%(user)s.log
-initialdir = %(initialdir)s
-environment="GZIP=-3"
-request_memory = 20
-
-queue
-"""
-    if outdir is None:
-        outdir = os.getcwd()
-    for f in files:
-        pathname = os.path.join(outdir, f)
-        if not os.path.exists(pathname):
-            raise RuntimeError("Missing %s from %s" % (f,outdir))
-
-    context = {'archivename': make_submission_name(name),
-               'filelist': " ".join(files),
-               'initialdir': os.path.abspath(outdir),
-               'user': os.getlogin()}
-
-    condor_script = os.path.join(outdir, make_condor_name(name, 'archive'))
-    condor_stream = open(condor_script,'w')
-    condor_stream.write(script % context)
-    condor_stream.close()
-    return condor_script
-
-
-def make_condor_upload_script(name, outdir=None):
-    script = """Universe = vanilla
-
-Executable = /usr/bin/lftp
-arguments = -c put ../%(archivename)s -o ftp://%(ftpuser)s:%(ftppassword)s@%(ftphost)s/%(archivename)s
-
-Error = upload.out.$(Process).log
-Output = upload.out.$(Process).log
-Log = /tmp/submission-upload-%(user)s.log
-initialdir = %(initialdir)s
-
-queue
-"""
-    if outdir is None:
-        outdir = os.getcwd()
-
-    auth = netrc.netrc(os.path.expanduser("~diane/.netrc"))
-
-    encodeftp = 'encodeftp.cse.ucsc.edu'
-    ftpuser = auth.hosts[encodeftp][0]
-    ftppassword = auth.hosts[encodeftp][2]
-    context = {'archivename': make_submission_name(name),
-               'initialdir': os.path.abspath(outdir),
-               'user': os.getlogin(),
-               'ftpuser': ftpuser,
-               'ftppassword': ftppassword,
-               'ftphost': encodeftp}
-
-    condor_script = os.path.join(outdir, make_condor_name(name, 'upload'))
-    condor_stream = open(condor_script,'w')
-    condor_stream.write(script % context)
-    condor_stream.close()
-    os.chmod(condor_script, stat.S_IREAD|stat.S_IWRITE)
-
-    return condor_script
-
-
-def make_dag_fragment(ininame, archive_condor, upload_condor):
-    """
-    Make the couple of fragments compress and then upload the data.
-    """
-    cur_dir = os.getcwd()
-    archive_condor = os.path.join(cur_dir, archive_condor)
-    upload_condor = os.path.join(cur_dir, upload_condor)
-    job_basename = make_base_name(ininame)
-
-    fragments = []
-    fragments.append('JOB %s_archive %s' % (job_basename, archive_condor))
-    fragments.append('JOB %s_upload %s' % (job_basename,  upload_condor))
-    fragments.append('PARENT %s_archive CHILD %s_upload' % (job_basename, job_basename))
-
-    return fragments
-
-
-def get_library_info(host, apidata, library_id):
-    url = api.library_url(host, library_id)
-    contents = api.retrieve_info(url, apidata)
-    return contents
-
-
-def make_submission_section(line_counter, files, attributes):
-    """
-    Create a section in the submission ini file
-    """
-    inifile = [ "[line%s]" % (line_counter,) ]
-    inifile += ["files=%s" % (",".join(files))]
-
-    for k,v in attributes.items():
-        inifile += ["%s=%s" % (k,v)]
-    return inifile
-
-
-def make_base_name(pathname):
-    base = os.path.basename(pathname)
-    name, ext = os.path.splitext(base)
-    return name
-
-
-def make_submission_name(ininame):
-    name = make_base_name(ininame)
-    return name + ".tgz"
-
-
-def make_ddf_name(pathname):
-    name = make_base_name(pathname)
-    return name + ".ddf"
-
-
-def make_condor_name(pathname, run_type=None):
-    name = make_base_name(pathname)
-    elements = [name]
-    if run_type is not None:
-        elements.append(run_type)
-    elements.append("condor")
-    return ".".join(elements)
-
-
-def parse_filelist(file_string):
-    return file_string.split(",")
-
-
-def validate_filelist(files):
-    """
-    Die if a file doesn't exist in a file list
-    """
-    for f in files:
-        if not os.path.exists(f):
-            raise RuntimeError("%s does not exist" % (f,))
-
-if __name__ == "__main__":
-    main()
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
old mode 100755 (executable)
new mode 100644 (file)
diff --git a/htsworkflow/submission/test/test_ucsc.py b/htsworkflow/submission/test/test_ucsc.py
new file mode 100644 (file)
index 0000000..f05de23
--- /dev/null
@@ -0,0 +1,29 @@
+import unittest
+from StringIO import StringIO
+
+from htsworkflow.submission import ucsc
+
+
+ENCODE_FILES="""wgEncodeGisChiaPetHCT116D000005593.bed.gz      project=wgEncode; grant=Ruan; lab=GIS-Ruan; composite=wgEncodeGisChiaPet; dataType=ChiaPet; view=Interactions; cell=HCT-116; antibody=Pol2; replicate=1; origAssembly=hg19; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH001427; dateSubmitted=2011-02-04; dateUnrestricted=2011-11-04; subId=3267; labVersion=CHH524; tableName=wgEncodeGisChiaPetHCT116D000005593; type=bed; md5sum=a3c7420aece4acfb15f80f4dfe9f1fb3; size=924K
+wgEncodeCaltechRnaSeqGm12878R2x75Il200FastqRd2Rep1.fastq.tgz   project=wgEncode; grant=Myers; lab=Caltech; composite=wgEncodeCaltechRnaSeq; dataType=RnaSeq; view=FastqRd2; cell=GM12878; localization=cell; rnaExtract=longPolyA; readType=2x75; insertLength=200; replicate=1; origAssembly=hg18; dataVersion=ENCODE Jan 2011 Freeze; dccAccession=wgEncodeEH000122; dateSubmitted=2010-07-14; dateResubmitted=2010-06-21; dateUnrestricted=2011-04-14; subId=1647; type=fastq; md5sum=51c4d1679b0ad29888bea2b40e26364a; size=4.8G
+"""
+
+
+class TestUCSCInfo(unittest.TestCase):
+    def test_parse_encodedcc_file(self):
+        stream = StringIO(ENCODE_FILES)
+        file_index = ucsc.parse_ucsc_file_index(stream)
+        self.assertEquals(len(file_index), 2)
+
+        for attributes in file_index.values():
+            self.assertIn('subId', attributes)
+            self.assertIn('project', attributes)
+            self.assertEquals(attributes['project'], 'wgEncode')
+
+def suite():
+    suite = unittest.makeSuite(TestUCSCInfo, 'test')
+    return suite
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
index f80629a9cda462ac6ce8e45f58949b4ffb9037f9..9181830e34983724e586a906a7194d96fc9507eb 100644 (file)
@@ -1,7 +1,11 @@
+"""Utilities for extracting information from the ENCODE DCC
+"""
 import urlparse
+import urllib2
 
 UCSCEncodePipeline = "http://encodesubmit.ucsc.edu/pipeline/"
 
+
 def ddf_download_url(submission_id):
     """Return url to download a DDF for a submission
 
@@ -11,6 +15,7 @@ def ddf_download_url(submission_id):
     fragment = 'download_ddf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def daf_download_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -20,6 +25,7 @@ def daf_download_url(submission_id):
     fragment = 'download_daf/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
 
+
 def submission_view_url(submission_id):
     """Return url to download a DAF for a submission
 
@@ -28,3 +34,27 @@ def submission_view_url(submission_id):
     """
     fragment = 'show/%s' % (submission_id,)
     return urlparse.urljoin(UCSCEncodePipeline, fragment)
+
+
+def get_ucsc_file_index(base_url):
+    """Get index of files for a ENCODE collection
+    """
+    if base_url[-1] != '/': base_url += '/'
+    request = urllib2.urlopen(base_url + 'files.txt')
+    file_index = parse_ucsc_file_index(request)
+    return file_index
+
+
+def parse_ucsc_file_index(stream):
+    """Turn a UCSC DCC files.txt index into a dictionary of name-value pairs
+    """
+    file_index = {}
+    for line in stream:
+        filename, attribute_line = line.split('\t')
+        attributes = {}
+        for assignment in  attribute_line.split(';'):
+            name, value = assignment.split('=')
+            attributes[name.strip()] = value.strip()
+
+        file_index[filename] = attributes
+    return file_index
old mode 100755 (executable)
new mode 100644 (file)
index 9fcd3311ee77215fd8ae4ed732e7a22b287d4a62..8fb1424960571d4ab327ab2c4261b293eb1efb56 100644 (file)
@@ -33,7 +33,7 @@ def sparql_query(model, query_filename):
         output = []
         for k,v in row.items()[::-1]:
             print "{0}: {1}".format(k,v)
-        print 
+        print
 
 
 def blankOrUri(value=None):
@@ -99,11 +99,11 @@ def fromTypedNode(node):
 
     return literal
 
-    
+
 def get_model(model_name=None, directory=None):
     if directory is None:
         directory = os.getcwd()
-        
+
     if model_name is None:
         storage = RDF.MemoryStorage()
         logger.info("Using RDF Memory model")
@@ -114,12 +114,12 @@ def get_model(model_name=None, directory=None):
         logger.info("Using {0} with options {1}".format(model_name, options))
     model = RDF.Model(storage)
     return model
-        
+
 
 def load_into_model(model, parser_name, filename, ns=None):
     if not os.path.exists(filename):
         raise IOError("Can't find {0}".format(filename))
-    
+
     data = open(filename, 'r').read()
     load_string_into_model(model, parser_name, data, ns)
 
@@ -127,7 +127,7 @@ def load_into_model(model, parser_name, filename, ns=None):
 def load_string_into_model(model, parser_name, data, ns=None):
     if ns is None:
         ns = "http://localhost/"
-        
+
     rdf_parser = RDF.Parser(name=parser_name)
     rdf_parser.parse_string_into_model(model, data, ns)
 
@@ -148,3 +148,6 @@ def get_serializer(name='turtle'):
     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
     return writer
 
+def dump_model(model):
+    serializer = get_serializer()
+    print serializer.serialize_model_to_string(model)