From 0c35e559ffdbd7e9fe7d6ec544320989b9e126fc Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Wed, 11 Apr 2012 15:24:46 -0700 Subject: [PATCH] Start developing GEO SOFT submission tool. At this point it will generate a partial soft file I'm still missing several parts to the file, such as platform description, but it is a proof of concept for the template based generation of the sparql query and the SOFT text file for describing the submission. --- encode_submission/geo_gather.py | 144 +++++++++++ htsworkflow/submission/geo.py | 57 +++++ htsworkflow/submission/submission.py | 256 ++++++++++++++++++++ htsworkflow/templates/geo_submission.soft | 11 + htsworkflow/templates/geo_submission.sparql | 33 +++ 5 files changed, 501 insertions(+) create mode 100644 encode_submission/geo_gather.py create mode 100644 htsworkflow/submission/geo.py create mode 100644 htsworkflow/submission/submission.py create mode 100644 htsworkflow/templates/geo_submission.soft create mode 100644 htsworkflow/templates/geo_submission.sparql diff --git a/encode_submission/geo_gather.py b/encode_submission/geo_gather.py new file mode 100644 index 0000000..9a1f51e --- /dev/null +++ b/encode_submission/geo_gather.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +from ConfigParser import SafeConfigParser +import fnmatch +from glob import glob +import json +import logging +import netrc +from optparse import OptionParser, OptionGroup +import os +from pprint import pprint, pformat +import shlex +from StringIO import StringIO +import stat +import sys +import time +import types +import urllib +import urllib2 +import urlparse +from zipfile import ZipFile + +import RDF + +if not 'DJANGO_SETTINGS_MODULE' in os.environ: + os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings' + + +from htsworkflow.util import api +from htsworkflow.util.rdfhelp import \ + dafTermOntology, \ + fromTypedNode, \ + get_model, \ + get_serializer, \ + load_into_model, \ + sparql_query, \ + submissionOntology +from htsworkflow.submission.daf import get_submission_uri +from htsworkflow.submission.results import ResultMap +from htsworkflow.submission.geo import GEOSubmission +from htsworkflow.submission.condorfastq import CondorFastqExtract + +logger = logging.getLogger(__name__) + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + submission_uri = None + + if opts.debug: + logging.basicConfig(level = logging.DEBUG ) + elif opts.verbose: + logging.basicConfig(level = logging.INFO ) + else: + logging.basicConfig(level = logging.WARNING ) + + apidata = api.make_auth_from_opts(opts, parser) + + model = get_model(opts.model, opts.db_path) + mapper = None + if opts.name: + mapper = GEOSubmission(opts.name, model) + if opts.library_url is not None: + mapper.library_url = opts.library_url + submission_uri = get_submission_uri(opts.name) + + + if opts.load_rdf is not None: + if submission_uri is None: + parser.error("Please specify the submission name") + load_into_model(model, 'turtle', opts.load_rdf, submission_uri) + + results = ResultMap() + for a in args: + results.add_results_from_file(a) + + if opts.make_tree_from is not None: + results.make_tree_from(opts.make_tree_from) + + if opts.fastq: + extractor = CondorFastqExtract(opts.host, apidata, opts.sequence, + force=opts.force) + extractor.create_scripts(results) + + if opts.scan_submission: + mapper.scan_submission_dirs(results) + + if opts.make_soft: + mapper.make_soft(results) + + if opts.sparql: + sparql_query(model, opts.sparql) + + if opts.print_rdf: + writer = get_serializer() + print writer.serialize_model_to_string(model) + + +def make_parser(): + parser = OptionParser() + + model = OptionGroup(parser, 'model') + model.add_option('--name', help="Set submission name") + model.add_option('--db-path', default=None, + help="set rdf database path") + model.add_option('--model', default=None, + help="Load model database") + model.add_option('--load-rdf', default=None, + help="load rdf statements into model") + model.add_option('--sparql', default=None, help="execute sparql query") + model.add_option('--print-rdf', action="store_true", default=False, + help="print ending model state") + parser.add_option_group(model) + # commands + commands = OptionGroup(parser, 'commands') + commands.add_option('--make-tree-from', + help="create directories & link data files", + default=None) + commands.add_option('--fastq', default=False, action="store_true", + help="generate scripts for making fastq files") + commands.add_option('--scan-submission', default=False, action="store_true", + help="Import metadata for submission into our model") + commands.add_option('--make-soft', help='make the soft file', default=False, + action="store_true") + + parser.add_option_group(commands) + + parser.add_option('--force', default=False, action="store_true", + help="Force regenerating fastqs") + parser.add_option('--daf', default=None, help='specify daf name') + parser.add_option('--library-url', default=None, + help="specify an alternate source for library information") + # debugging + parser.add_option('--verbose', default=False, action="store_true", + help='verbose logging') + parser.add_option('--debug', default=False, action="store_true", + help='debug logging') + + api.add_auth_options(parser) + + return parser + + +if __name__ == "__main__": + main() diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py new file mode 100644 index 0000000..737b1bb --- /dev/null +++ b/htsworkflow/submission/geo.py @@ -0,0 +1,57 @@ +import logging + +import RDF + +from htsworkflow.submission.submission import Submission + +from htsworkflow.util.rdfhelp import \ + fromTypedNode, \ + submissionOntology + +from django.conf import settings +from django.template import Context, loader + +LOGGER = logging.getLogger(__name__) + +class GEOSubmission(Submission): + def __init__(self, name, model): + super(GEOSubmission, self).__init__(name, model) + + def make_soft(self, result_map): + samples = [] + for lib_id, result_dir in result_map.items(): + an_analysis = self.get_submission_node(result_dir) + samples.append(self.get_sample_metadata(an_analysis)) + + soft_template = loader.get_template('geo_submission.soft') + context = Context({ + 'samples': samples + }) + print str(soft_template.render(context)) + + def check_for_name(self, analysis_node): + name = fromTypedNode( + self.model.get_target(analysis_node, + submissionOntology['name'])) + if name is None: + logger.error("Need name for %s" % (str(analysis_node))) + return False + else: + return True + + def get_sample_metadata(self, analysis_node): + """Gather information for filling out sample section of a SOFT file + """ + query_template = loader.get_template('geo_submission.sparql') + + context = Context({ + 'submission': str(analysis_node.uri), + }) + + formatted_query = query_template.render(context) + query = RDF.SPARQLQuery(str(formatted_query)) + rdfstream = query.execute(self.model) + results = [] + for r in rdfstream: + results.append(r) + return results diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py new file mode 100644 index 0000000..98c25d5 --- /dev/null +++ b/htsworkflow/submission/submission.py @@ -0,0 +1,256 @@ +"""Common submission elements +""" +import logging +import os +import re + +import RDF + +from htsworkflow.util.rdfhelp import \ + blankOrUri, \ + dafTermOntology, \ + dump_model, \ + get_model, \ + libraryOntology, \ + owlNS, \ + rdfNS, \ + submissionLog, \ + submissionOntology, \ + toTypedNode, \ + fromTypedNode +from htsworkflow.util.hashfile import make_md5sum + +from htsworkflow.submission.daf import \ + MetadataLookupException, \ + get_submission_uri + +logger = logging.getLogger(__name__) + +class Submission(object): + def __init__(self, name, model): + self.name = name + self.model = model + + self.submissionSet = get_submission_uri(self.name) + self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/') + self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/') + + self.__view_map = None + + def scan_submission_dirs(self, result_map): + """Examine files in our result directory + """ + for lib_id, result_dir in result_map.items(): + logger.info("Importing %s from %s" % (lib_id, result_dir)) + try: + self.import_analysis_dir(result_dir, lib_id) + except MetadataLookupException, e: + logger.error("Skipping %s: %s" % (lib_id, str(e))) + + def import_analysis_dir(self, analysis_dir, library_id): + """Import a submission directories and update our model as needed + """ + #attributes = get_filename_attribute_map(paired) + libNode = self.libraryNS[library_id + "/"] + + self._add_library_details_to_model(libNode) + + submission_files = os.listdir(analysis_dir) + for filename in submission_files: + self.construct_file_attributes(analysis_dir, libNode, filename) + + def construct_file_attributes(self, analysis_dir, libNode, pathname): + """Looking for the best extension + The 'best' is the longest match + + :Args: + filename (str): the filename whose extention we are about to examine + """ + path, filename = os.path.split(pathname) + + logger.debug("Searching for view") + file_classification = self.find_best_match(filename) + if file_classification is None: + logger.warn("Unrecognized file: {0}".format(pathname)) + return None + if str(file_classification) == str(libraryOntology['ignore']): + return None + + an_analysis_name = self.make_submission_name(analysis_dir) + an_analysis = self.get_submission_node(analysis_dir) + an_analysis_uri = str(an_analysis.uri) + + self.model.add_statement(RDF.Statement(an_analysis, + submissionOntology['name'], + toTypedNode(an_analysis_name))) + self.model.add_statement( + RDF.Statement(an_analysis, + rdfNS['type'], + submissionOntology['submission'])) + self.model.add_statement(RDF.Statement(an_analysis, + submissionOntology['library'], + libNode)) + + logger.debug("Adding statements to {0}".format(str(an_analysis))) + # add track specific information + self.model.add_statement( + RDF.Statement(an_analysis, + dafTermOntology['paired'], + toTypedNode(self._is_paired(libNode)))) + self.model.add_statement( + RDF.Statement(an_analysis, + dafTermOntology['submission'], + an_analysis)) + + # add file specific information + fileNode = self.link_file_to_classes(filename, + an_analysis, + an_analysis_uri, + analysis_dir) + self.add_md5s(filename, fileNode, analysis_dir) + + logger.debug("Done.") + + def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir): + # add file specific information + fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename)) + self.model.add_statement( + RDF.Statement(submissionNode, + dafTermOntology['has_file'], + fileNode)) + self.model.add_statement( + RDF.Statement(fileNode, + dafTermOntology['filename'], + filename)) + return fileNode + + def add_md5s(self, filename, fileNode, analysis_dir): + logger.debug("Updating file md5sum") + submission_pathname = os.path.join(analysis_dir, filename) + md5 = make_md5sum(submission_pathname) + if md5 is None: + errmsg = "Unable to produce md5sum for {0}" + logger.warning(errmsg.format(submission_pathname)) + else: + self.model.add_statement( + RDF.Statement(fileNode, dafTermOntology['md5sum'], md5)) + + def _add_library_details_to_model(self, libNode): + parser = RDF.Parser(name='rdfa') + new_statements = parser.parse_as_stream(libNode.uri) + for s in new_statements: + # don't override things we already have in the model + targets = list(self.model.get_targets(s.subject, s.predicate)) + if len(targets) == 0: + self.model.append(s) + + + def find_best_match(self, filename): + """Search through potential filename matching patterns + """ + if self.__view_map is None: + self.__view_map = self._get_filename_view_map() + + results = [] + for pattern, view in self.__view_map.items(): + if re.match(pattern, filename): + results.append(view) + + if len(results) > 1: + msg = "%s matched multiple views %s" % ( + filename, + [str(x) for x in results]) + raise ModelException(msg) + elif len(results) == 1: + return results[0] + else: + return None + + def _get_filename_view_map(self): + """Query our model for filename patterns + + return a dictionary of compiled regular expressions to view names + """ + filename_query = RDF.Statement( + None, dafTermOntology['filename_re'], None) + + patterns = {} + for s in self.model.find_statements(filename_query): + view_name = s.subject + literal_re = s.object.literal_value['string'] + logger.debug("Found: %s" % (literal_re,)) + try: + filename_re = re.compile(literal_re) + except re.error, e: + logger.error("Unable to compile: %s" % (literal_re,)) + patterns[literal_re] = view_name + return patterns + + def make_submission_name(self, analysis_dir): + analysis_dir = os.path.normpath(analysis_dir) + analysis_dir_name = os.path.split(analysis_dir)[1] + if len(analysis_dir_name) == 0: + raise RuntimeError( + "Submission dir name too short: {0}".format(analysis_dir)) + return analysis_dir_name + + def get_submission_node(self, analysis_dir): + """Convert a submission directory name to a submission node + """ + submission_name = self.make_submission_name(analysis_dir) + return self.submissionSetNS[submission_name] + + def _get_library_attribute(self, libNode, attribute): + if not isinstance(attribute, RDF.Node): + attribute = libraryOntology[attribute] + + targets = list(self.model.get_targets(libNode, attribute)) + if len(targets) > 0: + return self._format_library_attribute(targets) + else: + return None + + #targets = self._search_same_as(libNode, attribute) + #if targets is not None: + # return self._format_library_attribute(targets) + + # we don't know anything about this attribute + self._add_library_details_to_model(libNode) + + targets = list(self.model.get_targets(libNode, attribute)) + if len(targets) > 0: + return self._format_library_attribute(targets) + + return None + + def _format_library_attribute(self, targets): + if len(targets) == 0: + return None + elif len(targets) == 1: + return fromTypedNode(targets[0]) + elif len(targets) > 1: + return [fromTypedNode(t) for t in targets] + + def _is_paired(self, libNode): + """Determine if a library is paired end""" + library_type = self._get_library_attribute(libNode, 'library_type') + if library_type is None: + errmsg = "%s doesn't have a library type" + raise ModelException(errmsg % (str(libNode),)) + + single = ['CSHL (lacking last nt)', + 'Single End (non-multiplexed)', + 'Small RNA (non-multiplexed)',] + paired = ['Barcoded Illumina', + 'Multiplexing', + 'Nextera', + 'Paired End (non-multiplexed)',] + if library_type in single: + return False + elif library_type in paired: + return True + else: + raise MetadataLookupException( + "Unrecognized library type %s for %s" % \ + (library_type, str(libNode))) + diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft new file mode 100644 index 0000000..ae2ac57 --- /dev/null +++ b/htsworkflow/templates/geo_submission.soft @@ -0,0 +1,11 @@ +Soft template +!Platform_title = Illumina Genome Analyzer (Homo sapiens) +!Platform_geo_accession = GPL9052 +{% for sample in samples %}{% for row in sample %}{%if forloop.first %} +^SAMPLE={{row.name}} +!Sample_title={{row.name}} +!Sample_organism_ch1 = {{ row.species_name }} +!Sample_taxid_ch1 = {{ row.taxon_id }} +{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }} +{% endif %}{% endspaceless %}{% endif %} +!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %} \ No newline at end of file diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql new file mode 100644 index 0000000..1d7cbb1 --- /dev/null +++ b/htsworkflow/templates/geo_submission.sparql @@ -0,0 +1,33 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: + +select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id +WHERE { + <{{submission}}> a submissionOntology:submission . + + OPTIONAL { ?submission ucscDaf:control ?control } + #OPTIONAL { ?submission ucscDaf:controlId ?controlId } + #OPTIONAL { ?library libraryOntology:antibody ?antibody } + OPTIONAL { ?library libraryOntology:cell_line ?cell } . + #OPTIONAL { ?library ucscDaf:sex ?sex } + OPTIONAL { ?library libraryOntology:library_id ?labExpId } + OPTIONAL { ?library libraryOntology:library_id ?labVersion } + OPTIONAL { ?library libraryOntology:replicate ?replicate } + OPTIONAL { ?library libraryOntology:species ?species_name } + + #OPTIONAL { ?library libraryOntology:condition_term ?treatment } + #OPTIONAL { ?library ucscDaf:protocol ?protocol } + #OPTIONAL { ?library ucscDaf:readType ?readType } + #OPTIONAL { ?library ucscDaf:strain ?strain } + #OPTIONAL { ?library libraryOntology:insert_size ?insertLength } + #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm } + + <{{submission}}> submissionOntology:library ?library ; + ucscDaf:has_file ?file ; + submissionOntology:name ?name . + ?species libraryOntology:species ?species_name ; + libraryOntology:taxon_id ?taxon_id . + ?file ucscDaf:filename ?filename . +} -- 2.30.2