Start developing GEO SOFT submission tool.
authorDiane Trout <diane@caltech.edu>
Wed, 11 Apr 2012 22:24:46 +0000 (15:24 -0700)
committerDiane Trout <diane@caltech.edu>
Wed, 11 Apr 2012 22:24:46 +0000 (15:24 -0700)
At this point it will generate a partial soft file I'm still
missing several parts to the file, such as platform description,
but it is a proof of concept for the template based generation
of the sparql query and the SOFT text file for describing the submission.

encode_submission/geo_gather.py [new file with mode: 0644]
htsworkflow/submission/geo.py [new file with mode: 0644]
htsworkflow/submission/submission.py [new file with mode: 0644]
htsworkflow/templates/geo_submission.soft [new file with mode: 0644]
htsworkflow/templates/geo_submission.sparql [new file with mode: 0644]

diff --git a/encode_submission/geo_gather.py b/encode_submission/geo_gather.py
new file mode 100644 (file)
index 0000000..9a1f51e
--- /dev/null
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+from ConfigParser import SafeConfigParser
+import fnmatch
+from glob import glob
+import json
+import logging
+import netrc
+from optparse import OptionParser, OptionGroup
+import os
+from pprint import pprint, pformat
+import shlex
+from StringIO import StringIO
+import stat
+import sys
+import time
+import types
+import urllib
+import urllib2
+import urlparse
+from zipfile import ZipFile
+
+import RDF
+
+if not 'DJANGO_SETTINGS_MODULE' in os.environ:
+    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings'
+
+
+from htsworkflow.util import api
+from htsworkflow.util.rdfhelp import \
+     dafTermOntology, \
+     fromTypedNode, \
+     get_model, \
+     get_serializer, \
+     load_into_model, \
+     sparql_query, \
+     submissionOntology
+from htsworkflow.submission.daf import get_submission_uri
+from htsworkflow.submission.results import ResultMap
+from htsworkflow.submission.geo import GEOSubmission
+from htsworkflow.submission.condorfastq import CondorFastqExtract
+
+logger = logging.getLogger(__name__)
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+    submission_uri = None
+
+    if opts.debug:
+        logging.basicConfig(level = logging.DEBUG )
+    elif opts.verbose:
+        logging.basicConfig(level = logging.INFO )
+    else:
+        logging.basicConfig(level = logging.WARNING )
+
+    apidata = api.make_auth_from_opts(opts, parser)
+
+    model = get_model(opts.model, opts.db_path)
+    mapper = None
+    if opts.name:
+        mapper = GEOSubmission(opts.name,  model)
+        if opts.library_url is not None:
+            mapper.library_url = opts.library_url
+        submission_uri = get_submission_uri(opts.name)
+
+
+    if opts.load_rdf is not None:
+        if submission_uri is None:
+            parser.error("Please specify the submission name")
+        load_into_model(model, 'turtle', opts.load_rdf, submission_uri)
+
+    results = ResultMap()
+    for a in args:
+        results.add_results_from_file(a)
+
+    if opts.make_tree_from is not None:
+        results.make_tree_from(opts.make_tree_from)
+
+    if opts.fastq:
+        extractor = CondorFastqExtract(opts.host, apidata, opts.sequence,
+                                       force=opts.force)
+        extractor.create_scripts(results)
+
+    if opts.scan_submission:
+        mapper.scan_submission_dirs(results)
+
+    if opts.make_soft:
+        mapper.make_soft(results)
+
+    if opts.sparql:
+        sparql_query(model, opts.sparql)
+
+    if opts.print_rdf:
+        writer = get_serializer()
+        print writer.serialize_model_to_string(model)
+
+
+def make_parser():
+    parser = OptionParser()
+
+    model = OptionGroup(parser, 'model')
+    model.add_option('--name', help="Set submission name")
+    model.add_option('--db-path', default=None,
+                     help="set rdf database path")
+    model.add_option('--model', default=None,
+      help="Load model database")
+    model.add_option('--load-rdf', default=None,
+      help="load rdf statements into model")
+    model.add_option('--sparql', default=None, help="execute sparql query")
+    model.add_option('--print-rdf', action="store_true", default=False,
+      help="print ending model state")
+    parser.add_option_group(model)
+    # commands
+    commands = OptionGroup(parser, 'commands')
+    commands.add_option('--make-tree-from',
+                      help="create directories & link data files",
+                      default=None)
+    commands.add_option('--fastq', default=False, action="store_true",
+                        help="generate scripts for making fastq files")
+    commands.add_option('--scan-submission', default=False, action="store_true",
+                      help="Import metadata for submission into our model")
+    commands.add_option('--make-soft', help='make the soft file', default=False,
+                      action="store_true")
+
+    parser.add_option_group(commands)
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="Force regenerating fastqs")
+    parser.add_option('--daf', default=None, help='specify daf name')
+    parser.add_option('--library-url', default=None,
+                      help="specify an alternate source for library information")
+    # debugging
+    parser.add_option('--verbose', default=False, action="store_true",
+                      help='verbose logging')
+    parser.add_option('--debug', default=False, action="store_true",
+                      help='debug logging')
+
+    api.add_auth_options(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    main()
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py
new file mode 100644 (file)
index 0000000..737b1bb
--- /dev/null
@@ -0,0 +1,57 @@
+import logging
+
+import RDF
+
+from htsworkflow.submission.submission import Submission
+
+from htsworkflow.util.rdfhelp import \
+     fromTypedNode, \
+     submissionOntology
+
+from django.conf import settings
+from django.template import Context, loader
+
+LOGGER = logging.getLogger(__name__)
+
+class GEOSubmission(Submission):
+    def __init__(self, name, model):
+        super(GEOSubmission, self).__init__(name, model)
+
+    def make_soft(self, result_map):
+        samples = []
+        for lib_id, result_dir in result_map.items():
+            an_analysis = self.get_submission_node(result_dir)
+            samples.append(self.get_sample_metadata(an_analysis))
+
+        soft_template = loader.get_template('geo_submission.soft')
+        context = Context({
+            'samples': samples
+        })
+        print str(soft_template.render(context))
+
+    def check_for_name(self, analysis_node):
+        name = fromTypedNode(
+            self.model.get_target(analysis_node,
+                                  submissionOntology['name']))
+        if name is None:
+            logger.error("Need name for %s" % (str(analysis_node)))
+            return False
+        else:
+            return True
+
+    def get_sample_metadata(self, analysis_node):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_submission.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            })
+
+        formatted_query = query_template.render(context)
+        query = RDF.SPARQLQuery(str(formatted_query))
+        rdfstream = query.execute(self.model)
+        results = []
+        for r in rdfstream:
+            results.append(r)
+        return results
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py
new file mode 100644 (file)
index 0000000..98c25d5
--- /dev/null
@@ -0,0 +1,256 @@
+"""Common submission elements
+"""
+import logging
+import os
+import re
+
+import RDF
+
+from htsworkflow.util.rdfhelp import \
+     blankOrUri, \
+     dafTermOntology, \
+     dump_model, \
+     get_model, \
+     libraryOntology, \
+     owlNS, \
+     rdfNS, \
+     submissionLog, \
+     submissionOntology, \
+     toTypedNode, \
+     fromTypedNode
+from htsworkflow.util.hashfile import make_md5sum
+
+from htsworkflow.submission.daf import \
+     MetadataLookupException, \
+     get_submission_uri
+
+logger = logging.getLogger(__name__)
+
+class Submission(object):
+    def __init__(self, name, model):
+        self.name = name
+        self.model = model
+
+        self.submissionSet = get_submission_uri(self.name)
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+        self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
+
+        self.__view_map = None
+
+    def scan_submission_dirs(self, result_map):
+        """Examine files in our result directory
+        """
+        for lib_id, result_dir in result_map.items():
+            logger.info("Importing %s from %s" % (lib_id, result_dir))
+            try:
+                self.import_analysis_dir(result_dir, lib_id)
+            except MetadataLookupException, e:
+                logger.error("Skipping %s: %s" % (lib_id, str(e)))
+
+    def import_analysis_dir(self, analysis_dir, library_id):
+        """Import a submission directories and update our model as needed
+        """
+        #attributes = get_filename_attribute_map(paired)
+        libNode = self.libraryNS[library_id + "/"]
+
+        self._add_library_details_to_model(libNode)
+
+        submission_files = os.listdir(analysis_dir)
+        for filename in submission_files:
+            self.construct_file_attributes(analysis_dir, libNode, filename)
+
+    def construct_file_attributes(self, analysis_dir, libNode, pathname):
+        """Looking for the best extension
+        The 'best' is the longest match
+
+        :Args:
+        filename (str): the filename whose extention we are about to examine
+        """
+        path, filename = os.path.split(pathname)
+
+        logger.debug("Searching for view")
+        file_classification = self.find_best_match(filename)
+        if file_classification is None:
+            logger.warn("Unrecognized file: {0}".format(pathname))
+            return None
+        if str(file_classification) == str(libraryOntology['ignore']):
+            return None
+
+        an_analysis_name = self.make_submission_name(analysis_dir)
+        an_analysis = self.get_submission_node(analysis_dir)
+        an_analysis_uri = str(an_analysis.uri)
+
+        self.model.add_statement(RDF.Statement(an_analysis,
+                                               submissionOntology['name'],
+                                               toTypedNode(an_analysis_name)))
+        self.model.add_statement(
+            RDF.Statement(an_analysis,
+                          rdfNS['type'],
+                          submissionOntology['submission']))
+        self.model.add_statement(RDF.Statement(an_analysis,
+                                               submissionOntology['library'],
+                                               libNode))
+
+        logger.debug("Adding statements to {0}".format(str(an_analysis)))
+        # add track specific information
+        self.model.add_statement(
+            RDF.Statement(an_analysis,
+                          dafTermOntology['paired'],
+                          toTypedNode(self._is_paired(libNode))))
+        self.model.add_statement(
+            RDF.Statement(an_analysis,
+                          dafTermOntology['submission'],
+                          an_analysis))
+
+        # add file specific information
+        fileNode = self.link_file_to_classes(filename,
+                                             an_analysis,
+                                             an_analysis_uri,
+                                             analysis_dir)
+        self.add_md5s(filename, fileNode, analysis_dir)
+
+        logger.debug("Done.")
+
+    def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
+        # add file specific information
+        fileNode = RDF.Node(RDF.Uri(submission_uri + '/' + filename))
+        self.model.add_statement(
+            RDF.Statement(submissionNode,
+                          dafTermOntology['has_file'],
+                          fileNode))
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          dafTermOntology['filename'],
+                          filename))
+        return fileNode
+
+    def add_md5s(self, filename, fileNode, analysis_dir):
+        logger.debug("Updating file md5sum")
+        submission_pathname = os.path.join(analysis_dir, filename)
+        md5 = make_md5sum(submission_pathname)
+        if md5 is None:
+            errmsg = "Unable to produce md5sum for {0}"
+            logger.warning(errmsg.format(submission_pathname))
+        else:
+            self.model.add_statement(
+                RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
+
+    def _add_library_details_to_model(self, libNode):
+        parser = RDF.Parser(name='rdfa')
+        new_statements = parser.parse_as_stream(libNode.uri)
+        for s in new_statements:
+            # don't override things we already have in the model
+            targets = list(self.model.get_targets(s.subject, s.predicate))
+            if len(targets) == 0:
+                self.model.append(s)
+
+
+    def find_best_match(self, filename):
+        """Search through potential filename matching patterns
+        """
+        if self.__view_map is None:
+            self.__view_map = self._get_filename_view_map()
+
+        results = []
+        for pattern, view in self.__view_map.items():
+            if re.match(pattern, filename):
+                results.append(view)
+
+        if len(results) > 1:
+            msg = "%s matched multiple views %s" % (
+                filename,
+                [str(x) for x in results])
+            raise ModelException(msg)
+        elif len(results) == 1:
+            return results[0]
+        else:
+            return None
+
+    def _get_filename_view_map(self):
+        """Query our model for filename patterns
+
+        return a dictionary of compiled regular expressions to view names
+        """
+        filename_query = RDF.Statement(
+            None, dafTermOntology['filename_re'], None)
+
+        patterns = {}
+        for s in self.model.find_statements(filename_query):
+            view_name = s.subject
+            literal_re = s.object.literal_value['string']
+            logger.debug("Found: %s" % (literal_re,))
+            try:
+                filename_re = re.compile(literal_re)
+            except re.error, e:
+                logger.error("Unable to compile: %s" % (literal_re,))
+            patterns[literal_re] = view_name
+        return patterns
+
+    def make_submission_name(self, analysis_dir):
+        analysis_dir = os.path.normpath(analysis_dir)
+        analysis_dir_name = os.path.split(analysis_dir)[1]
+        if len(analysis_dir_name) == 0:
+            raise RuntimeError(
+                "Submission dir name too short: {0}".format(analysis_dir))
+        return analysis_dir_name
+
+    def get_submission_node(self, analysis_dir):
+        """Convert a submission directory name to a submission node
+        """
+        submission_name = self.make_submission_name(analysis_dir)
+        return self.submissionSetNS[submission_name]
+
+    def _get_library_attribute(self, libNode, attribute):
+        if not isinstance(attribute, RDF.Node):
+            attribute = libraryOntology[attribute]
+
+        targets = list(self.model.get_targets(libNode, attribute))
+        if len(targets) > 0:
+            return self._format_library_attribute(targets)
+        else:
+            return None
+
+        #targets = self._search_same_as(libNode, attribute)
+        #if targets is not None:
+        #    return self._format_library_attribute(targets)
+
+        # we don't know anything about this attribute
+        self._add_library_details_to_model(libNode)
+
+        targets = list(self.model.get_targets(libNode, attribute))
+        if len(targets) > 0:
+            return self._format_library_attribute(targets)
+
+        return None
+
+    def _format_library_attribute(self, targets):
+        if len(targets) == 0:
+            return None
+        elif len(targets) == 1:
+            return fromTypedNode(targets[0])
+        elif len(targets) > 1:
+            return [fromTypedNode(t) for t in targets]
+
+    def _is_paired(self, libNode):
+        """Determine if a library is paired end"""
+        library_type = self._get_library_attribute(libNode, 'library_type')
+        if library_type is None:
+            errmsg = "%s doesn't have a library type"
+            raise ModelException(errmsg % (str(libNode),))
+
+        single = ['CSHL (lacking last nt)',
+                  'Single End (non-multiplexed)',
+                  'Small RNA (non-multiplexed)',]
+        paired = ['Barcoded Illumina',
+                  'Multiplexing',
+                  'Nextera',
+                  'Paired End (non-multiplexed)',]
+        if library_type in single:
+            return False
+        elif library_type in paired:
+            return True
+        else:
+            raise MetadataLookupException(
+                "Unrecognized library type %s for %s" % \
+                (library_type, str(libNode)))
+
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft
new file mode 100644 (file)
index 0000000..ae2ac57
--- /dev/null
@@ -0,0 +1,11 @@
+Soft template
+!Platform_title = Illumina Genome Analyzer (Homo sapiens)
+!Platform_geo_accession = GPL9052
+{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+^SAMPLE={{row.name}}
+!Sample_title={{row.name}}
+!Sample_organism_ch1 = {{ row.species_name }}
+!Sample_taxid_ch1 = {{ row.taxon_id }}
+{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endif %}{% endspaceless %}{% endif %}
+!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql
new file mode 100644 (file)
index 0000000..1d7cbb1
--- /dev/null
@@ -0,0 +1,33 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+
+select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
+WHERE {
+  <{{submission}}> a submissionOntology:submission .
+
+  OPTIONAL { ?submission ucscDaf:control ?control }
+  #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
+  #OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell } .
+  #OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
+  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name }
+  
+  #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  #OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  #OPTIONAL { ?library ucscDaf:readType ?readType }
+  #OPTIONAL { ?library ucscDaf:strain ?strain }
+  #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+  <{{submission}}> submissionOntology:library ?library ;
+                   ucscDaf:has_file ?file ;
+                   submissionOntology:name ?name .
+  ?species libraryOntology:species ?species_name ;
+           libraryOntology:taxon_id ?taxon_id .
+  ?file ucscDaf:filename ?filename .
+}