import logging
+import os
import RDF
from htsworkflow.util.rdfhelp import \
fromTypedNode, \
+ geoSoftNS, \
+ simplifyUri, \
submissionOntology
from django.conf import settings
def make_soft(self, result_map):
samples = []
+ platform = self.get_platform_metadata()
+ platform_attribs = dict(platform)
+ platform_id = platform_attribs['^platform']
+ series = self.get_series_metadata()
+ series_attribs = dict(series)
+ series_id = series_attribs['^series']
for lib_id, result_dir in result_map.items():
an_analysis = self.get_submission_node(result_dir)
- samples.append(self.get_sample_metadata(an_analysis))
+ metadata = self.get_sample_metadata(an_analysis)
+ if len(metadata) > 1:
+ errmsg = 'Confused there are more than one samples for %s'
+ LOGGER.debug(errmsg % (str(an_analysis,)))
+ metadata = metadata[0]
+ metadata['raw'] = self.get_sample_files(an_analysis,
+ geoSoftNS['raw'])
+ metadata['supplimental'] = self.get_sample_files(
+ an_analysis,
+ geoSoftNS['supplemental'])
+ samples.append(metadata)
soft_template = loader.get_template('geo_submission.soft')
context = Context({
- 'samples': samples
+ 'platform': platform,
+ 'series': series,
+ 'samples': samples,
+ 'platform_id': platform_id,
+ 'series_id': series_id,
})
print str(soft_template.render(context))
else:
return True
+ def get_platform_metadata(self):
+ """Gather information for filling out sample section of a SOFT file
+ """
+ query_template = loader.get_template('geo_platform.sparql')
+ submission = str(self.submissionSetNS[''].uri)
+ context = Context({
+ 'submission': submission,
+ })
+
+ results = self.execute_query(query_template, context)
+ return self.query_to_soft_dictionary(results, 'platform')
+
+ def get_series_metadata(self):
+ """Gather information for filling out sample section of a SOFT file
+ """
+ query_template = loader.get_template('geo_series.sparql')
+ submission = str(self.submissionSetNS[''].uri)
+ context = Context({
+ 'submission': submission,
+ })
+
+ results = self.execute_query(query_template, context)
+ return self.query_to_soft_dictionary(results, 'series')
+
def get_sample_metadata(self, analysis_node):
"""Gather information for filling out sample section of a SOFT file
"""
- query_template = loader.get_template('geo_submission.sparql')
+ query_template = loader.get_template('geo_samples.sparql')
context = Context({
'submission': str(analysis_node.uri),
+ 'submissionSet': str(self.submissionSetNS[''].uri),
})
- formatted_query = query_template.render(context)
- query = RDF.SPARQLQuery(str(formatted_query))
- rdfstream = query.execute(self.model)
- results = []
- for r in rdfstream:
- results.append(r)
+ results = self.execute_query(query_template, context)
+ for r in results:
+
+ r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
return results
+
+ def get_sample_files(self, analysis_node, file_class):
+ """Gather files
+ """
+ query_template = loader.get_template('geo_files.sparql')
+
+ context = Context({
+ 'submission': str(analysis_node.uri),
+ 'file_class': str(file_class)
+ })
+
+ return self.execute_query(query_template, context)
+
+ def query_to_soft_dictionary(self, results, heading):
+ attributes = []
+ for r in results:
+ name = simplifyUri(geoSoftNS, r['name'])
+ if name is not None:
+ if name.lower() == heading.lower():
+ name = '^' + name
+ else:
+ name = '!' + name
+ for v in fromTypedNode(r['value']).split(os.linesep):
+ v = v.strip()
+ if len(v) > 0:
+ attributes.append((name, v))
+ return attributes
MetadataLookupException, \
get_submission_uri
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
class Submission(object):
def __init__(self, name, model):
self.model = model
self.submissionSet = get_submission_uri(self.name)
- self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+ self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
self.__view_map = None
"""Examine files in our result directory
"""
for lib_id, result_dir in result_map.items():
- logger.info("Importing %s from %s" % (lib_id, result_dir))
+ LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
try:
self.import_analysis_dir(result_dir, lib_id)
except MetadataLookupException, e:
- logger.error("Skipping %s: %s" % (lib_id, str(e)))
+ LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
def import_analysis_dir(self, analysis_dir, library_id):
"""Import a submission directories and update our model as needed
"""
path, filename = os.path.split(pathname)
- logger.debug("Searching for view")
- file_classification = self.find_best_match(filename)
- if file_classification is None:
- logger.warn("Unrecognized file: {0}".format(pathname))
+ LOGGER.debug("Searching for view")
+ file_type = self.find_best_match(filename)
+ if file_type is None:
+ LOGGER.warn("Unrecognized file: {0}".format(pathname))
return None
- if str(file_classification) == str(libraryOntology['ignore']):
+ if str(file_type) == str(libraryOntology['ignore']):
return None
an_analysis_name = self.make_submission_name(analysis_dir)
an_analysis = self.get_submission_node(analysis_dir)
an_analysis_uri = str(an_analysis.uri)
+ file_classification = self.model.get_target(file_type,
+ rdfNS['type'])
+ if file_classification is None:
+ errmsg = 'Could not find class for {0}'
+ logger.warning(errmsg.format(str(file_type)))
+ return
+ self.model.add_statement(
+ RDF.Statement(self.submissionSetNS[''],
+ submissionOntology['has_submission'],
+ an_analysis))
self.model.add_statement(RDF.Statement(an_analysis,
submissionOntology['name'],
toTypedNode(an_analysis_name)))
submissionOntology['library'],
libNode))
- logger.debug("Adding statements to {0}".format(str(an_analysis)))
+ LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
# add track specific information
self.model.add_statement(
RDF.Statement(an_analysis,
an_analysis_uri,
analysis_dir)
self.add_md5s(filename, fileNode, analysis_dir)
-
- logger.debug("Done.")
+ self.model.add_statement(
+ RDF.Statement(fileNode,
+ rdfNS['type'],
+ file_type))
+ LOGGER.debug("Done.")
def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
# add file specific information
return fileNode
def add_md5s(self, filename, fileNode, analysis_dir):
- logger.debug("Updating file md5sum")
+ LOGGER.debug("Updating file md5sum")
submission_pathname = os.path.join(analysis_dir, filename)
md5 = make_md5sum(submission_pathname)
if md5 is None:
errmsg = "Unable to produce md5sum for {0}"
- logger.warning(errmsg.format(submission_pathname))
+ LOGGER.warning(errmsg.format(submission_pathname))
else:
self.model.add_statement(
RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
for s in self.model.find_statements(filename_query):
view_name = s.subject
literal_re = s.object.literal_value['string']
- logger.debug("Found: %s" % (literal_re,))
+ LOGGER.debug("Found: %s" % (literal_re,))
try:
filename_re = re.compile(literal_re)
except re.error, e:
- logger.error("Unable to compile: %s" % (literal_re,))
+ LOGGER.error("Unable to compile: %s" % (literal_re,))
patterns[literal_re] = view_name
return patterns
"Unrecognized library type %s for %s" % \
(library_type, str(libNode)))
+ def execute_query(self, template, context):
+ """Execute the query, returning the results
+ """
+ formatted_query = template.render(context)
+ LOGGER.debug(formatted_query)
+ query = RDF.SPARQLQuery(str(formatted_query))
+ rdfstream = query.execute(self.model)
+ results = []
+ for r in rdfstream:
+ results.append(r)
+ return results
--- /dev/null
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+WHERE {
+ <{{submission}}> ucscDaf:has_file ?file ;
+ a submissionOntology:submission .
+
+ ?file ucscDaf:filename ?filename ;
+ ucscDaf:md5sum ?md5sum ;
+ a ?file_type .
+ ?file_type a <{{file_class}}> ;
+ geoSoft:fileTypeLabel ?file_type_label .
+
+}
\ No newline at end of file
--- /dev/null
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+ <{{submission}}> submissionOntology:has_platform ?platform .
+ ?platform a geoSoft:platform .
+
+ ?platform ?name ?value .
+}
--- /dev/null
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
+WHERE {
+ <{{submission}}> a submissionOntology:submission .
+
+ OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+ OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+ OPTIONAL { ?library libraryOntology:antibody ?antibody }
+ OPTIONAL { ?library libraryOntology:cell_line ?cell .
+ ?cell_line cells:cell ?cell ;
+ cells:documents ?growthProtocol . }
+ OPTIONAL { ?library ucscDaf:sex ?sex }
+ OPTIONAL { ?library libraryOntology:library_id ?library_id }
+ OPTIONAL { ?library libraryOntology:replicate ?replicate }
+ OPTIONAL { ?library libraryOntology:species ?species_name }
+ OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+ OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+ OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+ OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+ OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+ OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+ OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+ OPTIONAL { ?library ucscDaf:protocol ?protocol }
+ OPTIONAL { ?library ucscDaf:readType ?readType }
+ OPTIONAL { ?library ucscDaf:strain ?strain }
+ OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+ OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+ <{{submission}}> submissionOntology:library ?library ;
+ submissionOntology:name ?name .
+ ?species libraryOntology:species ?species_name ;
+ libraryOntology:taxon_id ?taxon_id .
+
+
+}
\ No newline at end of file
--- /dev/null
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+ <{{submission}}> submissionOntology:has_series ?series.
+ ?series a geoSoft:series .
+
+ ?series ?name ?value .
+}
-Soft template
-!Platform_title = Illumina Genome Analyzer (Homo sapiens)
-!Platform_geo_accession = GPL9052
-{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+{% for name, value in series %}{{name}} = {{value}}
+{% endfor %}!Series_platform_id = {{ platform_id }}
+{% for row in samples %}
^SAMPLE={{row.name}}
+!Sample_type=SRA
!Sample_title={{row.name}}
+!Sample_series_id = {{ series_id }}
+!Sample_instrument_model = Illumina Genome Analyzer
+!Sample_instrument_model = Illumina Genome Analyzer II
+!Sample_instrument_model = Illumina Genome Analyzer IIx
+!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_channel_count = 1
!Sample_organism_ch1 = {{ row.species_name }}
!Sample_taxid_ch1 = {{ row.taxon_id }}
-{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
-{% endif %}{% endspaceless %}{% endif %}
-!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
\ No newline at end of file
+!Sample_platform_id = {{ platform_id }}
+!Sample_source_name_ch1={{row.cell}}
+!Sample_library_strategy={{ row.experiment_type }}
+!Sample_library_source={{row.library_source}}
+!Sample_library_selection={{ row.library_selection }}
+!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
+!Sample_extract_protocol={{ row.extractProtocol|safe }}
+!Sample_data_processing={{ row.dataProtocol|safe }}
+!Sample_molecule_ch1 = {{ row.extractMolecule }}
+!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+{% if row.cell %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endspaceless %}{% endif %}
+{% if row.readType %}{% spaceless %}
+!Sample_characteristics_ch1 = readType: {{ row.readType }}
+{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
+{% endfor %}{% endfor %}
\ No newline at end of file
+++ /dev/null
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
-
-select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
-WHERE {
- <{{submission}}> a submissionOntology:submission .
-
- OPTIONAL { ?submission ucscDaf:control ?control }
- #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
- #OPTIONAL { ?library libraryOntology:antibody ?antibody }
- OPTIONAL { ?library libraryOntology:cell_line ?cell } .
- #OPTIONAL { ?library ucscDaf:sex ?sex }
- OPTIONAL { ?library libraryOntology:library_id ?labExpId }
- OPTIONAL { ?library libraryOntology:library_id ?labVersion }
- OPTIONAL { ?library libraryOntology:replicate ?replicate }
- OPTIONAL { ?library libraryOntology:species ?species_name }
-
- #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
- #OPTIONAL { ?library ucscDaf:protocol ?protocol }
- #OPTIONAL { ?library ucscDaf:readType ?readType }
- #OPTIONAL { ?library ucscDaf:strain ?strain }
- #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
- #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-
- <{{submission}}> submissionOntology:library ?library ;
- ucscDaf:has_file ?file ;
- submissionOntology:name ?name .
- ?species libraryOntology:species ?species_name ;
- libraryOntology:taxon_id ?taxon_id .
- ?file ucscDaf:filename ?filename .
-}