From: Diane Trout Date: Tue, 25 Sep 2012 23:18:42 +0000 (-0700) Subject: This might actually generate soft file with raw & supplemental data. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=93f0db5c7e4b67e2fb5dd82a9ce7749343ef81a4 This might actually generate soft file with raw & supplemental data. To make working with the development server easier, I changed the submission class to take a host which it will use to generate the base library url. When constructing URLs for files, I'm now using the actual path names instead of synthesizing something based on the submission name. This is to limit the amount of knowledge that needs to be passed between the fastq generation code. For fastq files it looks at the source file to find the flowcell information. For supplemental files it looks at the submission class for that analysis directory and grabs the library id from there. --- diff --git a/encode_submission/geo_gather.py b/encode_submission/geo_gather.py index 9db286d..f9b07ab 100644 --- a/encode_submission/geo_gather.py +++ b/encode_submission/geo_gather.py @@ -58,9 +58,7 @@ def main(cmdline=None): model = get_model(opts.model, opts.db_path) mapper = None if opts.name: - mapper = GEOSubmission(opts.name, model) - if opts.library_url is not None: - mapper.library_url = opts.library_url + mapper = GEOSubmission(opts.name, model, host=opts.host) submission_uri = get_submission_uri(opts.name) diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py index 01fe6c5..f78a964 100644 --- a/htsworkflow/submission/condorfastq.py +++ b/htsworkflow/submission/condorfastq.py @@ -195,7 +195,6 @@ WHERE { imported = False a_lane = self.model.get_target(r['flowcell'], libraryOntology['has_lane']) - print a_lane if a_lane is None: imported = True # we lack information about which lanes were on this flowcell diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py index 8594715..097bea1 100644 --- a/htsworkflow/submission/geo.py +++ b/htsworkflow/submission/geo.py @@ -17,8 +17,8 @@ from django.template import Context, loader LOGGER = logging.getLogger(__name__) class GEOSubmission(Submission): - def __init__(self, name, model): - super(GEOSubmission, self).__init__(name, model) + def __init__(self, name, model, host): + super(GEOSubmission, self).__init__(name, model, host) def make_soft(self, result_map): samples = [] @@ -36,7 +36,7 @@ class GEOSubmission(Submission): LOGGER.error(errmsg.format(str(an_analysis),)) continue elif len(metadata) > 1: - errmsg = 'Confused there are more than one samples for %s' + errmsg = 'Confused there are more than one sample for %s' LOGGER.debug(errmsg % (str(an_analysis),)) metadata = metadata[0] metadata['raw'] = self.get_raw_files(an_analysis) diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py index 2b04ff4..c944b76 100644 --- a/htsworkflow/submission/submission.py +++ b/htsworkflow/submission/submission.py @@ -27,13 +27,13 @@ from htsworkflow.submission.daf import \ LOGGER = logging.getLogger(__name__) class Submission(object): - def __init__(self, name, model): + def __init__(self, name, model, host): self.name = name self.model = model self.submissionSet = get_submission_uri(self.name) self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#') - self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/') + self.libraryNS = RDF.NS('{0}/library/'.format(host)) self.__view_map = None @@ -57,7 +57,8 @@ class Submission(object): submission_files = os.listdir(analysis_dir) for filename in submission_files: - self.construct_file_attributes(analysis_dir, libNode, filename) + pathname = os.path.abspath(os.path.join(analysis_dir, filename)) + self.construct_file_attributes(analysis_dir, libNode, pathname) def construct_file_attributes(self, analysis_dir, libNode, pathname): """Looking for the best extension @@ -113,7 +114,7 @@ class Submission(object): an_analysis)) # add file specific information - fileNode = self.link_file_to_classes(filename, + fileNode = self.link_file_to_classes(pathname, an_analysis, an_analysis_uri, analysis_dir) @@ -124,9 +125,10 @@ class Submission(object): file_type)) LOGGER.debug("Done.") - def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir): + def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir): # add file specific information - fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(filename))) + path, filename = os.path.split(pathname) + fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname))) self.model.add_statement( RDF.Statement(submissionNode, dafTermOntology['has_file'], diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql index de9097b..428cef7 100644 --- a/htsworkflow/templates/geo_fastqs.sparql +++ b/htsworkflow/templates/geo_fastqs.sparql @@ -3,21 +3,23 @@ PREFIX submissionOntology: PREFIX ncbiTaxon: PREFIX geoSoft: +PREFIX dc: select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model WHERE { - <{{submission}}> submissionOntology:library ?library ; + <{{submission}}> ucscDaf:has_file ?file ; + submissionOntology:library ?library ; a submissionOntology:submission . ?file ucscDaf:filename ?filename ; ucscDaf:md5sum ?md5sum ; - libraryOntology:library ?library ; + dc:source ?source ; a ?file_type . ?file_type a <{{file_class}}> ; geoSoft:fileTypeLabel ?file_type_label . - ?library libraryOntology:has_lane ?lane . - ?lane libraryOntology:flowcell ?flowcell . + ?source libraryOntology:flowcell ?flowcell . + ?flowcell libraryOntology:flowcell_id ?flowcell_id ; libraryOntology:read_length ?read_length ; libraryOntology:flowcell_type ?flowcell_type ; diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql index e3fcb9d..6fd7cac 100644 --- a/htsworkflow/templates/geo_files.sparql +++ b/htsworkflow/templates/geo_files.sparql @@ -3,20 +3,22 @@ PREFIX submissionOntology: PREFIX ncbiTaxon: PREFIX geoSoft: +PREFIX dc: select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model WHERE { <{{submission}}> ucscDaf:has_file ?file ; + submissionOntology:library ?library ; a submissionOntology:submission . ?file ucscDaf:filename ?filename ; ucscDaf:md5sum ?md5sum ; - libraryOntology:has_lane ?lane ; a ?file_type . ?file_type a <{{file_class}}> ; geoSoft:fileTypeLabel ?file_type_label . - OPTIONAL { ?lane libraryOntology:flowcell ?flowcell . + OPTIONAL { ?file dc:source ?source_file . + ?source_file libraryOntology:flowcell ?flowcell . ?flowcell libraryOntology:flowcell_id ?flowcell_id ; libraryOntology:read_length ?read_length ; libraryOntology:flowcell_type ?flowcell_type ;