This might actually generate soft file with raw & supplemental data.
authorDiane Trout <diane@caltech.edu>
Tue, 25 Sep 2012 23:18:42 +0000 (16:18 -0700)
committerDiane Trout <diane@caltech.edu>
Tue, 25 Sep 2012 23:18:42 +0000 (16:18 -0700)
To make working with the development server easier, I changed
the submission class to take a host which it will use to generate
the base library url.

When constructing URLs for files, I'm now using the actual path names
instead of synthesizing something based on the submission name.
This is to limit the amount of knowledge that needs to be passed
between the fastq generation code.

For fastq files it looks at the source file to find the flowcell
information. For supplemental files it looks at the submission
class for that analysis directory and grabs the library id
from there.

encode_submission/geo_gather.py
htsworkflow/submission/condorfastq.py
htsworkflow/submission/geo.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_fastqs.sparql
htsworkflow/templates/geo_files.sparql

index 9db286d37b6326e1b576e1045a3c04b5e92beabe..f9b07ab775a4491226db9c9b97215ea82f7cc54e 100644 (file)
@@ -58,9 +58,7 @@ def main(cmdline=None):
     model = get_model(opts.model, opts.db_path)
     mapper = None
     if opts.name:
-        mapper = GEOSubmission(opts.name,  model)
-        if opts.library_url is not None:
-            mapper.library_url = opts.library_url
+        mapper = GEOSubmission(opts.name,  model, host=opts.host)
         submission_uri = get_submission_uri(opts.name)
 
 
index 01fe6c5a19274869b22e41821011135ee8ccb6ae..f78a964c978da660d88f5ac8f263cc7f54f15149 100644 (file)
@@ -195,7 +195,6 @@ WHERE {
             imported = False
             a_lane = self.model.get_target(r['flowcell'],
                                            libraryOntology['has_lane'])
-            print a_lane
             if a_lane is None:
                 imported = True
                 # we lack information about which lanes were on this flowcell
index 85947158390f699afa8a76cd719a0b1fadf26ebf..097bea1083af8c1ffc39b3edea9a6e94e0a63fcd 100644 (file)
@@ -17,8 +17,8 @@ from django.template import Context, loader
 LOGGER = logging.getLogger(__name__)
 
 class GEOSubmission(Submission):
-    def __init__(self, name, model):
-        super(GEOSubmission, self).__init__(name, model)
+    def __init__(self, name, model, host):
+        super(GEOSubmission, self).__init__(name, model, host)
 
     def make_soft(self, result_map):
         samples = []
@@ -36,7 +36,7 @@ class GEOSubmission(Submission):
                 LOGGER.error(errmsg.format(str(an_analysis),))
                 continue
             elif len(metadata) > 1:
-                errmsg = 'Confused there are more than one samples for %s'
+                errmsg = 'Confused there are more than one sample for %s'
                 LOGGER.debug(errmsg % (str(an_analysis),))
             metadata = metadata[0]
             metadata['raw'] = self.get_raw_files(an_analysis)
index 2b04ff43a05ce6e70d6ccb7b20f7f17d54165699..c944b7612d127266ce62c973cda4a54356b0129d 100644 (file)
@@ -27,13 +27,13 @@ from htsworkflow.submission.daf import \
 LOGGER = logging.getLogger(__name__)
 
 class Submission(object):
-    def __init__(self, name, model):
+    def __init__(self, name, model, host):
         self.name = name
         self.model = model
 
         self.submissionSet = get_submission_uri(self.name)
         self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
-        self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
+        self.libraryNS = RDF.NS('{0}/library/'.format(host))
 
         self.__view_map = None
 
@@ -57,7 +57,8 @@ class Submission(object):
 
         submission_files = os.listdir(analysis_dir)
         for filename in submission_files:
-            self.construct_file_attributes(analysis_dir, libNode, filename)
+            pathname = os.path.abspath(os.path.join(analysis_dir, filename))
+            self.construct_file_attributes(analysis_dir, libNode, pathname)
 
     def construct_file_attributes(self, analysis_dir, libNode, pathname):
         """Looking for the best extension
@@ -113,7 +114,7 @@ class Submission(object):
                           an_analysis))
 
         # add file specific information
-        fileNode = self.link_file_to_classes(filename,
+        fileNode = self.link_file_to_classes(pathname,
                                              an_analysis,
                                              an_analysis_uri,
                                              analysis_dir)
@@ -124,9 +125,10 @@ class Submission(object):
                           file_type))
         LOGGER.debug("Done.")
 
-    def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
+    def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir):
         # add file specific information
-        fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(filename)))
+        path, filename = os.path.split(pathname)
+        fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
         self.model.add_statement(
             RDF.Statement(submissionNode,
                           dafTermOntology['has_file'],
index de9097ba9962849dde661bbd9ebd38521573f781..428cef7933fb196b1b69760125ecead4b511fdcb 100644 (file)
@@ -3,21 +3,23 @@ PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntol
 PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
 PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX dc: <http://purl.org/dc/elements/1.1/>
 
 select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
 WHERE {
-  <{{submission}}> submissionOntology:library ?library ;
+  <{{submission}}> ucscDaf:has_file ?file ;
+                   submissionOntology:library ?library ;
                    a submissionOntology:submission .
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
-        libraryOntology:library ?library ;
+        dc:source ?source ;
         a ?file_type .
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
-  ?library libraryOntology:has_lane ?lane .
-  ?lane libraryOntology:flowcell ?flowcell .
+  ?source libraryOntology:flowcell ?flowcell .
+
   ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
             libraryOntology:read_length ?read_length ;
             libraryOntology:flowcell_type ?flowcell_type ;
index e3fcb9d8d4028c57f752e254019227f6e61b85bf..6fd7cac6ad92a92517b38240c0cf1b7ccbc57d70 100644 (file)
@@ -3,20 +3,22 @@ PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntol
 PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
 PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX dc: <http://purl.org/dc/elements/1.1/>
 
 select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
 WHERE {
   <{{submission}}> ucscDaf:has_file ?file ;
+                   submissionOntology:library ?library ;
                    a submissionOntology:submission .
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
-        libraryOntology:has_lane ?lane ;
         a ?file_type .
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
-  OPTIONAL { ?lane libraryOntology:flowcell ?flowcell .
+  OPTIONAL { ?file dc:source ?source_file .
+             ?source_file libraryOntology:flowcell ?flowcell .
              ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
                        libraryOntology:read_length ?read_length ;
                        libraryOntology:flowcell_type ?flowcell_type ;