Get actual list of sequencers used for a library.
authorDiane Trout <diane@caltech.edu>
Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
committerDiane Trout <diane@caltech.edu>
Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
I do have pipeline version numbers available, but with the current
version of the query it would end up duplicating the sequencer
model number. (there's a lot more version combinations then
there are sequencers)

htsworkflow/submission/geo.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_run_details.sparql [new file with mode: 0644]
htsworkflow/templates/geo_submission.soft

index 8ff349fe9ae7444c5cc82f20c15e8458d6cc0599..ad6642251931df7a49db3ec1c23a34d4c30ab4c0 100644 (file)
@@ -40,6 +40,7 @@ class GEOSubmission(Submission):
             metadata['supplimental'] = self.get_sample_files(
                 an_analysis,
                 geoSoftNS['supplemental'])
+            metadata['run'] = self.get_run_details(an_analysis)
             samples.append(metadata)
 
         soft_template = loader.get_template('geo_submission.soft')
@@ -98,8 +99,8 @@ class GEOSubmission(Submission):
 
         results = self.execute_query(query_template, context)
         for r in results:
-
             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
+
         return results
 
     def get_sample_files(self, analysis_node, file_class):
@@ -114,6 +115,17 @@ class GEOSubmission(Submission):
 
         return self.execute_query(query_template, context)
 
+    def get_run_details(self, analysis_node):
+        """Get information about runs
+        """
+        query_template = loader.get_template('geo_run_details.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            })
+
+        return self.execute_query(query_template, context)
+
     def query_to_soft_dictionary(self, results, heading):
         attributes = []
         for r in results:
index e4ce90c73b073287913ec2f76f5f1d5f0d9bd887..6dd630aeda90fe5ad08a96da8be610f628642b04 100644 (file)
@@ -149,13 +149,44 @@ class Submission(object):
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 
     def _add_library_details_to_model(self, libNode):
+        # attributes that can have multiple values
+        set_attributes = set((libraryOntology['has_lane'],
+                              libraryOntology['has_mappings'],
+                              dafTermOntology['has_file']))
         parser = RDF.Parser(name='rdfa')
         new_statements = parser.parse_as_stream(libNode.uri)
+        toadd = []
         for s in new_statements:
+            # always add "collections"
+            if s.predicate in set_attributes:
+                toadd.append(s)
+                continue
             # don't override things we already have in the model
             targets = list(self.model.get_targets(s.subject, s.predicate))
             if len(targets) == 0:
-                self.model.append(s)
+                toadd.append(s)
+
+        for s in toadd:
+            self.model.append(s)
+
+        self._add_lane_details(libNode)
+
+    def _add_lane_details(self, libNode):
+        """Import lane details
+        """
+        query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
+        lanes = []
+        for lane_stmt in self.model.find_statements(query):
+            lanes.append(lane_stmt.object)
+
+        parser = RDF.Parser(name='rdfa')
+        for lane in lanes:
+            LOGGER.debug("Importing %s" % (lane.uri,))
+            try:
+                parser.parse_into_model(self.model, lane.uri)
+            except RDF.RedlandError, e:
+                LOGGER.error("Error accessing %s" % (lane.uri,))
+                raise e
 
 
     def find_best_match(self, filename):
diff --git a/htsworkflow/templates/geo_run_details.sparql b/htsworkflow/templates/geo_run_details.sparql
new file mode 100644 (file)
index 0000000..aa7e438
--- /dev/null
@@ -0,0 +1,27 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+# right now we're just grabbing the sequencer model
+# it might make sense to report each of the flowcell/image software.
+#select distinct ?flowcell ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
+select distinct ?sequencer_model
+where {
+  <{{submission}}> submissionOntology:library ?library ;
+                   a submissionOntology:submission .
+
+  ?library libraryOntology:library_id ?library_id ;
+           libraryOntology:has_lane ?lane ;
+           a libraryOntology:library .
+  OPTIONAL { ?flowcell libraryOntology:has_lane ?lane .
+             ?flowcell libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version ;
+                       libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version ;
+                       libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model
+  }
+}
index 00c0b4dc77818789bfea8950507fbd1dba0c13fb..3541e56aeed896237ac35dcea76fcf4cf94bc085 100644 (file)
@@ -4,11 +4,8 @@
 ^SAMPLE={{row.name}}
 !Sample_type=SRA
 !Sample_title={{row.name}}
-!Sample_series_id = {{ series_id }}
-!Sample_instrument_model = Illumina Genome Analyzer
-!Sample_instrument_model = Illumina Genome Analyzer II
-!Sample_instrument_model = Illumina Genome Analyzer IIx
-!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_series_id = {{ series_id }}{% for run in row.run %}
+!Sample_instrument_model = {{ run.sequencer_model }}{% endfor %}
 !Sample_channel_count = 1
 !Sample_organism_ch1 = {{ row.species_name }}
 !Sample_taxid_ch1 = {{ row.taxon_id }}