Get actual list of sequencers used for a library.

author Diane Trout <diane@caltech.edu>

Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)

committer Diane Trout <diane@caltech.edu>

Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
author Diane Trout <diane@caltech.edu>
Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
committer Diane Trout <diane@caltech.edu>
Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py

index 8ff349fe9ae7444c5cc82f20c15e8458d6cc0599..ad6642251931df7a49db3ec1c23a34d4c30ab4c0 100644 (file)
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -40,6 +40,7 @@ class GEOSubmission(Submission):
              metadata['supplimental'] = self.get_sample_files(
                  an_analysis,
                  geoSoftNS['supplemental'])
+            metadata['run'] = self.get_run_details(an_analysis)
              samples.append(metadata)
  
          soft_template = loader.get_template('geo_submission.soft')
@@ -98,8 +99,8 @@ class GEOSubmission(Submission):
  
          results = self.execute_query(query_template, context)
          for r in results:
-
              r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
+
          return results
  
      def get_sample_files(self, analysis_node, file_class):
@@ -114,6 +115,17 @@ class GEOSubmission(Submission):
  
          return self.execute_query(query_template, context)
  
+    def get_run_details(self, analysis_node):
+        """Get information about runs
+        """
+        query_template = loader.get_template('geo_run_details.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            })
+
+        return self.execute_query(query_template, context)
+
      def query_to_soft_dictionary(self, results, heading):
          attributes = []
          for r in results:
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index e4ce90c73b073287913ec2f76f5f1d5f0d9bd887..6dd630aeda90fe5ad08a96da8be610f628642b04 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -149,13 +149,44 @@ class Submission(object):
                  RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
  
      def _add_library_details_to_model(self, libNode):
+        # attributes that can have multiple values
+        set_attributes = set((libraryOntology['has_lane'],
+                              libraryOntology['has_mappings'],
+                              dafTermOntology['has_file']))
          parser = RDF.Parser(name='rdfa')
          new_statements = parser.parse_as_stream(libNode.uri)
+        toadd = []
          for s in new_statements:
+            # always add "collections"
+            if s.predicate in set_attributes:
+                toadd.append(s)
+                continue
              # don't override things we already have in the model
              targets = list(self.model.get_targets(s.subject, s.predicate))
              if len(targets) == 0:
-                self.model.append(s)
+                toadd.append(s)
+
+        for s in toadd:
+            self.model.append(s)
+
+        self._add_lane_details(libNode)
+
+    def _add_lane_details(self, libNode):
+        """Import lane details
+        """
+        query = RDF.Statement(libNode, libraryOntology['has_lane'], None)
+        lanes = []
+        for lane_stmt in self.model.find_statements(query):
+            lanes.append(lane_stmt.object)
+
+        parser = RDF.Parser(name='rdfa')
+        for lane in lanes:
+            LOGGER.debug("Importing %s" % (lane.uri,))
+            try:
+                parser.parse_into_model(self.model, lane.uri)
+            except RDF.RedlandError, e:
+                LOGGER.error("Error accessing %s" % (lane.uri,))
+                raise e
  
  
      def find_best_match(self, filename):
diff --git a/htsworkflow/templates/geo_run_details.sparql b/htsworkflow/templates/geo_run_details.sparql

new file mode 100644 (file)

index 0000000..aa7e438
--- /dev/null
+++ b/htsworkflow/templates/geo_run_details.sparql
@@ -0,0 +1,27 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+# right now we're just grabbing the sequencer model
+# it might make sense to report each of the flowcell/image software.
+#select distinct ?flowcell ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
+select distinct ?sequencer_model
+where {
+  <{{submission}}> submissionOntology:library ?library ;
+                   a submissionOntology:submission .
+
+  ?library libraryOntology:library_id ?library_id ;
+           libraryOntology:has_lane ?lane ;
+           a libraryOntology:library .
+  OPTIONAL { ?flowcell libraryOntology:has_lane ?lane .
+             ?flowcell libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version ;
+                       libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version ;
+                       libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model
+  }
+}
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft

index 00c0b4dc77818789bfea8950507fbd1dba0c13fb..3541e56aeed896237ac35dcea76fcf4cf94bc085 100644 (file)
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -4,11 +4,8 @@
  ^SAMPLE={{row.name}}
  !Sample_type=SRA
  !Sample_title={{row.name}}
-!Sample_series_id = {{ series_id }}
-!Sample_instrument_model = Illumina Genome Analyzer
-!Sample_instrument_model = Illumina Genome Analyzer II
-!Sample_instrument_model = Illumina Genome Analyzer IIx
-!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_series_id = {{ series_id }}{% for run in row.run %}
+!Sample_instrument_model = {{ run.sequencer_model }}{% endfor %}
  !Sample_channel_count = 1
  !Sample_organism_ch1 = {{ row.species_name }}
  !Sample_taxid_ch1 = {{ row.taxon_id }}
author	Diane Trout <diane@caltech.edu>
	Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
committer	Diane Trout <diane@caltech.edu>
	Tue, 24 Jul 2012 19:39:34 +0000 (12:39 -0700)
htsworkflow/submission/geo.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/templates/geo_run_details.sparql	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/geo_submission.soft		patch \| blob \| history