htsworkflow/submission/geo.py

   1 from __future__ import print_function
   2
   3 import logging
   4 import os
   5
   6 import RDF
   7
   8 from htsworkflow.submission.submission import Submission
   9
  10 from htsworkflow.util.rdfhelp import \
  11      fromTypedNode, \
  12      geoSoftNS, \
  13      strip_namespace, \
  14      submissionOntology
  15
  16 from django.conf import settings
  17 from django.template import Context, loader
  18
  19 LOGGER = logging.getLogger(__name__)
  20
  21 class GEOSubmission(Submission):
  22     def __init__(self, name, model, host):
  23         super(GEOSubmission, self).__init__(name, model, host)
  24
  25     def make_soft(self, result_map):
  26         samples = []
  27         platform = self.get_platform_metadata()
  28         platform_attribs = dict(platform)
  29         platform_id = platform_attribs['^platform']
  30         series = self.get_series_metadata()
  31         series_attribs = dict(series)
  32         series_id = series_attribs['^series']
  33         for lib_id, result_dir in result_map.items():
  34             an_analysis = self.get_submission_node(result_dir)
  35             metadata = self.get_sample_metadata(an_analysis)
  36             if len(metadata) == 0:
  37                 errmsg = 'No metadata found for {0}'
  38                 LOGGER.error(errmsg.format(str(an_analysis),))
  39                 continue
  40             elif len(metadata) > 1:
  41                 errmsg = 'Confused there are more than one sample for %s'
  42                 LOGGER.debug(errmsg % (str(an_analysis),))
  43             metadata = metadata[0]
  44             metadata['raw'] = self.get_raw_files(an_analysis)
  45             metadata['supplimental'] = self.get_sample_files(an_analysis)
  46             metadata['run'] = self.get_run_details(an_analysis)
  47             samples.append(metadata)
  48
  49         soft_template = loader.get_template('geo_submission.soft')
  50         context = Context({
  51             'platform': platform,
  52             'series': series,
  53             'samples': samples,
  54             'platform_id': platform_id,
  55             'series_id': series_id,
  56         })
  57         print(str(soft_template.render(context)))
  58
  59     def check_for_name(self, analysis_node):
  60         name = fromTypedNode(
  61             self.model.get_target(analysis_node,
  62                                   submissionOntology['name']))
  63         if name is None:
  64             logger.error("Need name for %s" % (str(analysis_node)))
  65             return False
  66         else:
  67             return True
  68
  69     def get_platform_metadata(self):
  70         """Gather information for filling out sample section of a SOFT file
  71         """
  72         query_template = loader.get_template('geo_platform.sparql')
  73         submission = str(self.submissionSetNS[''].uri)
  74         context = Context({
  75             'submission': submission,
  76             })
  77
  78         results = self.execute_query(query_template, context)
  79         return self.query_to_soft_dictionary(results, 'platform')
  80
  81     def get_series_metadata(self):
  82         """Gather information for filling out sample section of a SOFT file
  83         """
  84         query_template = loader.get_template('geo_series.sparql')
  85         submission = str(self.submissionSetNS[''].uri)
  86         context = Context({
  87             'submission': submission,
  88             })
  89
  90         results = self.execute_query(query_template, context)
  91         return self.query_to_soft_dictionary(results, 'series')
  92
  93     def get_sample_metadata(self, analysis_node):
  94         """Gather information for filling out sample section of a SOFT file
  95         """
  96         query_template = loader.get_template('geo_samples.sparql')
  97
  98         context = Context({
  99             'submission': str(analysis_node.uri),
 100             'submissionSet': str(self.submissionSetNS[''].uri),
 101             })
 102
 103         results = self.execute_query(query_template, context)
 104         for r in results:
 105             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
 106
 107         return results
 108
 109     def get_sample_files(self, analysis_node):
 110         """Gather derived files
 111         """
 112         query_template = loader.get_template('geo_files.sparql')
 113
 114         context = Context({
 115             'submission': str(analysis_node.uri),
 116             'file_class': str(geoSoftNS['supplemental'].uri)
 117             })
 118
 119         return self.execute_query(query_template, context)
 120
 121     def get_raw_files(self, analysis_node):
 122         """Gather raw data e.g. fastq files.
 123         """
 124         query_template = loader.get_template('geo_fastqs.sparql')
 125
 126         context = Context({
 127             'submission': str(analysis_node.uri),
 128             'file_class': str(geoSoftNS['raw'].uri),
 129             })
 130
 131         lanes = {}
 132         for row in self.execute_query(query_template, context):
 133             data = {}
 134             for k, v in row.items():
 135                 data[k] = v
 136             library = str(data['library'])
 137             lanes.setdefault(library, []).append(data)
 138         result = []
 139         for library, files in lanes.items():
 140             if len(files) > 2:
 141                 errmsg = "Don't know what to do with more than 2 raw files"
 142                 raise ValueError(errmsg)
 143             elif len(files) == 2:
 144                 is_paired = True
 145             elif len(files) == 1:
 146                 is_paired = False
 147             elif len(files) == 0:
 148                 raise RuntimeError("Empty library list discovered")
 149             files = self._format_filename(files, is_paired)
 150             files = self._format_flowcell_type(files, is_paired)
 151             files = self._format_read_length(files, is_paired)
 152             result.append(files[0])
 153         return result
 154
 155     def _format_flowcell_type(self, files, is_paired):
 156         """Used by get_raw_files to format value for single_or_paired-end
 157         """
 158         for f in files:
 159             if 'flowcell_type' in f:
 160                 flowcell_type = fromTypedNode(f['flowcell_type'])
 161                 if flowcell_type is None:
 162                     pass
 163                 elif flowcell_type.lower() == "paired":
 164                     f['flowcell_type'] = 'paired-end'
 165                 else:
 166                     f['flowcell_type'] = 'single'
 167
 168         return files
 169
 170     def _format_read_length(self, files, is_paired):
 171         """Format
 172         """
 173         read_count = 2 if is_paired else 1
 174         for f in files:
 175             if 'read_length' in f:
 176                 read_length = str(fromTypedNode(f['read_length']))
 177                 f['read_length'] = ",".join([read_length] * read_count)
 178         return files
 179
 180     def _format_filename(self, files, is_paired):
 181         """Format file name for get_raw_files, also report if paired
 182         """
 183         if len(files) == 2:
 184             # should be paired
 185             f0 = files[0]
 186             f1 = files[1]
 187             f0['filename'] = "%s, %s" % (str(f0['filename']),
 188                                          str(f1['filename']))
 189             f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
 190                                        str(f1['md5sum']))
 191             del files[1]
 192         else:
 193             files[0]['filename'] = str(files[0]['filename'])
 194             files[0]['md5sum'] = str(files[0]['md5sum'])
 195         return files
 196
 197
 198     def get_run_details(self, analysis_node):
 199         """Get information about runs
 200         """
 201         query_template = loader.get_template('geo_run_details.sparql')
 202
 203         context = Context({
 204             'submission': str(analysis_node.uri),
 205             })
 206
 207         return self.execute_query(query_template, context)
 208
 209     def query_to_soft_dictionary(self, results, heading):
 210         attributes = []
 211         for r in results:
 212             name = strip_namespace(geoSoftNS, r['name'])
 213             if name is not None:
 214                 if name.lower() == heading.lower():
 215                     name = '^' + name
 216                 else:
 217                     name = '!' + name
 218                 for v in fromTypedNode(r['value']).split(os.linesep):
 219                     v = v.strip()
 220                     if len(v) > 0:
 221                         attributes.append((name, v))
 222         return attributes