htsworkflow/submission/geo.py

   1 import logging
   2 import os
   3
   4 import RDF
   5
   6 from htsworkflow.submission.submission import Submission
   7
   8 from htsworkflow.util.rdfhelp import \
   9      fromTypedNode, \
  10      geoSoftNS, \
  11      stripNamespace, \
  12      submissionOntology
  13
  14 from django.conf import settings
  15 from django.template import Context, loader
  16
  17 LOGGER = logging.getLogger(__name__)
  18
  19 class GEOSubmission(Submission):
  20     def __init__(self, name, model):
  21         super(GEOSubmission, self).__init__(name, model)
  22
  23     def make_soft(self, result_map):
  24         samples = []
  25         platform = self.get_platform_metadata()
  26         platform_attribs = dict(platform)
  27         platform_id = platform_attribs['^platform']
  28         series = self.get_series_metadata()
  29         series_attribs = dict(series)
  30         series_id = series_attribs['^series']
  31         for lib_id, result_dir in result_map.items():
  32             an_analysis = self.get_submission_node(result_dir)
  33             metadata = self.get_sample_metadata(an_analysis)
  34             if len(metadata) > 1:
  35                 errmsg = 'Confused there are more than one samples for %s'
  36                 LOGGER.debug(errmsg % (str(an_analysis,)))
  37             metadata = metadata[0]
  38             metadata['raw'] = self.get_raw_files(an_analysis)
  39             metadata['supplimental'] = self.get_sample_files(an_analysis)
  40             metadata['run'] = self.get_run_details(an_analysis)
  41             samples.append(metadata)
  42
  43         soft_template = loader.get_template('geo_submission.soft')
  44         context = Context({
  45             'platform': platform,
  46             'series': series,
  47             'samples': samples,
  48             'platform_id': platform_id,
  49             'series_id': series_id,
  50         })
  51         print str(soft_template.render(context))
  52
  53     def check_for_name(self, analysis_node):
  54         name = fromTypedNode(
  55             self.model.get_target(analysis_node,
  56                                   submissionOntology['name']))
  57         if name is None:
  58             logger.error("Need name for %s" % (str(analysis_node)))
  59             return False
  60         else:
  61             return True
  62
  63     def get_platform_metadata(self):
  64         """Gather information for filling out sample section of a SOFT file
  65         """
  66         query_template = loader.get_template('geo_platform.sparql')
  67         submission = str(self.submissionSetNS[''].uri)
  68         context = Context({
  69             'submission': submission,
  70             })
  71
  72         results = self.execute_query(query_template, context)
  73         return self.query_to_soft_dictionary(results, 'platform')
  74
  75     def get_series_metadata(self):
  76         """Gather information for filling out sample section of a SOFT file
  77         """
  78         query_template = loader.get_template('geo_series.sparql')
  79         submission = str(self.submissionSetNS[''].uri)
  80         context = Context({
  81             'submission': submission,
  82             })
  83
  84         results = self.execute_query(query_template, context)
  85         return self.query_to_soft_dictionary(results, 'series')
  86
  87     def get_sample_metadata(self, analysis_node):
  88         """Gather information for filling out sample section of a SOFT file
  89         """
  90         query_template = loader.get_template('geo_samples.sparql')
  91
  92         context = Context({
  93             'submission': str(analysis_node.uri),
  94             'submissionSet': str(self.submissionSetNS[''].uri),
  95             })
  96
  97         results = self.execute_query(query_template, context)
  98         for r in results:
  99             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
 100
 101         return results
 102
 103     def get_sample_files(self, analysis_node):
 104         """Gather derived files
 105         """
 106         query_template = loader.get_template('geo_files.sparql')
 107
 108         context = Context({
 109             'submission': str(analysis_node.uri),
 110             'file_class': str(geoSoftNS['supplemental'])
 111             })
 112
 113         return self.execute_query(query_template, context)
 114
 115     def get_raw_files(self, analysis_node):
 116         """Gather raw data e.g. fastq files.
 117         """
 118         query_template = loader.get_template('geo_fastqs.sparql')
 119
 120         context = Context({
 121             'submission': str(analysis_node.uri),
 122             'file_class': str(geoSoftNS['raw']),
 123             })
 124
 125         lanes = {}
 126         for row in self.execute_query(query_template, context):
 127             data = {}
 128             for k, v in row.items():
 129                 data[k] = v
 130             lane = str(data['lane'])
 131             lanes.setdefault(lane, []).append(data)
 132         result = []
 133         for lane, files in lanes.items():
 134             if len(files) > 2:
 135                 errmsg = "Don't know what to do with more than 2 raw files"
 136                 raise ValueError(errmsg)
 137             elif len(files) == 2:
 138                 is_paired = True
 139             elif len(files) == 1:
 140                 is_paired = False
 141             elif len(files) == 0:
 142                 raise RuntimeError("Empty lane list discovered")
 143             files = self._format_filename(files, is_paired)
 144             files = self._format_flowcell_type(files, is_paired)
 145             files = self._format_read_length(files, is_paired)
 146             result.append(files[0])
 147         return result
 148
 149     def _format_flowcell_type(self, files, is_paired):
 150         """Used by get_raw_files to format value for single_or_paired-end
 151         """
 152         for f in files:
 153             if 'flowcell_type' in f:
 154                 flowcell_type = fromTypedNode(f['flowcell_type'])
 155                 if flowcell_type is None:
 156                     pass
 157                 elif flowcell_type.lower() == "paired":
 158                     f['flowcell_type'] = 'paired-end'
 159                 else:
 160                     f['flowcell_type'] = 'single'
 161
 162         return files
 163
 164     def _format_read_length(self, files, is_paired):
 165         """Format
 166         """
 167         read_count = 2 if is_paired else 1
 168         for f in files:
 169             if 'read_length' in f:
 170                 read_length = str(fromTypedNode(f['read_length']))
 171                 f['read_length'] = ",".join([read_length] * read_count)
 172         return files
 173
 174     def _format_filename(self, files, is_paired):
 175         """Format file name for get_raw_files, also report if paired
 176         """
 177         if len(files) == 2:
 178             # should be paired
 179             f0 = files[0]
 180             f1 = files[1]
 181             f0['filename'] = "%s, %s" % (str(f0['filename']),
 182                                          str(f1['filename']))
 183             f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
 184                                        str(f1['md5sum']))
 185             del files[1]
 186         else:
 187             files[0]['filename'] = str(files[0]['filename'])
 188             files[0]['md5sum'] = str(files[0]['md5sum'])
 189         return files
 190
 191
 192     def get_run_details(self, analysis_node):
 193         """Get information about runs
 194         """
 195         query_template = loader.get_template('geo_run_details.sparql')
 196
 197         context = Context({
 198             'submission': str(analysis_node.uri),
 199             })
 200
 201         return self.execute_query(query_template, context)
 202
 203     def query_to_soft_dictionary(self, results, heading):
 204         attributes = []
 205         for r in results:
 206             name = stripNamespace(geoSoftNS, r['name'])
 207             if name is not None:
 208                 if name.lower() == heading.lower():
 209                     name = '^' + name
 210                 else:
 211                     name = '!' + name
 212                 for v in fromTypedNode(r['value']).split(os.linesep):
 213                     v = v.strip()
 214                     if len(v) > 0:
 215                         attributes.append((name, v))
 216         return attributes