Directly generate compressed fastq files from HiSeq split fastqs.

author Diane Trout <diane@caltech.edu>

Mon, 12 May 2014 23:16:00 +0000 (16:16 -0700)

committer Diane Trout <diane@ghic.org>

Tue, 10 Jun 2014 23:30:17 +0000 (16:30 -0700)
author Diane Trout <diane@caltech.edu>
Mon, 12 May 2014 23:16:00 +0000 (16:16 -0700)
committer Diane Trout <diane@ghic.org>
Tue, 10 Jun 2014 23:30:17 +0000 (16:30 -0700)
diff --git a/encode_submission/encode3.py b/encode_submission/encode3.py

index a77151e1ead7d2a91091a58a4451a01bcd569d72..53984fc245f460b79a1743fc37eda89187366e6f 100644 (file)
--- a/encode_submission/encode3.py
+++ b/encode_submission/encode3.py
@@ -115,6 +115,7 @@ def main(cmdline=None):
          flowcells = os.path.join(opts.sequence, 'flowcells')
          extractor = CondorFastqExtract(opts.host, flowcells,
                                         model=opts.model,
+                                       compression=opts.compression,
                                         force=opts.force)
          extractor.create_scripts(results)
  
@@ -188,6 +189,9 @@ def make_parser():
  
      parser.add_option('--force', default=False, action="store_true",
                        help="Force regenerating fastqs")
+    parser.add_option('--compression', default=None, type='choice',
+                      choices=['gzip'],
+                      help='select compression type for fastq files')
      parser.add_option('--daf', default=None, help='specify daf name')
      parser.add_option('--library-url', default=None,
                        help="specify an alternate source for library information")
diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py

index b6c22348f581f5148eeaa33cacfafe65f6ebc963..37d60edf9572ff8c51e6328b88ecc53467496ae8 100644 (file)
--- a/htsworkflow/submission/condorfastq.py
+++ b/htsworkflow/submission/condorfastq.py
@@ -27,11 +27,16 @@ import RDF
  
  LOGGER = logging.getLogger(__name__)
  
+COMPRESSION_EXTENSIONS = {
+    None: '',
+    'gzip': '.gz'
+}
  
  class CondorFastqExtract(object):
      def __init__(self, host, sequences_path,
                   log_path='log',
                   model=None,
+                 compression=None,
                   force=False):
          """Extract fastqs from results archive
  
@@ -40,16 +45,19 @@ class CondorFastqExtract(object):
            apidata (dict): id & key to post to the server
            sequences_path (str): root of the directory tree to scan for files
            log_path (str): where to put condor log files
+          compression (str): one of 'gzip', 'bzip2'
            force (bool): do we force overwriting current files?
          """
          self.host = host
          self.model = get_model(model)
          self.sequences_path = sequences_path
          self.log_path = log_path
+        self.compression=compression
          self.force = force
          LOGGER.info("CondorFastq host={0}".format(self.host))
          LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path))
          LOGGER.info("CondorFastq log_path={0}".format(self.log_path))
+        LOGGER.info("Compression {0}".format(self.compression))
  
      def create_scripts(self, result_map ):
          """
@@ -231,6 +239,7 @@ WHERE {
                  'lane': seq.lane_number,
                  'read': seq.read,
                  'cycle': seq.cycle,
+                'compression_extension': COMPRESSION_EXTENSIONS[self.compression],
                  'is_paired': seq.ispaired
              }
  
@@ -291,9 +300,14 @@ WHERE {
          for source in sources:
              paths.append(source.path)
          paths.sort()
+        compression_argument = ''
+        if self.compression:
+            compression_argument = '--'+self.compression
+
          return {
              'pyscript': desplit_fastq.__file__,
              'target': target_pathname,
+            'compression': compression_argument,
              'sources': paths,
              'ispaired': sources[0].ispaired,
          }
diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py

index 9dd52a0319fb9e9efa230e68344ef53156645be7..ac3d5fc4ed59a4ff944588c015ae402fba1b0e53 100644 (file)
--- a/htsworkflow/submission/fastqname.py
+++ b/htsworkflow/submission/fastqname.py
@@ -2,8 +2,8 @@
  """
  import collections
  import re
-PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
-SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq{compression_extension}'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq{compression_extension}'
  
  FASTQ_RE = re.compile(
      '(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
@@ -17,7 +17,7 @@ class FastqName(collections.Mapping):
  
          Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
          """
-        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle', 'compression_extension')
          self._is_paired = is_paired
  
          if len(kwargs) == 0:
diff --git a/htsworkflow/templates/split_fastq.condor b/htsworkflow/templates/split_fastq.condor

index 127a9c0122edd4351728d62c4b85d129c617bc26..faca698ba64909e81cf77d35321e06e8968f4131 100644 (file)
--- a/htsworkflow/templates/split_fastq.condor
+++ b/htsworkflow/templates/split_fastq.condor
@@ -6,6 +6,6 @@ log={{ logdir }}/fastq.log
  {% if env %}environment="{{env}}"{% endif %}
  
  {% for arg in args %}
-arguments="{{ arg.pyscript }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
+arguments="{{ arg.pyscript }} {{ arg.compression }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
  queue
  {% endfor %}
author	Diane Trout <diane@caltech.edu>
	Mon, 12 May 2014 23:16:00 +0000 (16:16 -0700)
committer	Diane Trout <diane@ghic.org>
	Tue, 10 Jun 2014 23:30:17 +0000 (16:30 -0700)
encode_submission/encode3.py		patch \| blob \| history
htsworkflow/submission/condorfastq.py		patch \| blob \| history
htsworkflow/submission/fastqname.py		patch \| blob \| history
htsworkflow/templates/split_fastq.condor		patch \| blob \| history