Directly generate compressed fastq files from HiSeq split fastqs.
authorDiane Trout <diane@caltech.edu>
Mon, 12 May 2014 23:16:00 +0000 (16:16 -0700)
committerDiane Trout <diane@ghic.org>
Tue, 10 Jun 2014 23:30:17 +0000 (16:30 -0700)
desplit_fastq is the only fastq builder that has the necessary
compression argument.

encode_submission/encode3.py
htsworkflow/submission/condorfastq.py
htsworkflow/submission/fastqname.py
htsworkflow/templates/split_fastq.condor

index a77151e1ead7d2a91091a58a4451a01bcd569d72..53984fc245f460b79a1743fc37eda89187366e6f 100644 (file)
@@ -115,6 +115,7 @@ def main(cmdline=None):
         flowcells = os.path.join(opts.sequence, 'flowcells')
         extractor = CondorFastqExtract(opts.host, flowcells,
                                        model=opts.model,
+                                       compression=opts.compression,
                                        force=opts.force)
         extractor.create_scripts(results)
 
@@ -188,6 +189,9 @@ def make_parser():
 
     parser.add_option('--force', default=False, action="store_true",
                       help="Force regenerating fastqs")
+    parser.add_option('--compression', default=None, type='choice',
+                      choices=['gzip'],
+                      help='select compression type for fastq files')
     parser.add_option('--daf', default=None, help='specify daf name')
     parser.add_option('--library-url', default=None,
                       help="specify an alternate source for library information")
index b6c22348f581f5148eeaa33cacfafe65f6ebc963..37d60edf9572ff8c51e6328b88ecc53467496ae8 100644 (file)
@@ -27,11 +27,16 @@ import RDF
 
 LOGGER = logging.getLogger(__name__)
 
+COMPRESSION_EXTENSIONS = {
+    None: '',
+    'gzip': '.gz'
+}
 
 class CondorFastqExtract(object):
     def __init__(self, host, sequences_path,
                  log_path='log',
                  model=None,
+                 compression=None,
                  force=False):
         """Extract fastqs from results archive
 
@@ -40,16 +45,19 @@ class CondorFastqExtract(object):
           apidata (dict): id & key to post to the server
           sequences_path (str): root of the directory tree to scan for files
           log_path (str): where to put condor log files
+          compression (str): one of 'gzip', 'bzip2'
           force (bool): do we force overwriting current files?
         """
         self.host = host
         self.model = get_model(model)
         self.sequences_path = sequences_path
         self.log_path = log_path
+        self.compression=compression
         self.force = force
         LOGGER.info("CondorFastq host={0}".format(self.host))
         LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path))
         LOGGER.info("CondorFastq log_path={0}".format(self.log_path))
+        LOGGER.info("Compression {0}".format(self.compression))
 
     def create_scripts(self, result_map ):
         """
@@ -231,6 +239,7 @@ WHERE {
                 'lane': seq.lane_number,
                 'read': seq.read,
                 'cycle': seq.cycle,
+                'compression_extension': COMPRESSION_EXTENSIONS[self.compression],
                 'is_paired': seq.ispaired
             }
 
@@ -291,9 +300,14 @@ WHERE {
         for source in sources:
             paths.append(source.path)
         paths.sort()
+        compression_argument = ''
+        if self.compression:
+            compression_argument = '--'+self.compression
+
         return {
             'pyscript': desplit_fastq.__file__,
             'target': target_pathname,
+            'compression': compression_argument,
             'sources': paths,
             'ispaired': sources[0].ispaired,
         }
index 9dd52a0319fb9e9efa230e68344ef53156645be7..ac3d5fc4ed59a4ff944588c015ae402fba1b0e53 100644 (file)
@@ -2,8 +2,8 @@
 """
 import collections
 import re
-PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
-SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq{compression_extension}'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq{compression_extension}'
 
 FASTQ_RE = re.compile(
     '(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
@@ -17,7 +17,7 @@ class FastqName(collections.Mapping):
 
         Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
         """
-        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle', 'compression_extension')
         self._is_paired = is_paired
 
         if len(kwargs) == 0:
index 127a9c0122edd4351728d62c4b85d129c617bc26..faca698ba64909e81cf77d35321e06e8968f4131 100644 (file)
@@ -6,6 +6,6 @@ log={{ logdir }}/fastq.log
 {% if env %}environment="{{env}}"{% endif %}
 
 {% for arg in args %}
-arguments="{{ arg.pyscript }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
+arguments="{{ arg.pyscript }} {{ arg.compression }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
 queue
 {% endfor %}