From: Diane Trout Date: Mon, 12 May 2014 23:16:00 +0000 (-0700) Subject: Directly generate compressed fastq files from HiSeq split fastqs. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=4db6d9f86b96b7b289dc9e6a15b1379418c865ea Directly generate compressed fastq files from HiSeq split fastqs. desplit_fastq is the only fastq builder that has the necessary compression argument. --- diff --git a/encode_submission/encode3.py b/encode_submission/encode3.py index a77151e..53984fc 100644 --- a/encode_submission/encode3.py +++ b/encode_submission/encode3.py @@ -115,6 +115,7 @@ def main(cmdline=None): flowcells = os.path.join(opts.sequence, 'flowcells') extractor = CondorFastqExtract(opts.host, flowcells, model=opts.model, + compression=opts.compression, force=opts.force) extractor.create_scripts(results) @@ -188,6 +189,9 @@ def make_parser(): parser.add_option('--force', default=False, action="store_true", help="Force regenerating fastqs") + parser.add_option('--compression', default=None, type='choice', + choices=['gzip'], + help='select compression type for fastq files') parser.add_option('--daf', default=None, help='specify daf name') parser.add_option('--library-url', default=None, help="specify an alternate source for library information") diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py index b6c2234..37d60ed 100644 --- a/htsworkflow/submission/condorfastq.py +++ b/htsworkflow/submission/condorfastq.py @@ -27,11 +27,16 @@ import RDF LOGGER = logging.getLogger(__name__) +COMPRESSION_EXTENSIONS = { + None: '', + 'gzip': '.gz' +} class CondorFastqExtract(object): def __init__(self, host, sequences_path, log_path='log', model=None, + compression=None, force=False): """Extract fastqs from results archive @@ -40,16 +45,19 @@ class CondorFastqExtract(object): apidata (dict): id & key to post to the server sequences_path (str): root of the directory tree to scan for files log_path (str): where to put condor log files + compression (str): one of 'gzip', 'bzip2' force (bool): do we force overwriting current files? """ self.host = host self.model = get_model(model) self.sequences_path = sequences_path self.log_path = log_path + self.compression=compression self.force = force LOGGER.info("CondorFastq host={0}".format(self.host)) LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path)) LOGGER.info("CondorFastq log_path={0}".format(self.log_path)) + LOGGER.info("Compression {0}".format(self.compression)) def create_scripts(self, result_map ): """ @@ -231,6 +239,7 @@ WHERE { 'lane': seq.lane_number, 'read': seq.read, 'cycle': seq.cycle, + 'compression_extension': COMPRESSION_EXTENSIONS[self.compression], 'is_paired': seq.ispaired } @@ -291,9 +300,14 @@ WHERE { for source in sources: paths.append(source.path) paths.sort() + compression_argument = '' + if self.compression: + compression_argument = '--'+self.compression + return { 'pyscript': desplit_fastq.__file__, 'target': target_pathname, + 'compression': compression_argument, 'sources': paths, 'ispaired': sources[0].ispaired, } diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py index 9dd52a0..ac3d5fc 100644 --- a/htsworkflow/submission/fastqname.py +++ b/htsworkflow/submission/fastqname.py @@ -2,8 +2,8 @@ """ import collections import re -PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq' -SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq' +PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq{compression_extension}' +SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq{compression_extension}' FASTQ_RE = re.compile( '(?P[^_]+)_(?P[^_]+)_'\ @@ -17,7 +17,7 @@ class FastqName(collections.Mapping): Takes filename or common attributes like flowcell, lib_id, lane, read, cycle """ - self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle') + self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle', 'compression_extension') self._is_paired = is_paired if len(kwargs) == 0: diff --git a/htsworkflow/templates/split_fastq.condor b/htsworkflow/templates/split_fastq.condor index 127a9c0..faca698 100644 --- a/htsworkflow/templates/split_fastq.condor +++ b/htsworkflow/templates/split_fastq.condor @@ -6,6 +6,6 @@ log={{ logdir }}/fastq.log {% if env %}environment="{{env}}"{% endif %} {% for arg in args %} -arguments="{{ arg.pyscript }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}" +arguments="{{ arg.pyscript }} {{ arg.compression }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}" queue {% endfor %}