flowcells = os.path.join(opts.sequence, 'flowcells')
extractor = CondorFastqExtract(opts.host, flowcells,
model=opts.model,
+ compression=opts.compression,
force=opts.force)
extractor.create_scripts(results)
parser.add_option('--force', default=False, action="store_true",
help="Force regenerating fastqs")
+ parser.add_option('--compression', default=None, type='choice',
+ choices=['gzip'],
+ help='select compression type for fastq files')
parser.add_option('--daf', default=None, help='specify daf name')
parser.add_option('--library-url', default=None,
help="specify an alternate source for library information")
LOGGER = logging.getLogger(__name__)
+COMPRESSION_EXTENSIONS = {
+ None: '',
+ 'gzip': '.gz'
+}
class CondorFastqExtract(object):
def __init__(self, host, sequences_path,
log_path='log',
model=None,
+ compression=None,
force=False):
"""Extract fastqs from results archive
apidata (dict): id & key to post to the server
sequences_path (str): root of the directory tree to scan for files
log_path (str): where to put condor log files
+ compression (str): one of 'gzip', 'bzip2'
force (bool): do we force overwriting current files?
"""
self.host = host
self.model = get_model(model)
self.sequences_path = sequences_path
self.log_path = log_path
+ self.compression=compression
self.force = force
LOGGER.info("CondorFastq host={0}".format(self.host))
LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path))
LOGGER.info("CondorFastq log_path={0}".format(self.log_path))
+ LOGGER.info("Compression {0}".format(self.compression))
def create_scripts(self, result_map ):
"""
'lane': seq.lane_number,
'read': seq.read,
'cycle': seq.cycle,
+ 'compression_extension': COMPRESSION_EXTENSIONS[self.compression],
'is_paired': seq.ispaired
}
for source in sources:
paths.append(source.path)
paths.sort()
+ compression_argument = ''
+ if self.compression:
+ compression_argument = '--'+self.compression
+
return {
'pyscript': desplit_fastq.__file__,
'target': target_pathname,
+ 'compression': compression_argument,
'sources': paths,
'ispaired': sources[0].ispaired,
}
"""
import collections
import re
-PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
-SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq{compression_extension}'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq{compression_extension}'
FASTQ_RE = re.compile(
'(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
"""
- self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+ self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle', 'compression_extension')
self._is_paired = is_paired
if len(kwargs) == 0:
{% if env %}environment="{{env}}"{% endif %}
{% for arg in args %}
-arguments="{{ arg.pyscript }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
+arguments="{{ arg.pyscript }} {{ arg.compression }} -o {{ arg.target }} {% for s in arg.sources %}{{ s }} {% endfor %}"
queue
{% endfor %}