From: Diane Trout Date: Fri, 22 Oct 2010 23:40:13 +0000 (-0700) Subject: Report version number derived from git tag. X-Git-Tag: 0.5.0~21 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=5d586b4a5d4af6bf52929409c96c491e24e86ad5 Report version number derived from git tag. This patch includes the necessary infrastructure to support that feature and its been added to qseq2fastq and srf2fastq. Additionally to improve testability of qseq2fastq and srf2fastq, the original standalone module was moved into htsworkflow.pipelines and a small stub module was placed in scripts. --- diff --git a/.gitignore b/.gitignore index 3f57fc7..921a57b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.py[co~] .coverage *.egg-info +dist +RELEASE-VERSION diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c4e95cd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include RELEASE-VERSION +version.py diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py index c692405..191281c 100755 --- a/extra/ucsc_encode_submission/ucsc_gather.py +++ b/extra/ucsc_encode_submission/ucsc_gather.py @@ -132,7 +132,7 @@ log=log/qseq2fastq.log qseq_condor_entries = [] srf_condor_header = """ Universe=vanilla -executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py +executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq output=log/srf_pair_fastq.out.$(process).log error=log/srf_pair_fastq.err.$(process).log log=log/srf_pair_fastq.log diff --git a/htsworkflow/pipelines/qseq2fastq.py b/htsworkflow/pipelines/qseq2fastq.py new file mode 100755 index 0000000..286663b --- /dev/null +++ b/htsworkflow/pipelines/qseq2fastq.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +from glob import glob +import os +from optparse import OptionParser +import numpy +import sys +import tarfile + +from htsworkflow.version import version + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + if opts.version: + print version() + return 0 + + if opts.infile is not None: + qseq_generator = tarfile_generator(opts.infile) + elif len(args) > 0: + qseq_generator = file_generator(args) + else: + qseq_generator = [sys.stdin] + + if opts.output is not None: + output = open(opts.output, 'w') + else: + output = sys.stdout + + if opts.nopass_output is not None: + nopass_output = open(opts.nopass_output, 'w') + else: + nopass_output = None + + qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output) + qseq_parser.fastq = not opts.fasta + qseq_parser.flowcell_id = opts.flowcell + qseq_parser.trim = parse_slice(opts.slice) + qseq_parser.reportFilter = opts.pf + + qseq_parser.run() + + +def make_parser(): + usage = "%prog: [options] *_qseq.txt" + parser = OptionParser(usage) + parser.add_option("-a", "--fasta", default=False, action="store_true", + help="produce fasta files instead of fastq files") + parser.add_option("-f", "--flowcell", default=None, + help="Set flowcell ID for output file") + parser.add_option("-i", "--infile", default=None, + help='source tar file (if reading from an archive instead of a directory)') + parser.add_option("-o", "--output", default=None, + help="output fastq file") + parser.add_option("-n", "--nopass-output", default=None, + help="if provided send files that failed illumina filter "\ + "to a differentfile") + parser.add_option("-s", "--slice", + help="specify python slice, e.g. 0:75, 0:-1", + default=None) + parser.add_option("--pf", help="report pass filter flag", default=False, + action="store_true") + parser.add_option("--version", default=False, action="store_true", + help="report software version") + + return parser + + +def file_generator(pattern_list): + for pattern in pattern_list: + for filename in glob(pattern): + yield open(filename,"r") + + +def tarfile_generator(tarfilename): + archive = tarfile.open(tarfilename,'r|*') + for tarinfo in archive: + yield archive.extractfile(tarinfo) + + +def parse_slice(slice_text): + if slice_text is None or len(slice_text) == 0: + return slice(None) + + slice_data = [] + for element in slice_text.split(':'): + if len(element) == 0: + element = None + else: + element = int(element) + slice_data.append(element) + + return slice(*slice_data) + + +def file_generator(pattern_list): + for pattern in pattern_list: + for filename in glob(pattern): + yield open(filename,"r") + + +def tarfile_generator(tarfilename): + archive = tarfile.open(tarfilename,'r|*') + for tarinfo in archive: + yield archive.extractfile(tarinfo) + + +def parse_slice(slice_text): + if slice_text is None or len(slice_text) == 0: + return slice(None) + + slice_data = [] + for element in slice_text.split(':'): + if len(element) == 0: + element = None + else: + element = int(element) + slice_data.append(element) + + return slice(*slice_data) + + +class Qseq2Fastq(object): + """ + Convert qseq files to fastq (or fasta) files. + """ + def __init__(self, sources, pass_destination, nopass_destination=None): + self.sources = sources + self.pass_destination = pass_destination + if nopass_destination is not None: + self.nopass_destination = nopass_destination + else: + self.nopass_destination = pass_destination + + self.fastq = True + self.flowcell_id = None + self.trim = slice(None) + self.reportFilter = False + + def _format_flowcell_id(self): + """ + Return formatted flowcell ID + """ + if self.flowcell_id is not None: + return self.flowcell_id+"_" + else: + return "" + + def _convert_illumina_quality(self, illumina_quality): + """ + Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score. + """ + # Illumina scores are Phred + 64 + # Fastq scores are Phread + 33 + # the following code grabs the string, converts to short ints and + # subtracts 31 (64-33) to convert between the two score formats. + # The numpy solution is twice as fast as some of my other + # ideas for the conversion. + # sorry about the uglyness in changing from character, to 8-bit int + # and back to a character array + quality = numpy.asarray(illumina_quality,'c') + quality.dtype = numpy.uint8 + quality -= 31 + quality.dtype = '|S1' # I'd like to know what the real numpy char type is + return quality + + def run(self): + if self.fastq: + header_template = '@' + else: + # fasta case + header_template = '>' + header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s' + + for qstream in self.sources: + for line in qstream: + # parse line + record = line.rstrip().split('\t') + machine_name = record[0] + run_number = record[1] + lane_number = record[2] + tile = record[3] + x = record[4] + y = record[5] + index = record[6] + read = record[7] + sequence = record[8].replace('.','N') + quality = self._convert_illumina_quality(record[9]) + + # add pass qc filter if we want it + pass_qc = int(record[10]) + if self.reportFilter: + pass_qc_msg = " pf=%s" % (pass_qc) + else: + pass_qc_msg = "" + + header = header_template % ( \ + machine_name, + run_number, + lane_number, + tile, + x, + y, + read, + pass_qc_msg, + os.linesep) + + + # if we passed the filter write to the "good" file + if pass_qc: + destination = self.pass_destination + else: + destination = self.nopass_destination + + destination.write(header) + destination.write(sequence[self.trim]) + destination.write(os.linesep) + if self.fastq: + destination.write('+') + destination.write(os.linesep) + destination.write(quality[self.trim].tostring()) + destination.write(os.linesep) + +if __name__ == "__main__": + main() diff --git a/htsworkflow/pipelines/srf2fastq.py b/htsworkflow/pipelines/srf2fastq.py new file mode 100755 index 0000000..adaa1b2 --- /dev/null +++ b/htsworkflow/pipelines/srf2fastq.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python +import logging +import mmap +from optparse import OptionParser +import os +from subprocess import Popen, PIPE +import sys + +from htsworkflow.util.opener import autoopen +from htsworkflow.version import version + +# constants for our fastq finite state machine +FASTQ_HEADER = 0 +FASTQ_SEQUENCE = 1 +FASTQ_SEQUENCE_HEADER = 2 +FASTQ_QUALITY = 3 + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + if opts.verbose: + logging.basicConfig(level=logging.INFO) + else: + logging.basicConfig(level=logging.WARN) + + if opts.version: + print version() + return 0 + + if len(args) != 1: + parser.error("Requires one argument, got: %s" % (str(args))) + + if opts.flowcell is not None: + header = "%s_" % (opts.flowcell,) + else: + header = '' + + if opts.single: + left = open_write(opts.single, opts.force) + else: + left = open_write(opts.left, opts.force) + right = open_write(opts.right, opts.force) + + # open the srf, fastq, or compressed fastq + if is_srf(args[0]): + source = srf_open(args[0]) + else: + source = autoopen(args[0]) + + if opts.single: + convert_single_to_fastq(source, left, header) + else: + convert_single_to_two_fastq(source, left, right, opts.mid, header) + + return 0 + +def make_parser(): + parser = OptionParser("""%prog: [options] file + +file can be either a fastq file or a srf file. +You can also force the flowcell ID to be added to the header.""") + + parser.add_option('--force', default=False, action="store_true", + help="overwrite existing files.") + parser.add_option('--flowcell', default=None, + help="add flowcell id header to sequence") + parser.add_option('-l','--left', default="r1.fastq", + help='left side filename') + parser.add_option('-m','--mid', default=None, + help='actual sequence mid point') + parser.add_option('-r','--right', default="r2.fastq", + help='right side filename') + parser.add_option('-s','--single', default=None, + help="single fastq target name") + parser.add_option('-v', '--verbose', default=False, action="store_true", + help="show information about what we're doing.") + parser.add_option('--version', default=False, action="store_true", + help="Report software version") + return parser + + +def srf_open(filename, cnf1=False): + """ + Make a stream from srf file using srf2fastq + """ + cmd = ['srf2fastq'] + if is_cnf1(filename): + cmd.append('-c') + cmd.append(filename) + + logging.info('srf command: %s' % (" ".join(cmd),)) + p = Popen(cmd, stdout=PIPE) + return p.stdout + + +def convert_single_to_fastq(instream, target1, header=''): + + state = FASTQ_HEADER + for line in instream: + line = line.strip() + # sequence header + if state == FASTQ_HEADER: + write_header(target1, header, line) + state = FASTQ_SEQUENCE + # sequence + elif state == FASTQ_SEQUENCE: + write_sequence(target1, line) + state = FASTQ_SEQUENCE_HEADER + # quality header + elif state == FASTQ_SEQUENCE_HEADER: + # the sequence header isn't really sequence, but + # we're just passing it through + write_sequence(target1, line) + state = FASTQ_QUALITY + # sequence or quality data + elif state == FASTQ_QUALITY: + write_sequence(target1, line) + state = FASTQ_HEADER + else: + raise RuntimeError("Unrecognized STATE in fastq split") + + + +def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''): + """ + read a fastq file where two paired ends have been run together into + two halves. + + instream is the source stream + target1 and target2 are the destination streams + """ + if mid is not None: + mid = int(mid) + + state = FASTQ_HEADER + for line in instream: + line = line.strip() + # sequence header + if state == FASTQ_HEADER: + write_header(target1, header, line, "/1") + write_header(target2, header, line, "/2") + state = FASTQ_SEQUENCE + # sequence + elif state == FASTQ_SEQUENCE: + if mid is None: + mid = len(line)/2 + write_split_sequence(target1, target2, line, mid) + state = FASTQ_SEQUENCE_HEADER + # quality header + elif state == FASTQ_SEQUENCE_HEADER: + # the sequence header isn't really sequence, but + # we're just passing it through + write_sequence(target1, line) + write_sequence(target2, line) + + state = FASTQ_QUALITY + # sequence or quality data + elif state == FASTQ_QUALITY: + write_split_sequence(target1, target2, line, mid) + state = FASTQ_HEADER + else: + raise RuntimeError("Unrecognized STATE in fastq split") + +def write_header(target, prefix, line, suffix=''): + target.write('@') + target.write(prefix) + target.write(line[1:]) + target.write(suffix) + target.write(os.linesep) + +def write_sequence(target, line): + target.write(line) + target.write(os.linesep) + +def write_split_sequence(target1, target2, line, mid): + target1.write(line[:mid]) + target1.write(os.linesep) + + target2.write(line[mid:]) + target2.write(os.linesep) + +def is_srf(filename): + """ + Check filename to see if it is likely to be a SRF file + """ + f = open(filename, 'r') + header = f.read(4) + f.close() + return header == "SSRF" + +def is_cnf1(filename): + """ + Brute force detection if a SRF file is using CNF1/CNF4 records + """ + max_header = 1024 ** 2 + PROGRAM_ID = 'PROGRAM_ID\000' + cnf4_apps = set(("solexa2srf v1.4", + "illumina2srf v1.11.5.Illumina.1.3")) + + if not is_srf(filename): + raise ValueError("%s must be a srf file" % (filename,)) + + fd = os.open(filename, os.O_RDONLY) + f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ) + # alas the max search length requires python 2.6+ + program_id_location = f.find(PROGRAM_ID, 0) #, max_header) + program_header_start = program_id_location+len(PROGRAM_ID) + next_null = f.find('\000', program_header_start) #, max_header) + program_id_header = f[program_header_start:next_null] + f.close() + os.close(fd) + + if program_id_header in cnf4_apps: + return False + else: + return True + +def open_write(filename, force=False): + """ + Open a file, but throw an exception if it already exists + """ + if not force: + if os.path.exists(filename): + raise RuntimeError("%s exists" % (filename,)) + + return open(filename, 'w') + +def foo(): + path, name = os.path.split(filename) + base, ext = os.path.splitext(name) + + target1_name = base + '_r1.fastq' + target2_name = base + '_r2.fastq' + + for target_name in [target1_name, target2_name]: + print 'target name', target_name + if os.path.exists(target_name): + raise RuntimeError("%s exists" % (target_name,)) + + instream = open(filename,'r') + target1 = open(target1_name,'w') + target2 = open(target2_name,'w') + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/htsworkflow/version.py b/htsworkflow/version.py new file mode 100644 index 0000000..ca7390d --- /dev/null +++ b/htsworkflow/version.py @@ -0,0 +1,19 @@ +import logging + +def version(): + """Return version number + """ + version = None + try: + import pkg_resources + except ImportError, e: + logging.error("Can't find version number, please install setuptools") + raise e + + try: + version = pkg_resources.get_distribution("htsworkflow") + except pkg_resources.DistributionNotFound, e: + logging.error("Package not installed") + + return version + diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq index 03970dd..555e668 100755 --- a/scripts/qseq2fastq +++ b/scripts/qseq2fastq @@ -1,188 +1,6 @@ -#!/usr/bin/env python - -from glob import glob -import os -from optparse import OptionParser -import numpy +#!/usr/bin/python import sys -import tarfile - -class Qseq2Fastq(object): - """ - Convert qseq files to fastq (or fasta) files. - """ - def __init__(self, sources, pass_destination, nopass_destination=None): - self.sources = sources - self.pass_destination = pass_destination - if nopass_destination is not None: - self.nopass_destination = nopass_destination - else: - self.nopass_destination = pass_destination - - self.fastq = True - self.flowcell_id = None - self.trim = slice(None) - self.reportFilter = False - - def _format_flowcell_id(self): - """ - Return formatted flowcell ID - """ - if self.flowcell_id is not None: - return self.flowcell_id+"_" - else: - return "" - - def _convert_illumina_quality(self, illumina_quality): - """ - Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score. - """ - # Illumina scores are Phred + 64 - # Fastq scores are Phread + 33 - # the following code grabs the string, converts to short ints and - # subtracts 31 (64-33) to convert between the two score formats. - # The numpy solution is twice as fast as some of my other - # ideas for the conversion. - # sorry about the uglyness in changing from character, to 8-bit int - # and back to a character array - quality = numpy.asarray(illumina_quality,'c') - quality.dtype = numpy.uint8 - quality -= 31 - quality.dtype = '|S1' # I'd like to know what the real numpy char type is - return quality - - def run(self): - if self.fastq: - header_template = '@' - else: - # fasta case - header_template = '>' - header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s' - - for qstream in self.sources: - for line in qstream: - # parse line - record = line.rstrip().split('\t') - machine_name = record[0] - run_number = record[1] - lane_number = record[2] - tile = record[3] - x = record[4] - y = record[5] - index = record[6] - read = record[7] - sequence = record[8].replace('.','N') - quality = self._convert_illumina_quality(record[9]) - - # add pass qc filter if we want it - pass_qc = int(record[10]) - if self.reportFilter: - pass_qc_msg = " pf=%s" % (pass_qc) - else: - pass_qc_msg = "" - - header = header_template % ( \ - machine_name, - run_number, - lane_number, - tile, - x, - y, - read, - pass_qc_msg, - os.linesep) - - - # if we passed the filter write to the "good" file - if pass_qc: - destination = self.pass_destination - else: - destination = self.nopass_destination - - destination.write(header) - destination.write(sequence[self.trim]) - destination.write(os.linesep) - if self.fastq: - destination.write('+') - destination.write(os.linesep) - destination.write(quality[self.trim].tostring()) - destination.write(os.linesep) - -def file_generator(pattern_list): - for pattern in pattern_list: - for filename in glob(pattern): - yield open(filename,'r') - -def tarfile_generator(tarfilename): - archive = tarfile.open(tarfilename,'r|*') - for tarinfo in archive: - yield archive.extractfile(tarinfo) - - -def parse_slice(slice_text): - if slice_text is None or len(slice_text) == 0: - return slice(None) - - slice_data = [] - for element in slice_text.split(':'): - if len(element) == 0: - element = None - else: - element = int(element) - slice_data.append(element) - - return slice(*slice_data) - -def make_parser(): - usage = "%prog: [options] *_qseq.txt" - parser = OptionParser(usage) - parser.add_option('-a', '--fasta', default=False, action="store_true", - help="produce fasta files instead of fastq files") - parser.add_option('-f', '--flowcell', default=None, - help="Set flowcell ID for output file") - parser.add_option('-i', '--infile', default=None, - help='source tar file (if reading from an archive instead of a directory)') - parser.add_option('-o', '--output', default=None, - help='output fastq file') - parser.add_option('-n', '--nopass-output', default=None, - help='if provided send files that failed illumina filter '\ - 'to a differentfile') - parser.add_option('-s', '--slice', - help='specify python slice, e.g. 0:75, 0:-1', - default=None) - parser.add_option('--pf', help="report pass filter flag", default=False, - action="store_true") - return parser - -def main(cmdline=None): - parser = make_parser() - opts, args = parser.parse_args(cmdline) - - if opts.infile is not None: - qseq_generator = tarfile_generator(opts.infile) - elif len(args) > 0: - qseq_generator = file_generator(args) - else: - qseq_generator = [sys.stdin] - - - if opts.output is not None: - output = open(opts.output, 'w') - else: - output = sys.stdout - - if opts.nopass_output is not None: - nopass_output = open(opts.nopass_output, 'w') - else: - nopass_output = None - - qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output) - qseq_parser.fastq = not opts.fasta - qseq_parser.flowcell_id = opts.flowcell - qseq_parser.trim = parse_slice(opts.slice) - qseq_parser.reportFilter = opts.pf - - qseq_parser.run() +from htsworkflow.pipelines.qseq2fastq import main -if __name__ == "__main__": - main() +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/srf2fastq b/scripts/srf2fastq new file mode 100755 index 0000000..0361e43 --- /dev/null +++ b/scripts/srf2fastq @@ -0,0 +1,6 @@ +#!/usr/bin/python +import sys +from htsworkflow.pipelines.srf2fastq import main + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/srf2named_fastq.py b/scripts/srf2named_fastq.py deleted file mode 100755 index d0403f6..0000000 --- a/scripts/srf2named_fastq.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python -import logging -import mmap -from optparse import OptionParser -import os -from subprocess import Popen, PIPE -import sys - -from htsworkflow.util.opener import autoopen - -# constants for our fastq finite state machine -FASTQ_HEADER = 0 -FASTQ_SEQUENCE = 1 -FASTQ_SEQUENCE_HEADER = 2 -FASTQ_QUALITY = 3 - - -def main(cmdline=None): - parser = make_parser() - opts, args = parser.parse_args(cmdline) - - if len(args) != 1: - parser.error("Requires one argument, got: %s" % (str(args))) - - if opts.verbose: - logging.basicConfig(level=logging.INFO) - else: - logging.basicConfig(level=logging.WARN) - - if opts.flowcell is not None: - header = "%s_" % (opts.flowcell,) - else: - header = '' - - if opts.single: - left = open_write(opts.single, opts.force) - else: - left = open_write(opts.left, opts.force) - right = open_write(opts.right, opts.force) - - # open the srf, fastq, or compressed fastq - if is_srf(args[0]): - source = srf_open(args[0]) - else: - source = autoopen(args[0]) - - if opts.single: - convert_single_to_fastq(source, left, header) - else: - convert_single_to_two_fastq(source, left, right, opts.mid, header) - - return 0 - -def make_parser(): - parser = OptionParser("""%prog: [options] file - -file can be either a fastq file or a srf file. -You can also force the flowcell ID to be added to the header.""") - - parser.add_option('--force', default=False, action="store_true", - help="overwrite existing files.") - parser.add_option('--flowcell', default=None, - help="add flowcell id header to sequence") - parser.add_option('-l','--left', default="r1.fastq", - help='left side filename') - parser.add_option('-m','--mid', default=None, - help='actual sequence mid point') - parser.add_option('-r','--right', default="r2.fastq", - help='right side filename') - parser.add_option('-s','--single', default=None, - help="single fastq target name") - parser.add_option('-v', '--verbose', default=False, action="store_true", - help="show information about what we're doing.") - return parser - - -def srf_open(filename, cnf1=False): - """ - Make a stream from srf file using srf2fastq - """ - - cmd = ['srf2fastq'] - if is_cnf1(filename): - cmd.append('-c') - cmd.append(filename) - - logging.info('srf command: %s' % (" ".join(cmd),)) - p = Popen(cmd, stdout=PIPE) - return p.stdout - - -def convert_single_to_fastq(instream, target1, header=''): - - state = FASTQ_HEADER - for line in instream: - line = line.strip() - # sequence header - if state == FASTQ_HEADER: - write_header(target1, header, line) - state = FASTQ_SEQUENCE - # sequence - elif state == FASTQ_SEQUENCE: - write_sequence(target1, line) - state = FASTQ_SEQUENCE_HEADER - # quality header - elif state == FASTQ_SEQUENCE_HEADER: - # the sequence header isn't really sequence, but - # we're just passing it through - write_sequence(target1, line) - state = FASTQ_QUALITY - # sequence or quality data - elif state == FASTQ_QUALITY: - write_sequence(target1, line) - state = FASTQ_HEADER - else: - raise RuntimeError("Unrecognized STATE in fastq split") - - - -def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''): - """ - read a fastq file where two paired ends have been run together into - two halves. - - instream is the source stream - target1 and target2 are the destination streams - """ - if mid is not None: - mid = int(mid) - - state = FASTQ_HEADER - for line in instream: - line = line.strip() - # sequence header - if state == FASTQ_HEADER: - write_header(target1, header, line, "/1") - write_header(target2, header, line, "/2") - state = FASTQ_SEQUENCE - # sequence - elif state == FASTQ_SEQUENCE: - if mid is None: - mid = len(line)/2 - write_split_sequence(target1, target2, line, mid) - state = FASTQ_SEQUENCE_HEADER - # quality header - elif state == FASTQ_SEQUENCE_HEADER: - # the sequence header isn't really sequence, but - # we're just passing it through - write_sequence(target1, line) - write_sequence(target2, line) - - state = FASTQ_QUALITY - # sequence or quality data - elif state == FASTQ_QUALITY: - write_split_sequence(target1, target2, line, mid) - state = FASTQ_HEADER - else: - raise RuntimeError("Unrecognized STATE in fastq split") - -def write_header(target, prefix, line, suffix=''): - target.write('@') - target.write(prefix) - target.write(line[1:]) - target.write(suffix) - target.write(os.linesep) - -def write_sequence(target, line): - target.write(line) - target.write(os.linesep) - -def write_split_sequence(target1, target2, line, mid): - target1.write(line[:mid]) - target1.write(os.linesep) - - target2.write(line[mid:]) - target2.write(os.linesep) - -def is_srf(filename): - """ - Check filename to see if it is likely to be a SRF file - """ - f = open(filename, 'r') - header = f.read(4) - f.close() - return header == "SSRF" - -def is_cnf1(filename): - """ - Brute force detection if a SRF file is using CNF1/CNF4 records - """ - max_header = 1024 ** 2 - PROGRAM_ID = 'PROGRAM_ID\000' - cnf4_apps = set(("solexa2srf v1.4", - "illumina2srf v1.11.5.Illumina.1.3")) - - if not is_srf(filename): - raise ValueError("%s must be a srf file" % (filename,)) - - fd = os.open(filename, os.O_RDONLY) - f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ) - # alas the max search length requires python 2.6+ - program_id_location = f.find(PROGRAM_ID, 0) #, max_header) - program_header_start = program_id_location+len(PROGRAM_ID) - next_null = f.find('\000', program_header_start) #, max_header) - program_id_header = f[program_header_start:next_null] - f.close() - os.close(fd) - - if program_id_header in cnf4_apps: - return False - else: - return True - -def open_write(filename, force=False): - """ - Open a file, but throw an exception if it already exists - """ - if not force: - if os.path.exists(filename): - raise RuntimeError("%s exists" % (filename,)) - - return open(filename, 'w') - -def foo(): - path, name = os.path.split(filename) - base, ext = os.path.splitext(name) - - target1_name = base + '_r1.fastq' - target2_name = base + '_r2.fastq' - - for target_name in [target1_name, target2_name]: - print 'target name', target_name - if os.path.exists(target_name): - raise RuntimeError("%s exists" % (target_name,)) - - instream = open(filename,'r') - target1 = open(target1_name,'w') - target2 = open(target2_name,'w') - - - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) diff --git a/setup.py b/setup.py index 40ee4eb..a226a4f 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,37 @@ from setuptools import setup +from version import get_git_version setup( - name="htsworkflow", - description="some bots and other utilities to help deal with data from an illumina sequencer", - author="Diane Trout & Brandon King", - author_email="diane@caltech.edu", - packages=["htsworkflow", - "htsworkflow.pipelines", - "htsworkflow.frontend", - "htsworkflow.frontend.analysis", - "htsworkflow.frontend.eland_config", - "htsworkflow.frontend.experiments", - "htsworkflow.frontend.inventory", - "htsworkflow.frontend.reports", - "htsworkflow.frontend.samples", - "htsworkflow.automation", - "htsworkflow.util" - ], - scripts=[ - 'scripts/configure_pipeline', + name="htsworkflow", + version=get_git_version(), + description="Utilities to help manage high-through-put sequencing", + author="Diane Trout, Brandon King", + author_email="diane@caltech.edu", + packages=["htsworkflow", + "htsworkflow.automation", + "htsworkflow.pipelines", + "htsworkflow.util", + # django site + "htsworkflow.frontend", + "htsworkflow.frontend.analysis", + "htsworkflow.frontend.eland_config", + "htsworkflow.frontend.experiments", + "htsworkflow.frontend.inventory", + "htsworkflow.frontend.reports", + "htsworkflow.frontend.samples", + ], + scripts=[ 'scripts/copier', - 'scripts/gerald2bed.py', 'scripts/library.py', 'scripts/makebed', - 'scripts/make-library-tree', + 'scripts/make-library-tree', 'scripts/mark_archived_data', 'scripts/qseq2fastq', - 'scripts/rerun_eland.py', 'scripts/retrieve_config', 'scripts/runfolder', 'scripts/runner', 'scripts/spoolwatcher', 'scripts/srf', - 'scripts/srf2named_fastq.py' + 'scripts/srf2fastq' ], -) + ) diff --git a/test/test_srf2fastq.py b/test/test_srf2fastq.py index 83412c4..eee4152 100644 --- a/test/test_srf2fastq.py +++ b/test/test_srf2fastq.py @@ -6,7 +6,7 @@ import unittest _module_path, _module_name = os.path.split(__file__) sys.path.append(os.path.join(_module_path, '..', 'scripts')) -import srf2named_fastq +from htsworkflow.pipelines import srf2fastq class testSrf2Fastq(unittest.TestCase): def test_split_good(self): @@ -18,7 +18,7 @@ IIIIB+++ target1 = StringIO() target2 = StringIO() - srf2named_fastq.convert_single_to_two_fastq(source, target1, target2) + srf2fastq.convert_single_to_two_fastq(source, target1, target2) target1.seek(0) lines1 = target1.readlines() @@ -49,7 +49,7 @@ IIIIB+++ target1 = StringIO() target2 = StringIO() - srf2named_fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_") + srf2fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_") target1.seek(0) lines1 = target1.readlines() @@ -79,7 +79,7 @@ IIIIB+++ """) target1 = StringIO() - srf2named_fastq.convert_single_to_fastq(source, target1) + srf2fastq.convert_single_to_fastq(source, target1) target1.seek(0) lines1 = target1.readlines() @@ -101,7 +101,7 @@ IIIIB+++ """) target1 = StringIO() - srf2named_fastq.convert_single_to_fastq(source, target1, header="foo_") + srf2fastq.convert_single_to_fastq(source, target1, header="foo_") target1.seek(0) lines1 = target1.readlines() @@ -117,7 +117,7 @@ IIIIB+++ cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' cnf1_path = os.path.join(_module_path, cnf1_srf) - is_srf = srf2named_fastq.is_srf + is_srf = srf2fastq.is_srf self.failUnlessEqual(is_srf(__file__), False) self.failUnlessEqual(is_srf(cnf4_path), True) self.failUnlessEqual(is_srf(cnf1_path), True) @@ -128,7 +128,7 @@ IIIIB+++ cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' cnf1_path = os.path.join(_module_path, cnf1_srf) - is_cnf1 = srf2named_fastq.is_cnf1 + is_cnf1 = srf2fastq.is_cnf1 self.failUnlessRaises(ValueError, is_cnf1, __file__) self.failUnlessEqual(is_cnf1(cnf4_path), False) self.failUnlessEqual(is_cnf1(cnf1_path), True) diff --git a/version.py b/version.py new file mode 100644 index 0000000..b850b4a --- /dev/null +++ b/version.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +# Author: Douglas Creager +# This file is placed into the public domain. + +# Calculates the current version number. If possible, this is the +# output of “git describe”, modified to conform to the versioning +# scheme that setuptools uses. If “git describe” returns an error +# (most likely because we're in an unpacked copy of a release tarball, +# rather than in a git working copy), then we fall back on reading the +# contents of the RELEASE-VERSION file. +# +# To use this script, simply import it your setup.py file, and use the +# results of get_git_version() as your package version: +# +# from version import * +# +# setup( +# version=get_git_version(), +# . +# . +# . +# ) +# +# This will automatically update the RELEASE-VERSION file, if +# necessary. Note that the RELEASE-VERSION file should *not* be +# checked into git; please add it to your top-level .gitignore file. +# +# You'll probably want to distribute the RELEASE-VERSION file in your +# sdist tarballs; to do this, just create a MANIFEST.in file that +# contains the following line: +# +# include RELEASE-VERSION + +__all__ = ("get_git_version") + +from subprocess import Popen, PIPE + + +def call_git_describe(abbrev=4): + try: + p = Popen(['git', 'describe', '--abbrev=%d' % abbrev], + stdout=PIPE, stderr=PIPE) + p.stderr.close() + line = p.stdout.readlines()[0] + return line.strip() + + except: + return None + + +def read_release_version(): + try: + f = open("RELEASE-VERSION", "r") + + try: + version = f.readlines()[0] + return version.strip() + + finally: + f.close() + + except: + return None + + +def write_release_version(version): + f = open("RELEASE-VERSION", "w") + f.write("%s\n" % version) + f.close() + + +def get_git_version(abbrev=4): + # Read in the version that's currently in RELEASE-VERSION. + + release_version = read_release_version() + + # First try to get the current version using “git describe”. + + version = call_git_describe(abbrev) + + # If that doesn't work, fall back on the value that's in + # RELEASE-VERSION. + + if version is None: + version = release_version + + # If we still don't have anything, that's an error. + + if version is None: + raise ValueError("Cannot find the version number!") + + # If the current version is different from what's in the + # RELEASE-VERSION file, update the file to be current. + + if version != release_version: + write_release_version(version) + + # Finally, return the current version. + + return version + + +if __name__ == "__main__": + print get_git_version()