From 5d586b4a5d4af6bf52929409c96c491e24e86ad5 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 22 Oct 2010 16:40:13 -0700 Subject: [PATCH] Report version number derived from git tag. This patch includes the necessary infrastructure to support that feature and its been added to qseq2fastq and srf2fastq. Additionally to improve testability of qseq2fastq and srf2fastq, the original standalone module was moved into htsworkflow.pipelines and a small stub module was placed in scripts. --- .gitignore | 2 + MANIFEST.in | 2 + extra/ucsc_encode_submission/ucsc_gather.py | 2 +- htsworkflow/pipelines/qseq2fastq.py | 226 ++++++++++++++++++ .../pipelines/srf2fastq.py | 16 +- htsworkflow/version.py | 19 ++ scripts/qseq2fastq | 190 +-------------- scripts/srf2fastq | 6 + setup.py | 46 ++-- test/test_srf2fastq.py | 14 +- version.py | 104 ++++++++ 11 files changed, 404 insertions(+), 223 deletions(-) create mode 100644 MANIFEST.in create mode 100755 htsworkflow/pipelines/qseq2fastq.py rename scripts/srf2named_fastq.py => htsworkflow/pipelines/srf2fastq.py (96%) create mode 100644 htsworkflow/version.py create mode 100755 scripts/srf2fastq create mode 100644 version.py diff --git a/.gitignore b/.gitignore index 3f57fc7..921a57b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.py[co~] .coverage *.egg-info +dist +RELEASE-VERSION diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c4e95cd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include RELEASE-VERSION +version.py diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py index c692405..191281c 100755 --- a/extra/ucsc_encode_submission/ucsc_gather.py +++ b/extra/ucsc_encode_submission/ucsc_gather.py @@ -132,7 +132,7 @@ log=log/qseq2fastq.log qseq_condor_entries = [] srf_condor_header = """ Universe=vanilla -executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py +executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq output=log/srf_pair_fastq.out.$(process).log error=log/srf_pair_fastq.err.$(process).log log=log/srf_pair_fastq.log diff --git a/htsworkflow/pipelines/qseq2fastq.py b/htsworkflow/pipelines/qseq2fastq.py new file mode 100755 index 0000000..286663b --- /dev/null +++ b/htsworkflow/pipelines/qseq2fastq.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +from glob import glob +import os +from optparse import OptionParser +import numpy +import sys +import tarfile + +from htsworkflow.version import version + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + if opts.version: + print version() + return 0 + + if opts.infile is not None: + qseq_generator = tarfile_generator(opts.infile) + elif len(args) > 0: + qseq_generator = file_generator(args) + else: + qseq_generator = [sys.stdin] + + if opts.output is not None: + output = open(opts.output, 'w') + else: + output = sys.stdout + + if opts.nopass_output is not None: + nopass_output = open(opts.nopass_output, 'w') + else: + nopass_output = None + + qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output) + qseq_parser.fastq = not opts.fasta + qseq_parser.flowcell_id = opts.flowcell + qseq_parser.trim = parse_slice(opts.slice) + qseq_parser.reportFilter = opts.pf + + qseq_parser.run() + + +def make_parser(): + usage = "%prog: [options] *_qseq.txt" + parser = OptionParser(usage) + parser.add_option("-a", "--fasta", default=False, action="store_true", + help="produce fasta files instead of fastq files") + parser.add_option("-f", "--flowcell", default=None, + help="Set flowcell ID for output file") + parser.add_option("-i", "--infile", default=None, + help='source tar file (if reading from an archive instead of a directory)') + parser.add_option("-o", "--output", default=None, + help="output fastq file") + parser.add_option("-n", "--nopass-output", default=None, + help="if provided send files that failed illumina filter "\ + "to a differentfile") + parser.add_option("-s", "--slice", + help="specify python slice, e.g. 0:75, 0:-1", + default=None) + parser.add_option("--pf", help="report pass filter flag", default=False, + action="store_true") + parser.add_option("--version", default=False, action="store_true", + help="report software version") + + return parser + + +def file_generator(pattern_list): + for pattern in pattern_list: + for filename in glob(pattern): + yield open(filename,"r") + + +def tarfile_generator(tarfilename): + archive = tarfile.open(tarfilename,'r|*') + for tarinfo in archive: + yield archive.extractfile(tarinfo) + + +def parse_slice(slice_text): + if slice_text is None or len(slice_text) == 0: + return slice(None) + + slice_data = [] + for element in slice_text.split(':'): + if len(element) == 0: + element = None + else: + element = int(element) + slice_data.append(element) + + return slice(*slice_data) + + +def file_generator(pattern_list): + for pattern in pattern_list: + for filename in glob(pattern): + yield open(filename,"r") + + +def tarfile_generator(tarfilename): + archive = tarfile.open(tarfilename,'r|*') + for tarinfo in archive: + yield archive.extractfile(tarinfo) + + +def parse_slice(slice_text): + if slice_text is None or len(slice_text) == 0: + return slice(None) + + slice_data = [] + for element in slice_text.split(':'): + if len(element) == 0: + element = None + else: + element = int(element) + slice_data.append(element) + + return slice(*slice_data) + + +class Qseq2Fastq(object): + """ + Convert qseq files to fastq (or fasta) files. + """ + def __init__(self, sources, pass_destination, nopass_destination=None): + self.sources = sources + self.pass_destination = pass_destination + if nopass_destination is not None: + self.nopass_destination = nopass_destination + else: + self.nopass_destination = pass_destination + + self.fastq = True + self.flowcell_id = None + self.trim = slice(None) + self.reportFilter = False + + def _format_flowcell_id(self): + """ + Return formatted flowcell ID + """ + if self.flowcell_id is not None: + return self.flowcell_id+"_" + else: + return "" + + def _convert_illumina_quality(self, illumina_quality): + """ + Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score. + """ + # Illumina scores are Phred + 64 + # Fastq scores are Phread + 33 + # the following code grabs the string, converts to short ints and + # subtracts 31 (64-33) to convert between the two score formats. + # The numpy solution is twice as fast as some of my other + # ideas for the conversion. + # sorry about the uglyness in changing from character, to 8-bit int + # and back to a character array + quality = numpy.asarray(illumina_quality,'c') + quality.dtype = numpy.uint8 + quality -= 31 + quality.dtype = '|S1' # I'd like to know what the real numpy char type is + return quality + + def run(self): + if self.fastq: + header_template = '@' + else: + # fasta case + header_template = '>' + header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s' + + for qstream in self.sources: + for line in qstream: + # parse line + record = line.rstrip().split('\t') + machine_name = record[0] + run_number = record[1] + lane_number = record[2] + tile = record[3] + x = record[4] + y = record[5] + index = record[6] + read = record[7] + sequence = record[8].replace('.','N') + quality = self._convert_illumina_quality(record[9]) + + # add pass qc filter if we want it + pass_qc = int(record[10]) + if self.reportFilter: + pass_qc_msg = " pf=%s" % (pass_qc) + else: + pass_qc_msg = "" + + header = header_template % ( \ + machine_name, + run_number, + lane_number, + tile, + x, + y, + read, + pass_qc_msg, + os.linesep) + + + # if we passed the filter write to the "good" file + if pass_qc: + destination = self.pass_destination + else: + destination = self.nopass_destination + + destination.write(header) + destination.write(sequence[self.trim]) + destination.write(os.linesep) + if self.fastq: + destination.write('+') + destination.write(os.linesep) + destination.write(quality[self.trim].tostring()) + destination.write(os.linesep) + +if __name__ == "__main__": + main() diff --git a/scripts/srf2named_fastq.py b/htsworkflow/pipelines/srf2fastq.py similarity index 96% rename from scripts/srf2named_fastq.py rename to htsworkflow/pipelines/srf2fastq.py index d0403f6..adaa1b2 100755 --- a/scripts/srf2named_fastq.py +++ b/htsworkflow/pipelines/srf2fastq.py @@ -7,6 +7,7 @@ from subprocess import Popen, PIPE import sys from htsworkflow.util.opener import autoopen +from htsworkflow.version import version # constants for our fastq finite state machine FASTQ_HEADER = 0 @@ -14,19 +15,22 @@ FASTQ_SEQUENCE = 1 FASTQ_SEQUENCE_HEADER = 2 FASTQ_QUALITY = 3 - def main(cmdline=None): parser = make_parser() opts, args = parser.parse_args(cmdline) - if len(args) != 1: - parser.error("Requires one argument, got: %s" % (str(args))) - if opts.verbose: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARN) + if opts.version: + print version() + return 0 + + if len(args) != 1: + parser.error("Requires one argument, got: %s" % (str(args))) + if opts.flowcell is not None: header = "%s_" % (opts.flowcell,) else: @@ -71,6 +75,8 @@ You can also force the flowcell ID to be added to the header.""") help="single fastq target name") parser.add_option('-v', '--verbose', default=False, action="store_true", help="show information about what we're doing.") + parser.add_option('--version', default=False, action="store_true", + help="Report software version") return parser @@ -78,7 +84,6 @@ def srf_open(filename, cnf1=False): """ Make a stream from srf file using srf2fastq """ - cmd = ['srf2fastq'] if is_cnf1(filename): cmd.append('-c') @@ -238,6 +243,5 @@ def foo(): target2 = open(target2_name,'w') - if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/htsworkflow/version.py b/htsworkflow/version.py new file mode 100644 index 0000000..ca7390d --- /dev/null +++ b/htsworkflow/version.py @@ -0,0 +1,19 @@ +import logging + +def version(): + """Return version number + """ + version = None + try: + import pkg_resources + except ImportError, e: + logging.error("Can't find version number, please install setuptools") + raise e + + try: + version = pkg_resources.get_distribution("htsworkflow") + except pkg_resources.DistributionNotFound, e: + logging.error("Package not installed") + + return version + diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq index 03970dd..555e668 100755 --- a/scripts/qseq2fastq +++ b/scripts/qseq2fastq @@ -1,188 +1,6 @@ -#!/usr/bin/env python - -from glob import glob -import os -from optparse import OptionParser -import numpy +#!/usr/bin/python import sys -import tarfile - -class Qseq2Fastq(object): - """ - Convert qseq files to fastq (or fasta) files. - """ - def __init__(self, sources, pass_destination, nopass_destination=None): - self.sources = sources - self.pass_destination = pass_destination - if nopass_destination is not None: - self.nopass_destination = nopass_destination - else: - self.nopass_destination = pass_destination - - self.fastq = True - self.flowcell_id = None - self.trim = slice(None) - self.reportFilter = False - - def _format_flowcell_id(self): - """ - Return formatted flowcell ID - """ - if self.flowcell_id is not None: - return self.flowcell_id+"_" - else: - return "" - - def _convert_illumina_quality(self, illumina_quality): - """ - Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score. - """ - # Illumina scores are Phred + 64 - # Fastq scores are Phread + 33 - # the following code grabs the string, converts to short ints and - # subtracts 31 (64-33) to convert between the two score formats. - # The numpy solution is twice as fast as some of my other - # ideas for the conversion. - # sorry about the uglyness in changing from character, to 8-bit int - # and back to a character array - quality = numpy.asarray(illumina_quality,'c') - quality.dtype = numpy.uint8 - quality -= 31 - quality.dtype = '|S1' # I'd like to know what the real numpy char type is - return quality - - def run(self): - if self.fastq: - header_template = '@' - else: - # fasta case - header_template = '>' - header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s' - - for qstream in self.sources: - for line in qstream: - # parse line - record = line.rstrip().split('\t') - machine_name = record[0] - run_number = record[1] - lane_number = record[2] - tile = record[3] - x = record[4] - y = record[5] - index = record[6] - read = record[7] - sequence = record[8].replace('.','N') - quality = self._convert_illumina_quality(record[9]) - - # add pass qc filter if we want it - pass_qc = int(record[10]) - if self.reportFilter: - pass_qc_msg = " pf=%s" % (pass_qc) - else: - pass_qc_msg = "" - - header = header_template % ( \ - machine_name, - run_number, - lane_number, - tile, - x, - y, - read, - pass_qc_msg, - os.linesep) - - - # if we passed the filter write to the "good" file - if pass_qc: - destination = self.pass_destination - else: - destination = self.nopass_destination - - destination.write(header) - destination.write(sequence[self.trim]) - destination.write(os.linesep) - if self.fastq: - destination.write('+') - destination.write(os.linesep) - destination.write(quality[self.trim].tostring()) - destination.write(os.linesep) - -def file_generator(pattern_list): - for pattern in pattern_list: - for filename in glob(pattern): - yield open(filename,'r') - -def tarfile_generator(tarfilename): - archive = tarfile.open(tarfilename,'r|*') - for tarinfo in archive: - yield archive.extractfile(tarinfo) - - -def parse_slice(slice_text): - if slice_text is None or len(slice_text) == 0: - return slice(None) - - slice_data = [] - for element in slice_text.split(':'): - if len(element) == 0: - element = None - else: - element = int(element) - slice_data.append(element) - - return slice(*slice_data) - -def make_parser(): - usage = "%prog: [options] *_qseq.txt" - parser = OptionParser(usage) - parser.add_option('-a', '--fasta', default=False, action="store_true", - help="produce fasta files instead of fastq files") - parser.add_option('-f', '--flowcell', default=None, - help="Set flowcell ID for output file") - parser.add_option('-i', '--infile', default=None, - help='source tar file (if reading from an archive instead of a directory)') - parser.add_option('-o', '--output', default=None, - help='output fastq file') - parser.add_option('-n', '--nopass-output', default=None, - help='if provided send files that failed illumina filter '\ - 'to a differentfile') - parser.add_option('-s', '--slice', - help='specify python slice, e.g. 0:75, 0:-1', - default=None) - parser.add_option('--pf', help="report pass filter flag", default=False, - action="store_true") - return parser - -def main(cmdline=None): - parser = make_parser() - opts, args = parser.parse_args(cmdline) - - if opts.infile is not None: - qseq_generator = tarfile_generator(opts.infile) - elif len(args) > 0: - qseq_generator = file_generator(args) - else: - qseq_generator = [sys.stdin] - - - if opts.output is not None: - output = open(opts.output, 'w') - else: - output = sys.stdout - - if opts.nopass_output is not None: - nopass_output = open(opts.nopass_output, 'w') - else: - nopass_output = None - - qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output) - qseq_parser.fastq = not opts.fasta - qseq_parser.flowcell_id = opts.flowcell - qseq_parser.trim = parse_slice(opts.slice) - qseq_parser.reportFilter = opts.pf - - qseq_parser.run() +from htsworkflow.pipelines.qseq2fastq import main -if __name__ == "__main__": - main() +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/srf2fastq b/scripts/srf2fastq new file mode 100755 index 0000000..0361e43 --- /dev/null +++ b/scripts/srf2fastq @@ -0,0 +1,6 @@ +#!/usr/bin/python +import sys +from htsworkflow.pipelines.srf2fastq import main + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/setup.py b/setup.py index 40ee4eb..a226a4f 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,37 @@ from setuptools import setup +from version import get_git_version setup( - name="htsworkflow", - description="some bots and other utilities to help deal with data from an illumina sequencer", - author="Diane Trout & Brandon King", - author_email="diane@caltech.edu", - packages=["htsworkflow", - "htsworkflow.pipelines", - "htsworkflow.frontend", - "htsworkflow.frontend.analysis", - "htsworkflow.frontend.eland_config", - "htsworkflow.frontend.experiments", - "htsworkflow.frontend.inventory", - "htsworkflow.frontend.reports", - "htsworkflow.frontend.samples", - "htsworkflow.automation", - "htsworkflow.util" - ], - scripts=[ - 'scripts/configure_pipeline', + name="htsworkflow", + version=get_git_version(), + description="Utilities to help manage high-through-put sequencing", + author="Diane Trout, Brandon King", + author_email="diane@caltech.edu", + packages=["htsworkflow", + "htsworkflow.automation", + "htsworkflow.pipelines", + "htsworkflow.util", + # django site + "htsworkflow.frontend", + "htsworkflow.frontend.analysis", + "htsworkflow.frontend.eland_config", + "htsworkflow.frontend.experiments", + "htsworkflow.frontend.inventory", + "htsworkflow.frontend.reports", + "htsworkflow.frontend.samples", + ], + scripts=[ 'scripts/copier', - 'scripts/gerald2bed.py', 'scripts/library.py', 'scripts/makebed', - 'scripts/make-library-tree', + 'scripts/make-library-tree', 'scripts/mark_archived_data', 'scripts/qseq2fastq', - 'scripts/rerun_eland.py', 'scripts/retrieve_config', 'scripts/runfolder', 'scripts/runner', 'scripts/spoolwatcher', 'scripts/srf', - 'scripts/srf2named_fastq.py' + 'scripts/srf2fastq' ], -) + ) diff --git a/test/test_srf2fastq.py b/test/test_srf2fastq.py index 83412c4..eee4152 100644 --- a/test/test_srf2fastq.py +++ b/test/test_srf2fastq.py @@ -6,7 +6,7 @@ import unittest _module_path, _module_name = os.path.split(__file__) sys.path.append(os.path.join(_module_path, '..', 'scripts')) -import srf2named_fastq +from htsworkflow.pipelines import srf2fastq class testSrf2Fastq(unittest.TestCase): def test_split_good(self): @@ -18,7 +18,7 @@ IIIIB+++ target1 = StringIO() target2 = StringIO() - srf2named_fastq.convert_single_to_two_fastq(source, target1, target2) + srf2fastq.convert_single_to_two_fastq(source, target1, target2) target1.seek(0) lines1 = target1.readlines() @@ -49,7 +49,7 @@ IIIIB+++ target1 = StringIO() target2 = StringIO() - srf2named_fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_") + srf2fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_") target1.seek(0) lines1 = target1.readlines() @@ -79,7 +79,7 @@ IIIIB+++ """) target1 = StringIO() - srf2named_fastq.convert_single_to_fastq(source, target1) + srf2fastq.convert_single_to_fastq(source, target1) target1.seek(0) lines1 = target1.readlines() @@ -101,7 +101,7 @@ IIIIB+++ """) target1 = StringIO() - srf2named_fastq.convert_single_to_fastq(source, target1, header="foo_") + srf2fastq.convert_single_to_fastq(source, target1, header="foo_") target1.seek(0) lines1 = target1.readlines() @@ -117,7 +117,7 @@ IIIIB+++ cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' cnf1_path = os.path.join(_module_path, cnf1_srf) - is_srf = srf2named_fastq.is_srf + is_srf = srf2fastq.is_srf self.failUnlessEqual(is_srf(__file__), False) self.failUnlessEqual(is_srf(cnf4_path), True) self.failUnlessEqual(is_srf(cnf1_path), True) @@ -128,7 +128,7 @@ IIIIB+++ cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf' cnf1_path = os.path.join(_module_path, cnf1_srf) - is_cnf1 = srf2named_fastq.is_cnf1 + is_cnf1 = srf2fastq.is_cnf1 self.failUnlessRaises(ValueError, is_cnf1, __file__) self.failUnlessEqual(is_cnf1(cnf4_path), False) self.failUnlessEqual(is_cnf1(cnf1_path), True) diff --git a/version.py b/version.py new file mode 100644 index 0000000..b850b4a --- /dev/null +++ b/version.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +# Author: Douglas Creager +# This file is placed into the public domain. + +# Calculates the current version number. If possible, this is the +# output of “git describe”, modified to conform to the versioning +# scheme that setuptools uses. If “git describe” returns an error +# (most likely because we're in an unpacked copy of a release tarball, +# rather than in a git working copy), then we fall back on reading the +# contents of the RELEASE-VERSION file. +# +# To use this script, simply import it your setup.py file, and use the +# results of get_git_version() as your package version: +# +# from version import * +# +# setup( +# version=get_git_version(), +# . +# . +# . +# ) +# +# This will automatically update the RELEASE-VERSION file, if +# necessary. Note that the RELEASE-VERSION file should *not* be +# checked into git; please add it to your top-level .gitignore file. +# +# You'll probably want to distribute the RELEASE-VERSION file in your +# sdist tarballs; to do this, just create a MANIFEST.in file that +# contains the following line: +# +# include RELEASE-VERSION + +__all__ = ("get_git_version") + +from subprocess import Popen, PIPE + + +def call_git_describe(abbrev=4): + try: + p = Popen(['git', 'describe', '--abbrev=%d' % abbrev], + stdout=PIPE, stderr=PIPE) + p.stderr.close() + line = p.stdout.readlines()[0] + return line.strip() + + except: + return None + + +def read_release_version(): + try: + f = open("RELEASE-VERSION", "r") + + try: + version = f.readlines()[0] + return version.strip() + + finally: + f.close() + + except: + return None + + +def write_release_version(version): + f = open("RELEASE-VERSION", "w") + f.write("%s\n" % version) + f.close() + + +def get_git_version(abbrev=4): + # Read in the version that's currently in RELEASE-VERSION. + + release_version = read_release_version() + + # First try to get the current version using “git describe”. + + version = call_git_describe(abbrev) + + # If that doesn't work, fall back on the value that's in + # RELEASE-VERSION. + + if version is None: + version = release_version + + # If we still don't have anything, that's an error. + + if version is None: + raise ValueError("Cannot find the version number!") + + # If the current version is different from what's in the + # RELEASE-VERSION file, update the file to be current. + + if version != release_version: + write_release_version(version) + + # Finally, return the current version. + + return version + + +if __name__ == "__main__": + print get_git_version() -- 2.30.2