Report version number derived from git tag.
authorDiane Trout <diane@caltech.edu>
Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
committerDiane Trout <diane@caltech.edu>
Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
This patch includes the necessary infrastructure to support that
feature and its been added to qseq2fastq and srf2fastq.

Additionally to improve testability of qseq2fastq and srf2fastq, the
original standalone module was moved into htsworkflow.pipelines and a
small stub module was placed in scripts.

12 files changed:
.gitignore
MANIFEST.in [new file with mode: 0644]
extra/ucsc_encode_submission/ucsc_gather.py
htsworkflow/pipelines/qseq2fastq.py [new file with mode: 0755]
htsworkflow/pipelines/srf2fastq.py [new file with mode: 0755]
htsworkflow/version.py [new file with mode: 0644]
scripts/qseq2fastq
scripts/srf2fastq [new file with mode: 0755]
scripts/srf2named_fastq.py [deleted file]
setup.py
test/test_srf2fastq.py
version.py [new file with mode: 0644]

index 3f57fc752af39b4aad0676b326c3bf77033ebde8..921a57b3b720e8cb8b78e3edfca3fb695fcf073b 100644 (file)
@@ -1,3 +1,5 @@
 *.py[co~]
 .coverage
 *.egg-info
+dist
+RELEASE-VERSION
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644 (file)
index 0000000..c4e95cd
--- /dev/null
@@ -0,0 +1,2 @@
+include RELEASE-VERSION
+version.py
index c69240548b5a5d0e5588f5d2d911416eab017876..191281c8580c308e5ff71f5088cb37e04b4af587 100755 (executable)
@@ -132,7 +132,7 @@ log=log/qseq2fastq.log
     qseq_condor_entries = []
     srf_condor_header = """
 Universe=vanilla
-executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py
+executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq
 output=log/srf_pair_fastq.out.$(process).log
 error=log/srf_pair_fastq.err.$(process).log
 log=log/srf_pair_fastq.log
diff --git a/htsworkflow/pipelines/qseq2fastq.py b/htsworkflow/pipelines/qseq2fastq.py
new file mode 100755 (executable)
index 0000000..286663b
--- /dev/null
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+from glob import glob
+import os
+from optparse import OptionParser
+import numpy
+import sys
+import tarfile
+
+from htsworkflow.version import version
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.version:
+        print version()
+        return 0
+    
+    if opts.infile is not None:
+        qseq_generator = tarfile_generator(opts.infile)
+    elif len(args) > 0:
+        qseq_generator = file_generator(args)
+    else:
+        qseq_generator = [sys.stdin]
+            
+    if opts.output is not None:
+        output = open(opts.output, 'w')
+    else:
+        output = sys.stdout
+
+    if opts.nopass_output is not None:
+        nopass_output = open(opts.nopass_output, 'w')
+    else:
+        nopass_output = None
+
+    qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
+    qseq_parser.fastq = not opts.fasta
+    qseq_parser.flowcell_id = opts.flowcell
+    qseq_parser.trim = parse_slice(opts.slice)
+    qseq_parser.reportFilter = opts.pf
+
+    qseq_parser.run()
+
+
+def make_parser():
+    usage = "%prog: [options] *_qseq.txt"
+    parser = OptionParser(usage)
+    parser.add_option("-a", "--fasta", default=False, action="store_true",
+                      help="produce fasta files instead of fastq files")
+    parser.add_option("-f", "--flowcell", default=None, 
+                      help="Set flowcell ID for output file")
+    parser.add_option("-i", "--infile", default=None,
+      help='source tar file (if reading from an archive instead of a directory)')
+    parser.add_option("-o", "--output", default=None,
+                      help="output fastq file")
+    parser.add_option("-n", "--nopass-output", default=None,
+                      help="if provided send files that failed illumina filter "\
+                           "to a differentfile")
+    parser.add_option("-s", "--slice",
+                      help="specify python slice, e.g. 0:75, 0:-1",
+                      default=None)
+    parser.add_option("--pf", help="report pass filter flag", default=False,
+                      action="store_true")
+    parser.add_option("--version", default=False, action="store_true",
+                      help="report software version")
+    
+    return parser
+
+            
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
+        
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+
+            
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
+        
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+
+
+class Qseq2Fastq(object):
+    """
+    Convert qseq files to fastq (or fasta) files.
+    """
+    def __init__(self, sources, pass_destination, nopass_destination=None):
+        self.sources = sources
+        self.pass_destination = pass_destination
+        if nopass_destination is not None:
+            self.nopass_destination = nopass_destination
+        else:
+            self.nopass_destination = pass_destination
+        
+        self.fastq = True
+        self.flowcell_id = None
+        self.trim = slice(None)
+        self.reportFilter = False
+
+    def _format_flowcell_id(self):
+        """
+        Return formatted flowcell ID
+        """
+        if self.flowcell_id is not None:
+            return self.flowcell_id+"_"
+        else:
+            return ""
+    
+    def _convert_illumina_quality(self, illumina_quality):
+        """
+        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
+        """
+        # Illumina scores are Phred + 64
+        # Fastq scores are Phread + 33
+        # the following code grabs the string, converts to short ints and
+        # subtracts 31 (64-33) to convert between the two score formats.
+        # The numpy solution is twice as fast as some of my other
+        # ideas for the conversion.
+        # sorry about the uglyness in changing from character, to 8-bit int
+        # and back to a character array
+        quality = numpy.asarray(illumina_quality,'c')
+        quality.dtype = numpy.uint8
+        quality -= 31
+        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+        return quality
+        
+    def run(self):
+        if self.fastq:
+            header_template = '@' 
+        else:
+            # fasta case
+            header_template = '>'
+        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
+        
+        for qstream in self.sources:
+            for line in qstream:
+                # parse line
+                record = line.rstrip().split('\t')
+                machine_name = record[0]
+                run_number = record[1]
+                lane_number = record[2]
+                tile = record[3]
+                x = record[4]
+                y = record[5]
+                index = record[6]
+                read = record[7]
+                sequence = record[8].replace('.','N')
+                quality = self._convert_illumina_quality(record[9])
+
+                # add pass qc filter if we want it
+                pass_qc = int(record[10])
+                if self.reportFilter:
+                    pass_qc_msg = " pf=%s" % (pass_qc)
+                else:
+                    pass_qc_msg = ""
+
+                header = header_template  % ( \
+                    machine_name,
+                    run_number,
+                    lane_number,
+                    tile,
+                    x,
+                    y,
+                    read,
+                    pass_qc_msg,
+                    os.linesep)
+
+
+                # if we passed the filter write to the "good" file
+                if pass_qc:
+                    destination = self.pass_destination
+                else:
+                    destination = self.nopass_destination
+                
+                destination.write(header)
+                destination.write(sequence[self.trim])
+                destination.write(os.linesep)
+                if self.fastq:
+                    destination.write('+')
+                    destination.write(os.linesep)
+                    destination.write(quality[self.trim].tostring())
+                    destination.write(os.linesep)
+
+if __name__ == "__main__":
+    main()
diff --git a/htsworkflow/pipelines/srf2fastq.py b/htsworkflow/pipelines/srf2fastq.py
new file mode 100755 (executable)
index 0000000..adaa1b2
--- /dev/null
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+import logging
+import mmap
+from optparse import OptionParser
+import os
+from subprocess import Popen, PIPE
+import sys
+
+from htsworkflow.util.opener import autoopen
+from htsworkflow.version import version
+
+# constants for our fastq finite state machine
+FASTQ_HEADER = 0
+FASTQ_SEQUENCE = 1
+FASTQ_SEQUENCE_HEADER = 2
+FASTQ_QUALITY = 3
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.WARN)
+
+    if opts.version:
+        print version()
+        return 0
+
+    if len(args) != 1:
+        parser.error("Requires one argument, got: %s" % (str(args)))
+
+    if opts.flowcell is not None:
+        header = "%s_" % (opts.flowcell,)
+    else:
+        header = ''
+
+    if opts.single:
+        left = open_write(opts.single, opts.force)
+    else:
+        left = open_write(opts.left, opts.force)
+        right = open_write(opts.right, opts.force)
+    
+    # open the srf, fastq, or compressed fastq
+    if is_srf(args[0]):
+        source = srf_open(args[0])
+    else:
+        source = autoopen(args[0])
+
+    if opts.single:
+        convert_single_to_fastq(source, left, header)
+    else:
+        convert_single_to_two_fastq(source, left, right, opts.mid, header)
+   
+    return 0
+
+def make_parser():
+    parser = OptionParser("""%prog: [options] file
+
+file can be either a fastq file or a srf file.
+You can also force the flowcell ID to be added to the header.""")
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="overwrite existing files.")
+    parser.add_option('--flowcell', default=None,
+                      help="add flowcell id header to sequence")
+    parser.add_option('-l','--left', default="r1.fastq",
+                      help='left side filename')
+    parser.add_option('-m','--mid', default=None, 
+                      help='actual sequence mid point')
+    parser.add_option('-r','--right', default="r2.fastq",
+                      help='right side filename')
+    parser.add_option('-s','--single', default=None,
+                      help="single fastq target name")
+    parser.add_option('-v', '--verbose', default=False, action="store_true",
+                      help="show information about what we're doing.")
+    parser.add_option('--version', default=False, action="store_true",
+                      help="Report software version")
+    return parser
+
+
+def srf_open(filename, cnf1=False):
+    """
+    Make a stream from srf file using srf2fastq
+    """
+    cmd = ['srf2fastq']
+    if is_cnf1(filename):
+        cmd.append('-c')
+    cmd.append(filename)
+      
+    logging.info('srf command: %s' % (" ".join(cmd),))
+    p = Popen(cmd, stdout=PIPE)
+    return p.stdout
+    
+
+def convert_single_to_fastq(instream, target1, header=''):
+
+    state = FASTQ_HEADER
+    for line in instream:
+        line = line.strip()
+        # sequence header
+        if state == FASTQ_HEADER:
+            write_header(target1, header, line)
+            state = FASTQ_SEQUENCE
+        # sequence
+        elif state == FASTQ_SEQUENCE:
+            write_sequence(target1, line)
+            state = FASTQ_SEQUENCE_HEADER
+        # quality header
+        elif state == FASTQ_SEQUENCE_HEADER:
+            # the sequence header isn't really sequence, but 
+            # we're just passing it through
+            write_sequence(target1, line)
+            state = FASTQ_QUALITY
+        # sequence or quality data
+        elif state == FASTQ_QUALITY:
+            write_sequence(target1, line)
+            state = FASTQ_HEADER
+        else:
+            raise RuntimeError("Unrecognized STATE in fastq split")
+
+
+        
+def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
+    """
+    read a fastq file where two paired ends have been run together into 
+    two halves.
+
+    instream is the source stream
+    target1 and target2 are the destination streams
+    """
+    if mid is not None:
+        mid = int(mid)
+
+    state = FASTQ_HEADER
+    for line in instream:
+        line = line.strip()
+        # sequence header
+        if state == FASTQ_HEADER:
+            write_header(target1, header, line, "/1")
+            write_header(target2, header, line, "/2")
+            state = FASTQ_SEQUENCE
+        # sequence
+        elif state == FASTQ_SEQUENCE:
+            if mid is None:
+                mid = len(line)/2
+            write_split_sequence(target1, target2, line, mid)
+            state = FASTQ_SEQUENCE_HEADER
+        # quality header
+        elif state == FASTQ_SEQUENCE_HEADER:
+            # the sequence header isn't really sequence, but 
+            # we're just passing it through
+            write_sequence(target1, line)
+            write_sequence(target2, line)
+
+            state = FASTQ_QUALITY
+        # sequence or quality data
+        elif state == FASTQ_QUALITY:
+            write_split_sequence(target1, target2, line, mid)
+            state = FASTQ_HEADER
+        else:
+            raise RuntimeError("Unrecognized STATE in fastq split")
+
+def write_header(target, prefix, line, suffix=''):
+    target.write('@')
+    target.write(prefix)
+    target.write(line[1:])
+    target.write(suffix)
+    target.write(os.linesep)
+
+def write_sequence(target, line):
+    target.write(line)
+    target.write(os.linesep)
+
+def write_split_sequence(target1, target2, line, mid):
+    target1.write(line[:mid])
+    target1.write(os.linesep)
+
+    target2.write(line[mid:])
+    target2.write(os.linesep)
+
+def is_srf(filename):
+    """
+    Check filename to see if it is likely to be a SRF file
+    """
+    f = open(filename, 'r')
+    header = f.read(4)
+    f.close()
+    return header == "SSRF"
+
+def is_cnf1(filename):
+    """
+    Brute force detection if a SRF file is using CNF1/CNF4 records
+    """
+    max_header = 1024 ** 2
+    PROGRAM_ID = 'PROGRAM_ID\000'
+    cnf4_apps = set(("solexa2srf v1.4", 
+                    "illumina2srf v1.11.5.Illumina.1.3"))
+
+    if not is_srf(filename):
+        raise ValueError("%s must be a srf file" % (filename,))
+
+    fd = os.open(filename, os.O_RDONLY)
+    f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+    # alas the max search length requires python 2.6+
+    program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
+    program_header_start = program_id_location+len(PROGRAM_ID)
+    next_null = f.find('\000', program_header_start) #, max_header)
+    program_id_header = f[program_header_start:next_null]
+    f.close()
+    os.close(fd)
+
+    if program_id_header in cnf4_apps:
+        return False
+    else:
+        return True
+
+def open_write(filename, force=False):
+    """
+    Open a file, but throw an exception if it already exists
+    """
+    if not force:
+        if os.path.exists(filename):
+            raise RuntimeError("%s exists" % (filename,))
+
+    return open(filename, 'w')
+
+def foo():
+    path, name = os.path.split(filename)
+    base, ext = os.path.splitext(name)
+
+    target1_name = base + '_r1.fastq'
+    target2_name = base + '_r2.fastq'
+
+    for target_name in [target1_name, target2_name]:
+        print 'target name', target_name
+        if os.path.exists(target_name):
+            raise RuntimeError("%s exists" % (target_name,))
+
+    instream = open(filename,'r')
+    target1 = open(target1_name,'w')
+    target2 = open(target2_name,'w')
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/htsworkflow/version.py b/htsworkflow/version.py
new file mode 100644 (file)
index 0000000..ca7390d
--- /dev/null
@@ -0,0 +1,19 @@
+import logging
+
+def version():
+    """Return version number    
+    """
+    version = None
+    try:
+        import pkg_resources
+    except ImportError, e:
+        logging.error("Can't find version number, please install setuptools")
+        raise e
+
+    try:
+        version = pkg_resources.get_distribution("htsworkflow")
+    except pkg_resources.DistributionNotFound, e:
+        logging.error("Package not installed")
+
+    return version
+        
index 03970dd687ed0e364540095cbdc07a253666fa34..555e668ff1c8ff9914c389f7f85e6e2f14ac5057 100755 (executable)
@@ -1,188 +1,6 @@
-#!/usr/bin/env python
-
-from glob import glob
-import os
-from optparse import OptionParser
-import numpy
+#!/usr/bin/python
 import sys
-import tarfile
-
-class Qseq2Fastq(object):
-    """
-    Convert qseq files to fastq (or fasta) files.
-    """
-    def __init__(self, sources, pass_destination, nopass_destination=None):
-        self.sources = sources
-        self.pass_destination = pass_destination
-        if nopass_destination is not None:
-            self.nopass_destination = nopass_destination
-        else:
-            self.nopass_destination = pass_destination
-        
-        self.fastq = True
-        self.flowcell_id = None
-        self.trim = slice(None)
-        self.reportFilter = False
-
-    def _format_flowcell_id(self):
-        """
-        Return formatted flowcell ID
-        """
-        if self.flowcell_id is not None:
-            return self.flowcell_id+"_"
-        else:
-            return ""
-    
-    def _convert_illumina_quality(self, illumina_quality):
-        """
-        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
-        """
-        # Illumina scores are Phred + 64
-        # Fastq scores are Phread + 33
-        # the following code grabs the string, converts to short ints and
-        # subtracts 31 (64-33) to convert between the two score formats.
-        # The numpy solution is twice as fast as some of my other
-        # ideas for the conversion.
-        # sorry about the uglyness in changing from character, to 8-bit int
-        # and back to a character array
-        quality = numpy.asarray(illumina_quality,'c')
-        quality.dtype = numpy.uint8
-        quality -= 31
-        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
-        return quality
-        
-    def run(self):
-        if self.fastq:
-            header_template = '@' 
-        else:
-            # fasta case
-            header_template = '>'
-        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
-        
-        for qstream in self.sources:
-            for line in qstream:
-                # parse line
-                record = line.rstrip().split('\t')
-                machine_name = record[0]
-                run_number = record[1]
-                lane_number = record[2]
-                tile = record[3]
-                x = record[4]
-                y = record[5]
-                index = record[6]
-                read = record[7]
-                sequence = record[8].replace('.','N')
-                quality = self._convert_illumina_quality(record[9])
-
-                # add pass qc filter if we want it
-                pass_qc = int(record[10])
-                if self.reportFilter:
-                    pass_qc_msg = " pf=%s" % (pass_qc)
-                else:
-                    pass_qc_msg = ""
-
-                header = header_template  % ( \
-                    machine_name,
-                    run_number,
-                    lane_number,
-                    tile,
-                    x,
-                    y,
-                    read,
-                    pass_qc_msg,
-                    os.linesep)
-
-
-                # if we passed the filter write to the "good" file
-                if pass_qc:
-                    destination = self.pass_destination
-                else:
-                    destination = self.nopass_destination
-                
-                destination.write(header)
-                destination.write(sequence[self.trim])
-                destination.write(os.linesep)
-                if self.fastq:
-                    destination.write('+')
-                    destination.write(os.linesep)
-                    destination.write(quality[self.trim].tostring())
-                    destination.write(os.linesep)
-            
-def file_generator(pattern_list):
-    for pattern in pattern_list:
-        for filename in glob(pattern):
-            yield open(filename,'r')
-
-def tarfile_generator(tarfilename):
-    archive = tarfile.open(tarfilename,'r|*')
-    for tarinfo in archive:
-        yield archive.extractfile(tarinfo)
-    
-
-def parse_slice(slice_text):
-    if slice_text is None or len(slice_text) == 0:
-        return slice(None)
-        
-    slice_data = []
-    for element in slice_text.split(':'):
-        if len(element) == 0:
-            element = None
-        else:
-            element = int(element)
-        slice_data.append(element)
-
-    return slice(*slice_data)
-        
-def make_parser():
-    usage = "%prog: [options] *_qseq.txt"
-    parser = OptionParser(usage)
-    parser.add_option('-a', '--fasta', default=False, action="store_true",
-                      help="produce fasta files instead of fastq files")
-    parser.add_option('-f', '--flowcell', default=None, 
-                      help="Set flowcell ID for output file")
-    parser.add_option('-i', '--infile', default=None,
-      help='source tar file (if reading from an archive instead of a directory)')
-    parser.add_option('-o', '--output', default=None,
-                      help='output fastq file')
-    parser.add_option('-n', '--nopass-output', default=None,
-                      help='if provided send files that failed illumina filter '\
-                           'to a differentfile')
-    parser.add_option('-s', '--slice',
-                      help='specify python slice, e.g. 0:75, 0:-1',
-                      default=None)
-    parser.add_option('--pf', help="report pass filter flag", default=False,
-                      action="store_true")
-    return parser
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if opts.infile is not None:
-        qseq_generator = tarfile_generator(opts.infile)
-    elif len(args) > 0:
-        qseq_generator = file_generator(args)
-    else:
-        qseq_generator = [sys.stdin]
-    
-        
-    if opts.output is not None:
-        output = open(opts.output, 'w')
-    else:
-        output = sys.stdout
-
-    if opts.nopass_output is not None:
-        nopass_output = open(opts.nopass_output, 'w')
-    else:
-        nopass_output = None
-
-    qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
-    qseq_parser.fastq = not opts.fasta
-    qseq_parser.flowcell_id = opts.flowcell
-    qseq_parser.trim = parse_slice(opts.slice)
-    qseq_parser.reportFilter = opts.pf
-
-    qseq_parser.run()
+from htsworkflow.pipelines.qseq2fastq import main
 
-if __name__ == "__main__":
-    main()
+if __name__ == "__main__":    
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/srf2fastq b/scripts/srf2fastq
new file mode 100755 (executable)
index 0000000..0361e43
--- /dev/null
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+import sys
+from htsworkflow.pipelines.srf2fastq import main
+
+if __name__ == "__main__":    
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/srf2named_fastq.py b/scripts/srf2named_fastq.py
deleted file mode 100755 (executable)
index d0403f6..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python
-import logging
-import mmap
-from optparse import OptionParser
-import os
-from subprocess import Popen, PIPE
-import sys
-
-from htsworkflow.util.opener import autoopen
-
-# constants for our fastq finite state machine
-FASTQ_HEADER = 0
-FASTQ_SEQUENCE = 1
-FASTQ_SEQUENCE_HEADER = 2
-FASTQ_QUALITY = 3
-
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if len(args) != 1:
-        parser.error("Requires one argument, got: %s" % (str(args)))
-
-    if opts.verbose:
-        logging.basicConfig(level=logging.INFO)
-    else:
-        logging.basicConfig(level=logging.WARN)
-
-    if opts.flowcell is not None:
-        header = "%s_" % (opts.flowcell,)
-    else:
-        header = ''
-
-    if opts.single:
-        left = open_write(opts.single, opts.force)
-    else:
-        left = open_write(opts.left, opts.force)
-        right = open_write(opts.right, opts.force)
-    
-    # open the srf, fastq, or compressed fastq
-    if is_srf(args[0]):
-        source = srf_open(args[0])
-    else:
-        source = autoopen(args[0])
-
-    if opts.single:
-        convert_single_to_fastq(source, left, header)
-    else:
-        convert_single_to_two_fastq(source, left, right, opts.mid, header)
-   
-    return 0
-
-def make_parser():
-    parser = OptionParser("""%prog: [options] file
-
-file can be either a fastq file or a srf file.
-You can also force the flowcell ID to be added to the header.""")
-
-    parser.add_option('--force', default=False, action="store_true",
-                      help="overwrite existing files.")
-    parser.add_option('--flowcell', default=None,
-                      help="add flowcell id header to sequence")
-    parser.add_option('-l','--left', default="r1.fastq",
-                      help='left side filename')
-    parser.add_option('-m','--mid', default=None, 
-                      help='actual sequence mid point')
-    parser.add_option('-r','--right', default="r2.fastq",
-                      help='right side filename')
-    parser.add_option('-s','--single', default=None,
-                      help="single fastq target name")
-    parser.add_option('-v', '--verbose', default=False, action="store_true",
-                      help="show information about what we're doing.")
-    return parser
-
-
-def srf_open(filename, cnf1=False):
-    """
-    Make a stream from srf file using srf2fastq
-    """
-    
-    cmd = ['srf2fastq']
-    if is_cnf1(filename):
-        cmd.append('-c')
-    cmd.append(filename)
-      
-    logging.info('srf command: %s' % (" ".join(cmd),))
-    p = Popen(cmd, stdout=PIPE)
-    return p.stdout
-    
-
-def convert_single_to_fastq(instream, target1, header=''):
-
-    state = FASTQ_HEADER
-    for line in instream:
-        line = line.strip()
-        # sequence header
-        if state == FASTQ_HEADER:
-            write_header(target1, header, line)
-            state = FASTQ_SEQUENCE
-        # sequence
-        elif state == FASTQ_SEQUENCE:
-            write_sequence(target1, line)
-            state = FASTQ_SEQUENCE_HEADER
-        # quality header
-        elif state == FASTQ_SEQUENCE_HEADER:
-            # the sequence header isn't really sequence, but 
-            # we're just passing it through
-            write_sequence(target1, line)
-            state = FASTQ_QUALITY
-        # sequence or quality data
-        elif state == FASTQ_QUALITY:
-            write_sequence(target1, line)
-            state = FASTQ_HEADER
-        else:
-            raise RuntimeError("Unrecognized STATE in fastq split")
-
-
-        
-def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
-    """
-    read a fastq file where two paired ends have been run together into 
-    two halves.
-
-    instream is the source stream
-    target1 and target2 are the destination streams
-    """
-    if mid is not None:
-        mid = int(mid)
-
-    state = FASTQ_HEADER
-    for line in instream:
-        line = line.strip()
-        # sequence header
-        if state == FASTQ_HEADER:
-            write_header(target1, header, line, "/1")
-            write_header(target2, header, line, "/2")
-            state = FASTQ_SEQUENCE
-        # sequence
-        elif state == FASTQ_SEQUENCE:
-            if mid is None:
-                mid = len(line)/2
-            write_split_sequence(target1, target2, line, mid)
-            state = FASTQ_SEQUENCE_HEADER
-        # quality header
-        elif state == FASTQ_SEQUENCE_HEADER:
-            # the sequence header isn't really sequence, but 
-            # we're just passing it through
-            write_sequence(target1, line)
-            write_sequence(target2, line)
-
-            state = FASTQ_QUALITY
-        # sequence or quality data
-        elif state == FASTQ_QUALITY:
-            write_split_sequence(target1, target2, line, mid)
-            state = FASTQ_HEADER
-        else:
-            raise RuntimeError("Unrecognized STATE in fastq split")
-
-def write_header(target, prefix, line, suffix=''):
-    target.write('@')
-    target.write(prefix)
-    target.write(line[1:])
-    target.write(suffix)
-    target.write(os.linesep)
-
-def write_sequence(target, line):
-    target.write(line)
-    target.write(os.linesep)
-
-def write_split_sequence(target1, target2, line, mid):
-    target1.write(line[:mid])
-    target1.write(os.linesep)
-
-    target2.write(line[mid:])
-    target2.write(os.linesep)
-
-def is_srf(filename):
-    """
-    Check filename to see if it is likely to be a SRF file
-    """
-    f = open(filename, 'r')
-    header = f.read(4)
-    f.close()
-    return header == "SSRF"
-
-def is_cnf1(filename):
-    """
-    Brute force detection if a SRF file is using CNF1/CNF4 records
-    """
-    max_header = 1024 ** 2
-    PROGRAM_ID = 'PROGRAM_ID\000'
-    cnf4_apps = set(("solexa2srf v1.4", 
-                    "illumina2srf v1.11.5.Illumina.1.3"))
-
-    if not is_srf(filename):
-        raise ValueError("%s must be a srf file" % (filename,))
-
-    fd = os.open(filename, os.O_RDONLY)
-    f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
-    # alas the max search length requires python 2.6+
-    program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
-    program_header_start = program_id_location+len(PROGRAM_ID)
-    next_null = f.find('\000', program_header_start) #, max_header)
-    program_id_header = f[program_header_start:next_null]
-    f.close()
-    os.close(fd)
-
-    if program_id_header in cnf4_apps:
-        return False
-    else:
-        return True
-
-def open_write(filename, force=False):
-    """
-    Open a file, but throw an exception if it already exists
-    """
-    if not force:
-        if os.path.exists(filename):
-            raise RuntimeError("%s exists" % (filename,))
-
-    return open(filename, 'w')
-
-def foo():
-    path, name = os.path.split(filename)
-    base, ext = os.path.splitext(name)
-
-    target1_name = base + '_r1.fastq'
-    target2_name = base + '_r2.fastq'
-
-    for target_name in [target1_name, target2_name]:
-        print 'target name', target_name
-        if os.path.exists(target_name):
-            raise RuntimeError("%s exists" % (target_name,))
-
-    instream = open(filename,'r')
-    target1 = open(target1_name,'w')
-    target2 = open(target2_name,'w')
-
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
index 40ee4eb6e1dd42e208c2d0694327262abd162068..a226a4fb886904251880b9364243d3450c9cb270 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -1,37 +1,37 @@
 from setuptools import setup
+from version import get_git_version
 
 setup(
-  name="htsworkflow",
-  description="some bots and other utilities to help deal with data from an illumina sequencer",
-  author="Diane Trout & Brandon King",
-  author_email="diane@caltech.edu",
-  packages=["htsworkflow", 
-            "htsworkflow.pipelines",
-            "htsworkflow.frontend",
-            "htsworkflow.frontend.analysis",
-            "htsworkflow.frontend.eland_config",
-            "htsworkflow.frontend.experiments",
-            "htsworkflow.frontend.inventory",
-            "htsworkflow.frontend.reports",
-            "htsworkflow.frontend.samples",
-            "htsworkflow.automation",
-            "htsworkflow.util"
-             ],
-  scripts=[
-        'scripts/configure_pipeline',
+    name="htsworkflow",
+    version=get_git_version(),
+    description="Utilities to help manage high-through-put sequencing",
+    author="Diane Trout, Brandon King",
+    author_email="diane@caltech.edu",
+    packages=["htsworkflow", 
+              "htsworkflow.automation",
+              "htsworkflow.pipelines",
+              "htsworkflow.util",
+              # django site
+              "htsworkflow.frontend",
+              "htsworkflow.frontend.analysis",
+              "htsworkflow.frontend.eland_config",
+              "htsworkflow.frontend.experiments",
+              "htsworkflow.frontend.inventory",
+              "htsworkflow.frontend.reports",
+              "htsworkflow.frontend.samples",
+              ],
+    scripts=[
         'scripts/copier',
-        'scripts/gerald2bed.py',
         'scripts/library.py',
         'scripts/makebed',
-       'scripts/make-library-tree',
+        'scripts/make-library-tree',
         'scripts/mark_archived_data',
         'scripts/qseq2fastq',
-        'scripts/rerun_eland.py',
         'scripts/retrieve_config',
         'scripts/runfolder',
         'scripts/runner',
         'scripts/spoolwatcher', 
         'scripts/srf',
-       'scripts/srf2named_fastq.py'
+        'scripts/srf2fastq'
         ],
-)
+    )
index 83412c4b766e4a3a96688263d4b3a23fa72155f0..eee4152f65082cc244165bb8e5dee4892cf865da 100644 (file)
@@ -6,7 +6,7 @@ import unittest
 _module_path, _module_name = os.path.split(__file__)
 sys.path.append(os.path.join(_module_path, '..', 'scripts'))
 
-import srf2named_fastq
+from htsworkflow.pipelines import srf2fastq
 
 class testSrf2Fastq(unittest.TestCase):
     def test_split_good(self):
@@ -18,7 +18,7 @@ IIIIB+++
         target1 = StringIO()
         target2 = StringIO()
 
-        srf2named_fastq.convert_single_to_two_fastq(source, target1, target2)
+        srf2fastq.convert_single_to_two_fastq(source, target1, target2)
 
         target1.seek(0)
         lines1 = target1.readlines()
@@ -49,7 +49,7 @@ IIIIB+++
         target1 = StringIO()
         target2 = StringIO()
 
-        srf2named_fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
+        srf2fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
 
         target1.seek(0)
         lines1 = target1.readlines()
@@ -79,7 +79,7 @@ IIIIB+++
 """)
         target1 = StringIO()
 
-        srf2named_fastq.convert_single_to_fastq(source, target1)
+        srf2fastq.convert_single_to_fastq(source, target1)
 
         target1.seek(0)
         lines1 = target1.readlines()
@@ -101,7 +101,7 @@ IIIIB+++
 """)
         target1 = StringIO()
 
-        srf2named_fastq.convert_single_to_fastq(source, target1, header="foo_")
+        srf2fastq.convert_single_to_fastq(source, target1, header="foo_")
 
         target1.seek(0)
         lines1 = target1.readlines()
@@ -117,7 +117,7 @@ IIIIB+++
         cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
         cnf1_path = os.path.join(_module_path, cnf1_srf)
         
-        is_srf = srf2named_fastq.is_srf
+        is_srf = srf2fastq.is_srf
         self.failUnlessEqual(is_srf(__file__), False)
         self.failUnlessEqual(is_srf(cnf4_path), True)
         self.failUnlessEqual(is_srf(cnf1_path), True)
@@ -128,7 +128,7 @@ IIIIB+++
         cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
         cnf1_path = os.path.join(_module_path, cnf1_srf)
         
-        is_cnf1 = srf2named_fastq.is_cnf1
+        is_cnf1 = srf2fastq.is_cnf1
         self.failUnlessRaises(ValueError, is_cnf1, __file__)
         self.failUnlessEqual(is_cnf1(cnf4_path), False)
         self.failUnlessEqual(is_cnf1(cnf1_path), True)
diff --git a/version.py b/version.py
new file mode 100644 (file)
index 0000000..b850b4a
--- /dev/null
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+# Author: Douglas Creager <dcreager@dcreager.net>
+# This file is placed into the public domain.
+
+# Calculates the current version number.  If possible, this is the
+# output of “git describe”, modified to conform to the versioning
+# scheme that setuptools uses.  If “git describe” returns an error
+# (most likely because we're in an unpacked copy of a release tarball,
+# rather than in a git working copy), then we fall back on reading the
+# contents of the RELEASE-VERSION file.
+#
+# To use this script, simply import it your setup.py file, and use the
+# results of get_git_version() as your package version:
+#
+# from version import *
+#
+# setup(
+#     version=get_git_version(),
+#     .
+#     .
+#     .
+# )
+#
+# This will automatically update the RELEASE-VERSION file, if
+# necessary.  Note that the RELEASE-VERSION file should *not* be
+# checked into git; please add it to your top-level .gitignore file.
+#
+# You'll probably want to distribute the RELEASE-VERSION file in your
+# sdist tarballs; to do this, just create a MANIFEST.in file that
+# contains the following line:
+#
+#   include RELEASE-VERSION
+
+__all__ = ("get_git_version")
+
+from subprocess import Popen, PIPE
+
+
+def call_git_describe(abbrev=4):
+    try:
+        p = Popen(['git', 'describe', '--abbrev=%d' % abbrev],
+                  stdout=PIPE, stderr=PIPE)
+        p.stderr.close()
+        line = p.stdout.readlines()[0]
+        return line.strip()
+
+    except:
+        return None
+
+
+def read_release_version():
+    try:
+        f = open("RELEASE-VERSION", "r")
+
+        try:
+            version = f.readlines()[0]
+            return version.strip()
+
+        finally:
+            f.close()
+
+    except:
+        return None
+
+
+def write_release_version(version):
+    f = open("RELEASE-VERSION", "w")
+    f.write("%s\n" % version)
+    f.close()
+
+
+def get_git_version(abbrev=4):
+    # Read in the version that's currently in RELEASE-VERSION.
+
+    release_version = read_release_version()
+
+    # First try to get the current version using “git describe”.
+
+    version = call_git_describe(abbrev)
+
+    # If that doesn't work, fall back on the value that's in
+    # RELEASE-VERSION.
+
+    if version is None:
+        version = release_version
+
+    # If we still don't have anything, that's an error.
+
+    if version is None:
+        raise ValueError("Cannot find the version number!")
+
+    # If the current version is different from what's in the
+    # RELEASE-VERSION file, update the file to be current.
+
+    if version != release_version:
+        write_release_version(version)
+
+    # Finally, return the current version.
+
+    return version
+
+
+if __name__ == "__main__":
+    print get_git_version()