Report version number derived from git tag.

author Diane Trout <diane@caltech.edu>

Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)

committer Diane Trout <diane@caltech.edu>

Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
author Diane Trout <diane@caltech.edu>
Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
committer Diane Trout <diane@caltech.edu>
Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
diff --git a/.gitignore b/.gitignore

index 3f57fc752af39b4aad0676b326c3bf77033ebde8..921a57b3b720e8cb8b78e3edfca3fb695fcf073b 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
  *.py[co~]
  .coverage
  *.egg-info
+dist
+RELEASE-VERSION
diff --git a/MANIFEST.in b/MANIFEST.in

new file mode 100644 (file)

index 0000000..c4e95cd
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include RELEASE-VERSION
+version.py
diff --git a/extra/ucsc_encode_submission/ucsc_gather.py b/extra/ucsc_encode_submission/ucsc_gather.py

index c69240548b5a5d0e5588f5d2d911416eab017876..191281c8580c308e5ff71f5088cb37e04b4af587 100755 (executable)
--- a/extra/ucsc_encode_submission/ucsc_gather.py
+++ b/extra/ucsc_encode_submission/ucsc_gather.py
@@ -132,7 +132,7 @@ log=log/qseq2fastq.log
      qseq_condor_entries = []
      srf_condor_header = """
  Universe=vanilla
-executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py
+executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq
  output=log/srf_pair_fastq.out.$(process).log
  error=log/srf_pair_fastq.err.$(process).log
  log=log/srf_pair_fastq.log
diff --git a/htsworkflow/pipelines/qseq2fastq.py b/htsworkflow/pipelines/qseq2fastq.py

new file mode 100755 (executable)

index 0000000..286663b
--- /dev/null
+++ b/htsworkflow/pipelines/qseq2fastq.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+from glob import glob
+import os
+from optparse import OptionParser
+import numpy
+import sys
+import tarfile
+
+from htsworkflow.version import version
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.version:
+        print version()
+        return 0
+    
+    if opts.infile is not None:
+        qseq_generator = tarfile_generator(opts.infile)
+    elif len(args) > 0:
+        qseq_generator = file_generator(args)
+    else:
+        qseq_generator = [sys.stdin]
+            
+    if opts.output is not None:
+        output = open(opts.output, 'w')
+    else:
+        output = sys.stdout
+
+    if opts.nopass_output is not None:
+        nopass_output = open(opts.nopass_output, 'w')
+    else:
+        nopass_output = None
+
+    qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
+    qseq_parser.fastq = not opts.fasta
+    qseq_parser.flowcell_id = opts.flowcell
+    qseq_parser.trim = parse_slice(opts.slice)
+    qseq_parser.reportFilter = opts.pf
+
+    qseq_parser.run()
+
+
+def make_parser():
+    usage = "%prog: [options] *_qseq.txt"
+    parser = OptionParser(usage)
+    parser.add_option("-a", "--fasta", default=False, action="store_true",
+                      help="produce fasta files instead of fastq files")
+    parser.add_option("-f", "--flowcell", default=None, 
+                      help="Set flowcell ID for output file")
+    parser.add_option("-i", "--infile", default=None,
+      help='source tar file (if reading from an archive instead of a directory)')
+    parser.add_option("-o", "--output", default=None,
+                      help="output fastq file")
+    parser.add_option("-n", "--nopass-output", default=None,
+                      help="if provided send files that failed illumina filter "\
+                           "to a differentfile")
+    parser.add_option("-s", "--slice",
+                      help="specify python slice, e.g. 0:75, 0:-1",
+                      default=None)
+    parser.add_option("--pf", help="report pass filter flag", default=False,
+                      action="store_true")
+    parser.add_option("--version", default=False, action="store_true",
+                      help="report software version")
+    
+    return parser
+
+            
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
+        
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+
+            
+def file_generator(pattern_list):
+    for pattern in pattern_list:
+        for filename in glob(pattern):
+            yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+    archive = tarfile.open(tarfilename,'r|*')
+    for tarinfo in archive:
+        yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+    if slice_text is None or len(slice_text) == 0:
+        return slice(None)
+        
+    slice_data = []
+    for element in slice_text.split(':'):
+        if len(element) == 0:
+            element = None
+        else:
+            element = int(element)
+        slice_data.append(element)
+
+    return slice(*slice_data)
+
+
+class Qseq2Fastq(object):
+    """
+    Convert qseq files to fastq (or fasta) files.
+    """
+    def __init__(self, sources, pass_destination, nopass_destination=None):
+        self.sources = sources
+        self.pass_destination = pass_destination
+        if nopass_destination is not None:
+            self.nopass_destination = nopass_destination
+        else:
+            self.nopass_destination = pass_destination
+        
+        self.fastq = True
+        self.flowcell_id = None
+        self.trim = slice(None)
+        self.reportFilter = False
+
+    def _format_flowcell_id(self):
+        """
+        Return formatted flowcell ID
+        """
+        if self.flowcell_id is not None:
+            return self.flowcell_id+"_"
+        else:
+            return ""
+    
+    def _convert_illumina_quality(self, illumina_quality):
+        """
+        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
+        """
+        # Illumina scores are Phred + 64
+        # Fastq scores are Phread + 33
+        # the following code grabs the string, converts to short ints and
+        # subtracts 31 (64-33) to convert between the two score formats.
+        # The numpy solution is twice as fast as some of my other
+        # ideas for the conversion.
+        # sorry about the uglyness in changing from character, to 8-bit int
+        # and back to a character array
+        quality = numpy.asarray(illumina_quality,'c')
+        quality.dtype = numpy.uint8
+        quality -= 31
+        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+        return quality
+        
+    def run(self):
+        if self.fastq:
+            header_template = '@' 
+        else:
+            # fasta case
+            header_template = '>'
+        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
+        
+        for qstream in self.sources:
+            for line in qstream:
+                # parse line
+                record = line.rstrip().split('\t')
+                machine_name = record[0]
+                run_number = record[1]
+                lane_number = record[2]
+                tile = record[3]
+                x = record[4]
+                y = record[5]
+                index = record[6]
+                read = record[7]
+                sequence = record[8].replace('.','N')
+                quality = self._convert_illumina_quality(record[9])
+
+                # add pass qc filter if we want it
+                pass_qc = int(record[10])
+                if self.reportFilter:
+                    pass_qc_msg = " pf=%s" % (pass_qc)
+                else:
+                    pass_qc_msg = ""
+
+                header = header_template  % ( \
+                    machine_name,
+                    run_number,
+                    lane_number,
+                    tile,
+                    x,
+                    y,
+                    read,
+                    pass_qc_msg,
+                    os.linesep)
+
+
+                # if we passed the filter write to the "good" file
+                if pass_qc:
+                    destination = self.pass_destination
+                else:
+                    destination = self.nopass_destination
+                
+                destination.write(header)
+                destination.write(sequence[self.trim])
+                destination.write(os.linesep)
+                if self.fastq:
+                    destination.write('+')
+                    destination.write(os.linesep)
+                    destination.write(quality[self.trim].tostring())
+                    destination.write(os.linesep)
+
+if __name__ == "__main__":
+    main()
diff --git a/htsworkflow/pipelines/srf2fastq.py b/htsworkflow/pipelines/srf2fastq.py

new file mode 100755 (executable)

index 0000000..adaa1b2
--- /dev/null
+++ b/htsworkflow/pipelines/srf2fastq.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+import logging
+import mmap
+from optparse import OptionParser
+import os
+from subprocess import Popen, PIPE
+import sys
+
+from htsworkflow.util.opener import autoopen
+from htsworkflow.version import version
+
+# constants for our fastq finite state machine
+FASTQ_HEADER = 0
+FASTQ_SEQUENCE = 1
+FASTQ_SEQUENCE_HEADER = 2
+FASTQ_QUALITY = 3
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    if opts.verbose:
+        logging.basicConfig(level=logging.INFO)
+    else:
+        logging.basicConfig(level=logging.WARN)
+
+    if opts.version:
+        print version()
+        return 0
+
+    if len(args) != 1:
+        parser.error("Requires one argument, got: %s" % (str(args)))
+
+    if opts.flowcell is not None:
+        header = "%s_" % (opts.flowcell,)
+    else:
+        header = ''
+
+    if opts.single:
+        left = open_write(opts.single, opts.force)
+    else:
+        left = open_write(opts.left, opts.force)
+        right = open_write(opts.right, opts.force)
+    
+    # open the srf, fastq, or compressed fastq
+    if is_srf(args[0]):
+        source = srf_open(args[0])
+    else:
+        source = autoopen(args[0])
+
+    if opts.single:
+        convert_single_to_fastq(source, left, header)
+    else:
+        convert_single_to_two_fastq(source, left, right, opts.mid, header)
+   
+    return 0
+
+def make_parser():
+    parser = OptionParser("""%prog: [options] file
+
+file can be either a fastq file or a srf file.
+You can also force the flowcell ID to be added to the header.""")
+
+    parser.add_option('--force', default=False, action="store_true",
+                      help="overwrite existing files.")
+    parser.add_option('--flowcell', default=None,
+                      help="add flowcell id header to sequence")
+    parser.add_option('-l','--left', default="r1.fastq",
+                      help='left side filename')
+    parser.add_option('-m','--mid', default=None, 
+                      help='actual sequence mid point')
+    parser.add_option('-r','--right', default="r2.fastq",
+                      help='right side filename')
+    parser.add_option('-s','--single', default=None,
+                      help="single fastq target name")
+    parser.add_option('-v', '--verbose', default=False, action="store_true",
+                      help="show information about what we're doing.")
+    parser.add_option('--version', default=False, action="store_true",
+                      help="Report software version")
+    return parser
+
+
+def srf_open(filename, cnf1=False):
+    """
+    Make a stream from srf file using srf2fastq
+    """
+    cmd = ['srf2fastq']
+    if is_cnf1(filename):
+        cmd.append('-c')
+    cmd.append(filename)
+      
+    logging.info('srf command: %s' % (" ".join(cmd),))
+    p = Popen(cmd, stdout=PIPE)
+    return p.stdout
+    
+
+def convert_single_to_fastq(instream, target1, header=''):
+
+    state = FASTQ_HEADER
+    for line in instream:
+        line = line.strip()
+        # sequence header
+        if state == FASTQ_HEADER:
+            write_header(target1, header, line)
+            state = FASTQ_SEQUENCE
+        # sequence
+        elif state == FASTQ_SEQUENCE:
+            write_sequence(target1, line)
+            state = FASTQ_SEQUENCE_HEADER
+        # quality header
+        elif state == FASTQ_SEQUENCE_HEADER:
+            # the sequence header isn't really sequence, but 
+            # we're just passing it through
+            write_sequence(target1, line)
+            state = FASTQ_QUALITY
+        # sequence or quality data
+        elif state == FASTQ_QUALITY:
+            write_sequence(target1, line)
+            state = FASTQ_HEADER
+        else:
+            raise RuntimeError("Unrecognized STATE in fastq split")
+
+
+        
+def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
+    """
+    read a fastq file where two paired ends have been run together into 
+    two halves.
+
+    instream is the source stream
+    target1 and target2 are the destination streams
+    """
+    if mid is not None:
+        mid = int(mid)
+
+    state = FASTQ_HEADER
+    for line in instream:
+        line = line.strip()
+        # sequence header
+        if state == FASTQ_HEADER:
+            write_header(target1, header, line, "/1")
+            write_header(target2, header, line, "/2")
+            state = FASTQ_SEQUENCE
+        # sequence
+        elif state == FASTQ_SEQUENCE:
+            if mid is None:
+                mid = len(line)/2
+            write_split_sequence(target1, target2, line, mid)
+            state = FASTQ_SEQUENCE_HEADER
+        # quality header
+        elif state == FASTQ_SEQUENCE_HEADER:
+            # the sequence header isn't really sequence, but 
+            # we're just passing it through
+            write_sequence(target1, line)
+            write_sequence(target2, line)
+
+            state = FASTQ_QUALITY
+        # sequence or quality data
+        elif state == FASTQ_QUALITY:
+            write_split_sequence(target1, target2, line, mid)
+            state = FASTQ_HEADER
+        else:
+            raise RuntimeError("Unrecognized STATE in fastq split")
+
+def write_header(target, prefix, line, suffix=''):
+    target.write('@')
+    target.write(prefix)
+    target.write(line[1:])
+    target.write(suffix)
+    target.write(os.linesep)
+
+def write_sequence(target, line):
+    target.write(line)
+    target.write(os.linesep)
+
+def write_split_sequence(target1, target2, line, mid):
+    target1.write(line[:mid])
+    target1.write(os.linesep)
+
+    target2.write(line[mid:])
+    target2.write(os.linesep)
+
+def is_srf(filename):
+    """
+    Check filename to see if it is likely to be a SRF file
+    """
+    f = open(filename, 'r')
+    header = f.read(4)
+    f.close()
+    return header == "SSRF"
+
+def is_cnf1(filename):
+    """
+    Brute force detection if a SRF file is using CNF1/CNF4 records
+    """
+    max_header = 1024 ** 2
+    PROGRAM_ID = 'PROGRAM_ID\000'
+    cnf4_apps = set(("solexa2srf v1.4", 
+                    "illumina2srf v1.11.5.Illumina.1.3"))
+
+    if not is_srf(filename):
+        raise ValueError("%s must be a srf file" % (filename,))
+
+    fd = os.open(filename, os.O_RDONLY)
+    f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+    # alas the max search length requires python 2.6+
+    program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
+    program_header_start = program_id_location+len(PROGRAM_ID)
+    next_null = f.find('\000', program_header_start) #, max_header)
+    program_id_header = f[program_header_start:next_null]
+    f.close()
+    os.close(fd)
+
+    if program_id_header in cnf4_apps:
+        return False
+    else:
+        return True
+
+def open_write(filename, force=False):
+    """
+    Open a file, but throw an exception if it already exists
+    """
+    if not force:
+        if os.path.exists(filename):
+            raise RuntimeError("%s exists" % (filename,))
+
+    return open(filename, 'w')
+
+def foo():
+    path, name = os.path.split(filename)
+    base, ext = os.path.splitext(name)
+
+    target1_name = base + '_r1.fastq'
+    target2_name = base + '_r2.fastq'
+
+    for target_name in [target1_name, target2_name]:
+        print 'target name', target_name
+        if os.path.exists(target_name):
+            raise RuntimeError("%s exists" % (target_name,))
+
+    instream = open(filename,'r')
+    target1 = open(target1_name,'w')
+    target2 = open(target2_name,'w')
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/htsworkflow/version.py b/htsworkflow/version.py

new file mode 100644 (file)

index 0000000..ca7390d
--- /dev/null
+++ b/htsworkflow/version.py
@@ -0,0 +1,19 @@
+import logging
+
+def version():
+    """Return version number    
+    """
+    version = None
+    try:
+        import pkg_resources
+    except ImportError, e:
+        logging.error("Can't find version number, please install setuptools")
+        raise e
+
+    try:
+        version = pkg_resources.get_distribution("htsworkflow")
+    except pkg_resources.DistributionNotFound, e:
+        logging.error("Package not installed")
+
+    return version
+        
diff --git a/scripts/qseq2fastq b/scripts/qseq2fastq

index 03970dd687ed0e364540095cbdc07a253666fa34..555e668ff1c8ff9914c389f7f85e6e2f14ac5057 100755 (executable)
--- a/scripts/qseq2fastq
+++ b/scripts/qseq2fastq
@@ -1,188 +1,6 @@
-#!/usr/bin/env python
-
-from glob import glob
-import os
-from optparse import OptionParser
-import numpy
+#!/usr/bin/python
  import sys
-import tarfile
-
-class Qseq2Fastq(object):
-    """
-    Convert qseq files to fastq (or fasta) files.
-    """
-    def __init__(self, sources, pass_destination, nopass_destination=None):
-        self.sources = sources
-        self.pass_destination = pass_destination
-        if nopass_destination is not None:
-            self.nopass_destination = nopass_destination
-        else:
-            self.nopass_destination = pass_destination
-        
-        self.fastq = True
-        self.flowcell_id = None
-        self.trim = slice(None)
-        self.reportFilter = False
-
-    def _format_flowcell_id(self):
-        """
-        Return formatted flowcell ID
-        """
-        if self.flowcell_id is not None:
-            return self.flowcell_id+"_"
-        else:
-            return ""
-    
-    def _convert_illumina_quality(self, illumina_quality):
-        """
-        Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
-        """
-        # Illumina scores are Phred + 64
-        # Fastq scores are Phread + 33
-        # the following code grabs the string, converts to short ints and
-        # subtracts 31 (64-33) to convert between the two score formats.
-        # The numpy solution is twice as fast as some of my other
-        # ideas for the conversion.
-        # sorry about the uglyness in changing from character, to 8-bit int
-        # and back to a character array
-        quality = numpy.asarray(illumina_quality,'c')
-        quality.dtype = numpy.uint8
-        quality -= 31
-        quality.dtype = '|S1' # I'd like to know what the real numpy char type is
-        return quality
-        
-    def run(self):
-        if self.fastq:
-            header_template = '@' 
-        else:
-            # fasta case
-            header_template = '>'
-        header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
-        
-        for qstream in self.sources:
-            for line in qstream:
-                # parse line
-                record = line.rstrip().split('\t')
-                machine_name = record[0]
-                run_number = record[1]
-                lane_number = record[2]
-                tile = record[3]
-                x = record[4]
-                y = record[5]
-                index = record[6]
-                read = record[7]
-                sequence = record[8].replace('.','N')
-                quality = self._convert_illumina_quality(record[9])
-
-                # add pass qc filter if we want it
-                pass_qc = int(record[10])
-                if self.reportFilter:
-                    pass_qc_msg = " pf=%s" % (pass_qc)
-                else:
-                    pass_qc_msg = ""
-
-                header = header_template  % ( \
-                    machine_name,
-                    run_number,
-                    lane_number,
-                    tile,
-                    x,
-                    y,
-                    read,
-                    pass_qc_msg,
-                    os.linesep)
-
-
-                # if we passed the filter write to the "good" file
-                if pass_qc:
-                    destination = self.pass_destination
-                else:
-                    destination = self.nopass_destination
-                
-                destination.write(header)
-                destination.write(sequence[self.trim])
-                destination.write(os.linesep)
-                if self.fastq:
-                    destination.write('+')
-                    destination.write(os.linesep)
-                    destination.write(quality[self.trim].tostring())
-                    destination.write(os.linesep)
-            
-def file_generator(pattern_list):
-    for pattern in pattern_list:
-        for filename in glob(pattern):
-            yield open(filename,'r')
-
-def tarfile_generator(tarfilename):
-    archive = tarfile.open(tarfilename,'r|*')
-    for tarinfo in archive:
-        yield archive.extractfile(tarinfo)
-    
-
-def parse_slice(slice_text):
-    if slice_text is None or len(slice_text) == 0:
-        return slice(None)
-        
-    slice_data = []
-    for element in slice_text.split(':'):
-        if len(element) == 0:
-            element = None
-        else:
-            element = int(element)
-        slice_data.append(element)
-
-    return slice(*slice_data)
-        
-def make_parser():
-    usage = "%prog: [options] *_qseq.txt"
-    parser = OptionParser(usage)
-    parser.add_option('-a', '--fasta', default=False, action="store_true",
-                      help="produce fasta files instead of fastq files")
-    parser.add_option('-f', '--flowcell', default=None, 
-                      help="Set flowcell ID for output file")
-    parser.add_option('-i', '--infile', default=None,
-      help='source tar file (if reading from an archive instead of a directory)')
-    parser.add_option('-o', '--output', default=None,
-                      help='output fastq file')
-    parser.add_option('-n', '--nopass-output', default=None,
-                      help='if provided send files that failed illumina filter '\
-                           'to a differentfile')
-    parser.add_option('-s', '--slice',
-                      help='specify python slice, e.g. 0:75, 0:-1',
-                      default=None)
-    parser.add_option('--pf', help="report pass filter flag", default=False,
-                      action="store_true")
-    return parser
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if opts.infile is not None:
-        qseq_generator = tarfile_generator(opts.infile)
-    elif len(args) > 0:
-        qseq_generator = file_generator(args)
-    else:
-        qseq_generator = [sys.stdin]
-    
-        
-    if opts.output is not None:
-        output = open(opts.output, 'w')
-    else:
-        output = sys.stdout
-
-    if opts.nopass_output is not None:
-        nopass_output = open(opts.nopass_output, 'w')
-    else:
-        nopass_output = None
-
-    qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
-    qseq_parser.fastq = not opts.fasta
-    qseq_parser.flowcell_id = opts.flowcell
-    qseq_parser.trim = parse_slice(opts.slice)
-    qseq_parser.reportFilter = opts.pf
-
-    qseq_parser.run()
+from htsworkflow.pipelines.qseq2fastq import main
  
-if __name__ == "__main__":
-    main()
+if __name__ == "__main__":    
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/srf2fastq b/scripts/srf2fastq

new file mode 100755 (executable)

index 0000000..0361e43
--- /dev/null
+++ b/scripts/srf2fastq
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+import sys
+from htsworkflow.pipelines.srf2fastq import main
+
+if __name__ == "__main__":    
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/srf2named_fastq.py b/scripts/srf2named_fastq.py

deleted file mode 100755 (executable)

index d0403f6..0000000
--- a/scripts/srf2named_fastq.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python
-import logging
-import mmap
-from optparse import OptionParser
-import os
-from subprocess import Popen, PIPE
-import sys
-
-from htsworkflow.util.opener import autoopen
-
-# constants for our fastq finite state machine
-FASTQ_HEADER = 0
-FASTQ_SEQUENCE = 1
-FASTQ_SEQUENCE_HEADER = 2
-FASTQ_QUALITY = 3
-
-
-def main(cmdline=None):
-    parser = make_parser()
-    opts, args = parser.parse_args(cmdline)
-
-    if len(args) != 1:
-        parser.error("Requires one argument, got: %s" % (str(args)))
-
-    if opts.verbose:
-        logging.basicConfig(level=logging.INFO)
-    else:
-        logging.basicConfig(level=logging.WARN)
-
-    if opts.flowcell is not None:
-        header = "%s_" % (opts.flowcell,)
-    else:
-        header = ''
-
-    if opts.single:
-        left = open_write(opts.single, opts.force)
-    else:
-        left = open_write(opts.left, opts.force)
-        right = open_write(opts.right, opts.force)
-    
-    # open the srf, fastq, or compressed fastq
-    if is_srf(args[0]):
-        source = srf_open(args[0])
-    else:
-        source = autoopen(args[0])
-
-    if opts.single:
-        convert_single_to_fastq(source, left, header)
-    else:
-        convert_single_to_two_fastq(source, left, right, opts.mid, header)
-   
-    return 0
-
-def make_parser():
-    parser = OptionParser("""%prog: [options] file
-
-file can be either a fastq file or a srf file.
-You can also force the flowcell ID to be added to the header.""")
-
-    parser.add_option('--force', default=False, action="store_true",
-                      help="overwrite existing files.")
-    parser.add_option('--flowcell', default=None,
-                      help="add flowcell id header to sequence")
-    parser.add_option('-l','--left', default="r1.fastq",
-                      help='left side filename')
-    parser.add_option('-m','--mid', default=None, 
-                      help='actual sequence mid point')
-    parser.add_option('-r','--right', default="r2.fastq",
-                      help='right side filename')
-    parser.add_option('-s','--single', default=None,
-                      help="single fastq target name")
-    parser.add_option('-v', '--verbose', default=False, action="store_true",
-                      help="show information about what we're doing.")
-    return parser
-
-
-def srf_open(filename, cnf1=False):
-    """
-    Make a stream from srf file using srf2fastq
-    """
-    
-    cmd = ['srf2fastq']
-    if is_cnf1(filename):
-        cmd.append('-c')
-    cmd.append(filename)
-      
-    logging.info('srf command: %s' % (" ".join(cmd),))
-    p = Popen(cmd, stdout=PIPE)
-    return p.stdout
-    
-
-def convert_single_to_fastq(instream, target1, header=''):
-
-    state = FASTQ_HEADER
-    for line in instream:
-        line = line.strip()
-        # sequence header
-        if state == FASTQ_HEADER:
-            write_header(target1, header, line)
-            state = FASTQ_SEQUENCE
-        # sequence
-        elif state == FASTQ_SEQUENCE:
-            write_sequence(target1, line)
-            state = FASTQ_SEQUENCE_HEADER
-        # quality header
-        elif state == FASTQ_SEQUENCE_HEADER:
-            # the sequence header isn't really sequence, but 
-            # we're just passing it through
-            write_sequence(target1, line)
-            state = FASTQ_QUALITY
-        # sequence or quality data
-        elif state == FASTQ_QUALITY:
-            write_sequence(target1, line)
-            state = FASTQ_HEADER
-        else:
-            raise RuntimeError("Unrecognized STATE in fastq split")
-
-
-        
-def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
-    """
-    read a fastq file where two paired ends have been run together into 
-    two halves.
-
-    instream is the source stream
-    target1 and target2 are the destination streams
-    """
-    if mid is not None:
-        mid = int(mid)
-
-    state = FASTQ_HEADER
-    for line in instream:
-        line = line.strip()
-        # sequence header
-        if state == FASTQ_HEADER:
-            write_header(target1, header, line, "/1")
-            write_header(target2, header, line, "/2")
-            state = FASTQ_SEQUENCE
-        # sequence
-        elif state == FASTQ_SEQUENCE:
-            if mid is None:
-                mid = len(line)/2
-            write_split_sequence(target1, target2, line, mid)
-            state = FASTQ_SEQUENCE_HEADER
-        # quality header
-        elif state == FASTQ_SEQUENCE_HEADER:
-            # the sequence header isn't really sequence, but 
-            # we're just passing it through
-            write_sequence(target1, line)
-            write_sequence(target2, line)
-
-            state = FASTQ_QUALITY
-        # sequence or quality data
-        elif state == FASTQ_QUALITY:
-            write_split_sequence(target1, target2, line, mid)
-            state = FASTQ_HEADER
-        else:
-            raise RuntimeError("Unrecognized STATE in fastq split")
-
-def write_header(target, prefix, line, suffix=''):
-    target.write('@')
-    target.write(prefix)
-    target.write(line[1:])
-    target.write(suffix)
-    target.write(os.linesep)
-
-def write_sequence(target, line):
-    target.write(line)
-    target.write(os.linesep)
-
-def write_split_sequence(target1, target2, line, mid):
-    target1.write(line[:mid])
-    target1.write(os.linesep)
-
-    target2.write(line[mid:])
-    target2.write(os.linesep)
-
-def is_srf(filename):
-    """
-    Check filename to see if it is likely to be a SRF file
-    """
-    f = open(filename, 'r')
-    header = f.read(4)
-    f.close()
-    return header == "SSRF"
-
-def is_cnf1(filename):
-    """
-    Brute force detection if a SRF file is using CNF1/CNF4 records
-    """
-    max_header = 1024 ** 2
-    PROGRAM_ID = 'PROGRAM_ID\000'
-    cnf4_apps = set(("solexa2srf v1.4", 
-                    "illumina2srf v1.11.5.Illumina.1.3"))
-
-    if not is_srf(filename):
-        raise ValueError("%s must be a srf file" % (filename,))
-
-    fd = os.open(filename, os.O_RDONLY)
-    f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
-    # alas the max search length requires python 2.6+
-    program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
-    program_header_start = program_id_location+len(PROGRAM_ID)
-    next_null = f.find('\000', program_header_start) #, max_header)
-    program_id_header = f[program_header_start:next_null]
-    f.close()
-    os.close(fd)
-
-    if program_id_header in cnf4_apps:
-        return False
-    else:
-        return True
-
-def open_write(filename, force=False):
-    """
-    Open a file, but throw an exception if it already exists
-    """
-    if not force:
-        if os.path.exists(filename):
-            raise RuntimeError("%s exists" % (filename,))
-
-    return open(filename, 'w')
-
-def foo():
-    path, name = os.path.split(filename)
-    base, ext = os.path.splitext(name)
-
-    target1_name = base + '_r1.fastq'
-    target2_name = base + '_r2.fastq'
-
-    for target_name in [target1_name, target2_name]:
-        print 'target name', target_name
-        if os.path.exists(target_name):
-            raise RuntimeError("%s exists" % (target_name,))
-
-    instream = open(filename,'r')
-    target1 = open(target1_name,'w')
-    target2 = open(target2_name,'w')
-
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
diff --git a/setup.py b/setup.py

index 40ee4eb6e1dd42e208c2d0694327262abd162068..a226a4fb886904251880b9364243d3450c9cb270 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -1,37 +1,37 @@
  from setuptools import setup
+from version import get_git_version
  
  setup(
-  name="htsworkflow",
-  description="some bots and other utilities to help deal with data from an illumina sequencer",
-  author="Diane Trout & Brandon King",
-  author_email="diane@caltech.edu",
-  packages=["htsworkflow", 
-            "htsworkflow.pipelines",
-            "htsworkflow.frontend",
-            "htsworkflow.frontend.analysis",
-            "htsworkflow.frontend.eland_config",
-            "htsworkflow.frontend.experiments",
-            "htsworkflow.frontend.inventory",
-            "htsworkflow.frontend.reports",
-            "htsworkflow.frontend.samples",
-            "htsworkflow.automation",
-            "htsworkflow.util"
-             ],
-  scripts=[
-        'scripts/configure_pipeline',
+    name="htsworkflow",
+    version=get_git_version(),
+    description="Utilities to help manage high-through-put sequencing",
+    author="Diane Trout, Brandon King",
+    author_email="diane@caltech.edu",
+    packages=["htsworkflow", 
+              "htsworkflow.automation",
+              "htsworkflow.pipelines",
+              "htsworkflow.util",
+              # django site
+              "htsworkflow.frontend",
+              "htsworkflow.frontend.analysis",
+              "htsworkflow.frontend.eland_config",
+              "htsworkflow.frontend.experiments",
+              "htsworkflow.frontend.inventory",
+              "htsworkflow.frontend.reports",
+              "htsworkflow.frontend.samples",
+              ],
+    scripts=[
          'scripts/copier',
-        'scripts/gerald2bed.py',
          'scripts/library.py',
          'scripts/makebed',
-       'scripts/make-library-tree',
+        'scripts/make-library-tree',
          'scripts/mark_archived_data',
          'scripts/qseq2fastq',
-        'scripts/rerun_eland.py',
          'scripts/retrieve_config',
          'scripts/runfolder',
          'scripts/runner',
          'scripts/spoolwatcher', 
          'scripts/srf',
-       'scripts/srf2named_fastq.py'
+        'scripts/srf2fastq'
          ],
-)
+    )
diff --git a/test/test_srf2fastq.py b/test/test_srf2fastq.py

index 83412c4b766e4a3a96688263d4b3a23fa72155f0..eee4152f65082cc244165bb8e5dee4892cf865da 100644 (file)
--- a/test/test_srf2fastq.py
+++ b/test/test_srf2fastq.py
@@ -6,7 +6,7 @@ import unittest
  _module_path, _module_name = os.path.split(__file__)
  sys.path.append(os.path.join(_module_path, '..', 'scripts'))
  
-import srf2named_fastq
+from htsworkflow.pipelines import srf2fastq
  
  class testSrf2Fastq(unittest.TestCase):
      def test_split_good(self):
@@ -18,7 +18,7 @@ IIIIB+++
          target1 = StringIO()
          target2 = StringIO()
  
-        srf2named_fastq.convert_single_to_two_fastq(source, target1, target2)
+        srf2fastq.convert_single_to_two_fastq(source, target1, target2)
  
          target1.seek(0)
          lines1 = target1.readlines()
@@ -49,7 +49,7 @@ IIIIB+++
          target1 = StringIO()
          target2 = StringIO()
  
-        srf2named_fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
+        srf2fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
  
          target1.seek(0)
          lines1 = target1.readlines()
@@ -79,7 +79,7 @@ IIIIB+++
  """)
          target1 = StringIO()
  
-        srf2named_fastq.convert_single_to_fastq(source, target1)
+        srf2fastq.convert_single_to_fastq(source, target1)
  
          target1.seek(0)
          lines1 = target1.readlines()
@@ -101,7 +101,7 @@ IIIIB+++
  """)
          target1 = StringIO()
  
-        srf2named_fastq.convert_single_to_fastq(source, target1, header="foo_")
+        srf2fastq.convert_single_to_fastq(source, target1, header="foo_")
  
          target1.seek(0)
          lines1 = target1.readlines()
@@ -117,7 +117,7 @@ IIIIB+++
          cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
          cnf1_path = os.path.join(_module_path, cnf1_srf)
          
-        is_srf = srf2named_fastq.is_srf
+        is_srf = srf2fastq.is_srf
          self.failUnlessEqual(is_srf(__file__), False)
          self.failUnlessEqual(is_srf(cnf4_path), True)
          self.failUnlessEqual(is_srf(cnf1_path), True)
@@ -128,7 +128,7 @@ IIIIB+++
          cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
          cnf1_path = os.path.join(_module_path, cnf1_srf)
          
-        is_cnf1 = srf2named_fastq.is_cnf1
+        is_cnf1 = srf2fastq.is_cnf1
          self.failUnlessRaises(ValueError, is_cnf1, __file__)
          self.failUnlessEqual(is_cnf1(cnf4_path), False)
          self.failUnlessEqual(is_cnf1(cnf1_path), True)
diff --git a/version.py b/version.py

new file mode 100644 (file)

index 0000000..b850b4a
--- /dev/null
+++ b/version.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+# Author: Douglas Creager <dcreager@dcreager.net>
+# This file is placed into the public domain.
+
+# Calculates the current version number.  If possible, this is the
+# output of “git describe”, modified to conform to the versioning
+# scheme that setuptools uses.  If “git describe” returns an error
+# (most likely because we're in an unpacked copy of a release tarball,
+# rather than in a git working copy), then we fall back on reading the
+# contents of the RELEASE-VERSION file.
+#
+# To use this script, simply import it your setup.py file, and use the
+# results of get_git_version() as your package version:
+#
+# from version import *
+#
+# setup(
+#     version=get_git_version(),
+#     .
+#     .
+#     .
+# )
+#
+# This will automatically update the RELEASE-VERSION file, if
+# necessary.  Note that the RELEASE-VERSION file should *not* be
+# checked into git; please add it to your top-level .gitignore file.
+#
+# You'll probably want to distribute the RELEASE-VERSION file in your
+# sdist tarballs; to do this, just create a MANIFEST.in file that
+# contains the following line:
+#
+#   include RELEASE-VERSION
+
+__all__ = ("get_git_version")
+
+from subprocess import Popen, PIPE
+
+
+def call_git_describe(abbrev=4):
+    try:
+        p = Popen(['git', 'describe', '--abbrev=%d' % abbrev],
+                  stdout=PIPE, stderr=PIPE)
+        p.stderr.close()
+        line = p.stdout.readlines()[0]
+        return line.strip()
+
+    except:
+        return None
+
+
+def read_release_version():
+    try:
+        f = open("RELEASE-VERSION", "r")
+
+        try:
+            version = f.readlines()[0]
+            return version.strip()
+
+        finally:
+            f.close()
+
+    except:
+        return None
+
+
+def write_release_version(version):
+    f = open("RELEASE-VERSION", "w")
+    f.write("%s\n" % version)
+    f.close()
+
+
+def get_git_version(abbrev=4):
+    # Read in the version that's currently in RELEASE-VERSION.
+
+    release_version = read_release_version()
+
+    # First try to get the current version using “git describe”.
+
+    version = call_git_describe(abbrev)
+
+    # If that doesn't work, fall back on the value that's in
+    # RELEASE-VERSION.
+
+    if version is None:
+        version = release_version
+
+    # If we still don't have anything, that's an error.
+
+    if version is None:
+        raise ValueError("Cannot find the version number!")
+
+    # If the current version is different from what's in the
+    # RELEASE-VERSION file, update the file to be current.
+
+    if version != release_version:
+        write_release_version(version)
+
+    # Finally, return the current version.
+
+    return version
+
+
+if __name__ == "__main__":
+    print get_git_version()
author	Diane Trout <diane@caltech.edu>
	Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
committer	Diane Trout <diane@caltech.edu>
	Fri, 22 Oct 2010 23:40:13 +0000 (16:40 -0700)
.gitignore		patch \| blob \| history
MANIFEST.in	[new file with mode: 0644]	patch \| blob
extra/ucsc_encode_submission/ucsc_gather.py		patch \| blob \| history
htsworkflow/pipelines/qseq2fastq.py	[new file with mode: 0755]	patch \| blob
htsworkflow/pipelines/srf2fastq.py	[new file with mode: 0755]	patch \| blob
htsworkflow/version.py	[new file with mode: 0644]	patch \| blob
scripts/qseq2fastq		patch \| blob \| history
scripts/srf2fastq	[new file with mode: 0755]	patch \| blob
scripts/srf2named_fastq.py	[deleted file]	patch \| blob \| history
setup.py		patch \| blob \| history
test/test_srf2fastq.py		patch \| blob \| history
version.py	[new file with mode: 0644]	patch \| blob