*.py[co~]
.coverage
*.egg-info
+dist
+RELEASE-VERSION
--- /dev/null
+include RELEASE-VERSION
+version.py
qseq_condor_entries = []
srf_condor_header = """
Universe=vanilla
-executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2named_fastq.py
+executable=/woldlab/rattus/lvol0/mus/home/diane/proj/solexa/gaworkflow/scripts/srf2fastq
output=log/srf_pair_fastq.out.$(process).log
error=log/srf_pair_fastq.err.$(process).log
log=log/srf_pair_fastq.log
--- /dev/null
+#!/usr/bin/env python
+from glob import glob
+import os
+from optparse import OptionParser
+import numpy
+import sys
+import tarfile
+
+from htsworkflow.version import version
+
+def main(cmdline=None):
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+
+ if opts.version:
+ print version()
+ return 0
+
+ if opts.infile is not None:
+ qseq_generator = tarfile_generator(opts.infile)
+ elif len(args) > 0:
+ qseq_generator = file_generator(args)
+ else:
+ qseq_generator = [sys.stdin]
+
+ if opts.output is not None:
+ output = open(opts.output, 'w')
+ else:
+ output = sys.stdout
+
+ if opts.nopass_output is not None:
+ nopass_output = open(opts.nopass_output, 'w')
+ else:
+ nopass_output = None
+
+ qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
+ qseq_parser.fastq = not opts.fasta
+ qseq_parser.flowcell_id = opts.flowcell
+ qseq_parser.trim = parse_slice(opts.slice)
+ qseq_parser.reportFilter = opts.pf
+
+ qseq_parser.run()
+
+
+def make_parser():
+ usage = "%prog: [options] *_qseq.txt"
+ parser = OptionParser(usage)
+ parser.add_option("-a", "--fasta", default=False, action="store_true",
+ help="produce fasta files instead of fastq files")
+ parser.add_option("-f", "--flowcell", default=None,
+ help="Set flowcell ID for output file")
+ parser.add_option("-i", "--infile", default=None,
+ help='source tar file (if reading from an archive instead of a directory)')
+ parser.add_option("-o", "--output", default=None,
+ help="output fastq file")
+ parser.add_option("-n", "--nopass-output", default=None,
+ help="if provided send files that failed illumina filter "\
+ "to a differentfile")
+ parser.add_option("-s", "--slice",
+ help="specify python slice, e.g. 0:75, 0:-1",
+ default=None)
+ parser.add_option("--pf", help="report pass filter flag", default=False,
+ action="store_true")
+ parser.add_option("--version", default=False, action="store_true",
+ help="report software version")
+
+ return parser
+
+
+def file_generator(pattern_list):
+ for pattern in pattern_list:
+ for filename in glob(pattern):
+ yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+ archive = tarfile.open(tarfilename,'r|*')
+ for tarinfo in archive:
+ yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+ if slice_text is None or len(slice_text) == 0:
+ return slice(None)
+
+ slice_data = []
+ for element in slice_text.split(':'):
+ if len(element) == 0:
+ element = None
+ else:
+ element = int(element)
+ slice_data.append(element)
+
+ return slice(*slice_data)
+
+
+def file_generator(pattern_list):
+ for pattern in pattern_list:
+ for filename in glob(pattern):
+ yield open(filename,"r")
+
+
+def tarfile_generator(tarfilename):
+ archive = tarfile.open(tarfilename,'r|*')
+ for tarinfo in archive:
+ yield archive.extractfile(tarinfo)
+
+
+def parse_slice(slice_text):
+ if slice_text is None or len(slice_text) == 0:
+ return slice(None)
+
+ slice_data = []
+ for element in slice_text.split(':'):
+ if len(element) == 0:
+ element = None
+ else:
+ element = int(element)
+ slice_data.append(element)
+
+ return slice(*slice_data)
+
+
+class Qseq2Fastq(object):
+ """
+ Convert qseq files to fastq (or fasta) files.
+ """
+ def __init__(self, sources, pass_destination, nopass_destination=None):
+ self.sources = sources
+ self.pass_destination = pass_destination
+ if nopass_destination is not None:
+ self.nopass_destination = nopass_destination
+ else:
+ self.nopass_destination = pass_destination
+
+ self.fastq = True
+ self.flowcell_id = None
+ self.trim = slice(None)
+ self.reportFilter = False
+
+ def _format_flowcell_id(self):
+ """
+ Return formatted flowcell ID
+ """
+ if self.flowcell_id is not None:
+ return self.flowcell_id+"_"
+ else:
+ return ""
+
+ def _convert_illumina_quality(self, illumina_quality):
+ """
+ Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
+ """
+ # Illumina scores are Phred + 64
+ # Fastq scores are Phread + 33
+ # the following code grabs the string, converts to short ints and
+ # subtracts 31 (64-33) to convert between the two score formats.
+ # The numpy solution is twice as fast as some of my other
+ # ideas for the conversion.
+ # sorry about the uglyness in changing from character, to 8-bit int
+ # and back to a character array
+ quality = numpy.asarray(illumina_quality,'c')
+ quality.dtype = numpy.uint8
+ quality -= 31
+ quality.dtype = '|S1' # I'd like to know what the real numpy char type is
+ return quality
+
+ def run(self):
+ if self.fastq:
+ header_template = '@'
+ else:
+ # fasta case
+ header_template = '>'
+ header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
+
+ for qstream in self.sources:
+ for line in qstream:
+ # parse line
+ record = line.rstrip().split('\t')
+ machine_name = record[0]
+ run_number = record[1]
+ lane_number = record[2]
+ tile = record[3]
+ x = record[4]
+ y = record[5]
+ index = record[6]
+ read = record[7]
+ sequence = record[8].replace('.','N')
+ quality = self._convert_illumina_quality(record[9])
+
+ # add pass qc filter if we want it
+ pass_qc = int(record[10])
+ if self.reportFilter:
+ pass_qc_msg = " pf=%s" % (pass_qc)
+ else:
+ pass_qc_msg = ""
+
+ header = header_template % ( \
+ machine_name,
+ run_number,
+ lane_number,
+ tile,
+ x,
+ y,
+ read,
+ pass_qc_msg,
+ os.linesep)
+
+
+ # if we passed the filter write to the "good" file
+ if pass_qc:
+ destination = self.pass_destination
+ else:
+ destination = self.nopass_destination
+
+ destination.write(header)
+ destination.write(sequence[self.trim])
+ destination.write(os.linesep)
+ if self.fastq:
+ destination.write('+')
+ destination.write(os.linesep)
+ destination.write(quality[self.trim].tostring())
+ destination.write(os.linesep)
+
+if __name__ == "__main__":
+ main()
--- /dev/null
+#!/usr/bin/env python
+import logging
+import mmap
+from optparse import OptionParser
+import os
+from subprocess import Popen, PIPE
+import sys
+
+from htsworkflow.util.opener import autoopen
+from htsworkflow.version import version
+
+# constants for our fastq finite state machine
+FASTQ_HEADER = 0
+FASTQ_SEQUENCE = 1
+FASTQ_SEQUENCE_HEADER = 2
+FASTQ_QUALITY = 3
+
+def main(cmdline=None):
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+
+ if opts.verbose:
+ logging.basicConfig(level=logging.INFO)
+ else:
+ logging.basicConfig(level=logging.WARN)
+
+ if opts.version:
+ print version()
+ return 0
+
+ if len(args) != 1:
+ parser.error("Requires one argument, got: %s" % (str(args)))
+
+ if opts.flowcell is not None:
+ header = "%s_" % (opts.flowcell,)
+ else:
+ header = ''
+
+ if opts.single:
+ left = open_write(opts.single, opts.force)
+ else:
+ left = open_write(opts.left, opts.force)
+ right = open_write(opts.right, opts.force)
+
+ # open the srf, fastq, or compressed fastq
+ if is_srf(args[0]):
+ source = srf_open(args[0])
+ else:
+ source = autoopen(args[0])
+
+ if opts.single:
+ convert_single_to_fastq(source, left, header)
+ else:
+ convert_single_to_two_fastq(source, left, right, opts.mid, header)
+
+ return 0
+
+def make_parser():
+ parser = OptionParser("""%prog: [options] file
+
+file can be either a fastq file or a srf file.
+You can also force the flowcell ID to be added to the header.""")
+
+ parser.add_option('--force', default=False, action="store_true",
+ help="overwrite existing files.")
+ parser.add_option('--flowcell', default=None,
+ help="add flowcell id header to sequence")
+ parser.add_option('-l','--left', default="r1.fastq",
+ help='left side filename')
+ parser.add_option('-m','--mid', default=None,
+ help='actual sequence mid point')
+ parser.add_option('-r','--right', default="r2.fastq",
+ help='right side filename')
+ parser.add_option('-s','--single', default=None,
+ help="single fastq target name")
+ parser.add_option('-v', '--verbose', default=False, action="store_true",
+ help="show information about what we're doing.")
+ parser.add_option('--version', default=False, action="store_true",
+ help="Report software version")
+ return parser
+
+
+def srf_open(filename, cnf1=False):
+ """
+ Make a stream from srf file using srf2fastq
+ """
+ cmd = ['srf2fastq']
+ if is_cnf1(filename):
+ cmd.append('-c')
+ cmd.append(filename)
+
+ logging.info('srf command: %s' % (" ".join(cmd),))
+ p = Popen(cmd, stdout=PIPE)
+ return p.stdout
+
+
+def convert_single_to_fastq(instream, target1, header=''):
+
+ state = FASTQ_HEADER
+ for line in instream:
+ line = line.strip()
+ # sequence header
+ if state == FASTQ_HEADER:
+ write_header(target1, header, line)
+ state = FASTQ_SEQUENCE
+ # sequence
+ elif state == FASTQ_SEQUENCE:
+ write_sequence(target1, line)
+ state = FASTQ_SEQUENCE_HEADER
+ # quality header
+ elif state == FASTQ_SEQUENCE_HEADER:
+ # the sequence header isn't really sequence, but
+ # we're just passing it through
+ write_sequence(target1, line)
+ state = FASTQ_QUALITY
+ # sequence or quality data
+ elif state == FASTQ_QUALITY:
+ write_sequence(target1, line)
+ state = FASTQ_HEADER
+ else:
+ raise RuntimeError("Unrecognized STATE in fastq split")
+
+
+
+def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
+ """
+ read a fastq file where two paired ends have been run together into
+ two halves.
+
+ instream is the source stream
+ target1 and target2 are the destination streams
+ """
+ if mid is not None:
+ mid = int(mid)
+
+ state = FASTQ_HEADER
+ for line in instream:
+ line = line.strip()
+ # sequence header
+ if state == FASTQ_HEADER:
+ write_header(target1, header, line, "/1")
+ write_header(target2, header, line, "/2")
+ state = FASTQ_SEQUENCE
+ # sequence
+ elif state == FASTQ_SEQUENCE:
+ if mid is None:
+ mid = len(line)/2
+ write_split_sequence(target1, target2, line, mid)
+ state = FASTQ_SEQUENCE_HEADER
+ # quality header
+ elif state == FASTQ_SEQUENCE_HEADER:
+ # the sequence header isn't really sequence, but
+ # we're just passing it through
+ write_sequence(target1, line)
+ write_sequence(target2, line)
+
+ state = FASTQ_QUALITY
+ # sequence or quality data
+ elif state == FASTQ_QUALITY:
+ write_split_sequence(target1, target2, line, mid)
+ state = FASTQ_HEADER
+ else:
+ raise RuntimeError("Unrecognized STATE in fastq split")
+
+def write_header(target, prefix, line, suffix=''):
+ target.write('@')
+ target.write(prefix)
+ target.write(line[1:])
+ target.write(suffix)
+ target.write(os.linesep)
+
+def write_sequence(target, line):
+ target.write(line)
+ target.write(os.linesep)
+
+def write_split_sequence(target1, target2, line, mid):
+ target1.write(line[:mid])
+ target1.write(os.linesep)
+
+ target2.write(line[mid:])
+ target2.write(os.linesep)
+
+def is_srf(filename):
+ """
+ Check filename to see if it is likely to be a SRF file
+ """
+ f = open(filename, 'r')
+ header = f.read(4)
+ f.close()
+ return header == "SSRF"
+
+def is_cnf1(filename):
+ """
+ Brute force detection if a SRF file is using CNF1/CNF4 records
+ """
+ max_header = 1024 ** 2
+ PROGRAM_ID = 'PROGRAM_ID\000'
+ cnf4_apps = set(("solexa2srf v1.4",
+ "illumina2srf v1.11.5.Illumina.1.3"))
+
+ if not is_srf(filename):
+ raise ValueError("%s must be a srf file" % (filename,))
+
+ fd = os.open(filename, os.O_RDONLY)
+ f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
+ # alas the max search length requires python 2.6+
+ program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
+ program_header_start = program_id_location+len(PROGRAM_ID)
+ next_null = f.find('\000', program_header_start) #, max_header)
+ program_id_header = f[program_header_start:next_null]
+ f.close()
+ os.close(fd)
+
+ if program_id_header in cnf4_apps:
+ return False
+ else:
+ return True
+
+def open_write(filename, force=False):
+ """
+ Open a file, but throw an exception if it already exists
+ """
+ if not force:
+ if os.path.exists(filename):
+ raise RuntimeError("%s exists" % (filename,))
+
+ return open(filename, 'w')
+
+def foo():
+ path, name = os.path.split(filename)
+ base, ext = os.path.splitext(name)
+
+ target1_name = base + '_r1.fastq'
+ target2_name = base + '_r2.fastq'
+
+ for target_name in [target1_name, target2_name]:
+ print 'target name', target_name
+ if os.path.exists(target_name):
+ raise RuntimeError("%s exists" % (target_name,))
+
+ instream = open(filename,'r')
+ target1 = open(target1_name,'w')
+ target2 = open(target2_name,'w')
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
--- /dev/null
+import logging
+
+def version():
+ """Return version number
+ """
+ version = None
+ try:
+ import pkg_resources
+ except ImportError, e:
+ logging.error("Can't find version number, please install setuptools")
+ raise e
+
+ try:
+ version = pkg_resources.get_distribution("htsworkflow")
+ except pkg_resources.DistributionNotFound, e:
+ logging.error("Package not installed")
+
+ return version
+
-#!/usr/bin/env python
-
-from glob import glob
-import os
-from optparse import OptionParser
-import numpy
+#!/usr/bin/python
import sys
-import tarfile
-
-class Qseq2Fastq(object):
- """
- Convert qseq files to fastq (or fasta) files.
- """
- def __init__(self, sources, pass_destination, nopass_destination=None):
- self.sources = sources
- self.pass_destination = pass_destination
- if nopass_destination is not None:
- self.nopass_destination = nopass_destination
- else:
- self.nopass_destination = pass_destination
-
- self.fastq = True
- self.flowcell_id = None
- self.trim = slice(None)
- self.reportFilter = False
-
- def _format_flowcell_id(self):
- """
- Return formatted flowcell ID
- """
- if self.flowcell_id is not None:
- return self.flowcell_id+"_"
- else:
- return ""
-
- def _convert_illumina_quality(self, illumina_quality):
- """
- Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
- """
- # Illumina scores are Phred + 64
- # Fastq scores are Phread + 33
- # the following code grabs the string, converts to short ints and
- # subtracts 31 (64-33) to convert between the two score formats.
- # The numpy solution is twice as fast as some of my other
- # ideas for the conversion.
- # sorry about the uglyness in changing from character, to 8-bit int
- # and back to a character array
- quality = numpy.asarray(illumina_quality,'c')
- quality.dtype = numpy.uint8
- quality -= 31
- quality.dtype = '|S1' # I'd like to know what the real numpy char type is
- return quality
-
- def run(self):
- if self.fastq:
- header_template = '@'
- else:
- # fasta case
- header_template = '>'
- header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
-
- for qstream in self.sources:
- for line in qstream:
- # parse line
- record = line.rstrip().split('\t')
- machine_name = record[0]
- run_number = record[1]
- lane_number = record[2]
- tile = record[3]
- x = record[4]
- y = record[5]
- index = record[6]
- read = record[7]
- sequence = record[8].replace('.','N')
- quality = self._convert_illumina_quality(record[9])
-
- # add pass qc filter if we want it
- pass_qc = int(record[10])
- if self.reportFilter:
- pass_qc_msg = " pf=%s" % (pass_qc)
- else:
- pass_qc_msg = ""
-
- header = header_template % ( \
- machine_name,
- run_number,
- lane_number,
- tile,
- x,
- y,
- read,
- pass_qc_msg,
- os.linesep)
-
-
- # if we passed the filter write to the "good" file
- if pass_qc:
- destination = self.pass_destination
- else:
- destination = self.nopass_destination
-
- destination.write(header)
- destination.write(sequence[self.trim])
- destination.write(os.linesep)
- if self.fastq:
- destination.write('+')
- destination.write(os.linesep)
- destination.write(quality[self.trim].tostring())
- destination.write(os.linesep)
-
-def file_generator(pattern_list):
- for pattern in pattern_list:
- for filename in glob(pattern):
- yield open(filename,'r')
-
-def tarfile_generator(tarfilename):
- archive = tarfile.open(tarfilename,'r|*')
- for tarinfo in archive:
- yield archive.extractfile(tarinfo)
-
-
-def parse_slice(slice_text):
- if slice_text is None or len(slice_text) == 0:
- return slice(None)
-
- slice_data = []
- for element in slice_text.split(':'):
- if len(element) == 0:
- element = None
- else:
- element = int(element)
- slice_data.append(element)
-
- return slice(*slice_data)
-
-def make_parser():
- usage = "%prog: [options] *_qseq.txt"
- parser = OptionParser(usage)
- parser.add_option('-a', '--fasta', default=False, action="store_true",
- help="produce fasta files instead of fastq files")
- parser.add_option('-f', '--flowcell', default=None,
- help="Set flowcell ID for output file")
- parser.add_option('-i', '--infile', default=None,
- help='source tar file (if reading from an archive instead of a directory)')
- parser.add_option('-o', '--output', default=None,
- help='output fastq file')
- parser.add_option('-n', '--nopass-output', default=None,
- help='if provided send files that failed illumina filter '\
- 'to a differentfile')
- parser.add_option('-s', '--slice',
- help='specify python slice, e.g. 0:75, 0:-1',
- default=None)
- parser.add_option('--pf', help="report pass filter flag", default=False,
- action="store_true")
- return parser
-
-def main(cmdline=None):
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
-
- if opts.infile is not None:
- qseq_generator = tarfile_generator(opts.infile)
- elif len(args) > 0:
- qseq_generator = file_generator(args)
- else:
- qseq_generator = [sys.stdin]
-
-
- if opts.output is not None:
- output = open(opts.output, 'w')
- else:
- output = sys.stdout
-
- if opts.nopass_output is not None:
- nopass_output = open(opts.nopass_output, 'w')
- else:
- nopass_output = None
-
- qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
- qseq_parser.fastq = not opts.fasta
- qseq_parser.flowcell_id = opts.flowcell
- qseq_parser.trim = parse_slice(opts.slice)
- qseq_parser.reportFilter = opts.pf
-
- qseq_parser.run()
+from htsworkflow.pipelines.qseq2fastq import main
-if __name__ == "__main__":
- main()
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
--- /dev/null
+#!/usr/bin/python
+import sys
+from htsworkflow.pipelines.srf2fastq import main
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
+++ /dev/null
-#!/usr/bin/env python
-import logging
-import mmap
-from optparse import OptionParser
-import os
-from subprocess import Popen, PIPE
-import sys
-
-from htsworkflow.util.opener import autoopen
-
-# constants for our fastq finite state machine
-FASTQ_HEADER = 0
-FASTQ_SEQUENCE = 1
-FASTQ_SEQUENCE_HEADER = 2
-FASTQ_QUALITY = 3
-
-
-def main(cmdline=None):
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
-
- if len(args) != 1:
- parser.error("Requires one argument, got: %s" % (str(args)))
-
- if opts.verbose:
- logging.basicConfig(level=logging.INFO)
- else:
- logging.basicConfig(level=logging.WARN)
-
- if opts.flowcell is not None:
- header = "%s_" % (opts.flowcell,)
- else:
- header = ''
-
- if opts.single:
- left = open_write(opts.single, opts.force)
- else:
- left = open_write(opts.left, opts.force)
- right = open_write(opts.right, opts.force)
-
- # open the srf, fastq, or compressed fastq
- if is_srf(args[0]):
- source = srf_open(args[0])
- else:
- source = autoopen(args[0])
-
- if opts.single:
- convert_single_to_fastq(source, left, header)
- else:
- convert_single_to_two_fastq(source, left, right, opts.mid, header)
-
- return 0
-
-def make_parser():
- parser = OptionParser("""%prog: [options] file
-
-file can be either a fastq file or a srf file.
-You can also force the flowcell ID to be added to the header.""")
-
- parser.add_option('--force', default=False, action="store_true",
- help="overwrite existing files.")
- parser.add_option('--flowcell', default=None,
- help="add flowcell id header to sequence")
- parser.add_option('-l','--left', default="r1.fastq",
- help='left side filename')
- parser.add_option('-m','--mid', default=None,
- help='actual sequence mid point')
- parser.add_option('-r','--right', default="r2.fastq",
- help='right side filename')
- parser.add_option('-s','--single', default=None,
- help="single fastq target name")
- parser.add_option('-v', '--verbose', default=False, action="store_true",
- help="show information about what we're doing.")
- return parser
-
-
-def srf_open(filename, cnf1=False):
- """
- Make a stream from srf file using srf2fastq
- """
-
- cmd = ['srf2fastq']
- if is_cnf1(filename):
- cmd.append('-c')
- cmd.append(filename)
-
- logging.info('srf command: %s' % (" ".join(cmd),))
- p = Popen(cmd, stdout=PIPE)
- return p.stdout
-
-
-def convert_single_to_fastq(instream, target1, header=''):
-
- state = FASTQ_HEADER
- for line in instream:
- line = line.strip()
- # sequence header
- if state == FASTQ_HEADER:
- write_header(target1, header, line)
- state = FASTQ_SEQUENCE
- # sequence
- elif state == FASTQ_SEQUENCE:
- write_sequence(target1, line)
- state = FASTQ_SEQUENCE_HEADER
- # quality header
- elif state == FASTQ_SEQUENCE_HEADER:
- # the sequence header isn't really sequence, but
- # we're just passing it through
- write_sequence(target1, line)
- state = FASTQ_QUALITY
- # sequence or quality data
- elif state == FASTQ_QUALITY:
- write_sequence(target1, line)
- state = FASTQ_HEADER
- else:
- raise RuntimeError("Unrecognized STATE in fastq split")
-
-
-
-def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
- """
- read a fastq file where two paired ends have been run together into
- two halves.
-
- instream is the source stream
- target1 and target2 are the destination streams
- """
- if mid is not None:
- mid = int(mid)
-
- state = FASTQ_HEADER
- for line in instream:
- line = line.strip()
- # sequence header
- if state == FASTQ_HEADER:
- write_header(target1, header, line, "/1")
- write_header(target2, header, line, "/2")
- state = FASTQ_SEQUENCE
- # sequence
- elif state == FASTQ_SEQUENCE:
- if mid is None:
- mid = len(line)/2
- write_split_sequence(target1, target2, line, mid)
- state = FASTQ_SEQUENCE_HEADER
- # quality header
- elif state == FASTQ_SEQUENCE_HEADER:
- # the sequence header isn't really sequence, but
- # we're just passing it through
- write_sequence(target1, line)
- write_sequence(target2, line)
-
- state = FASTQ_QUALITY
- # sequence or quality data
- elif state == FASTQ_QUALITY:
- write_split_sequence(target1, target2, line, mid)
- state = FASTQ_HEADER
- else:
- raise RuntimeError("Unrecognized STATE in fastq split")
-
-def write_header(target, prefix, line, suffix=''):
- target.write('@')
- target.write(prefix)
- target.write(line[1:])
- target.write(suffix)
- target.write(os.linesep)
-
-def write_sequence(target, line):
- target.write(line)
- target.write(os.linesep)
-
-def write_split_sequence(target1, target2, line, mid):
- target1.write(line[:mid])
- target1.write(os.linesep)
-
- target2.write(line[mid:])
- target2.write(os.linesep)
-
-def is_srf(filename):
- """
- Check filename to see if it is likely to be a SRF file
- """
- f = open(filename, 'r')
- header = f.read(4)
- f.close()
- return header == "SSRF"
-
-def is_cnf1(filename):
- """
- Brute force detection if a SRF file is using CNF1/CNF4 records
- """
- max_header = 1024 ** 2
- PROGRAM_ID = 'PROGRAM_ID\000'
- cnf4_apps = set(("solexa2srf v1.4",
- "illumina2srf v1.11.5.Illumina.1.3"))
-
- if not is_srf(filename):
- raise ValueError("%s must be a srf file" % (filename,))
-
- fd = os.open(filename, os.O_RDONLY)
- f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
- # alas the max search length requires python 2.6+
- program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
- program_header_start = program_id_location+len(PROGRAM_ID)
- next_null = f.find('\000', program_header_start) #, max_header)
- program_id_header = f[program_header_start:next_null]
- f.close()
- os.close(fd)
-
- if program_id_header in cnf4_apps:
- return False
- else:
- return True
-
-def open_write(filename, force=False):
- """
- Open a file, but throw an exception if it already exists
- """
- if not force:
- if os.path.exists(filename):
- raise RuntimeError("%s exists" % (filename,))
-
- return open(filename, 'w')
-
-def foo():
- path, name = os.path.split(filename)
- base, ext = os.path.splitext(name)
-
- target1_name = base + '_r1.fastq'
- target2_name = base + '_r2.fastq'
-
- for target_name in [target1_name, target2_name]:
- print 'target name', target_name
- if os.path.exists(target_name):
- raise RuntimeError("%s exists" % (target_name,))
-
- instream = open(filename,'r')
- target1 = open(target1_name,'w')
- target2 = open(target2_name,'w')
-
-
-
-if __name__ == "__main__":
- sys.exit(main(sys.argv[1:]))
from setuptools import setup
+from version import get_git_version
setup(
- name="htsworkflow",
- description="some bots and other utilities to help deal with data from an illumina sequencer",
- author="Diane Trout & Brandon King",
- author_email="diane@caltech.edu",
- packages=["htsworkflow",
- "htsworkflow.pipelines",
- "htsworkflow.frontend",
- "htsworkflow.frontend.analysis",
- "htsworkflow.frontend.eland_config",
- "htsworkflow.frontend.experiments",
- "htsworkflow.frontend.inventory",
- "htsworkflow.frontend.reports",
- "htsworkflow.frontend.samples",
- "htsworkflow.automation",
- "htsworkflow.util"
- ],
- scripts=[
- 'scripts/configure_pipeline',
+ name="htsworkflow",
+ version=get_git_version(),
+ description="Utilities to help manage high-through-put sequencing",
+ author="Diane Trout, Brandon King",
+ author_email="diane@caltech.edu",
+ packages=["htsworkflow",
+ "htsworkflow.automation",
+ "htsworkflow.pipelines",
+ "htsworkflow.util",
+ # django site
+ "htsworkflow.frontend",
+ "htsworkflow.frontend.analysis",
+ "htsworkflow.frontend.eland_config",
+ "htsworkflow.frontend.experiments",
+ "htsworkflow.frontend.inventory",
+ "htsworkflow.frontend.reports",
+ "htsworkflow.frontend.samples",
+ ],
+ scripts=[
'scripts/copier',
- 'scripts/gerald2bed.py',
'scripts/library.py',
'scripts/makebed',
- 'scripts/make-library-tree',
+ 'scripts/make-library-tree',
'scripts/mark_archived_data',
'scripts/qseq2fastq',
- 'scripts/rerun_eland.py',
'scripts/retrieve_config',
'scripts/runfolder',
'scripts/runner',
'scripts/spoolwatcher',
'scripts/srf',
- 'scripts/srf2named_fastq.py'
+ 'scripts/srf2fastq'
],
-)
+ )
_module_path, _module_name = os.path.split(__file__)
sys.path.append(os.path.join(_module_path, '..', 'scripts'))
-import srf2named_fastq
+from htsworkflow.pipelines import srf2fastq
class testSrf2Fastq(unittest.TestCase):
def test_split_good(self):
target1 = StringIO()
target2 = StringIO()
- srf2named_fastq.convert_single_to_two_fastq(source, target1, target2)
+ srf2fastq.convert_single_to_two_fastq(source, target1, target2)
target1.seek(0)
lines1 = target1.readlines()
target1 = StringIO()
target2 = StringIO()
- srf2named_fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
+ srf2fastq.convert_single_to_two_fastq(source, target1, target2, header="foo_")
target1.seek(0)
lines1 = target1.readlines()
""")
target1 = StringIO()
- srf2named_fastq.convert_single_to_fastq(source, target1)
+ srf2fastq.convert_single_to_fastq(source, target1)
target1.seek(0)
lines1 = target1.readlines()
""")
target1 = StringIO()
- srf2named_fastq.convert_single_to_fastq(source, target1, header="foo_")
+ srf2fastq.convert_single_to_fastq(source, target1, header="foo_")
target1.seek(0)
lines1 = target1.readlines()
cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
cnf1_path = os.path.join(_module_path, cnf1_srf)
- is_srf = srf2named_fastq.is_srf
+ is_srf = srf2fastq.is_srf
self.failUnlessEqual(is_srf(__file__), False)
self.failUnlessEqual(is_srf(cnf4_path), True)
self.failUnlessEqual(is_srf(cnf1_path), True)
cnf1_srf = 'woldlab_090512_HWI-EAS229_0114_428NNAAXX_5.srf'
cnf1_path = os.path.join(_module_path, cnf1_srf)
- is_cnf1 = srf2named_fastq.is_cnf1
+ is_cnf1 = srf2fastq.is_cnf1
self.failUnlessRaises(ValueError, is_cnf1, __file__)
self.failUnlessEqual(is_cnf1(cnf4_path), False)
self.failUnlessEqual(is_cnf1(cnf1_path), True)
--- /dev/null
+# -*- coding: utf-8 -*-
+# Author: Douglas Creager <dcreager@dcreager.net>
+# This file is placed into the public domain.
+
+# Calculates the current version number. If possible, this is the
+# output of “git describe”, modified to conform to the versioning
+# scheme that setuptools uses. If “git describe” returns an error
+# (most likely because we're in an unpacked copy of a release tarball,
+# rather than in a git working copy), then we fall back on reading the
+# contents of the RELEASE-VERSION file.
+#
+# To use this script, simply import it your setup.py file, and use the
+# results of get_git_version() as your package version:
+#
+# from version import *
+#
+# setup(
+# version=get_git_version(),
+# .
+# .
+# .
+# )
+#
+# This will automatically update the RELEASE-VERSION file, if
+# necessary. Note that the RELEASE-VERSION file should *not* be
+# checked into git; please add it to your top-level .gitignore file.
+#
+# You'll probably want to distribute the RELEASE-VERSION file in your
+# sdist tarballs; to do this, just create a MANIFEST.in file that
+# contains the following line:
+#
+# include RELEASE-VERSION
+
+__all__ = ("get_git_version")
+
+from subprocess import Popen, PIPE
+
+
+def call_git_describe(abbrev=4):
+ try:
+ p = Popen(['git', 'describe', '--abbrev=%d' % abbrev],
+ stdout=PIPE, stderr=PIPE)
+ p.stderr.close()
+ line = p.stdout.readlines()[0]
+ return line.strip()
+
+ except:
+ return None
+
+
+def read_release_version():
+ try:
+ f = open("RELEASE-VERSION", "r")
+
+ try:
+ version = f.readlines()[0]
+ return version.strip()
+
+ finally:
+ f.close()
+
+ except:
+ return None
+
+
+def write_release_version(version):
+ f = open("RELEASE-VERSION", "w")
+ f.write("%s\n" % version)
+ f.close()
+
+
+def get_git_version(abbrev=4):
+ # Read in the version that's currently in RELEASE-VERSION.
+
+ release_version = read_release_version()
+
+ # First try to get the current version using “git describe”.
+
+ version = call_git_describe(abbrev)
+
+ # If that doesn't work, fall back on the value that's in
+ # RELEASE-VERSION.
+
+ if version is None:
+ version = release_version
+
+ # If we still don't have anything, that's an error.
+
+ if version is None:
+ raise ValueError("Cannot find the version number!")
+
+ # If the current version is different from what's in the
+ # RELEASE-VERSION file, update the file to be current.
+
+ if version != release_version:
+ write_release_version(version)
+
+ # Finally, return the current version.
+
+ return version
+
+
+if __name__ == "__main__":
+ print get_git_version()