4 from optparse import OptionParser
9 from htsworkflow.version import version
11 def main(cmdline=None):
12 parser = make_parser()
13 opts, args = parser.parse_args(cmdline)
19 if opts.infile is not None:
20 qseq_generator = tarfile_generator(opts.infile)
22 qseq_generator = file_generator(args)
24 qseq_generator = [sys.stdin]
26 if opts.output is not None:
27 output = open(opts.output, 'w')
31 if opts.nopass_output is not None:
32 nopass_output = open(opts.nopass_output, 'w')
36 qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
37 qseq_parser.fastq = not opts.fasta
38 qseq_parser.flowcell_id = opts.flowcell
39 qseq_parser.trim = parse_slice(opts.slice)
40 qseq_parser.reportFilter = opts.pf
46 usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
47 parser = OptionParser(usage)
48 parser.add_option("-a", "--fasta", default=False, action="store_true",
49 help="produce fasta files instead of fastq files")
50 parser.add_option("-f", "--flowcell", default=None,
51 help="Set flowcell ID for output file")
52 parser.add_option("-i", "--infile", default=None,
53 help='source tar file (if reading from an archive instead of a directory)')
54 parser.add_option("-o", "--output", default=None,
55 help="output fastq file")
56 parser.add_option("-n", "--nopass-output", default=None,
57 help="if provided send files that failed illumina filter "\
59 parser.add_option("-s", "--slice",
60 help="specify python slice, e.g. 0:75, 0:-1",
62 parser.add_option("--pf", help="report pass filter flag", default=False,
64 parser.add_option("--version", default=False, action="store_true",
65 help="report software version")
70 def file_generator(pattern_list):
71 for pattern in pattern_list:
72 for filename in glob(pattern):
73 yield open(filename,"r")
76 def tarfile_generator(tarfilename):
77 archive = tarfile.open(tarfilename,'r|*')
78 for tarinfo in archive:
79 yield archive.extractfile(tarinfo)
82 def parse_slice(slice_text):
83 if slice_text is None or len(slice_text) == 0:
87 for element in slice_text.split(':'):
91 element = int(element)
92 slice_data.append(element)
94 return slice(*slice_data)
97 def file_generator(pattern_list):
98 for pattern in pattern_list:
99 for filename in glob(pattern):
100 yield open(filename,"r")
103 def tarfile_generator(tarfilename):
104 archive = tarfile.open(tarfilename,'r|*')
105 for tarinfo in archive:
106 yield archive.extractfile(tarinfo)
109 def parse_slice(slice_text):
110 if slice_text is None or len(slice_text) == 0:
114 for element in slice_text.split(':'):
115 if len(element) == 0:
118 element = int(element)
119 slice_data.append(element)
121 return slice(*slice_data)
124 class Qseq2Fastq(object):
126 Convert qseq files to fastq (or fasta) files.
128 def __init__(self, sources, pass_destination, nopass_destination=None):
129 self.sources = sources
130 self.pass_destination = pass_destination
131 if nopass_destination is not None:
132 self.nopass_destination = nopass_destination
134 self.nopass_destination = pass_destination
137 self.flowcell_id = None
138 self.trim = slice(None)
139 self.reportFilter = False
141 def _format_flowcell_id(self):
143 Return formatted flowcell ID
145 if self.flowcell_id is not None:
146 return self.flowcell_id+"_"
150 def _convert_illumina_quality(self, illumina_quality):
152 Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
154 # Illumina scores are Phred + 64
155 # Fastq scores are Phread + 33
156 # the following code grabs the string, converts to short ints and
157 # subtracts 31 (64-33) to convert between the two score formats.
158 # The numpy solution is twice as fast as some of my other
159 # ideas for the conversion.
160 # sorry about the uglyness in changing from character, to 8-bit int
161 # and back to a character array
162 quality = numpy.asarray(illumina_quality,'c')
163 quality.dtype = numpy.uint8
165 quality.dtype = '|S1' # I'd like to know what the real numpy char type is
170 header_template = '@'
173 header_template = '>'
174 header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
176 for qstream in self.sources:
179 record = line.rstrip().split('\t')
180 machine_name = record[0]
181 run_number = record[1]
182 lane_number = record[2]
188 sequence = record[8].replace('.','N')
189 quality = self._convert_illumina_quality(record[9])
191 # add pass qc filter if we want it
192 pass_qc = int(record[10])
193 if self.reportFilter:
194 pass_qc_msg = " pf=%s" % (pass_qc)
198 header = header_template % ( \
210 # if we passed the filter write to the "good" file
212 destination = self.pass_destination
214 destination = self.nopass_destination
216 destination.write(header)
217 destination.write(sequence[self.trim])
218 destination.write(os.linesep)
220 destination.write('+')
221 destination.write(os.linesep)
222 destination.write(quality[self.trim].tostring())
223 destination.write(os.linesep)
225 if __name__ == "__main__":