2 from __future__ import print_function, unicode_literals
6 from optparse import OptionParser
8 from subprocess import Popen, PIPE
11 from htsworkflow.util.opener import autoopen
12 from htsworkflow.util.version import version
14 LOGGER = logging.getLogger(__name__)
16 # constants for our fastq finite state machine
19 FASTQ_SEQUENCE_HEADER = 2
22 def main(cmdline=None):
23 parser = make_parser()
24 opts, args = parser.parse_args(cmdline)
27 logging.basicConfig(level=logging.INFO)
29 logging.basicConfig(level=logging.WARN)
36 parser.error("Requires one argument, got: %s" % (str(args)))
38 if opts.flowcell is not None:
39 header = "%s_" % (opts.flowcell,)
44 left = open_write(opts.single, opts.force)
46 left = open_write(opts.left, opts.force)
47 right = open_write(opts.right, opts.force)
49 # open the srf, fastq, or compressed fastq
51 source = srf_open(args[0], opts.srf2fastq, opts.cnf1)
53 source = autoopen(args[0])
56 convert_single_to_fastq(source, left, header)
58 convert_single_to_two_fastq(source, left, right, opts.mid, header)
63 parser = OptionParser("""%prog: [options] file
65 file can be either a fastq file or a srf file.
66 You can also force the flowcell ID to be added to the header.""")
68 parser.add_option('--force', default=False, action="store_true",
69 help="overwrite existing files.")
70 parser.add_option('--flowcell', default=None,
71 help="add flowcell id header to sequence")
72 parser.add_option('-l','--left', default="r1.fastq",
73 help='left side filename')
74 parser.add_option('-m','--mid', default=None,
75 help='actual sequence mid point')
76 parser.add_option('-r','--right', default="r2.fastq",
77 help='right side filename')
78 parser.add_option('-s','--single', default=None,
79 help="single fastq target name")
80 parser.add_option('-v', '--verbose', default=False, action="store_true",
81 help="show information about what we're doing.")
82 parser.add_option('--version', default=False, action="store_true",
83 help="Report software version")
84 parser.add_option('--cnf1', default=False, action="store_true",
85 help="Force cnf1 mode in srf2fastq")
86 parser.add_option('--srf2fastq', default='srf2fastq',
87 help='specify srf2fastq command')
91 def srf_open(filename, srf2fastq_cmd, cnf1=False):
93 Make a stream from srf file using srf2fastq
95 if not os.path.exists(srf2fastq_cmd):
96 LOGGER.error("srf command: %s doesn't exist" % (srf2fastq_cmd,))
98 if cnf1 or is_cnf1(filename):
102 LOGGER.info('srf command: %s' % (" ".join(cmd),))
103 p = Popen(cmd, stdout=PIPE)
107 def convert_single_to_fastq(instream, target1, header=''):
109 for line in instream:
112 if state == FASTQ_HEADER:
113 write_header(target1, header, line)
114 state = FASTQ_SEQUENCE
116 elif state == FASTQ_SEQUENCE:
117 write_sequence(target1, line)
118 state = FASTQ_SEQUENCE_HEADER
120 elif state == FASTQ_SEQUENCE_HEADER:
121 # the sequence header isn't really sequence, but
122 # we're just passing it through
123 write_sequence(target1, line)
124 state = FASTQ_QUALITY
125 # sequence or quality data
126 elif state == FASTQ_QUALITY:
127 write_sequence(target1, line)
130 raise RuntimeError("Unrecognized STATE in fastq split")
134 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
136 read a fastq file where two paired ends have been run together into
139 instream is the source stream
140 target1 and target2 are the destination streams
146 for line in instream:
149 if state == FASTQ_HEADER:
150 write_header(target1, header, line, "/1")
151 write_header(target2, header, line, "/2")
152 state = FASTQ_SEQUENCE
154 elif state == FASTQ_SEQUENCE:
157 write_split_sequence(target1, target2, line, mid)
158 state = FASTQ_SEQUENCE_HEADER
160 elif state == FASTQ_SEQUENCE_HEADER:
161 # the sequence header isn't really sequence, but
162 # we're just passing it through
163 write_sequence(target1, line)
164 write_sequence(target2, line)
166 state = FASTQ_QUALITY
167 # sequence or quality data
168 elif state == FASTQ_QUALITY:
169 write_split_sequence(target1, target2, line, mid)
172 raise RuntimeError("Unrecognized STATE in fastq split")
174 def write_header(target, prefix, line, suffix=''):
177 target.write(line[1:])
179 target.write(os.linesep)
181 def write_sequence(target, line):
183 target.write(os.linesep)
185 def write_split_sequence(target1, target2, line, mid):
186 target1.write(line[:mid])
187 target1.write(os.linesep)
189 target2.write(line[mid:])
190 target2.write(os.linesep)
192 def is_srf(filename):
194 Check filename to see if it is likely to be a SRF file
196 f = open(filename, 'rb')
199 return header == b"SSRF"
201 def is_cnf1(filename):
203 Brute force detection if a SRF file is using CNF1/CNF4 records
205 max_header = 1024 ** 2
206 PROGRAM_ID = b'PROGRAM_ID\000'
207 cnf4_apps = set((b"solexa2srf v1.4",
208 b"illumina2srf v1.11.5.Illumina.1.3"))
210 if not is_srf(filename):
211 raise ValueError("%s must be a srf file" % (filename,))
213 fd = os.open(filename, os.O_RDONLY)
214 f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
215 # alas the max search length requires python 2.6+
216 program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
217 program_header_start = program_id_location+len(PROGRAM_ID)
218 next_null = f.find(b'\000', program_header_start) #, max_header)
219 program_id_header = f[program_header_start:next_null]
223 if program_id_header in cnf4_apps:
228 def open_write(filename, force=False):
230 Open a file, but throw an exception if it already exists
233 if os.path.exists(filename):
234 raise RuntimeError("%s exists" % (filename,))
236 return open(filename, 'w')
239 path, name = os.path.split(filename)
240 base, ext = os.path.splitext(name)
242 target1_name = base + '_r1.fastq'
243 target2_name = base + '_r2.fastq'
245 for target_name in [target1_name, target2_name]:
246 print('target name', target_name)
247 if os.path.exists(target_name):
248 raise RuntimeError("%s exists" % (target_name,))
250 instream = open(filename,'r')
251 target1 = open(target1_name,'w')
252 target2 = open(target2_name,'w')
255 if __name__ == "__main__":
256 sys.exit(main(sys.argv[1:]))