4 from optparse import OptionParser
6 from subprocess import Popen, PIPE
9 from htsworkflow.util.opener import autoopen
11 # constants for our fastq finite state machine
14 FASTQ_SEQUENCE_HEADER = 2
18 def main(cmdline=None):
19 parser = make_parser()
20 opts, args = parser.parse_args(cmdline)
23 parser.error("Requires one argument, got: %s" % (str(args)))
26 logging.basicConfig(level=logging.INFO)
28 logging.basicConfig(level=logging.WARN)
30 if opts.flowcell is not None:
31 header = "%s_" % (opts.flowcell,)
36 left = open_write(opts.single, opts.force)
38 left = open_write(opts.left, opts.force)
39 right = open_write(opts.right, opts.force)
41 # open the srf, fastq, or compressed fastq
43 source = srf_open(args[0])
45 source = autoopen(args[0])
48 convert_single_to_fastq(source, left, header)
50 convert_single_to_two_fastq(source, left, right, opts.mid, header)
55 parser = OptionParser("""%prog: [options] file
57 file can be either a fastq file or a srf file.
58 You can also force the flowcell ID to be added to the header.""")
60 parser.add_option('--force', default=False, action="store_true",
61 help="overwrite existing files.")
62 parser.add_option('--flowcell', default=None,
63 help="add flowcell id header to sequence")
64 parser.add_option('-l','--left', default="r1.fastq",
65 help='left side filename')
66 parser.add_option('-r','--right', default="r2.fastq",
67 help='right side filename')
68 parser.add_option('-m','--mid', default=None,
69 help='actual sequence mid point')
70 parser.add_option('-s','--single', default=None,
71 help="single fastq target name")
72 parser.add_option('-v', '--verbose', default=False, action="store_true",
73 help="show information about what we're doing.")
77 def srf_open(filename, cnf1=False):
79 Make a stream from srf file using srf2fastq
87 logging.info('srf command: %s' % (" ".join(cmd),))
88 p = Popen(cmd, stdout=PIPE)
92 def convert_single_to_fastq(instream, target1, header=''):
98 if state == FASTQ_HEADER:
99 write_header(target1, header, line)
100 state = FASTQ_SEQUENCE
102 elif state == FASTQ_SEQUENCE:
103 write_sequence(target1, line)
104 state = FASTQ_SEQUENCE_HEADER
106 elif state == FASTQ_SEQUENCE_HEADER:
107 # the sequence header isn't really sequence, but
108 # we're just passing it through
109 write_sequence(target1, line)
110 state = FASTQ_QUALITY
111 # sequence or quality data
112 elif state == FASTQ_QUALITY:
113 write_sequence(target1, line)
116 raise RuntimeError("Unrecognized STATE in fastq split")
120 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
122 read a fastq file where two paired ends have been run together into
125 instream is the source stream
126 target1 and target2 are the destination streams
132 for line in instream:
135 if state == FASTQ_HEADER:
136 write_header(target1, header, line, "/1")
137 write_header(target2, header, line, "/2")
138 state = FASTQ_SEQUENCE
140 elif state == FASTQ_SEQUENCE:
143 write_split_sequence(target1, target2, line, mid)
144 state = FASTQ_SEQUENCE_HEADER
146 elif state == FASTQ_SEQUENCE_HEADER:
147 # the sequence header isn't really sequence, but
148 # we're just passing it through
149 write_sequence(target1, line)
150 write_sequence(target2, line)
152 state = FASTQ_QUALITY
153 # sequence or quality data
154 elif state == FASTQ_QUALITY:
155 write_split_sequence(target1, target2, line, mid)
158 raise RuntimeError("Unrecognized STATE in fastq split")
160 def write_header(target, prefix, line, suffix=''):
163 target.write(line[1:])
165 target.write(os.linesep)
167 def write_sequence(target, line):
169 target.write(os.linesep)
171 def write_split_sequence(target1, target2, line, mid):
172 target1.write(line[:mid])
173 target1.write(os.linesep)
175 target2.write(line[mid:])
176 target2.write(os.linesep)
178 def is_srf(filename):
180 Check filename to see if it is likely to be a SRF file
182 f = open(filename, 'r')
185 return header == "SSRF"
187 def is_cnf1(filename):
189 Brute force detection if a SRF file is using CNF1/CNF4 records
191 max_header = 1024 ** 2
192 PROGRAM_ID = 'PROGRAM_ID\000'
194 if not is_srf(filename):
195 raise ValueError("%s must be a srf file" % (filename,))
197 fd = os.open(filename, os.O_RDONLY)
198 f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
199 # alas the max search length requires python 2.6+
200 program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
201 program_header_start = program_id_location+len(PROGRAM_ID)
202 next_null = f.find('\000', program_header_start) #, max_header)
203 program_id_header = f[program_header_start:next_null]
207 if program_id_header == "solexa2srf v1.4":
212 def open_write(filename, force=False):
214 Open a file, but throw an exception if it already exists
217 if os.path.exists(filename):
218 raise RuntimeError("%s exists" % (filename,))
220 return open(filename, 'w')
223 path, name = os.path.split(filename)
224 base, ext = os.path.splitext(name)
226 target1_name = base + '_r1.fastq'
227 target2_name = base + '_r2.fastq'
229 for target_name in [target1_name, target2_name]:
230 print 'target name', target_name
231 if os.path.exists(target_name):
232 raise RuntimeError("%s exists" % (target_name,))
234 instream = open(filename,'r')
235 target1 = open(target1_name,'w')
236 target2 = open(target2_name,'w')
240 if __name__ == "__main__":
241 sys.exit(main(sys.argv[1:]))