4 from optparse import OptionParser
6 from subprocess import Popen, PIPE
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.version import version
12 LOGGER = logging.getLogger(__name__)
14 # constants for our fastq finite state machine
17 FASTQ_SEQUENCE_HEADER = 2
20 def main(cmdline=None):
21 parser = make_parser()
22 opts, args = parser.parse_args(cmdline)
25 logging.basicConfig(level=logging.INFO)
27 logging.basicConfig(level=logging.WARN)
34 parser.error("Requires one argument, got: %s" % (str(args)))
36 if opts.flowcell is not None:
37 header = "%s_" % (opts.flowcell,)
42 left = open_write(opts.single, opts.force)
44 left = open_write(opts.left, opts.force)
45 right = open_write(opts.right, opts.force)
47 # open the srf, fastq, or compressed fastq
49 source = srf_open(args[0], opts.cnf1)
51 source = autoopen(args[0])
54 convert_single_to_fastq(source, left, header)
56 convert_single_to_two_fastq(source, left, right, opts.mid, header)
61 parser = OptionParser("""%prog: [options] file
63 file can be either a fastq file or a srf file.
64 You can also force the flowcell ID to be added to the header.""")
66 parser.add_option('--force', default=False, action="store_true",
67 help="overwrite existing files.")
68 parser.add_option('--flowcell', default=None,
69 help="add flowcell id header to sequence")
70 parser.add_option('-l','--left', default="r1.fastq",
71 help='left side filename')
72 parser.add_option('-m','--mid', default=None,
73 help='actual sequence mid point')
74 parser.add_option('-r','--right', default="r2.fastq",
75 help='right side filename')
76 parser.add_option('-s','--single', default=None,
77 help="single fastq target name")
78 parser.add_option('-v', '--verbose', default=False, action="store_true",
79 help="show information about what we're doing.")
80 parser.add_option('--version', default=False, action="store_true",
81 help="Report software version")
82 parser.add_option('--cnf1', default=False, action="store_true",
83 help="Force cnf1 mode in srf2fastq")
87 def srf_open(filename, cnf1=False):
89 Make a stream from srf file using srf2fastq
92 if cnf1 or is_cnf1(filename):
96 LOGGER.info('srf command: %s' % (" ".join(cmd),))
97 p = Popen(cmd, stdout=PIPE)
101 def convert_single_to_fastq(instream, target1, header=''):
104 for line in instream:
107 if state == FASTQ_HEADER:
108 write_header(target1, header, line)
109 state = FASTQ_SEQUENCE
111 elif state == FASTQ_SEQUENCE:
112 write_sequence(target1, line)
113 state = FASTQ_SEQUENCE_HEADER
115 elif state == FASTQ_SEQUENCE_HEADER:
116 # the sequence header isn't really sequence, but
117 # we're just passing it through
118 write_sequence(target1, line)
119 state = FASTQ_QUALITY
120 # sequence or quality data
121 elif state == FASTQ_QUALITY:
122 write_sequence(target1, line)
125 raise RuntimeError("Unrecognized STATE in fastq split")
129 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
131 read a fastq file where two paired ends have been run together into
134 instream is the source stream
135 target1 and target2 are the destination streams
141 for line in instream:
144 if state == FASTQ_HEADER:
145 write_header(target1, header, line, "/1")
146 write_header(target2, header, line, "/2")
147 state = FASTQ_SEQUENCE
149 elif state == FASTQ_SEQUENCE:
152 write_split_sequence(target1, target2, line, mid)
153 state = FASTQ_SEQUENCE_HEADER
155 elif state == FASTQ_SEQUENCE_HEADER:
156 # the sequence header isn't really sequence, but
157 # we're just passing it through
158 write_sequence(target1, line)
159 write_sequence(target2, line)
161 state = FASTQ_QUALITY
162 # sequence or quality data
163 elif state == FASTQ_QUALITY:
164 write_split_sequence(target1, target2, line, mid)
167 raise RuntimeError("Unrecognized STATE in fastq split")
169 def write_header(target, prefix, line, suffix=''):
172 target.write(line[1:])
174 target.write(os.linesep)
176 def write_sequence(target, line):
178 target.write(os.linesep)
180 def write_split_sequence(target1, target2, line, mid):
181 target1.write(line[:mid])
182 target1.write(os.linesep)
184 target2.write(line[mid:])
185 target2.write(os.linesep)
187 def is_srf(filename):
189 Check filename to see if it is likely to be a SRF file
191 f = open(filename, 'r')
194 return header == "SSRF"
196 def is_cnf1(filename):
198 Brute force detection if a SRF file is using CNF1/CNF4 records
200 max_header = 1024 ** 2
201 PROGRAM_ID = 'PROGRAM_ID\000'
202 cnf4_apps = set(("solexa2srf v1.4",
203 "illumina2srf v1.11.5.Illumina.1.3"))
205 if not is_srf(filename):
206 raise ValueError("%s must be a srf file" % (filename,))
208 fd = os.open(filename, os.O_RDONLY)
209 f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
210 # alas the max search length requires python 2.6+
211 program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
212 program_header_start = program_id_location+len(PROGRAM_ID)
213 next_null = f.find('\000', program_header_start) #, max_header)
214 program_id_header = f[program_header_start:next_null]
218 if program_id_header in cnf4_apps:
223 def open_write(filename, force=False):
225 Open a file, but throw an exception if it already exists
228 if os.path.exists(filename):
229 raise RuntimeError("%s exists" % (filename,))
231 return open(filename, 'w')
234 path, name = os.path.split(filename)
235 base, ext = os.path.splitext(name)
237 target1_name = base + '_r1.fastq'
238 target2_name = base + '_r2.fastq'
240 for target_name in [target1_name, target2_name]:
241 print 'target name', target_name
242 if os.path.exists(target_name):
243 raise RuntimeError("%s exists" % (target_name,))
245 instream = open(filename,'r')
246 target1 = open(target1_name,'w')
247 target2 = open(target2_name,'w')
250 if __name__ == "__main__":
251 sys.exit(main(sys.argv[1:]))