3 from optparse import OptionParser
5 from subprocess import Popen, PIPE
8 from htsworkflow.util.opener import autoopen
10 # constants for our fastq finite state machine
13 FASTQ_SEQUENCE_HEADER = 2
17 def main(cmdline=None):
18 parser = make_parser()
19 opts, args = parser.parse_args(cmdline)
22 parser.error("Requires one argument")
25 logging.basicConfig(level=logging.INFO)
27 logging.basicConfig(level=logging.WARN)
29 if opts.flowcell is not None:
30 header = "%s_" % (opts.flowcell,)
35 left = open_write(opts.single, opts.force)
37 left = open_write(opts.left, opts.force)
38 right = open_write(opts.right, opts.force)
40 # open the srf, fastq, or compressed fastq
42 source = srf_open(args[0], opts.cnf1)
44 source = autoopen(args[0])
47 convert_single_to_fastq(source, left, header)
49 convert_single_to_two_fastq(source, left, right, opts.mid, header)
54 parser = OptionParser("""%prog: [options] file
56 file can be either a fastq file or a srf file.
57 You can also force the flowcell ID to be added to the header.""")
58 parser.add_option('-c','--cnf1',default=False, action="store_true",
59 help="pass -c to srf2fastq, needed for calibrated quality values"
61 parser.add_option('--force', default=False, action="store_true",
62 help="overwrite existing files.")
63 parser.add_option('--flowcell', default=None,
64 help="add flowcell id header to sequence")
65 parser.add_option('-l','--left', default="r1.fastq",
66 help='left side filename')
67 parser.add_option('-r','--right', default="r2.fastq",
68 help='right side filename')
69 parser.add_option('-m','--mid', default=None,
70 help='actual sequence mid point')
71 parser.add_option('-s','--single', default=None,
72 help="single fastq target name")
73 parser.add_option('-v', '--verbose', default=False, action="store_true",
74 help="show information about what we're doing.")
78 def srf_open(filename, cnf1=False):
80 Make a stream from srf file using srf2fastq
88 logging.info('srf command: %s' % (" ".join(cmd),))
89 p = Popen(cmd, stdout=PIPE)
93 def convert_single_to_fastq(instream, target1, header=''):
99 if state == FASTQ_HEADER:
100 write_header(target1, header, line)
101 state = FASTQ_SEQUENCE
103 elif state == FASTQ_SEQUENCE:
104 write_sequence(target1, line)
105 state = FASTQ_SEQUENCE_HEADER
107 elif state == FASTQ_SEQUENCE_HEADER:
108 # the sequence header isn't really sequence, but
109 # we're just passing it through
110 write_sequence(target1, line)
111 state = FASTQ_QUALITY
112 # sequence or quality data
113 elif state == FASTQ_QUALITY:
114 write_sequence(target1, line)
117 raise RuntimeError("Unrecognized STATE in fastq split")
121 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
123 read a fastq file where two paired ends have been run together into
126 instream is the source stream
127 target1 and target2 are the destination streams
133 for line in instream:
136 if state == FASTQ_HEADER:
137 write_header(target1, header, line, "/1")
138 write_header(target2, header, line, "/2")
139 state = FASTQ_SEQUENCE
141 elif state == FASTQ_SEQUENCE:
144 write_split_sequence(target1, target2, line, mid)
145 state = FASTQ_SEQUENCE_HEADER
147 elif state == FASTQ_SEQUENCE_HEADER:
148 # the sequence header isn't really sequence, but
149 # we're just passing it through
150 write_sequence(target1, line)
151 write_sequence(target2, line)
153 state = FASTQ_QUALITY
154 # sequence or quality data
155 elif state == FASTQ_QUALITY:
156 write_split_sequence(target1, target2, line, mid)
159 raise RuntimeError("Unrecognized STATE in fastq split")
161 def write_header(target, prefix, line, suffix=''):
164 target.write(line[1:])
166 target.write(os.linesep)
168 def write_sequence(target, line):
170 target.write(os.linesep)
172 def write_split_sequence(target1, target2, line, mid):
173 target1.write(line[:mid])
174 target1.write(os.linesep)
176 target2.write(line[mid:])
177 target2.write(os.linesep)
179 def is_srf(filename):
181 Check filename to see if it is likely to be a SRF file
183 f = open(filename, 'r')
186 return header == "SSRF"
188 def open_write(filename, force=False):
190 Open a file, but throw an exception if it already exists
193 if os.path.exists(filename):
194 raise RuntimeError("%s exists" % (filename,))
196 return open(filename, 'w')
199 path, name = os.path.split(filename)
200 base, ext = os.path.splitext(name)
202 target1_name = base + '_r1.fastq'
203 target2_name = base + '_r2.fastq'
205 for target_name in [target1_name, target2_name]:
206 print 'target name', target_name
207 if os.path.exists(target_name):
208 raise RuntimeError("%s exists" % (target_name,))
210 instream = open(filename,'r')
211 target1 = open(target1_name,'w')
212 target2 = open(target2_name,'w')
216 if __name__ == "__main__":
217 sys.exit(main(sys.argv[1:]))