update srf2fastq detection code to work with byte arrays
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 from __future__ import print_function, unicode_literals
3
4 import logging
5 import mmap
6 from optparse import OptionParser
7 import os
8 from subprocess import Popen, PIPE
9 import sys
10
11 from htsworkflow.util.opener import autoopen
12 from htsworkflow.util.version import version
13
14 LOGGER = logging.getLogger(__name__)
15
16 # constants for our fastq finite state machine
17 FASTQ_HEADER = 0
18 FASTQ_SEQUENCE = 1
19 FASTQ_SEQUENCE_HEADER = 2
20 FASTQ_QUALITY = 3
21
22 def main(cmdline=None):
23     parser = make_parser()
24     opts, args = parser.parse_args(cmdline)
25
26     if opts.verbose:
27         logging.basicConfig(level=logging.INFO)
28     else:
29         logging.basicConfig(level=logging.WARN)
30
31     if opts.version:
32         print(version())
33         return 0
34
35     if len(args) != 1:
36         parser.error("Requires one argument, got: %s" % (str(args)))
37
38     if opts.flowcell is not None:
39         header = "%s_" % (opts.flowcell,)
40     else:
41         header = ''
42
43     if opts.single:
44         left = open_write(opts.single, opts.force)
45     else:
46         left = open_write(opts.left, opts.force)
47         right = open_write(opts.right, opts.force)
48
49     # open the srf, fastq, or compressed fastq
50     if is_srf(args[0]):
51         source = srf_open(args[0], opts.srf2fastq, opts.cnf1)
52     else:
53         source = autoopen(args[0])
54
55     if opts.single:
56         convert_single_to_fastq(source, left, header)
57     else:
58         convert_single_to_two_fastq(source, left, right, opts.mid, header)
59
60     return 0
61
62 def make_parser():
63     parser = OptionParser("""%prog: [options] file
64
65 file can be either a fastq file or a srf file.
66 You can also force the flowcell ID to be added to the header.""")
67
68     parser.add_option('--force', default=False, action="store_true",
69                       help="overwrite existing files.")
70     parser.add_option('--flowcell', default=None,
71                       help="add flowcell id header to sequence")
72     parser.add_option('-l','--left', default="r1.fastq",
73                       help='left side filename')
74     parser.add_option('-m','--mid', default=None,
75                       help='actual sequence mid point')
76     parser.add_option('-r','--right', default="r2.fastq",
77                       help='right side filename')
78     parser.add_option('-s','--single', default=None,
79                       help="single fastq target name")
80     parser.add_option('-v', '--verbose', default=False, action="store_true",
81                       help="show information about what we're doing.")
82     parser.add_option('--version', default=False, action="store_true",
83                       help="Report software version")
84     parser.add_option('--cnf1', default=False, action="store_true",
85                       help="Force cnf1 mode in srf2fastq")
86     parser.add_option('--srf2fastq', default='srf2fastq',
87                       help='specify srf2fastq command')
88     return parser
89
90
91 def srf_open(filename, srf2fastq_cmd, cnf1=False):
92     """
93     Make a stream from srf file using srf2fastq
94     """
95     if not os.path.exists(srf2fastq_cmd):
96         LOGGER.error("srf command: %s doesn't exist" % (srf2fastq_cmd,))
97     cmd = [srf2fastq_cmd]
98     if cnf1 or is_cnf1(filename):
99         cmd.append('-c')
100     cmd.append(filename)
101
102     LOGGER.info('srf command: %s' % (" ".join(cmd),))
103     p = Popen(cmd, stdout=PIPE)
104     return p.stdout
105
106
107 def convert_single_to_fastq(instream, target1, header=''):
108     state = FASTQ_HEADER
109     for line in instream:
110         line = line.strip()
111         # sequence header
112         if state == FASTQ_HEADER:
113             write_header(target1, header, line)
114             state = FASTQ_SEQUENCE
115         # sequence
116         elif state == FASTQ_SEQUENCE:
117             write_sequence(target1, line)
118             state = FASTQ_SEQUENCE_HEADER
119         # quality header
120         elif state == FASTQ_SEQUENCE_HEADER:
121             # the sequence header isn't really sequence, but
122             # we're just passing it through
123             write_sequence(target1, line)
124             state = FASTQ_QUALITY
125         # sequence or quality data
126         elif state == FASTQ_QUALITY:
127             write_sequence(target1, line)
128             state = FASTQ_HEADER
129         else:
130             raise RuntimeError("Unrecognized STATE in fastq split")
131
132
133
134 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
135     """
136     read a fastq file where two paired ends have been run together into
137     two halves.
138
139     instream is the source stream
140     target1 and target2 are the destination streams
141     """
142     if mid is not None:
143         mid = int(mid)
144
145     state = FASTQ_HEADER
146     for line in instream:
147         line = line.strip()
148         # sequence header
149         if state == FASTQ_HEADER:
150             write_header(target1, header, line, "/1")
151             write_header(target2, header, line, "/2")
152             state = FASTQ_SEQUENCE
153         # sequence
154         elif state == FASTQ_SEQUENCE:
155             if mid is None:
156                 mid = len(line) // 2
157             write_split_sequence(target1, target2, line, mid)
158             state = FASTQ_SEQUENCE_HEADER
159         # quality header
160         elif state == FASTQ_SEQUENCE_HEADER:
161             # the sequence header isn't really sequence, but
162             # we're just passing it through
163             write_sequence(target1, line)
164             write_sequence(target2, line)
165
166             state = FASTQ_QUALITY
167         # sequence or quality data
168         elif state == FASTQ_QUALITY:
169             write_split_sequence(target1, target2, line, mid)
170             state = FASTQ_HEADER
171         else:
172             raise RuntimeError("Unrecognized STATE in fastq split")
173
174 def write_header(target, prefix, line, suffix=''):
175     target.write('@')
176     target.write(prefix)
177     target.write(line[1:])
178     target.write(suffix)
179     target.write(os.linesep)
180
181 def write_sequence(target, line):
182     target.write(line)
183     target.write(os.linesep)
184
185 def write_split_sequence(target1, target2, line, mid):
186     target1.write(line[:mid])
187     target1.write(os.linesep)
188
189     target2.write(line[mid:])
190     target2.write(os.linesep)
191
192 def is_srf(filename):
193     """
194     Check filename to see if it is likely to be a SRF file
195     """
196     f = open(filename, 'rb')
197     header = f.read(4)
198     f.close()
199     return header == b"SSRF"
200
201 def is_cnf1(filename):
202     """
203     Brute force detection if a SRF file is using CNF1/CNF4 records
204     """
205     max_header = 1024 ** 2
206     PROGRAM_ID = b'PROGRAM_ID\000'
207     cnf4_apps = set((b"solexa2srf v1.4",
208                      b"illumina2srf v1.11.5.Illumina.1.3"))
209
210     if not is_srf(filename):
211         raise ValueError("%s must be a srf file" % (filename,))
212
213     fd = os.open(filename, os.O_RDONLY)
214     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
215     # alas the max search length requires python 2.6+
216     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
217     program_header_start = program_id_location+len(PROGRAM_ID)
218     next_null = f.find(b'\000', program_header_start) #, max_header)
219     program_id_header = f[program_header_start:next_null]
220     f.close()
221     os.close(fd)
222
223     if program_id_header in cnf4_apps:
224         return False
225     else:
226         return True
227
228 def open_write(filename, force=False):
229     """
230     Open a file, but throw an exception if it already exists
231     """
232     if not force:
233         if os.path.exists(filename):
234             raise RuntimeError("%s exists" % (filename,))
235
236     return open(filename, 'w')
237
238 def foo():
239     path, name = os.path.split(filename)
240     base, ext = os.path.splitext(name)
241
242     target1_name = base + '_r1.fastq'
243     target2_name = base + '_r2.fastq'
244
245     for target_name in [target1_name, target2_name]:
246         print('target name', target_name)
247         if os.path.exists(target_name):
248             raise RuntimeError("%s exists" % (target_name,))
249
250     instream = open(filename,'r')
251     target1 = open(target1_name,'w')
252     target2 = open(target2_name,'w')
253
254
255 if __name__ == "__main__":
256     sys.exit(main(sys.argv[1:]))