Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.util.version import version
11
12 LOGGER = logging.getLogger(__name__)
13
14 # constants for our fastq finite state machine
15 FASTQ_HEADER = 0
16 FASTQ_SEQUENCE = 1
17 FASTQ_SEQUENCE_HEADER = 2
18 FASTQ_QUALITY = 3
19
20 def main(cmdline=None):
21     parser = make_parser()
22     opts, args = parser.parse_args(cmdline)
23
24     if opts.verbose:
25         logging.basicConfig(level=logging.INFO)
26     else:
27         logging.basicConfig(level=logging.WARN)
28
29     if opts.version:
30         print(version())
31         return 0
32
33     if len(args) != 1:
34         parser.error("Requires one argument, got: %s" % (str(args)))
35
36     if opts.flowcell is not None:
37         header = "%s_" % (opts.flowcell,)
38     else:
39         header = ''
40
41     if opts.single:
42         left = open_write(opts.single, opts.force)
43     else:
44         left = open_write(opts.left, opts.force)
45         right = open_write(opts.right, opts.force)
46
47     # open the srf, fastq, or compressed fastq
48     if is_srf(args[0]):
49         source = srf_open(args[0], opts.srf2fastq, opts.cnf1)
50     else:
51         source = autoopen(args[0])
52
53     if opts.single:
54         convert_single_to_fastq(source, left, header)
55     else:
56         convert_single_to_two_fastq(source, left, right, opts.mid, header)
57
58     return 0
59
60 def make_parser():
61     parser = OptionParser("""%prog: [options] file
62
63 file can be either a fastq file or a srf file.
64 You can also force the flowcell ID to be added to the header.""")
65
66     parser.add_option('--force', default=False, action="store_true",
67                       help="overwrite existing files.")
68     parser.add_option('--flowcell', default=None,
69                       help="add flowcell id header to sequence")
70     parser.add_option('-l','--left', default="r1.fastq",
71                       help='left side filename')
72     parser.add_option('-m','--mid', default=None,
73                       help='actual sequence mid point')
74     parser.add_option('-r','--right', default="r2.fastq",
75                       help='right side filename')
76     parser.add_option('-s','--single', default=None,
77                       help="single fastq target name")
78     parser.add_option('-v', '--verbose', default=False, action="store_true",
79                       help="show information about what we're doing.")
80     parser.add_option('--version', default=False, action="store_true",
81                       help="Report software version")
82     parser.add_option('--cnf1', default=False, action="store_true",
83                       help="Force cnf1 mode in srf2fastq")
84     parser.add_option('--srf2fastq', default='srf2fastq',
85                       help='specify srf2fastq command')
86     return parser
87
88
89 def srf_open(filename, srf2fastq_cmd, cnf1=False):
90     """
91     Make a stream from srf file using srf2fastq
92     """
93     if not os.path.exists(srf2fastq_cmd):
94         LOGGER.error("srf command: %s doesn't exist" % (srf2fastq_cmd,))
95     cmd = [srf2fastq_cmd]
96     if cnf1 or is_cnf1(filename):
97         cmd.append('-c')
98     cmd.append(filename)
99
100     LOGGER.info('srf command: %s' % (" ".join(cmd),))
101     p = Popen(cmd, stdout=PIPE)
102     return p.stdout
103
104
105 def convert_single_to_fastq(instream, target1, header=''):
106     state = FASTQ_HEADER
107     for line in instream:
108         line = line.strip()
109         # sequence header
110         if state == FASTQ_HEADER:
111             write_header(target1, header, line)
112             state = FASTQ_SEQUENCE
113         # sequence
114         elif state == FASTQ_SEQUENCE:
115             write_sequence(target1, line)
116             state = FASTQ_SEQUENCE_HEADER
117         # quality header
118         elif state == FASTQ_SEQUENCE_HEADER:
119             # the sequence header isn't really sequence, but
120             # we're just passing it through
121             write_sequence(target1, line)
122             state = FASTQ_QUALITY
123         # sequence or quality data
124         elif state == FASTQ_QUALITY:
125             write_sequence(target1, line)
126             state = FASTQ_HEADER
127         else:
128             raise RuntimeError("Unrecognized STATE in fastq split")
129
130
131
132 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
133     """
134     read a fastq file where two paired ends have been run together into
135     two halves.
136
137     instream is the source stream
138     target1 and target2 are the destination streams
139     """
140     if mid is not None:
141         mid = int(mid)
142
143     state = FASTQ_HEADER
144     for line in instream:
145         line = line.strip()
146         # sequence header
147         if state == FASTQ_HEADER:
148             write_header(target1, header, line, "/1")
149             write_header(target2, header, line, "/2")
150             state = FASTQ_SEQUENCE
151         # sequence
152         elif state == FASTQ_SEQUENCE:
153             if mid is None:
154                 mid = len(line)/2
155             write_split_sequence(target1, target2, line, mid)
156             state = FASTQ_SEQUENCE_HEADER
157         # quality header
158         elif state == FASTQ_SEQUENCE_HEADER:
159             # the sequence header isn't really sequence, but
160             # we're just passing it through
161             write_sequence(target1, line)
162             write_sequence(target2, line)
163
164             state = FASTQ_QUALITY
165         # sequence or quality data
166         elif state == FASTQ_QUALITY:
167             write_split_sequence(target1, target2, line, mid)
168             state = FASTQ_HEADER
169         else:
170             raise RuntimeError("Unrecognized STATE in fastq split")
171
172 def write_header(target, prefix, line, suffix=''):
173     target.write('@')
174     target.write(prefix)
175     target.write(line[1:])
176     target.write(suffix)
177     target.write(os.linesep)
178
179 def write_sequence(target, line):
180     target.write(line)
181     target.write(os.linesep)
182
183 def write_split_sequence(target1, target2, line, mid):
184     target1.write(line[:mid])
185     target1.write(os.linesep)
186
187     target2.write(line[mid:])
188     target2.write(os.linesep)
189
190 def is_srf(filename):
191     """
192     Check filename to see if it is likely to be a SRF file
193     """
194     f = open(filename, 'r')
195     header = f.read(4)
196     f.close()
197     return header == "SSRF"
198
199 def is_cnf1(filename):
200     """
201     Brute force detection if a SRF file is using CNF1/CNF4 records
202     """
203     max_header = 1024 ** 2
204     PROGRAM_ID = 'PROGRAM_ID\000'
205     cnf4_apps = set(("solexa2srf v1.4",
206                     "illumina2srf v1.11.5.Illumina.1.3"))
207
208     if not is_srf(filename):
209         raise ValueError("%s must be a srf file" % (filename,))
210
211     fd = os.open(filename, os.O_RDONLY)
212     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
213     # alas the max search length requires python 2.6+
214     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
215     program_header_start = program_id_location+len(PROGRAM_ID)
216     next_null = f.find('\000', program_header_start) #, max_header)
217     program_id_header = f[program_header_start:next_null]
218     f.close()
219     os.close(fd)
220
221     if program_id_header in cnf4_apps:
222         return False
223     else:
224         return True
225
226 def open_write(filename, force=False):
227     """
228     Open a file, but throw an exception if it already exists
229     """
230     if not force:
231         if os.path.exists(filename):
232             raise RuntimeError("%s exists" % (filename,))
233
234     return open(filename, 'w')
235
236 def foo():
237     path, name = os.path.split(filename)
238     base, ext = os.path.splitext(name)
239
240     target1_name = base + '_r1.fastq'
241     target2_name = base + '_r2.fastq'
242
243     for target_name in [target1_name, target2_name]:
244         print('target name', target_name)
245         if os.path.exists(target_name):
246             raise RuntimeError("%s exists" % (target_name,))
247
248     instream = open(filename,'r')
249     target1 = open(target1_name,'w')
250     target2 = open(target2_name,'w')
251
252
253 if __name__ == "__main__":
254     sys.exit(main(sys.argv[1:]))