c73bb2cc3f93953de73210a751dfd9f463c8b832
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.version import version
11
12 LOGGER = logging.getLogger(__name__)
13
14 # constants for our fastq finite state machine
15 FASTQ_HEADER = 0
16 FASTQ_SEQUENCE = 1
17 FASTQ_SEQUENCE_HEADER = 2
18 FASTQ_QUALITY = 3
19
20 def main(cmdline=None):
21     parser = make_parser()
22     opts, args = parser.parse_args(cmdline)
23
24     if opts.verbose:
25         logging.basicConfig(level=logging.INFO)
26     else:
27         logging.basicConfig(level=logging.WARN)
28
29     if opts.version:
30         print version()
31         return 0
32
33     if len(args) != 1:
34         parser.error("Requires one argument, got: %s" % (str(args)))
35
36     if opts.flowcell is not None:
37         header = "%s_" % (opts.flowcell,)
38     else:
39         header = ''
40
41     if opts.single:
42         left = open_write(opts.single, opts.force)
43     else:
44         left = open_write(opts.left, opts.force)
45         right = open_write(opts.right, opts.force)
46
47     # open the srf, fastq, or compressed fastq
48     if is_srf(args[0]):
49         source = srf_open(args[0], opts.srf2fastq, opts.cnf1)
50     else:
51         source = autoopen(args[0])
52
53     if opts.single:
54         convert_single_to_fastq(source, left, header)
55     else:
56         convert_single_to_two_fastq(source, left, right, opts.mid, header)
57
58     return 0
59
60 def make_parser():
61     parser = OptionParser("""%prog: [options] file
62
63 file can be either a fastq file or a srf file.
64 You can also force the flowcell ID to be added to the header.""")
65
66     parser.add_option('--force', default=False, action="store_true",
67                       help="overwrite existing files.")
68     parser.add_option('--flowcell', default=None,
69                       help="add flowcell id header to sequence")
70     parser.add_option('-l','--left', default="r1.fastq",
71                       help='left side filename')
72     parser.add_option('-m','--mid', default=None,
73                       help='actual sequence mid point')
74     parser.add_option('-r','--right', default="r2.fastq",
75                       help='right side filename')
76     parser.add_option('-s','--single', default=None,
77                       help="single fastq target name")
78     parser.add_option('-v', '--verbose', default=False, action="store_true",
79                       help="show information about what we're doing.")
80     parser.add_option('--version', default=False, action="store_true",
81                       help="Report software version")
82     parser.add_option('--cnf1', default=False, action="store_true",
83                       help="Force cnf1 mode in srf2fastq")
84     parser.add_option('--srf2fastq', default='srf2fastq',
85                       help='specify srf2fastq command')
86     return parser
87
88
89 def srf_open(filename, srf2fastq_cmd, cnf1=False):
90     """
91     Make a stream from srf file using srf2fastq
92     """
93     if not os.path.exists(srf2fastq_cmd):
94         LOGGER.error("srf command: %s doesn't exist" % (srf2fastq_cmd,))
95     cmd = [srf2fastq_cmd]
96     if cnf1 or is_cnf1(filename):
97         cmd.append('-c')
98     cmd.append(filename)
99
100     LOGGER.info('srf command: %s' % (" ".join(cmd),))
101     p = Popen(cmd, stdout=PIPE)
102     return p.stdout
103
104
105 def convert_single_to_fastq(instream, target1, header=''):
106
107     state = FASTQ_HEADER
108     for line in instream:
109         line = line.strip()
110         # sequence header
111         if state == FASTQ_HEADER:
112             write_header(target1, header, line)
113             state = FASTQ_SEQUENCE
114         # sequence
115         elif state == FASTQ_SEQUENCE:
116             write_sequence(target1, line)
117             state = FASTQ_SEQUENCE_HEADER
118         # quality header
119         elif state == FASTQ_SEQUENCE_HEADER:
120             # the sequence header isn't really sequence, but
121             # we're just passing it through
122             write_sequence(target1, line)
123             state = FASTQ_QUALITY
124         # sequence or quality data
125         elif state == FASTQ_QUALITY:
126             write_sequence(target1, line)
127             state = FASTQ_HEADER
128         else:
129             raise RuntimeError("Unrecognized STATE in fastq split")
130
131
132
133 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
134     """
135     read a fastq file where two paired ends have been run together into
136     two halves.
137
138     instream is the source stream
139     target1 and target2 are the destination streams
140     """
141     if mid is not None:
142         mid = int(mid)
143
144     state = FASTQ_HEADER
145     for line in instream:
146         line = line.strip()
147         # sequence header
148         if state == FASTQ_HEADER:
149             write_header(target1, header, line, "/1")
150             write_header(target2, header, line, "/2")
151             state = FASTQ_SEQUENCE
152         # sequence
153         elif state == FASTQ_SEQUENCE:
154             if mid is None:
155                 mid = len(line)/2
156             write_split_sequence(target1, target2, line, mid)
157             state = FASTQ_SEQUENCE_HEADER
158         # quality header
159         elif state == FASTQ_SEQUENCE_HEADER:
160             # the sequence header isn't really sequence, but
161             # we're just passing it through
162             write_sequence(target1, line)
163             write_sequence(target2, line)
164
165             state = FASTQ_QUALITY
166         # sequence or quality data
167         elif state == FASTQ_QUALITY:
168             write_split_sequence(target1, target2, line, mid)
169             state = FASTQ_HEADER
170         else:
171             raise RuntimeError("Unrecognized STATE in fastq split")
172
173 def write_header(target, prefix, line, suffix=''):
174     target.write('@')
175     target.write(prefix)
176     target.write(line[1:])
177     target.write(suffix)
178     target.write(os.linesep)
179
180 def write_sequence(target, line):
181     target.write(line)
182     target.write(os.linesep)
183
184 def write_split_sequence(target1, target2, line, mid):
185     target1.write(line[:mid])
186     target1.write(os.linesep)
187
188     target2.write(line[mid:])
189     target2.write(os.linesep)
190
191 def is_srf(filename):
192     """
193     Check filename to see if it is likely to be a SRF file
194     """
195     f = open(filename, 'r')
196     header = f.read(4)
197     f.close()
198     return header == "SSRF"
199
200 def is_cnf1(filename):
201     """
202     Brute force detection if a SRF file is using CNF1/CNF4 records
203     """
204     max_header = 1024 ** 2
205     PROGRAM_ID = 'PROGRAM_ID\000'
206     cnf4_apps = set(("solexa2srf v1.4",
207                     "illumina2srf v1.11.5.Illumina.1.3"))
208
209     if not is_srf(filename):
210         raise ValueError("%s must be a srf file" % (filename,))
211
212     fd = os.open(filename, os.O_RDONLY)
213     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
214     # alas the max search length requires python 2.6+
215     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
216     program_header_start = program_id_location+len(PROGRAM_ID)
217     next_null = f.find('\000', program_header_start) #, max_header)
218     program_id_header = f[program_header_start:next_null]
219     f.close()
220     os.close(fd)
221
222     if program_id_header in cnf4_apps:
223         return False
224     else:
225         return True
226
227 def open_write(filename, force=False):
228     """
229     Open a file, but throw an exception if it already exists
230     """
231     if not force:
232         if os.path.exists(filename):
233             raise RuntimeError("%s exists" % (filename,))
234
235     return open(filename, 'w')
236
237 def foo():
238     path, name = os.path.split(filename)
239     base, ext = os.path.splitext(name)
240
241     target1_name = base + '_r1.fastq'
242     target2_name = base + '_r2.fastq'
243
244     for target_name in [target1_name, target2_name]:
245         print 'target name', target_name
246         if os.path.exists(target_name):
247             raise RuntimeError("%s exists" % (target_name,))
248
249     instream = open(filename,'r')
250     target1 = open(target1_name,'w')
251     target2 = open(target2_name,'w')
252
253
254 if __name__ == "__main__":
255     sys.exit(main(sys.argv[1:]))