61f93c77fe892af71ea5a130ced53e0f12a2b530
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.version import version
11
12 LOGGER = logging.getLogger(__name__)
13
14 # constants for our fastq finite state machine
15 FASTQ_HEADER = 0
16 FASTQ_SEQUENCE = 1
17 FASTQ_SEQUENCE_HEADER = 2
18 FASTQ_QUALITY = 3
19
20 def main(cmdline=None):
21     parser = make_parser()
22     opts, args = parser.parse_args(cmdline)
23
24     if opts.verbose:
25         logging.basicConfig(level=logging.INFO)
26     else:
27         logging.basicConfig(level=logging.WARN)
28
29     if opts.version:
30         print version()
31         return 0
32
33     if len(args) != 1:
34         parser.error("Requires one argument, got: %s" % (str(args)))
35
36     if opts.flowcell is not None:
37         header = "%s_" % (opts.flowcell,)
38     else:
39         header = ''
40
41     if opts.single:
42         left = open_write(opts.single, opts.force)
43     else:
44         left = open_write(opts.left, opts.force)
45         right = open_write(opts.right, opts.force)
46
47     # open the srf, fastq, or compressed fastq
48     if is_srf(args[0]):
49         source = srf_open(args[0], opts.cnf1)
50     else:
51         source = autoopen(args[0])
52
53     if opts.single:
54         convert_single_to_fastq(source, left, header)
55     else:
56         convert_single_to_two_fastq(source, left, right, opts.mid, header)
57
58     return 0
59
60 def make_parser():
61     parser = OptionParser("""%prog: [options] file
62
63 file can be either a fastq file or a srf file.
64 You can also force the flowcell ID to be added to the header.""")
65
66     parser.add_option('--force', default=False, action="store_true",
67                       help="overwrite existing files.")
68     parser.add_option('--flowcell', default=None,
69                       help="add flowcell id header to sequence")
70     parser.add_option('-l','--left', default="r1.fastq",
71                       help='left side filename')
72     parser.add_option('-m','--mid', default=None,
73                       help='actual sequence mid point')
74     parser.add_option('-r','--right', default="r2.fastq",
75                       help='right side filename')
76     parser.add_option('-s','--single', default=None,
77                       help="single fastq target name")
78     parser.add_option('-v', '--verbose', default=False, action="store_true",
79                       help="show information about what we're doing.")
80     parser.add_option('--version', default=False, action="store_true",
81                       help="Report software version")
82     parser.add_option('--cnf1', default=False, action="store_true",
83                       help="Force cnf1 mode in srf2fastq")
84     return parser
85
86
87 def srf_open(filename, cnf1=False):
88     """
89     Make a stream from srf file using srf2fastq
90     """
91     cmd = ['srf2fastq']
92     if cnf1 or is_cnf1(filename):
93         cmd.append('-c')
94     cmd.append(filename)
95
96     LOGGER.info('srf command: %s' % (" ".join(cmd),))
97     p = Popen(cmd, stdout=PIPE)
98     return p.stdout
99
100
101 def convert_single_to_fastq(instream, target1, header=''):
102
103     state = FASTQ_HEADER
104     for line in instream:
105         line = line.strip()
106         # sequence header
107         if state == FASTQ_HEADER:
108             write_header(target1, header, line)
109             state = FASTQ_SEQUENCE
110         # sequence
111         elif state == FASTQ_SEQUENCE:
112             write_sequence(target1, line)
113             state = FASTQ_SEQUENCE_HEADER
114         # quality header
115         elif state == FASTQ_SEQUENCE_HEADER:
116             # the sequence header isn't really sequence, but
117             # we're just passing it through
118             write_sequence(target1, line)
119             state = FASTQ_QUALITY
120         # sequence or quality data
121         elif state == FASTQ_QUALITY:
122             write_sequence(target1, line)
123             state = FASTQ_HEADER
124         else:
125             raise RuntimeError("Unrecognized STATE in fastq split")
126
127
128
129 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
130     """
131     read a fastq file where two paired ends have been run together into
132     two halves.
133
134     instream is the source stream
135     target1 and target2 are the destination streams
136     """
137     if mid is not None:
138         mid = int(mid)
139
140     state = FASTQ_HEADER
141     for line in instream:
142         line = line.strip()
143         # sequence header
144         if state == FASTQ_HEADER:
145             write_header(target1, header, line, "/1")
146             write_header(target2, header, line, "/2")
147             state = FASTQ_SEQUENCE
148         # sequence
149         elif state == FASTQ_SEQUENCE:
150             if mid is None:
151                 mid = len(line)/2
152             write_split_sequence(target1, target2, line, mid)
153             state = FASTQ_SEQUENCE_HEADER
154         # quality header
155         elif state == FASTQ_SEQUENCE_HEADER:
156             # the sequence header isn't really sequence, but
157             # we're just passing it through
158             write_sequence(target1, line)
159             write_sequence(target2, line)
160
161             state = FASTQ_QUALITY
162         # sequence or quality data
163         elif state == FASTQ_QUALITY:
164             write_split_sequence(target1, target2, line, mid)
165             state = FASTQ_HEADER
166         else:
167             raise RuntimeError("Unrecognized STATE in fastq split")
168
169 def write_header(target, prefix, line, suffix=''):
170     target.write('@')
171     target.write(prefix)
172     target.write(line[1:])
173     target.write(suffix)
174     target.write(os.linesep)
175
176 def write_sequence(target, line):
177     target.write(line)
178     target.write(os.linesep)
179
180 def write_split_sequence(target1, target2, line, mid):
181     target1.write(line[:mid])
182     target1.write(os.linesep)
183
184     target2.write(line[mid:])
185     target2.write(os.linesep)
186
187 def is_srf(filename):
188     """
189     Check filename to see if it is likely to be a SRF file
190     """
191     f = open(filename, 'r')
192     header = f.read(4)
193     f.close()
194     return header == "SSRF"
195
196 def is_cnf1(filename):
197     """
198     Brute force detection if a SRF file is using CNF1/CNF4 records
199     """
200     max_header = 1024 ** 2
201     PROGRAM_ID = 'PROGRAM_ID\000'
202     cnf4_apps = set(("solexa2srf v1.4",
203                     "illumina2srf v1.11.5.Illumina.1.3"))
204
205     if not is_srf(filename):
206         raise ValueError("%s must be a srf file" % (filename,))
207
208     fd = os.open(filename, os.O_RDONLY)
209     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
210     # alas the max search length requires python 2.6+
211     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
212     program_header_start = program_id_location+len(PROGRAM_ID)
213     next_null = f.find('\000', program_header_start) #, max_header)
214     program_id_header = f[program_header_start:next_null]
215     f.close()
216     os.close(fd)
217
218     if program_id_header in cnf4_apps:
219         return False
220     else:
221         return True
222
223 def open_write(filename, force=False):
224     """
225     Open a file, but throw an exception if it already exists
226     """
227     if not force:
228         if os.path.exists(filename):
229             raise RuntimeError("%s exists" % (filename,))
230
231     return open(filename, 'w')
232
233 def foo():
234     path, name = os.path.split(filename)
235     base, ext = os.path.splitext(name)
236
237     target1_name = base + '_r1.fastq'
238     target2_name = base + '_r2.fastq'
239
240     for target_name in [target1_name, target2_name]:
241         print 'target name', target_name
242         if os.path.exists(target_name):
243             raise RuntimeError("%s exists" % (target_name,))
244
245     instream = open(filename,'r')
246     target1 = open(target1_name,'w')
247     target2 = open(target2_name,'w')
248
249
250 if __name__ == "__main__":
251     sys.exit(main(sys.argv[1:]))