Update srf2named_fastq to try to detect if the srf file is CNF1 or CNF4
[htsworkflow.git] / scripts / srf2named_fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10
11 # constants for our fastq finite state machine
12 FASTQ_HEADER = 0
13 FASTQ_SEQUENCE = 1
14 FASTQ_SEQUENCE_HEADER = 2
15 FASTQ_QUALITY = 3
16
17
18 def main(cmdline=None):
19     parser = make_parser()
20     opts, args = parser.parse_args(cmdline)
21
22     if len(args) != 1:
23         parser.error("Requires one argument, got: %s" % (str(args)))
24
25     if opts.verbose:
26         logging.basicConfig(level=logging.INFO)
27     else:
28         logging.basicConfig(level=logging.WARN)
29
30     if opts.flowcell is not None:
31         header = "%s_" % (opts.flowcell,)
32     else:
33         header = ''
34
35     if opts.single:
36         left = open_write(opts.single, opts.force)
37     else:
38         left = open_write(opts.left, opts.force)
39         right = open_write(opts.right, opts.force)
40     
41     # open the srf, fastq, or compressed fastq
42     if is_srf(args[0]):
43         source = srf_open(args[0])
44     else:
45         source = autoopen(args[0])
46
47     if opts.single:
48         convert_single_to_fastq(source, left, header)
49     else:
50         convert_single_to_two_fastq(source, left, right, opts.mid, header)
51    
52     return 0
53
54 def make_parser():
55     parser = OptionParser("""%prog: [options] file
56
57 file can be either a fastq file or a srf file.
58 You can also force the flowcell ID to be added to the header.""")
59
60     parser.add_option('--force', default=False, action="store_true",
61                       help="overwrite existing files.")
62     parser.add_option('--flowcell', default=None,
63                       help="add flowcell id header to sequence")
64     parser.add_option('-l','--left', default="r1.fastq",
65                       help='left side filename')
66     parser.add_option('-r','--right', default="r2.fastq",
67                       help='right side filename')
68     parser.add_option('-m','--mid', default=None, 
69                       help='actual sequence mid point')
70     parser.add_option('-s','--single', default=None,
71                       help="single fastq target name")
72     parser.add_option('-v', '--verbose', default=False, action="store_true",
73                       help="show information about what we're doing.")
74     return parser
75
76
77 def srf_open(filename, cnf1=False):
78     """
79     Make a stream from srf file using srf2fastq
80     """
81     
82     cmd = ['srf2fastq']
83     if is_cnf1(filename):
84         cmd.append('-c')
85     cmd.append(filename)
86       
87     logging.info('srf command: %s' % (" ".join(cmd),))
88     p = Popen(cmd, stdout=PIPE)
89     return p.stdout
90     
91
92 def convert_single_to_fastq(instream, target1, header=''):
93
94     state = FASTQ_HEADER
95     for line in instream:
96         line = line.strip()
97         # sequence header
98         if state == FASTQ_HEADER:
99             write_header(target1, header, line)
100             state = FASTQ_SEQUENCE
101         # sequence
102         elif state == FASTQ_SEQUENCE:
103             write_sequence(target1, line)
104             state = FASTQ_SEQUENCE_HEADER
105         # quality header
106         elif state == FASTQ_SEQUENCE_HEADER:
107             # the sequence header isn't really sequence, but 
108             # we're just passing it through
109             write_sequence(target1, line)
110             state = FASTQ_QUALITY
111         # sequence or quality data
112         elif state == FASTQ_QUALITY:
113             write_sequence(target1, line)
114             state = FASTQ_HEADER
115         else:
116             raise RuntimeError("Unrecognized STATE in fastq split")
117
118
119         
120 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
121     """
122     read a fastq file where two paired ends have been run together into 
123     two halves.
124
125     instream is the source stream
126     target1 and target2 are the destination streams
127     """
128     if mid is not None:
129         mid = int(mid)
130
131     state = FASTQ_HEADER
132     for line in instream:
133         line = line.strip()
134         # sequence header
135         if state == FASTQ_HEADER:
136             write_header(target1, header, line, "/1")
137             write_header(target2, header, line, "/2")
138             state = FASTQ_SEQUENCE
139         # sequence
140         elif state == FASTQ_SEQUENCE:
141             if mid is None:
142                 mid = len(line)/2
143             write_split_sequence(target1, target2, line, mid)
144             state = FASTQ_SEQUENCE_HEADER
145         # quality header
146         elif state == FASTQ_SEQUENCE_HEADER:
147             # the sequence header isn't really sequence, but 
148             # we're just passing it through
149             write_sequence(target1, line)
150             write_sequence(target2, line)
151
152             state = FASTQ_QUALITY
153         # sequence or quality data
154         elif state == FASTQ_QUALITY:
155             write_split_sequence(target1, target2, line, mid)
156             state = FASTQ_HEADER
157         else:
158             raise RuntimeError("Unrecognized STATE in fastq split")
159
160 def write_header(target, prefix, line, suffix=''):
161     target.write('@')
162     target.write(prefix)
163     target.write(line[1:])
164     target.write(suffix)
165     target.write(os.linesep)
166
167 def write_sequence(target, line):
168     target.write(line)
169     target.write(os.linesep)
170
171 def write_split_sequence(target1, target2, line, mid):
172     target1.write(line[:mid])
173     target1.write(os.linesep)
174
175     target2.write(line[mid:])
176     target2.write(os.linesep)
177
178 def is_srf(filename):
179     """
180     Check filename to see if it is likely to be a SRF file
181     """
182     f = open(filename, 'r')
183     header = f.read(4)
184     f.close()
185     return header == "SSRF"
186
187 def is_cnf1(filename):
188     """
189     Brute force detection if a SRF file is using CNF1/CNF4 records
190     """
191     max_header = 1024 ** 2
192     PROGRAM_ID = 'PROGRAM_ID\000'
193
194     if not is_srf(filename):
195         raise ValueError("%s must be a srf file" % (filename,))
196
197     fd = os.open(filename, os.O_RDONLY)
198     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
199     # alas the max search length requires python 2.6+
200     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
201     program_header_start = program_id_location+len(PROGRAM_ID)
202     next_null = f.find('\000', program_header_start) #, max_header)
203     program_id_header = f[program_header_start:next_null]
204     f.close()
205     os.close(fd)
206
207     if program_id_header == "solexa2srf v1.4":
208         return False
209     else:
210         return True
211
212 def open_write(filename, force=False):
213     """
214     Open a file, but throw an exception if it already exists
215     """
216     if not force:
217         if os.path.exists(filename):
218             raise RuntimeError("%s exists" % (filename,))
219
220     return open(filename, 'w')
221
222 def foo():
223     path, name = os.path.split(filename)
224     base, ext = os.path.splitext(name)
225
226     target1_name = base + '_r1.fastq'
227     target2_name = base + '_r2.fastq'
228
229     for target_name in [target1_name, target2_name]:
230         print 'target name', target_name
231         if os.path.exists(target_name):
232             raise RuntimeError("%s exists" % (target_name,))
233
234     instream = open(filename,'r')
235     target1 = open(target1_name,'w')
236     target2 = open(target2_name,'w')
237
238
239
240 if __name__ == "__main__":
241     sys.exit(main(sys.argv[1:]))