d0403f6e014ae68a7b456d1e7ef3d5a61a6dc2fa
[htsworkflow.git] / scripts / srf2named_fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10
11 # constants for our fastq finite state machine
12 FASTQ_HEADER = 0
13 FASTQ_SEQUENCE = 1
14 FASTQ_SEQUENCE_HEADER = 2
15 FASTQ_QUALITY = 3
16
17
18 def main(cmdline=None):
19     parser = make_parser()
20     opts, args = parser.parse_args(cmdline)
21
22     if len(args) != 1:
23         parser.error("Requires one argument, got: %s" % (str(args)))
24
25     if opts.verbose:
26         logging.basicConfig(level=logging.INFO)
27     else:
28         logging.basicConfig(level=logging.WARN)
29
30     if opts.flowcell is not None:
31         header = "%s_" % (opts.flowcell,)
32     else:
33         header = ''
34
35     if opts.single:
36         left = open_write(opts.single, opts.force)
37     else:
38         left = open_write(opts.left, opts.force)
39         right = open_write(opts.right, opts.force)
40     
41     # open the srf, fastq, or compressed fastq
42     if is_srf(args[0]):
43         source = srf_open(args[0])
44     else:
45         source = autoopen(args[0])
46
47     if opts.single:
48         convert_single_to_fastq(source, left, header)
49     else:
50         convert_single_to_two_fastq(source, left, right, opts.mid, header)
51    
52     return 0
53
54 def make_parser():
55     parser = OptionParser("""%prog: [options] file
56
57 file can be either a fastq file or a srf file.
58 You can also force the flowcell ID to be added to the header.""")
59
60     parser.add_option('--force', default=False, action="store_true",
61                       help="overwrite existing files.")
62     parser.add_option('--flowcell', default=None,
63                       help="add flowcell id header to sequence")
64     parser.add_option('-l','--left', default="r1.fastq",
65                       help='left side filename')
66     parser.add_option('-m','--mid', default=None, 
67                       help='actual sequence mid point')
68     parser.add_option('-r','--right', default="r2.fastq",
69                       help='right side filename')
70     parser.add_option('-s','--single', default=None,
71                       help="single fastq target name")
72     parser.add_option('-v', '--verbose', default=False, action="store_true",
73                       help="show information about what we're doing.")
74     return parser
75
76
77 def srf_open(filename, cnf1=False):
78     """
79     Make a stream from srf file using srf2fastq
80     """
81     
82     cmd = ['srf2fastq']
83     if is_cnf1(filename):
84         cmd.append('-c')
85     cmd.append(filename)
86       
87     logging.info('srf command: %s' % (" ".join(cmd),))
88     p = Popen(cmd, stdout=PIPE)
89     return p.stdout
90     
91
92 def convert_single_to_fastq(instream, target1, header=''):
93
94     state = FASTQ_HEADER
95     for line in instream:
96         line = line.strip()
97         # sequence header
98         if state == FASTQ_HEADER:
99             write_header(target1, header, line)
100             state = FASTQ_SEQUENCE
101         # sequence
102         elif state == FASTQ_SEQUENCE:
103             write_sequence(target1, line)
104             state = FASTQ_SEQUENCE_HEADER
105         # quality header
106         elif state == FASTQ_SEQUENCE_HEADER:
107             # the sequence header isn't really sequence, but 
108             # we're just passing it through
109             write_sequence(target1, line)
110             state = FASTQ_QUALITY
111         # sequence or quality data
112         elif state == FASTQ_QUALITY:
113             write_sequence(target1, line)
114             state = FASTQ_HEADER
115         else:
116             raise RuntimeError("Unrecognized STATE in fastq split")
117
118
119         
120 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
121     """
122     read a fastq file where two paired ends have been run together into 
123     two halves.
124
125     instream is the source stream
126     target1 and target2 are the destination streams
127     """
128     if mid is not None:
129         mid = int(mid)
130
131     state = FASTQ_HEADER
132     for line in instream:
133         line = line.strip()
134         # sequence header
135         if state == FASTQ_HEADER:
136             write_header(target1, header, line, "/1")
137             write_header(target2, header, line, "/2")
138             state = FASTQ_SEQUENCE
139         # sequence
140         elif state == FASTQ_SEQUENCE:
141             if mid is None:
142                 mid = len(line)/2
143             write_split_sequence(target1, target2, line, mid)
144             state = FASTQ_SEQUENCE_HEADER
145         # quality header
146         elif state == FASTQ_SEQUENCE_HEADER:
147             # the sequence header isn't really sequence, but 
148             # we're just passing it through
149             write_sequence(target1, line)
150             write_sequence(target2, line)
151
152             state = FASTQ_QUALITY
153         # sequence or quality data
154         elif state == FASTQ_QUALITY:
155             write_split_sequence(target1, target2, line, mid)
156             state = FASTQ_HEADER
157         else:
158             raise RuntimeError("Unrecognized STATE in fastq split")
159
160 def write_header(target, prefix, line, suffix=''):
161     target.write('@')
162     target.write(prefix)
163     target.write(line[1:])
164     target.write(suffix)
165     target.write(os.linesep)
166
167 def write_sequence(target, line):
168     target.write(line)
169     target.write(os.linesep)
170
171 def write_split_sequence(target1, target2, line, mid):
172     target1.write(line[:mid])
173     target1.write(os.linesep)
174
175     target2.write(line[mid:])
176     target2.write(os.linesep)
177
178 def is_srf(filename):
179     """
180     Check filename to see if it is likely to be a SRF file
181     """
182     f = open(filename, 'r')
183     header = f.read(4)
184     f.close()
185     return header == "SSRF"
186
187 def is_cnf1(filename):
188     """
189     Brute force detection if a SRF file is using CNF1/CNF4 records
190     """
191     max_header = 1024 ** 2
192     PROGRAM_ID = 'PROGRAM_ID\000'
193     cnf4_apps = set(("solexa2srf v1.4", 
194                     "illumina2srf v1.11.5.Illumina.1.3"))
195
196     if not is_srf(filename):
197         raise ValueError("%s must be a srf file" % (filename,))
198
199     fd = os.open(filename, os.O_RDONLY)
200     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
201     # alas the max search length requires python 2.6+
202     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
203     program_header_start = program_id_location+len(PROGRAM_ID)
204     next_null = f.find('\000', program_header_start) #, max_header)
205     program_id_header = f[program_header_start:next_null]
206     f.close()
207     os.close(fd)
208
209     if program_id_header in cnf4_apps:
210         return False
211     else:
212         return True
213
214 def open_write(filename, force=False):
215     """
216     Open a file, but throw an exception if it already exists
217     """
218     if not force:
219         if os.path.exists(filename):
220             raise RuntimeError("%s exists" % (filename,))
221
222     return open(filename, 'w')
223
224 def foo():
225     path, name = os.path.split(filename)
226     base, ext = os.path.splitext(name)
227
228     target1_name = base + '_r1.fastq'
229     target2_name = base + '_r2.fastq'
230
231     for target_name in [target1_name, target2_name]:
232         print 'target name', target_name
233         if os.path.exists(target_name):
234             raise RuntimeError("%s exists" % (target_name,))
235
236     instream = open(filename,'r')
237     target1 = open(target1_name,'w')
238     target2 = open(target2_name,'w')
239
240
241
242 if __name__ == "__main__":
243     sys.exit(main(sys.argv[1:]))