ebf33232a1c126a0ff1f7d0e7e3ce3d3bb1179fa
[htsworkflow.git] / scripts / srf2named_fastq.py
1 #!/usr/bin/env python
2 import logging
3 from optparse import OptionParser
4 import os
5 from subprocess import Popen, PIPE
6 import sys
7
8 from htsworkflow.util.opener import autoopen
9
10 # constants for our fastq finite state machine
11 FASTQ_HEADER = 0
12 FASTQ_SEQUENCE = 1
13 FASTQ_SEQUENCE_HEADER = 2
14 FASTQ_QUALITY = 3
15
16
17 def main(cmdline=None):
18     parser = make_parser()
19     opts, args = parser.parse_args(cmdline)
20
21     if len(args) != 1:
22         parser.error("Requires one argument")
23
24     if opts.verbose:
25         logging.basicConfig(level=logging.INFO)
26     else:
27         logging.basicConfig(level=logging.WARN)
28
29     if opts.flowcell is not None:
30         header = "%s_" % (opts.flowcell,)
31     else:
32         header = ''
33
34     if opts.single:
35         left = open_write(opts.single, opts.force)
36     else:
37         left = open_write(opts.left, opts.force)
38         right = open_write(opts.right, opts.force)
39     
40     # open the srf, fastq, or compressed fastq
41     if is_srf(args[0]):
42         source = srf_open(args[0], opts.cnf1)
43     else:
44         source = autoopen(args[0])
45
46     if opts.single:
47         convert_single_to_fastq(source, left, header)
48     else:
49         convert_single_to_two_fastq(source, left, right, opts.mid, header)
50    
51     return 0
52
53 def make_parser():
54     parser = OptionParser("""%prog: [options] file
55
56 file can be either a fastq file or a srf file.
57 You can also force the flowcell ID to be added to the header.""")
58     parser.add_option('-c','--cnf1',default=False, action="store_true",
59       help="pass -c to srf2fastq, needed for calibrated quality values"
60     )
61     parser.add_option('--force', default=False, action="store_true",
62                       help="overwrite existing files.")
63     parser.add_option('--flowcell', default=None,
64                       help="add flowcell id header to sequence")
65     parser.add_option('-l','--left', default="r1.fastq",
66                       help='left side filename')
67     parser.add_option('-r','--right', default="r2.fastq",
68                       help='right side filename')
69     parser.add_option('-m','--mid', default=None, 
70                       help='actual sequence mid point')
71     parser.add_option('-s','--single', default=None,
72                       help="single fastq target name")
73     parser.add_option('-v', '--verbose', default=False, action="store_true",
74                       help="show information about what we're doing.")
75     return parser
76
77
78 def srf_open(filename, cnf1=False):
79     """
80     Make a stream from srf file using srf2fastq
81     """
82     
83     cmd = ['srf2fastq']
84     if cnf1:
85         cmd.append('-c')
86     cmd.append(filename)
87       
88     logging.info('srf command: %s' % (" ".join(cmd),))
89     p = Popen(cmd, stdout=PIPE)
90     return p.stdout
91     
92
93 def convert_single_to_fastq(instream, target1, header=''):
94
95     state = FASTQ_HEADER
96     for line in instream:
97         line = line.strip()
98         # sequence header
99         if state == FASTQ_HEADER:
100             write_header(target1, header, line)
101             state = FASTQ_SEQUENCE
102         # sequence
103         elif state == FASTQ_SEQUENCE:
104             write_sequence(target1, line)
105             state = FASTQ_SEQUENCE_HEADER
106         # quality header
107         elif state == FASTQ_SEQUENCE_HEADER:
108             # the sequence header isn't really sequence, but 
109             # we're just passing it through
110             write_sequence(target1, line)
111             state = FASTQ_QUALITY
112         # sequence or quality data
113         elif state == FASTQ_QUALITY:
114             write_sequence(target1, line)
115             state = FASTQ_HEADER
116         else:
117             raise RuntimeError("Unrecognized STATE in fastq split")
118
119
120         
121 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
122     """
123     read a fastq file where two paired ends have been run together into 
124     two halves.
125
126     instream is the source stream
127     target1 and target2 are the destination streams
128     """
129     if mid is not None:
130         mid = int(mid)
131
132     state = FASTQ_HEADER
133     for line in instream:
134         line = line.strip()
135         # sequence header
136         if state == FASTQ_HEADER:
137             write_header(target1, header, line, "/1")
138             write_header(target2, header, line, "/2")
139             state = FASTQ_SEQUENCE
140         # sequence
141         elif state == FASTQ_SEQUENCE:
142             if mid is None:
143                 mid = len(line)/2
144             write_split_sequence(target1, target2, line, mid)
145             state = FASTQ_SEQUENCE_HEADER
146         # quality header
147         elif state == FASTQ_SEQUENCE_HEADER:
148             # the sequence header isn't really sequence, but 
149             # we're just passing it through
150             write_sequence(target1, line)
151             write_sequence(target2, line)
152
153             state = FASTQ_QUALITY
154         # sequence or quality data
155         elif state == FASTQ_QUALITY:
156             write_split_sequence(target1, target2, line, mid)
157             state = FASTQ_HEADER
158         else:
159             raise RuntimeError("Unrecognized STATE in fastq split")
160
161 def write_header(target, prefix, line, suffix=''):
162     target.write('@')
163     target.write(prefix)
164     target.write(line[1:])
165     target.write(suffix)
166     target.write(os.linesep)
167
168 def write_sequence(target, line):
169     target.write(line)
170     target.write(os.linesep)
171
172 def write_split_sequence(target1, target2, line, mid):
173     target1.write(line[:mid])
174     target1.write(os.linesep)
175
176     target2.write(line[mid:])
177     target2.write(os.linesep)
178
179 def is_srf(filename):
180     """
181     Check filename to see if it is likely to be a SRF file
182     """
183     f = open(filename, 'r')
184     header = f.read(4)
185     f.close()
186     return header == "SSRF"
187
188 def open_write(filename, force=False):
189     """
190     Open a file, but throw an exception if it already exists
191     """
192     if not force:
193         if os.path.exists(filename):
194             raise RuntimeError("%s exists" % (filename,))
195
196     return open(filename, 'w')
197
198 def foo():
199     path, name = os.path.split(filename)
200     base, ext = os.path.splitext(name)
201
202     target1_name = base + '_r1.fastq'
203     target2_name = base + '_r2.fastq'
204
205     for target_name in [target1_name, target2_name]:
206         print 'target name', target_name
207         if os.path.exists(target_name):
208             raise RuntimeError("%s exists" % (target_name,))
209
210     instream = open(filename,'r')
211     target1 = open(target1_name,'w')
212     target2 = open(target2_name,'w')
213
214
215
216 if __name__ == "__main__":
217     sys.exit(main(sys.argv[1:]))