Fix srf2named_fastq to output the proper /2 in paired end reads
[htsworkflow.git] / scripts / srf2named_fastq.py
1 #!/usr/bin/env python
2 import logging
3 from optparse import OptionParser
4 import os
5 from subprocess import Popen, PIPE
6 import sys
7
8 from htsworkflow.util.opener import autoopen
9
10
11 def main(cmdline=None):
12     parser = make_parser()
13     opts, args = parser.parse_args(cmdline)
14
15     if len(args) != 1:
16         parser.error("Requires one argument")
17
18     if opts.verbose:
19         logging.basicConfig(level=logging.INFO)
20     else:
21         logging.basicConfig(level=logging.WARN)
22
23     if opts.flowcell is not None:
24         header = "%s_" % (opts.flowcell,)
25     else:
26         header = ''
27
28     if opts.single:
29         left = open_write(opts.single, opts.force)
30     else:
31         left = open_write(opts.left, opts.force)
32         right = open_write(opts.right, opts.force)
33     
34     # open the srf, fastq, or compressed fastq
35     if is_srf(args[0]):
36         source = srf_open(args[0], opts.cnf1)
37     else:
38         source = autoopen(args[0])
39
40     if opts.single:
41         convert_single_to_fastq(source, left, header)
42     else:
43         convert_single_to_two_fastq(source, left, right, opts.mid, header)
44    
45     return 0
46
47 def make_parser():
48     parser = OptionParser("""%prog: [options] file
49
50 file can be either a fastq file or a srf file.
51 You can also force the flowcell ID to be added to the header.""")
52     parser.add_option('-c','--cnf1',default=False, action="store_true",
53       help="pass -c to srf2fastq, needed for calibrated quality values"
54     )
55     parser.add_option('--force', default=False, action="store_true",
56                       help="overwrite existing files.")
57     parser.add_option('--flowcell', default=None,
58                       help="add flowcell id header to sequence")
59     parser.add_option('-l','--left', default="r1.fastq",
60                       help='left side filename')
61     parser.add_option('-r','--right', default="r2.fastq",
62                       help='right side filename')
63     parser.add_option('-m','--mid', default=None, 
64                       help='actual sequence mid point')
65     parser.add_option('-s','--single', default=None,
66                       help="single fastq target name")
67     parser.add_option('-v', '--verbose', default=False, action="store_true",
68                       help="show information about what we're doing.")
69     return parser
70
71
72 def srf_open(filename, cnf1=False):
73     """
74     Make a stream from srf file using srf2fastq
75     """
76     
77     cmd = ['srf2fastq']
78     if cnf1:
79         cmd.append('-c')
80     cmd.append(filename)
81       
82     logging.info('srf command: %s' % (" ".join(cmd),))
83     p = Popen(cmd, stdout=PIPE)
84     return p.stdout
85     
86
87 def convert_single_to_fastq(instream, target1, header=''):
88     for line in instream:
89         # sequence header
90         if line[0] == '@':
91             line = line.strip()
92             target1.write('@')
93             target1.write(header)
94             target1.write(line[1:])
95             target1.write(os.linesep)
96
97         # quality header
98         elif line[0] == '+':
99             target1.write(line)
100         # sequence or quality data
101         else:
102             target1.write(line)
103         
104 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
105     if mid is not None:
106         mid = int(mid)
107
108     for line in instream:
109         # sequence header
110         if line[0] == '@':
111             line = line.strip()
112             target1.write('@')
113             target1.write(header)
114             target1.write(line[1:])
115             target1.write("/1")
116             target1.write(os.linesep)
117
118             target2.write('@')
119             target2.write(header)
120             target2.write(line[1:])
121             target2.write("/2")
122             target2.write(os.linesep)
123
124         # quality header
125         elif line[0] == '+':
126             target1.write(line)
127             target2.write(line)
128         # sequence or quality data
129         else:
130             line = line.strip()
131             if mid is None:
132                 mid = len(line)/2
133             target1.write(line[:mid])
134             target1.write(os.linesep)
135             target2.write(line[mid:])
136             target2.write(os.linesep)
137
138 def is_srf(filename):
139     """
140     Check filename to see if it is likely to be a SRF file
141     """
142     f = open(filename, 'r')
143     header = f.read(4)
144     f.close()
145     return header == "SSRF"
146
147 def open_write(filename, force=False):
148     """
149     Open a file, but throw an exception if it already exists
150     """
151     if not force:
152         if os.path.exists(filename):
153             raise RuntimeError("%s exists" % (filename,))
154
155     return open(filename, 'w')
156
157 def foo():
158     path, name = os.path.split(filename)
159     base, ext = os.path.splitext(name)
160
161     target1_name = base + '_r1.fastq'
162     target2_name = base + '_r2.fastq'
163
164     for target_name in [target1_name, target2_name]:
165         print 'target name', target_name
166         if os.path.exists(target_name):
167             raise RuntimeError("%s exists" % (target_name,))
168
169     instream = open(filename,'r')
170     target1 = open(target1_name,'w')
171     target2 = open(target2_name,'w')
172
173
174
175 if __name__ == "__main__":
176     sys.exit(main(sys.argv[1:]))