adaa1b2dfe4efcfcd5186bfa86a350ce093de88e
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.version import version
11
12 # constants for our fastq finite state machine
13 FASTQ_HEADER = 0
14 FASTQ_SEQUENCE = 1
15 FASTQ_SEQUENCE_HEADER = 2
16 FASTQ_QUALITY = 3
17
18 def main(cmdline=None):
19     parser = make_parser()
20     opts, args = parser.parse_args(cmdline)
21
22     if opts.verbose:
23         logging.basicConfig(level=logging.INFO)
24     else:
25         logging.basicConfig(level=logging.WARN)
26
27     if opts.version:
28         print version()
29         return 0
30
31     if len(args) != 1:
32         parser.error("Requires one argument, got: %s" % (str(args)))
33
34     if opts.flowcell is not None:
35         header = "%s_" % (opts.flowcell,)
36     else:
37         header = ''
38
39     if opts.single:
40         left = open_write(opts.single, opts.force)
41     else:
42         left = open_write(opts.left, opts.force)
43         right = open_write(opts.right, opts.force)
44     
45     # open the srf, fastq, or compressed fastq
46     if is_srf(args[0]):
47         source = srf_open(args[0])
48     else:
49         source = autoopen(args[0])
50
51     if opts.single:
52         convert_single_to_fastq(source, left, header)
53     else:
54         convert_single_to_two_fastq(source, left, right, opts.mid, header)
55    
56     return 0
57
58 def make_parser():
59     parser = OptionParser("""%prog: [options] file
60
61 file can be either a fastq file or a srf file.
62 You can also force the flowcell ID to be added to the header.""")
63
64     parser.add_option('--force', default=False, action="store_true",
65                       help="overwrite existing files.")
66     parser.add_option('--flowcell', default=None,
67                       help="add flowcell id header to sequence")
68     parser.add_option('-l','--left', default="r1.fastq",
69                       help='left side filename')
70     parser.add_option('-m','--mid', default=None, 
71                       help='actual sequence mid point')
72     parser.add_option('-r','--right', default="r2.fastq",
73                       help='right side filename')
74     parser.add_option('-s','--single', default=None,
75                       help="single fastq target name")
76     parser.add_option('-v', '--verbose', default=False, action="store_true",
77                       help="show information about what we're doing.")
78     parser.add_option('--version', default=False, action="store_true",
79                       help="Report software version")
80     return parser
81
82
83 def srf_open(filename, cnf1=False):
84     """
85     Make a stream from srf file using srf2fastq
86     """
87     cmd = ['srf2fastq']
88     if is_cnf1(filename):
89         cmd.append('-c')
90     cmd.append(filename)
91       
92     logging.info('srf command: %s' % (" ".join(cmd),))
93     p = Popen(cmd, stdout=PIPE)
94     return p.stdout
95     
96
97 def convert_single_to_fastq(instream, target1, header=''):
98
99     state = FASTQ_HEADER
100     for line in instream:
101         line = line.strip()
102         # sequence header
103         if state == FASTQ_HEADER:
104             write_header(target1, header, line)
105             state = FASTQ_SEQUENCE
106         # sequence
107         elif state == FASTQ_SEQUENCE:
108             write_sequence(target1, line)
109             state = FASTQ_SEQUENCE_HEADER
110         # quality header
111         elif state == FASTQ_SEQUENCE_HEADER:
112             # the sequence header isn't really sequence, but 
113             # we're just passing it through
114             write_sequence(target1, line)
115             state = FASTQ_QUALITY
116         # sequence or quality data
117         elif state == FASTQ_QUALITY:
118             write_sequence(target1, line)
119             state = FASTQ_HEADER
120         else:
121             raise RuntimeError("Unrecognized STATE in fastq split")
122
123
124         
125 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
126     """
127     read a fastq file where two paired ends have been run together into 
128     two halves.
129
130     instream is the source stream
131     target1 and target2 are the destination streams
132     """
133     if mid is not None:
134         mid = int(mid)
135
136     state = FASTQ_HEADER
137     for line in instream:
138         line = line.strip()
139         # sequence header
140         if state == FASTQ_HEADER:
141             write_header(target1, header, line, "/1")
142             write_header(target2, header, line, "/2")
143             state = FASTQ_SEQUENCE
144         # sequence
145         elif state == FASTQ_SEQUENCE:
146             if mid is None:
147                 mid = len(line)/2
148             write_split_sequence(target1, target2, line, mid)
149             state = FASTQ_SEQUENCE_HEADER
150         # quality header
151         elif state == FASTQ_SEQUENCE_HEADER:
152             # the sequence header isn't really sequence, but 
153             # we're just passing it through
154             write_sequence(target1, line)
155             write_sequence(target2, line)
156
157             state = FASTQ_QUALITY
158         # sequence or quality data
159         elif state == FASTQ_QUALITY:
160             write_split_sequence(target1, target2, line, mid)
161             state = FASTQ_HEADER
162         else:
163             raise RuntimeError("Unrecognized STATE in fastq split")
164
165 def write_header(target, prefix, line, suffix=''):
166     target.write('@')
167     target.write(prefix)
168     target.write(line[1:])
169     target.write(suffix)
170     target.write(os.linesep)
171
172 def write_sequence(target, line):
173     target.write(line)
174     target.write(os.linesep)
175
176 def write_split_sequence(target1, target2, line, mid):
177     target1.write(line[:mid])
178     target1.write(os.linesep)
179
180     target2.write(line[mid:])
181     target2.write(os.linesep)
182
183 def is_srf(filename):
184     """
185     Check filename to see if it is likely to be a SRF file
186     """
187     f = open(filename, 'r')
188     header = f.read(4)
189     f.close()
190     return header == "SSRF"
191
192 def is_cnf1(filename):
193     """
194     Brute force detection if a SRF file is using CNF1/CNF4 records
195     """
196     max_header = 1024 ** 2
197     PROGRAM_ID = 'PROGRAM_ID\000'
198     cnf4_apps = set(("solexa2srf v1.4", 
199                     "illumina2srf v1.11.5.Illumina.1.3"))
200
201     if not is_srf(filename):
202         raise ValueError("%s must be a srf file" % (filename,))
203
204     fd = os.open(filename, os.O_RDONLY)
205     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
206     # alas the max search length requires python 2.6+
207     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
208     program_header_start = program_id_location+len(PROGRAM_ID)
209     next_null = f.find('\000', program_header_start) #, max_header)
210     program_id_header = f[program_header_start:next_null]
211     f.close()
212     os.close(fd)
213
214     if program_id_header in cnf4_apps:
215         return False
216     else:
217         return True
218
219 def open_write(filename, force=False):
220     """
221     Open a file, but throw an exception if it already exists
222     """
223     if not force:
224         if os.path.exists(filename):
225             raise RuntimeError("%s exists" % (filename,))
226
227     return open(filename, 'w')
228
229 def foo():
230     path, name = os.path.split(filename)
231     base, ext = os.path.splitext(name)
232
233     target1_name = base + '_r1.fastq'
234     target2_name = base + '_r2.fastq'
235
236     for target_name in [target1_name, target2_name]:
237         print 'target name', target_name
238         if os.path.exists(target_name):
239             raise RuntimeError("%s exists" % (target_name,))
240
241     instream = open(filename,'r')
242     target1 = open(target1_name,'w')
243     target2 = open(target2_name,'w')
244
245
246 if __name__ == "__main__":
247     sys.exit(main(sys.argv[1:]))