Move parse_flowcell_id out of frontend.samples.results
[htsworkflow.git] / htsworkflow / pipelines / srf2fastq.py
1 #!/usr/bin/env python
2 import logging
3 import mmap
4 from optparse import OptionParser
5 import os
6 from subprocess import Popen, PIPE
7 import sys
8
9 from htsworkflow.util.opener import autoopen
10 from htsworkflow.version import version
11
12 # constants for our fastq finite state machine
13 FASTQ_HEADER = 0
14 FASTQ_SEQUENCE = 1
15 FASTQ_SEQUENCE_HEADER = 2
16 FASTQ_QUALITY = 3
17
18 def main(cmdline=None):
19     parser = make_parser()
20     opts, args = parser.parse_args(cmdline)
21
22     if opts.verbose:
23         logging.basicConfig(level=logging.INFO)
24     else:
25         logging.basicConfig(level=logging.WARN)
26
27     if opts.version:
28         print version()
29         return 0
30
31     if len(args) != 1:
32         parser.error("Requires one argument, got: %s" % (str(args)))
33
34     if opts.flowcell is not None:
35         header = "%s_" % (opts.flowcell,)
36     else:
37         header = ''
38
39     if opts.single:
40         left = open_write(opts.single, opts.force)
41     else:
42         left = open_write(opts.left, opts.force)
43         right = open_write(opts.right, opts.force)
44     
45     # open the srf, fastq, or compressed fastq
46     if is_srf(args[0]):
47         source = srf_open(args[0], opts.cnf1)
48     else:
49         source = autoopen(args[0])
50
51     if opts.single:
52         convert_single_to_fastq(source, left, header)
53     else:
54         convert_single_to_two_fastq(source, left, right, opts.mid, header)
55    
56     return 0
57
58 def make_parser():
59     parser = OptionParser("""%prog: [options] file
60
61 file can be either a fastq file or a srf file.
62 You can also force the flowcell ID to be added to the header.""")
63
64     parser.add_option('--force', default=False, action="store_true",
65                       help="overwrite existing files.")
66     parser.add_option('--flowcell', default=None,
67                       help="add flowcell id header to sequence")
68     parser.add_option('-l','--left', default="r1.fastq",
69                       help='left side filename')
70     parser.add_option('-m','--mid', default=None, 
71                       help='actual sequence mid point')
72     parser.add_option('-r','--right', default="r2.fastq",
73                       help='right side filename')
74     parser.add_option('-s','--single', default=None,
75                       help="single fastq target name")
76     parser.add_option('-v', '--verbose', default=False, action="store_true",
77                       help="show information about what we're doing.")
78     parser.add_option('--version', default=False, action="store_true",
79                       help="Report software version")
80     parser.add_option('--cnf1', default=False, action="store_true",
81                       help="Force cnf1 mode in srf2fastq")
82     return parser
83
84
85 def srf_open(filename, cnf1=False):
86     """
87     Make a stream from srf file using srf2fastq
88     """
89     cmd = ['srf2fastq']
90     if cnf1 or is_cnf1(filename):
91         cmd.append('-c')
92     cmd.append(filename)
93       
94     logging.info('srf command: %s' % (" ".join(cmd),))
95     p = Popen(cmd, stdout=PIPE)
96     return p.stdout
97     
98
99 def convert_single_to_fastq(instream, target1, header=''):
100
101     state = FASTQ_HEADER
102     for line in instream:
103         line = line.strip()
104         # sequence header
105         if state == FASTQ_HEADER:
106             write_header(target1, header, line)
107             state = FASTQ_SEQUENCE
108         # sequence
109         elif state == FASTQ_SEQUENCE:
110             write_sequence(target1, line)
111             state = FASTQ_SEQUENCE_HEADER
112         # quality header
113         elif state == FASTQ_SEQUENCE_HEADER:
114             # the sequence header isn't really sequence, but 
115             # we're just passing it through
116             write_sequence(target1, line)
117             state = FASTQ_QUALITY
118         # sequence or quality data
119         elif state == FASTQ_QUALITY:
120             write_sequence(target1, line)
121             state = FASTQ_HEADER
122         else:
123             raise RuntimeError("Unrecognized STATE in fastq split")
124
125
126         
127 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
128     """
129     read a fastq file where two paired ends have been run together into 
130     two halves.
131
132     instream is the source stream
133     target1 and target2 are the destination streams
134     """
135     if mid is not None:
136         mid = int(mid)
137
138     state = FASTQ_HEADER
139     for line in instream:
140         line = line.strip()
141         # sequence header
142         if state == FASTQ_HEADER:
143             write_header(target1, header, line, "/1")
144             write_header(target2, header, line, "/2")
145             state = FASTQ_SEQUENCE
146         # sequence
147         elif state == FASTQ_SEQUENCE:
148             if mid is None:
149                 mid = len(line)/2
150             write_split_sequence(target1, target2, line, mid)
151             state = FASTQ_SEQUENCE_HEADER
152         # quality header
153         elif state == FASTQ_SEQUENCE_HEADER:
154             # the sequence header isn't really sequence, but 
155             # we're just passing it through
156             write_sequence(target1, line)
157             write_sequence(target2, line)
158
159             state = FASTQ_QUALITY
160         # sequence or quality data
161         elif state == FASTQ_QUALITY:
162             write_split_sequence(target1, target2, line, mid)
163             state = FASTQ_HEADER
164         else:
165             raise RuntimeError("Unrecognized STATE in fastq split")
166
167 def write_header(target, prefix, line, suffix=''):
168     target.write('@')
169     target.write(prefix)
170     target.write(line[1:])
171     target.write(suffix)
172     target.write(os.linesep)
173
174 def write_sequence(target, line):
175     target.write(line)
176     target.write(os.linesep)
177
178 def write_split_sequence(target1, target2, line, mid):
179     target1.write(line[:mid])
180     target1.write(os.linesep)
181
182     target2.write(line[mid:])
183     target2.write(os.linesep)
184
185 def is_srf(filename):
186     """
187     Check filename to see if it is likely to be a SRF file
188     """
189     f = open(filename, 'r')
190     header = f.read(4)
191     f.close()
192     return header == "SSRF"
193
194 def is_cnf1(filename):
195     """
196     Brute force detection if a SRF file is using CNF1/CNF4 records
197     """
198     max_header = 1024 ** 2
199     PROGRAM_ID = 'PROGRAM_ID\000'
200     cnf4_apps = set(("solexa2srf v1.4", 
201                     "illumina2srf v1.11.5.Illumina.1.3"))
202
203     if not is_srf(filename):
204         raise ValueError("%s must be a srf file" % (filename,))
205
206     fd = os.open(filename, os.O_RDONLY)
207     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
208     # alas the max search length requires python 2.6+
209     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
210     program_header_start = program_id_location+len(PROGRAM_ID)
211     next_null = f.find('\000', program_header_start) #, max_header)
212     program_id_header = f[program_header_start:next_null]
213     f.close()
214     os.close(fd)
215
216     if program_id_header in cnf4_apps:
217         return False
218     else:
219         return True
220
221 def open_write(filename, force=False):
222     """
223     Open a file, but throw an exception if it already exists
224     """
225     if not force:
226         if os.path.exists(filename):
227             raise RuntimeError("%s exists" % (filename,))
228
229     return open(filename, 'w')
230
231 def foo():
232     path, name = os.path.split(filename)
233     base, ext = os.path.splitext(name)
234
235     target1_name = base + '_r1.fastq'
236     target2_name = base + '_r2.fastq'
237
238     for target_name in [target1_name, target2_name]:
239         print 'target name', target_name
240         if os.path.exists(target_name):
241             raise RuntimeError("%s exists" % (target_name,))
242
243     instream = open(filename,'r')
244     target1 = open(target1_name,'w')
245     target2 = open(target2_name,'w')
246
247
248 if __name__ == "__main__":
249     sys.exit(main(sys.argv[1:]))