scripts/srf2named_fastq.py

   1 #!/usr/bin/env python
   2 import logging
   3 import mmap
   4 from optparse import OptionParser
   5 import os
   6 from subprocess import Popen, PIPE
   7 import sys
   8
   9 from htsworkflow.util.opener import autoopen
  10
  11 # constants for our fastq finite state machine
  12 FASTQ_HEADER = 0
  13 FASTQ_SEQUENCE = 1
  14 FASTQ_SEQUENCE_HEADER = 2
  15 FASTQ_QUALITY = 3
  16
  17
  18 def main(cmdline=None):
  19     parser = make_parser()
  20     opts, args = parser.parse_args(cmdline)
  21
  22     if len(args) != 1:
  23         parser.error("Requires one argument, got: %s" % (str(args)))
  24
  25     if opts.verbose:
  26         logging.basicConfig(level=logging.INFO)
  27     else:
  28         logging.basicConfig(level=logging.WARN)
  29
  30     if opts.flowcell is not None:
  31         header = "%s_" % (opts.flowcell,)
  32     else:
  33         header = ''
  34
  35     if opts.single:
  36         left = open_write(opts.single, opts.force)
  37     else:
  38         left = open_write(opts.left, opts.force)
  39         right = open_write(opts.right, opts.force)
  40
  41     # open the srf, fastq, or compressed fastq
  42     if is_srf(args[0]):
  43         source = srf_open(args[0])
  44     else:
  45         source = autoopen(args[0])
  46
  47     if opts.single:
  48         convert_single_to_fastq(source, left, header)
  49     else:
  50         convert_single_to_two_fastq(source, left, right, opts.mid, header)
  51
  52     return 0
  53
  54 def make_parser():
  55     parser = OptionParser("""%prog: [options] file
  56
  57 file can be either a fastq file or a srf file.
  58 You can also force the flowcell ID to be added to the header.""")
  59
  60     parser.add_option('--force', default=False, action="store_true",
  61                       help="overwrite existing files.")
  62     parser.add_option('--flowcell', default=None,
  63                       help="add flowcell id header to sequence")
  64     parser.add_option('-l','--left', default="r1.fastq",
  65                       help='left side filename')
  66     parser.add_option('-r','--right', default="r2.fastq",
  67                       help='right side filename')
  68     parser.add_option('-m','--mid', default=None,
  69                       help='actual sequence mid point')
  70     parser.add_option('-s','--single', default=None,
  71                       help="single fastq target name")
  72     parser.add_option('-v', '--verbose', default=False, action="store_true",
  73                       help="show information about what we're doing.")
  74     return parser
  75
  76
  77 def srf_open(filename, cnf1=False):
  78     """
  79     Make a stream from srf file using srf2fastq
  80     """
  81
  82     cmd = ['srf2fastq']
  83     if is_cnf1(filename):
  84         cmd.append('-c')
  85     cmd.append(filename)
  86
  87     logging.info('srf command: %s' % (" ".join(cmd),))
  88     p = Popen(cmd, stdout=PIPE)
  89     return p.stdout
  90
  91
  92 def convert_single_to_fastq(instream, target1, header=''):
  93
  94     state = FASTQ_HEADER
  95     for line in instream:
  96         line = line.strip()
  97         # sequence header
  98         if state == FASTQ_HEADER:
  99             write_header(target1, header, line)
 100             state = FASTQ_SEQUENCE
 101         # sequence
 102         elif state == FASTQ_SEQUENCE:
 103             write_sequence(target1, line)
 104             state = FASTQ_SEQUENCE_HEADER
 105         # quality header
 106         elif state == FASTQ_SEQUENCE_HEADER:
 107             # the sequence header isn't really sequence, but
 108             # we're just passing it through
 109             write_sequence(target1, line)
 110             state = FASTQ_QUALITY
 111         # sequence or quality data
 112         elif state == FASTQ_QUALITY:
 113             write_sequence(target1, line)
 114             state = FASTQ_HEADER
 115         else:
 116             raise RuntimeError("Unrecognized STATE in fastq split")
 117
 118
 119
 120 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
 121     """
 122     read a fastq file where two paired ends have been run together into
 123     two halves.
 124
 125     instream is the source stream
 126     target1 and target2 are the destination streams
 127     """
 128     if mid is not None:
 129         mid = int(mid)
 130
 131     state = FASTQ_HEADER
 132     for line in instream:
 133         line = line.strip()
 134         # sequence header
 135         if state == FASTQ_HEADER:
 136             write_header(target1, header, line, "/1")
 137             write_header(target2, header, line, "/2")
 138             state = FASTQ_SEQUENCE
 139         # sequence
 140         elif state == FASTQ_SEQUENCE:
 141             if mid is None:
 142                 mid = len(line)/2
 143             write_split_sequence(target1, target2, line, mid)
 144             state = FASTQ_SEQUENCE_HEADER
 145         # quality header
 146         elif state == FASTQ_SEQUENCE_HEADER:
 147             # the sequence header isn't really sequence, but
 148             # we're just passing it through
 149             write_sequence(target1, line)
 150             write_sequence(target2, line)
 151
 152             state = FASTQ_QUALITY
 153         # sequence or quality data
 154         elif state == FASTQ_QUALITY:
 155             write_split_sequence(target1, target2, line, mid)
 156             state = FASTQ_HEADER
 157         else:
 158             raise RuntimeError("Unrecognized STATE in fastq split")
 159
 160 def write_header(target, prefix, line, suffix=''):
 161     target.write('@')
 162     target.write(prefix)
 163     target.write(line[1:])
 164     target.write(suffix)
 165     target.write(os.linesep)
 166
 167 def write_sequence(target, line):
 168     target.write(line)
 169     target.write(os.linesep)
 170
 171 def write_split_sequence(target1, target2, line, mid):
 172     target1.write(line[:mid])
 173     target1.write(os.linesep)
 174
 175     target2.write(line[mid:])
 176     target2.write(os.linesep)
 177
 178 def is_srf(filename):
 179     """
 180     Check filename to see if it is likely to be a SRF file
 181     """
 182     f = open(filename, 'r')
 183     header = f.read(4)
 184     f.close()
 185     return header == "SSRF"
 186
 187 def is_cnf1(filename):
 188     """
 189     Brute force detection if a SRF file is using CNF1/CNF4 records
 190     """
 191     max_header = 1024 ** 2
 192     PROGRAM_ID = 'PROGRAM_ID\000'
 193
 194     if not is_srf(filename):
 195         raise ValueError("%s must be a srf file" % (filename,))
 196
 197     fd = os.open(filename, os.O_RDONLY)
 198     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
 199     # alas the max search length requires python 2.6+
 200     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
 201     program_header_start = program_id_location+len(PROGRAM_ID)
 202     next_null = f.find('\000', program_header_start) #, max_header)
 203     program_id_header = f[program_header_start:next_null]
 204     f.close()
 205     os.close(fd)
 206
 207     if program_id_header == "solexa2srf v1.4":
 208         return False
 209     else:
 210         return True
 211
 212 def open_write(filename, force=False):
 213     """
 214     Open a file, but throw an exception if it already exists
 215     """
 216     if not force:
 217         if os.path.exists(filename):
 218             raise RuntimeError("%s exists" % (filename,))
 219
 220     return open(filename, 'w')
 221
 222 def foo():
 223     path, name = os.path.split(filename)
 224     base, ext = os.path.splitext(name)
 225
 226     target1_name = base + '_r1.fastq'
 227     target2_name = base + '_r2.fastq'
 228
 229     for target_name in [target1_name, target2_name]:
 230         print 'target name', target_name
 231         if os.path.exists(target_name):
 232             raise RuntimeError("%s exists" % (target_name,))
 233
 234     instream = open(filename,'r')
 235     target1 = open(target1_name,'w')
 236     target2 = open(target2_name,'w')
 237
 238
 239
 240 if __name__ == "__main__":
 241     sys.exit(main(sys.argv[1:]))