scripts/srf2named_fastq.py

   1 #!/usr/bin/env python
   2 import logging
   3 import mmap
   4 from optparse import OptionParser
   5 import os
   6 from subprocess import Popen, PIPE
   7 import sys
   8
   9 from htsworkflow.util.opener import autoopen
  10
  11 # constants for our fastq finite state machine
  12 FASTQ_HEADER = 0
  13 FASTQ_SEQUENCE = 1
  14 FASTQ_SEQUENCE_HEADER = 2
  15 FASTQ_QUALITY = 3
  16
  17
  18 def main(cmdline=None):
  19     parser = make_parser()
  20     opts, args = parser.parse_args(cmdline)
  21
  22     if len(args) != 1:
  23         parser.error("Requires one argument, got: %s" % (str(args)))
  24
  25     if opts.verbose:
  26         logging.basicConfig(level=logging.INFO)
  27     else:
  28         logging.basicConfig(level=logging.WARN)
  29
  30     if opts.flowcell is not None:
  31         header = "%s_" % (opts.flowcell,)
  32     else:
  33         header = ''
  34
  35     if opts.single:
  36         left = open_write(opts.single, opts.force)
  37     else:
  38         left = open_write(opts.left, opts.force)
  39         right = open_write(opts.right, opts.force)
  40
  41     # open the srf, fastq, or compressed fastq
  42     if is_srf(args[0]):
  43         source = srf_open(args[0])
  44     else:
  45         source = autoopen(args[0])
  46
  47     if opts.single:
  48         convert_single_to_fastq(source, left, header)
  49     else:
  50         convert_single_to_two_fastq(source, left, right, opts.mid, header)
  51
  52     return 0
  53
  54 def make_parser():
  55     parser = OptionParser("""%prog: [options] file
  56
  57 file can be either a fastq file or a srf file.
  58 You can also force the flowcell ID to be added to the header.""")
  59
  60     parser.add_option('--force', default=False, action="store_true",
  61                       help="overwrite existing files.")
  62     parser.add_option('--flowcell', default=None,
  63                       help="add flowcell id header to sequence")
  64     parser.add_option('-l','--left', default="r1.fastq",
  65                       help='left side filename')
  66     parser.add_option('-m','--mid', default=None,
  67                       help='actual sequence mid point')
  68     parser.add_option('-r','--right', default="r2.fastq",
  69                       help='right side filename')
  70     parser.add_option('-s','--single', default=None,
  71                       help="single fastq target name")
  72     parser.add_option('-v', '--verbose', default=False, action="store_true",
  73                       help="show information about what we're doing.")
  74     return parser
  75
  76
  77 def srf_open(filename, cnf1=False):
  78     """
  79     Make a stream from srf file using srf2fastq
  80     """
  81
  82     cmd = ['srf2fastq']
  83     if is_cnf1(filename):
  84         cmd.append('-c')
  85     cmd.append(filename)
  86
  87     logging.info('srf command: %s' % (" ".join(cmd),))
  88     p = Popen(cmd, stdout=PIPE)
  89     return p.stdout
  90
  91
  92 def convert_single_to_fastq(instream, target1, header=''):
  93
  94     state = FASTQ_HEADER
  95     for line in instream:
  96         line = line.strip()
  97         # sequence header
  98         if state == FASTQ_HEADER:
  99             write_header(target1, header, line)
 100             state = FASTQ_SEQUENCE
 101         # sequence
 102         elif state == FASTQ_SEQUENCE:
 103             write_sequence(target1, line)
 104             state = FASTQ_SEQUENCE_HEADER
 105         # quality header
 106         elif state == FASTQ_SEQUENCE_HEADER:
 107             # the sequence header isn't really sequence, but
 108             # we're just passing it through
 109             write_sequence(target1, line)
 110             state = FASTQ_QUALITY
 111         # sequence or quality data
 112         elif state == FASTQ_QUALITY:
 113             write_sequence(target1, line)
 114             state = FASTQ_HEADER
 115         else:
 116             raise RuntimeError("Unrecognized STATE in fastq split")
 117
 118
 119
 120 def convert_single_to_two_fastq(instream, target1, target2, mid=None, header=''):
 121     """
 122     read a fastq file where two paired ends have been run together into
 123     two halves.
 124
 125     instream is the source stream
 126     target1 and target2 are the destination streams
 127     """
 128     if mid is not None:
 129         mid = int(mid)
 130
 131     state = FASTQ_HEADER
 132     for line in instream:
 133         line = line.strip()
 134         # sequence header
 135         if state == FASTQ_HEADER:
 136             write_header(target1, header, line, "/1")
 137             write_header(target2, header, line, "/2")
 138             state = FASTQ_SEQUENCE
 139         # sequence
 140         elif state == FASTQ_SEQUENCE:
 141             if mid is None:
 142                 mid = len(line)/2
 143             write_split_sequence(target1, target2, line, mid)
 144             state = FASTQ_SEQUENCE_HEADER
 145         # quality header
 146         elif state == FASTQ_SEQUENCE_HEADER:
 147             # the sequence header isn't really sequence, but
 148             # we're just passing it through
 149             write_sequence(target1, line)
 150             write_sequence(target2, line)
 151
 152             state = FASTQ_QUALITY
 153         # sequence or quality data
 154         elif state == FASTQ_QUALITY:
 155             write_split_sequence(target1, target2, line, mid)
 156             state = FASTQ_HEADER
 157         else:
 158             raise RuntimeError("Unrecognized STATE in fastq split")
 159
 160 def write_header(target, prefix, line, suffix=''):
 161     target.write('@')
 162     target.write(prefix)
 163     target.write(line[1:])
 164     target.write(suffix)
 165     target.write(os.linesep)
 166
 167 def write_sequence(target, line):
 168     target.write(line)
 169     target.write(os.linesep)
 170
 171 def write_split_sequence(target1, target2, line, mid):
 172     target1.write(line[:mid])
 173     target1.write(os.linesep)
 174
 175     target2.write(line[mid:])
 176     target2.write(os.linesep)
 177
 178 def is_srf(filename):
 179     """
 180     Check filename to see if it is likely to be a SRF file
 181     """
 182     f = open(filename, 'r')
 183     header = f.read(4)
 184     f.close()
 185     return header == "SSRF"
 186
 187 def is_cnf1(filename):
 188     """
 189     Brute force detection if a SRF file is using CNF1/CNF4 records
 190     """
 191     max_header = 1024 ** 2
 192     PROGRAM_ID = 'PROGRAM_ID\000'
 193     cnf4_apps = set(("solexa2srf v1.4",
 194                     "illumina2srf v1.11.5.Illumina.1.3"))
 195
 196     if not is_srf(filename):
 197         raise ValueError("%s must be a srf file" % (filename,))
 198
 199     fd = os.open(filename, os.O_RDONLY)
 200     f = mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
 201     # alas the max search length requires python 2.6+
 202     program_id_location = f.find(PROGRAM_ID, 0) #, max_header)
 203     program_header_start = program_id_location+len(PROGRAM_ID)
 204     next_null = f.find('\000', program_header_start) #, max_header)
 205     program_id_header = f[program_header_start:next_null]
 206     f.close()
 207     os.close(fd)
 208
 209     if program_id_header in cnf4_apps:
 210         return False
 211     else:
 212         return True
 213
 214 def open_write(filename, force=False):
 215     """
 216     Open a file, but throw an exception if it already exists
 217     """
 218     if not force:
 219         if os.path.exists(filename):
 220             raise RuntimeError("%s exists" % (filename,))
 221
 222     return open(filename, 'w')
 223
 224 def foo():
 225     path, name = os.path.split(filename)
 226     base, ext = os.path.splitext(name)
 227
 228     target1_name = base + '_r1.fastq'
 229     target2_name = base + '_r2.fastq'
 230
 231     for target_name in [target1_name, target2_name]:
 232         print 'target name', target_name
 233         if os.path.exists(target_name):
 234             raise RuntimeError("%s exists" % (target_name,))
 235
 236     instream = open(filename,'r')
 237     target1 = open(target1_name,'w')
 238     target2 = open(target2_name,'w')
 239
 240
 241
 242 if __name__ == "__main__":
 243     sys.exit(main(sys.argv[1:]))