htsworkflow/pipelines/qseq2fastq.py

   1 #!/usr/bin/env python
   2 from glob import glob
   3 import os
   4 from optparse import OptionParser
   5 import numpy
   6 import sys
   7 import tarfile
   8
   9 from htsworkflow.version import version
  10
  11 def main(cmdline=None):
  12     parser = make_parser()
  13     opts, args = parser.parse_args(cmdline)
  14
  15     if opts.version:
  16         print version()
  17         return 0
  18
  19     if opts.infile is not None:
  20         qseq_generator = tarfile_generator(opts.infile)
  21     elif len(args) > 0:
  22         qseq_generator = file_generator(args)
  23     else:
  24         qseq_generator = [sys.stdin]
  25
  26     if opts.output is not None:
  27         output = open(opts.output, 'w')
  28     else:
  29         output = sys.stdout
  30
  31     if opts.nopass_output is not None:
  32         nopass_output = open(opts.nopass_output, 'w')
  33     else:
  34         nopass_output = None
  35
  36     qseq_parser = Qseq2Fastq(qseq_generator, output, nopass_output)
  37     qseq_parser.fastq = not opts.fasta
  38     qseq_parser.flowcell_id = opts.flowcell
  39     qseq_parser.trim = parse_slice(opts.slice)
  40     qseq_parser.reportFilter = opts.pf
  41
  42     qseq_parser.run()
  43
  44
  45 def make_parser():
  46     usage = "%prog: [options] *_qseq.txt\nProduces Phred33 files by default"
  47     parser = OptionParser(usage)
  48     parser.add_option("-a", "--fasta", default=False, action="store_true",
  49                       help="produce fasta files instead of fastq files")
  50     parser.add_option("-f", "--flowcell", default=None,
  51                       help="Set flowcell ID for output file")
  52     parser.add_option("-i", "--infile", default=None,
  53       help='source tar file (if reading from an archive instead of a directory)')
  54     parser.add_option("-o", "--output", default=None,
  55                       help="output fastq file")
  56     parser.add_option("-n", "--nopass-output", default=None,
  57                       help="if provided send files that failed illumina filter "\
  58                            "to a differentfile")
  59     parser.add_option("-s", "--slice",
  60                       help="specify python slice, e.g. 0:75, 0:-1",
  61                       default=None)
  62     parser.add_option("--pf", help="report pass filter flag", default=False,
  63                       action="store_true")
  64     parser.add_option("--version", default=False, action="store_true",
  65                       help="report software version")
  66
  67     return parser
  68
  69
  70 def file_generator(pattern_list):
  71     for pattern in pattern_list:
  72         for filename in glob(pattern):
  73             yield open(filename,"r")
  74
  75
  76 def tarfile_generator(tarfilename):
  77     archive = tarfile.open(tarfilename,'r|*')
  78     for tarinfo in archive:
  79         yield archive.extractfile(tarinfo)
  80
  81
  82 def parse_slice(slice_text):
  83     if slice_text is None or len(slice_text) == 0:
  84         return slice(None)
  85
  86     slice_data = []
  87     for element in slice_text.split(':'):
  88         if len(element) == 0:
  89             element = None
  90         else:
  91             element = int(element)
  92         slice_data.append(element)
  93
  94     return slice(*slice_data)
  95
  96
  97 def file_generator(pattern_list):
  98     for pattern in pattern_list:
  99         for filename in glob(pattern):
 100             yield open(filename,"r")
 101
 102
 103 def tarfile_generator(tarfilename):
 104     archive = tarfile.open(tarfilename,'r|*')
 105     for tarinfo in archive:
 106         yield archive.extractfile(tarinfo)
 107
 108
 109 def parse_slice(slice_text):
 110     if slice_text is None or len(slice_text) == 0:
 111         return slice(None)
 112
 113     slice_data = []
 114     for element in slice_text.split(':'):
 115         if len(element) == 0:
 116             element = None
 117         else:
 118             element = int(element)
 119         slice_data.append(element)
 120
 121     return slice(*slice_data)
 122
 123
 124 class Qseq2Fastq(object):
 125     """
 126     Convert qseq files to fastq (or fasta) files.
 127     """
 128     def __init__(self, sources, pass_destination, nopass_destination=None):
 129         self.sources = sources
 130         self.pass_destination = pass_destination
 131         if nopass_destination is not None:
 132             self.nopass_destination = nopass_destination
 133         else:
 134             self.nopass_destination = pass_destination
 135
 136         self.fastq = True
 137         self.flowcell_id = None
 138         self.trim = slice(None)
 139         self.reportFilter = False
 140
 141     def _format_flowcell_id(self):
 142         """
 143         Return formatted flowcell ID
 144         """
 145         if self.flowcell_id is not None:
 146             return self.flowcell_id+"_"
 147         else:
 148             return ""
 149
 150     def _convert_illumina_quality(self, illumina_quality):
 151         """
 152         Convert an Illumina ASCII encoded quality score to a Phred ASCII quality score.
 153         """
 154         # Illumina scores are Phred + 64
 155         # Fastq scores are Phread + 33
 156         # the following code grabs the string, converts to short ints and
 157         # subtracts 31 (64-33) to convert between the two score formats.
 158         # The numpy solution is twice as fast as some of my other
 159         # ideas for the conversion.
 160         # sorry about the uglyness in changing from character, to 8-bit int
 161         # and back to a character array
 162         quality = numpy.asarray(illumina_quality,'c')
 163         quality.dtype = numpy.uint8
 164         quality -= 31
 165         quality.dtype = '|S1' # I'd like to know what the real numpy char type is
 166         return quality
 167
 168     def run(self):
 169         if self.fastq:
 170             header_template = '@'
 171         else:
 172             # fasta case
 173             header_template = '>'
 174         header_template += self._format_flowcell_id() + '%s_%s:%s:%s:%s:%s/%s%s%s'
 175
 176         for qstream in self.sources:
 177             for line in qstream:
 178                 # parse line
 179                 record = line.rstrip().split('\t')
 180                 machine_name = record[0]
 181                 run_number = record[1]
 182                 lane_number = record[2]
 183                 tile = record[3]
 184                 x = record[4]
 185                 y = record[5]
 186                 index = record[6]
 187                 read = record[7]
 188                 sequence = record[8].replace('.','N')
 189                 quality = self._convert_illumina_quality(record[9])
 190
 191                 # add pass qc filter if we want it
 192                 pass_qc = int(record[10])
 193                 if self.reportFilter:
 194                     pass_qc_msg = " pf=%s" % (pass_qc)
 195                 else:
 196                     pass_qc_msg = ""
 197
 198                 header = header_template  % ( \
 199                     machine_name,
 200                     run_number,
 201                     lane_number,
 202                     tile,
 203                     x,
 204                     y,
 205                     read,
 206                     pass_qc_msg,
 207                     os.linesep)
 208
 209
 210                 # if we passed the filter write to the "good" file
 211                 if pass_qc:
 212                     destination = self.pass_destination
 213                 else:
 214                     destination = self.nopass_destination
 215
 216                 destination.write(header)
 217                 destination.write(sequence[self.trim])
 218                 destination.write(os.linesep)
 219                 if self.fastq:
 220                     destination.write('+')
 221                     destination.write(os.linesep)
 222                     destination.write(quality[self.trim].tostring())
 223                     destination.write(os.linesep)
 224
 225 if __name__ == "__main__":
 226     main()