htsworkflow/pipelines/srf.py

   1 import optparse
   2 from glob import glob
   3 import logging
   4 import os
   5 import shutil
   6
   7 from htsworkflow.util import queuecommands
   8 from htsworkflow.pipelines.samplekey import SampleKey
   9
  10 LOGGER = logging.getLogger(__name__)
  11
  12 SOLEXA2SRF = 0
  13 ILLUMINA2SRF10 = 1
  14 ILLUMINA2SRF11 = 2
  15
  16 def pathname_to_run_name(base):
  17   """
  18   Convert a pathname to a base runfolder name
  19   handle the case with a trailing /
  20
  21   >>> print pathname_to_run_name("/a/b/c/run")
  22   run
  23   >>> print pathname_to_run_name("/a/b/c/run/")
  24   run
  25   >>> print pathname_to_run_name("run")
  26   run
  27   >>> print pathname_to_run_name("run/")
  28   run
  29   >>> print pathname_to_run_name("../run")
  30   run
  31   >>> print pathname_to_run_name("../run/")
  32   run
  33   """
  34   name = ""
  35   while len(name) == 0:
  36     base, name = os.path.split(base)
  37     if len(base) == 0:
  38       break
  39   return name
  40
  41 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
  42   """
  43   make a subprocess-friendly list of command line arguments to run solexa2srf
  44   generates files like:
  45   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
  46   site        run name                    lane
  47
  48   run_name - most of the file name (run folder name is a good choice)
  49   lanes - list of integers corresponding to which lanes to process
  50   site_name - name of your "sequencing site" or "Individual"
  51   destdir - where to write all the srf files
  52   """
  53   # clean up pathname
  54   LOGGER.info("run_name %s" % (run_name,))
  55
  56   cmd_list = []
  57   for key in lanes:
  58     if not isinstance(key, SampleKey):
  59        errmsg = "Expected %s got %s"
  60        raise ValueError(errmsg % (str(SampleKey), str(type(key))))
  61     name_prefix = '%s_%%l_' % (run_name,)
  62     destname = '%s_%s_%d.srf' % (site_name, run_name, key.lane)
  63     destdir = os.path.normpath(destdir)
  64     dest_path = os.path.join(destdir, destname)
  65     seq_pattern = 's_%d_*_seq.txt' % (key.lane,)
  66
  67     if cmdlevel == SOLEXA2SRF:
  68         cmd = ['solexa2srf',
  69                '-N', name_prefix,
  70                '-n', '%t:%3x:%3y',
  71                '-o', dest_path,
  72                seq_pattern]
  73     elif cmdlevel == ILLUMINA2SRF10:
  74         cmd = ['illumina2srf',
  75                '-v1.0',
  76                '-o', dest_path,
  77                seq_pattern]
  78     elif cmdlevel == ILLUMINA2SRF11:
  79         seq_pattern = 's_%d_*_qseq.txt' % (key.lane,)
  80         cmd = ['illumina2srf',
  81                '-o', dest_path,
  82                seq_pattern]
  83     else:
  84         raise ValueError("Unrecognized run level %d" % (cmdlevel,))
  85
  86     LOGGER.info("Generated command: " + " ".join(cmd))
  87     cmd_list.append(" ".join(cmd))
  88   return cmd_list
  89
  90 def create_qseq_patterns(bustard_dir):
  91     """Scan a bustard directory for qseq files and determine a glob pattern
  92     """
  93     # grab one tile for each lane.
  94     qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
  95     # handle old runfolders
  96     if len(qseqs) == 0:
  97       qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
  98     if len(qseqs) == 0:
  99       r
 100     qseqs = [ os.path.split(x)[-1] for x in qseqs ]
 101     if len(qseqs[0].split('_')) == 4:
 102       # single ended
 103       return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
 104     elif len(qseqs[0].split('_')) == 5:
 105       # more than 1 read
 106       # build a dictionary of read numbers by lane
 107       # ( just in case we didn't run all 8 lanes )
 108       lanes = {}
 109       for q in qseqs:
 110         sample, lane, read, tile, extension = q.split('_')
 111         lanes.setdefault(lane, []).append(read)
 112       qseq_patterns = []
 113       # grab a lane from the dictionary
 114       # I don't think it matters which one.
 115       k = lanes.keys()[0]
 116       # build the list of patterns
 117       for read in lanes[k]:
 118         read = int(read)
 119         qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
 120       return qseq_patterns
 121     else:
 122       raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
 123
 124 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
 125   """
 126   make a subprocess-friendly list of command line arguments to run solexa2srf
 127   generates files like:
 128   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
 129    site        run name                    lane
 130
 131   run_name - most of the file name (run folder name is a good choice)
 132   lanes - list of integers corresponding to which lanes to process
 133   site_name - name of your "sequencing site" or "Individual"
 134   destdir - where to write all the srf files
 135   """
 136   # clean up pathname
 137   LOGGER.info("run_name %s" % (run_name,))
 138
 139   cmd_list = []
 140   for key in lanes:
 141     if not isinstance(key, SampleKey):
 142       errmsg = "Expected %s got %s"
 143       raise ValueError(errmsg % (str(SampleKey), str(type(key))))
 144     name_prefix = '%s_%%l_%%t_' % (run_name,)
 145     destdir = os.path.normpath(destdir)
 146     qseq_patterns = create_qseq_patterns(bustard_dir)
 147
 148     for read, pattern in qseq_patterns:
 149       if read is None:
 150         destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, key.lane)
 151         dest_path = os.path.join(destdir, destname)
 152       else:
 153         destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, key.lane, read)
 154         dest_path = os.path.join(destdir, destname)
 155
 156       cmd = " ".join(['tar', 'cjf', dest_path, pattern % (key.lane,) ])
 157       LOGGER.info("Generated command: " + cmd)
 158       cmd_list.append(cmd)
 159
 160   return cmd_list
 161
 162 def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
 163     """
 164     make a subprocess-friendly list of command line arguments to save HiSeq fastq files
 165
 166     run_name - most of the file name (run folder name is a good choice)
 167     basecall_dir - location of unaligned files.
 168     site_name - name of your "sequencing site" or "Individual"
 169     destdir - root of where to save fastq files
 170     """
 171     # clean up pathname
 172     LOGGER.info("run_name %s" % (run_name,))
 173
 174     cmd_list = []
 175     project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
 176     for project_dir in project_dirs:
 177         _, project_name = os.path.split(project_dir)
 178         sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
 179         project_dest = os.path.join(destdir, project_name)
 180         if not os.path.exists(project_dest):
 181             LOGGER.info("Making: %s" % (project_dest))
 182             os.mkdir(project_dest)
 183
 184         for fastq_file in sample_files:
 185             shutil.copy(fastq_file, project_dest)
 186
 187
 188 def run_commands(new_dir, cmd_list, num_jobs):
 189     LOGGER.info("chdir to %s" % (new_dir,))
 190     curdir = os.getcwd()
 191     os.chdir(new_dir)
 192     q = queuecommands.QueueCommands(cmd_list, num_jobs)
 193     q.run()
 194     os.chdir(curdir)
 195
 196 def make_md5_commands(destdir):
 197   """
 198   Scan the cycle dir and create md5s for the contents
 199   """
 200   cmd_list = []
 201   destdir = os.path.abspath(destdir)
 202   bz2s = glob(os.path.join(destdir, "*.bz2"))
 203   gzs = glob(os.path.join(destdir, "*gz"))
 204   srfs = glob(os.path.join(destdir, "*.srf"))
 205
 206   file_list = bz2s + gzs + srfs
 207
 208   for f in file_list:
 209       cmd = " ".join(['md5sum', f, '>', f + '.md5'])
 210       LOGGER.info('generated command: ' + cmd)
 211       cmd_list.append(cmd)
 212
 213   return cmd_list
 214
 215 def main(cmdline=None):
 216     parser = make_parser()
 217     opts, args = parser.parse_args(cmdline)
 218
 219     logging.basicConfig(level = logging.DEBUG)
 220     if not opts.name:
 221         parser.error("Specify run name. Usually runfolder name")
 222     if not opts.destination:
 223         parser.error("Specify where to write sequence files")
 224     if not opts.site_name:
 225         parser.error("Specify site name")
 226     if len(args) != 1:
 227         parser.error("Can only process one directory")
 228
 229     source = args[0]
 230     LOGGER.info("Raw Format is: %s" % (opts.format, ))
 231     seq_cmds = []
 232     if opts.format == 'fastq':
 233         LOGGER.info("raw data = %s" % (source,))
 234         copy_hiseq_project_fastqs(opts.name, source, opts.site_name, opts.destination)
 235     elif opts.format == 'qseq':
 236         seq_cmds = make_qseq_commands(opts.name, source, opts.lanes, opts.site_name, opts.destination)
 237     elif opts.format == 'srf':
 238         seq_cmds = make_srf_commands(opts.name, source, opts.lanes, opts.site_name, opts.destination, 0)
 239     else:
 240         raise ValueError('Unknown --format=%s' % (opts.format))
 241     print seq_cmds
 242     srf.run_commands(args.source, seq_cmds, num_jobs)
 243
 244 def make_parser():
 245     parser = optparse.OptionParser()
 246     parser.add_option('-f', '--format', default='fastq',
 247                         help="Format raw data is in")
 248     parser.add_option('-n', '--name', default=None,
 249                         help="Specify run name")
 250     parser.add_option('-d', '--destination', default=None,
 251                         help='specify where to write files  (cycle dir)')
 252     parser.add_option('-s', '--site-name', default=None,
 253                         help="specify site name")
 254     parser.add_option('-l', '--lanes', default="1,2,3,4,5,6,7,8",
 255                         help="what lanes to process, defaults to all")
 256     return parser
 257 if __name__ == "__main__":
 258     main()