htsworkflow/pipelines/srf.py

   1 from glob import glob
   2 import logging
   3 import os
   4 import shutil
   5
   6 from htsworkflow.util import queuecommands
   7 from htsworkflow.pipelines.samplekey import SampleKey
   8
   9 LOGGER = logging.getLogger(__name__)
  10
  11 SOLEXA2SRF = 0
  12 ILLUMINA2SRF10 = 1
  13 ILLUMINA2SRF11 = 2
  14
  15 def pathname_to_run_name(base):
  16   """
  17   Convert a pathname to a base runfolder name
  18   handle the case with a trailing /
  19
  20   >>> print pathname_to_run_name("/a/b/c/run")
  21   run
  22   >>> print pathname_to_run_name("/a/b/c/run/")
  23   run
  24   >>> print pathname_to_run_name("run")
  25   run
  26   >>> print pathname_to_run_name("run/")
  27   run
  28   >>> print pathname_to_run_name("../run")
  29   run
  30   >>> print pathname_to_run_name("../run/")
  31   run
  32   """
  33   name = ""
  34   while len(name) == 0:
  35     base, name = os.path.split(base)
  36     if len(base) == 0:
  37       break
  38   return name
  39
  40 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
  41   """
  42   make a subprocess-friendly list of command line arguments to run solexa2srf
  43   generates files like:
  44   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
  45   site        run name                    lane
  46
  47   run_name - most of the file name (run folder name is a good choice)
  48   lanes - list of integers corresponding to which lanes to process
  49   site_name - name of your "sequencing site" or "Individual"
  50   destdir - where to write all the srf files
  51   """
  52   # clean up pathname
  53   LOGGER.info("run_name %s" % (run_name,))
  54
  55   cmd_list = []
  56   for key in lanes:
  57     if not isinstance(key, SampleKey):
  58        errmsg = "Expected %s got %s"
  59        raise ValueError(errmsg % (str(SampleKey), str(type(key))))
  60     name_prefix = '%s_%%l_' % (run_name,)
  61     destname = '%s_%s_%d.srf' % (site_name, run_name, key.lane)
  62     destdir = os.path.normpath(destdir)
  63     dest_path = os.path.join(destdir, destname)
  64     seq_pattern = 's_%d_*_seq.txt' % (key.lane,)
  65
  66     if cmdlevel == SOLEXA2SRF:
  67         cmd = ['solexa2srf',
  68                '-N', name_prefix,
  69                '-n', '%t:%3x:%3y',
  70                '-o', dest_path,
  71                seq_pattern]
  72     elif cmdlevel == ILLUMINA2SRF10:
  73         cmd = ['illumina2srf',
  74                '-v1.0',
  75                '-o', dest_path,
  76                seq_pattern]
  77     elif cmdlevel == ILLUMINA2SRF11:
  78         seq_pattern = 's_%d_*_qseq.txt' % (key.lane,)
  79         cmd = ['illumina2srf',
  80                '-o', dest_path,
  81                seq_pattern]
  82     else:
  83         raise ValueError("Unrecognized run level %d" % (cmdlevel,))
  84
  85     LOGGER.info("Generated command: " + " ".join(cmd))
  86     cmd_list.append(" ".join(cmd))
  87   return cmd_list
  88
  89 def create_qseq_patterns(bustard_dir):
  90     """Scan a bustard directory for qseq files and determine a glob pattern
  91     """
  92     # grab one tile for each lane.
  93     qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
  94     # handle old runfolders
  95     if len(qseqs) == 0:
  96       qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
  97     if len(qseqs) == 0:
  98       r
  99     qseqs = [ os.path.split(x)[-1] for x in qseqs ]
 100     if len(qseqs[0].split('_')) == 4:
 101       # single ended
 102       return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
 103     elif len(qseqs[0].split('_')) == 5:
 104       # more than 1 read
 105       # build a dictionary of read numbers by lane
 106       # ( just in case we didn't run all 8 lanes )
 107       lanes = {}
 108       for q in qseqs:
 109         sample, lane, read, tile, extension = q.split('_')
 110         lanes.setdefault(lane, []).append(read)
 111       qseq_patterns = []
 112       # grab a lane from the dictionary
 113       # I don't think it matters which one.
 114       k = list(lanes.keys())[0]
 115       # build the list of patterns
 116       for read in lanes[k]:
 117         read = int(read)
 118         qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
 119       return qseq_patterns
 120     else:
 121       raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
 122
 123 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
 124   """
 125   make a subprocess-friendly list of command line arguments to run solexa2srf
 126   generates files like:
 127   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
 128    site        run name                    lane
 129
 130   run_name - most of the file name (run folder name is a good choice)
 131   lanes - list of integers corresponding to which lanes to process
 132   site_name - name of your "sequencing site" or "Individual"
 133   destdir - where to write all the srf files
 134   """
 135   # clean up pathname
 136   LOGGER.info("run_name %s" % (run_name,))
 137
 138   cmd_list = []
 139   for key in lanes:
 140     if not isinstance(key, SampleKey):
 141       errmsg = "Expected %s got %s"
 142       raise ValueError(errmsg % (str(SampleKey), str(type(key))))
 143     name_prefix = '%s_%%l_%%t_' % (run_name,)
 144     destdir = os.path.normpath(destdir)
 145     qseq_patterns = create_qseq_patterns(bustard_dir)
 146
 147     for read, pattern in qseq_patterns:
 148       if read is None:
 149         destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, key.lane)
 150         dest_path = os.path.join(destdir, destname)
 151       else:
 152         destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, key.lane, read)
 153         dest_path = os.path.join(destdir, destname)
 154
 155       cmd = " ".join(['tar', 'cjf', dest_path, pattern % (key.lane,) ])
 156       LOGGER.info("Generated command: " + cmd)
 157       cmd_list.append(cmd)
 158
 159   return cmd_list
 160
 161 def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
 162     """
 163     make a subprocess-friendly list of command line arguments to save HiSeq fastq files
 164
 165     run_name - most of the file name (run folder name is a good choice)
 166     basecall_dir - location of unaligned files.
 167     site_name - name of your "sequencing site" or "Individual"
 168     destdir - root of where to save fastq files
 169     """
 170     # clean up pathname
 171     LOGGER.info("run_name %s" % (run_name,))
 172
 173     cmd_list = []
 174     project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
 175     for project_dir in project_dirs:
 176         _, project_name = os.path.split(project_dir)
 177         sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
 178         project_dest = os.path.join(destdir, project_name)
 179         if not os.path.exists(project_dest):
 180             LOGGER.info("Making: %s" % (project_dest))
 181             os.mkdir(project_dest)
 182
 183         for fastq_file in sample_files:
 184             shutil.copy(fastq_file, project_dest)
 185
 186
 187 def run_commands(new_dir, cmd_list, num_jobs):
 188     LOGGER.info("chdir to %s" % (new_dir,))
 189     curdir = os.getcwd()
 190     os.chdir(new_dir)
 191     q = queuecommands.QueueCommands(cmd_list, num_jobs)
 192     q.run()
 193     os.chdir(curdir)
 194
 195 def make_md5_commands(destdir):
 196   """
 197   Scan the cycle dir and create md5s for the contents
 198   """
 199   cmd_list = []
 200   destdir = os.path.abspath(destdir)
 201   bz2s = glob(os.path.join(destdir, "*.bz2"))
 202   gzs = glob(os.path.join(destdir, "*gz"))
 203   srfs = glob(os.path.join(destdir, "*.srf"))
 204
 205   file_list = bz2s + gzs + srfs
 206
 207   for f in file_list:
 208       cmd = " ".join(['md5sum', f, '>', f + '.md5'])
 209       LOGGER.info('generated command: ' + cmd)
 210       cmd_list.append(cmd)
 211
 212   return cmd_list
 213