htsworkflow/pipelines/srf.py

   1 from glob import glob
   2 import logging
   3 import os
   4 import shutil
   5
   6 from htsworkflow.util import queuecommands
   7
   8 LOGGER = logging.getLogger(__name__)
   9
  10 SOLEXA2SRF = 0
  11 ILLUMINA2SRF10 = 1
  12 ILLUMINA2SRF11 = 2
  13
  14 def pathname_to_run_name(base):
  15   """
  16   Convert a pathname to a base runfolder name
  17   handle the case with a trailing /
  18
  19   >>> print pathname_to_run_name("/a/b/c/run")
  20   run
  21   >>> print pathname_to_run_name("/a/b/c/run/")
  22   run
  23   >>> print pathname_to_run_name("run")
  24   run
  25   >>> print pathname_to_run_name("run/")
  26   run
  27   >>> print pathname_to_run_name("../run")
  28   run
  29   >>> print pathname_to_run_name("../run/")
  30   run
  31   """
  32   name = ""
  33   while len(name) == 0:
  34     base, name = os.path.split(base)
  35     if len(base) == 0:
  36       break
  37   return name
  38
  39 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
  40   """
  41   make a subprocess-friendly list of command line arguments to run solexa2srf
  42   generates files like:
  43   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
  44    site        run name                    lane
  45
  46   run_name - most of the file name (run folder name is a good choice)
  47   lanes - list of integers corresponding to which lanes to process
  48   site_name - name of your "sequencing site" or "Individual"
  49   destdir - where to write all the srf files
  50   """
  51   # clean up pathname
  52   LOGGER.info("run_name %s" % (run_name,))
  53
  54   cmd_list = []
  55   for lane in lanes:
  56     name_prefix = '%s_%%l_' % (run_name,)
  57     destname = '%s_%s_%d.srf' % (site_name, run_name, lane)
  58     destdir = os.path.normpath(destdir)
  59     dest_path = os.path.join(destdir, destname)
  60     seq_pattern = 's_%d_*_seq.txt' % (lane,)
  61
  62     if cmdlevel == SOLEXA2SRF:
  63         cmd = ['solexa2srf',
  64                '-N', name_prefix,
  65                '-n', '%t:%3x:%3y',
  66                '-o', dest_path,
  67                seq_pattern]
  68     elif cmdlevel == ILLUMINA2SRF10:
  69         cmd = ['illumina2srf',
  70                '-v1.0',
  71                '-o', dest_path,
  72                seq_pattern]
  73     elif cmdlevel == ILLUMINA2SRF11:
  74         seq_pattern = 's_%d_*_qseq.txt' % (lane,)
  75         cmd = ['illumina2srf',
  76                '-o', dest_path,
  77                seq_pattern]
  78     else:
  79         raise ValueError("Unrecognized run level %d" % (cmdlevel,))
  80
  81     LOGGER.info("Generated command: " + " ".join(cmd))
  82     cmd_list.append(" ".join(cmd))
  83   return cmd_list
  84
  85 def create_qseq_patterns(bustard_dir):
  86     """Scan a bustard directory for qseq files and determine a glob pattern
  87     """
  88     # grab one tile for each lane.
  89     qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
  90     # handle old runfolders
  91     if len(qseqs) == 0:
  92       qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
  93     if len(qseqs) == 0:
  94       r
  95     qseqs = [ os.path.split(x)[-1] for x in qseqs ]
  96     if len(qseqs[0].split('_')) == 4:
  97       # single ended
  98       return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
  99     elif len(qseqs[0].split('_')) == 5:
 100       # more than 1 read
 101       # build a dictionary of read numbers by lane
 102       # ( just in case we didn't run all 8 lanes )
 103       lanes = {}
 104       for q in qseqs:
 105         sample, lane, read, tile, extension = q.split('_')
 106         lanes.setdefault(lane, []).append(read)
 107       qseq_patterns = []
 108       # grab a lane from the dictionary
 109       # I don't think it matters which one.
 110       k = lanes.keys()[0]
 111       # build the list of patterns
 112       for read in lanes[k]:
 113         read = int(read)
 114         qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
 115       return qseq_patterns
 116     else:
 117       raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
 118
 119 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
 120   """
 121   make a subprocess-friendly list of command line arguments to run solexa2srf
 122   generates files like:
 123   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
 124    site        run name                    lane
 125
 126   run_name - most of the file name (run folder name is a good choice)
 127   lanes - list of integers corresponding to which lanes to process
 128   site_name - name of your "sequencing site" or "Individual"
 129   destdir - where to write all the srf files
 130   """
 131   # clean up pathname
 132   LOGGER.info("run_name %s" % (run_name,))
 133
 134   cmd_list = []
 135   for lane in lanes:
 136     name_prefix = '%s_%%l_%%t_' % (run_name,)
 137     destdir = os.path.normpath(destdir)
 138     qseq_patterns = create_qseq_patterns(bustard_dir)
 139
 140     for read, pattern in qseq_patterns:
 141       if read is None:
 142         destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, lane)
 143         dest_path = os.path.join(destdir, destname)
 144       else:
 145         destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, lane, read)
 146         dest_path = os.path.join(destdir, destname)
 147
 148       cmd = " ".join(['tar', 'cjf', dest_path, pattern % (lane,) ])
 149       LOGGER.info("Generated command: " + cmd)
 150       cmd_list.append(cmd)
 151
 152   return cmd_list
 153
 154 def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
 155     """
 156     make a subprocess-friendly list of command line arguments to save HiSeq fastq files
 157
 158     run_name - most of the file name (run folder name is a good choice)
 159     basecall_dir - location of unaligned files.
 160     site_name - name of your "sequencing site" or "Individual"
 161     destdir - root of where to save fastq files
 162     """
 163     # clean up pathname
 164     LOGGER.info("run_name %s" % (run_name,))
 165
 166     cmd_list = []
 167     project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
 168     for project_dir in project_dirs:
 169         _, project_name = os.path.split(project_dir)
 170         sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
 171         project_dest = os.path.join(destdir, project_name)
 172         if not os.path.exists(project_dest):
 173             LOGGER.info("Making: %s" % (project_dest))
 174             os.mkdir(project_dest)
 175
 176         for fastq_file in sample_files:
 177             shutil.copy(fastq_file, project_dest)
 178
 179
 180 def run_commands(new_dir, cmd_list, num_jobs):
 181     LOGGER.info("chdir to %s" % (new_dir,))
 182     curdir = os.getcwd()
 183     os.chdir(new_dir)
 184     q = queuecommands.QueueCommands(cmd_list, num_jobs)
 185     q.run()
 186     os.chdir(curdir)
 187
 188 def make_md5_commands(destdir):
 189   """
 190   Scan the cycle dir and create md5s for the contents
 191   """
 192   cmd_list = []
 193   destdir = os.path.abspath(destdir)
 194   bz2s = glob(os.path.join(destdir, "*.bz2"))
 195   gzs = glob(os.path.join(destdir, "*gz"))
 196   srfs = glob(os.path.join(destdir, "*.srf"))
 197
 198   file_list = bz2s + gzs + srfs
 199
 200   for f in file_list:
 201       cmd = " ".join(['md5sum', f, '>', f + '.md5'])
 202       LOGGER.info('generated command: ' + cmd)
 203       cmd_list.append(cmd)
 204
 205   return cmd_list
 206