htsworkflow/pipelines/srf.py

   1 from glob import glob
   2 import logging
   3 import os
   4
   5 from htsworkflow.util import queuecommands
   6
   7 LOGGER = logging.getLogger(__name__)
   8
   9 SOLEXA2SRF = 0
  10 ILLUMINA2SRF10 = 1
  11 ILLUMINA2SRF11 = 2
  12
  13 def pathname_to_run_name(base):
  14   """
  15   Convert a pathname to a base runfolder name
  16   handle the case with a trailing /
  17
  18   >>> print pathname_to_run_name("/a/b/c/run")
  19   run
  20   >>> print pathname_to_run_name("/a/b/c/run/")
  21   run
  22   >>> print pathname_to_run_name("run")
  23   run
  24   >>> print pathname_to_run_name("run/")
  25   run
  26   >>> print pathname_to_run_name("../run")
  27   run
  28   >>> print pathname_to_run_name("../run/")
  29   run
  30   """
  31   name = ""
  32   while len(name) == 0:
  33     base, name = os.path.split(base)
  34     if len(base) == 0:
  35       break
  36   return name
  37
  38 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
  39   """
  40   make a subprocess-friendly list of command line arguments to run solexa2srf
  41   generates files like:
  42   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
  43    site        run name                    lane
  44
  45   run_name - most of the file name (run folder name is a good choice)
  46   lanes - list of integers corresponding to which lanes to process
  47   site_name - name of your "sequencing site" or "Individual"
  48   destdir - where to write all the srf files
  49   """
  50   # clean up pathname
  51   LOGGER.info("run_name %s" % (run_name,))
  52
  53   cmd_list = []
  54   for lane in lanes:
  55     name_prefix = '%s_%%l_' % (run_name,)
  56     destname = '%s_%s_%d.srf' % (site_name, run_name, lane)
  57     destdir = os.path.normpath(destdir)
  58     dest_path = os.path.join(destdir, destname)
  59     seq_pattern = 's_%d_*_seq.txt' % (lane,)
  60
  61     if cmdlevel == SOLEXA2SRF:
  62         cmd = ['solexa2srf',
  63                '-N', name_prefix,
  64                '-n', '%t:%3x:%3y',
  65                '-o', dest_path,
  66                seq_pattern]
  67     elif cmdlevel == ILLUMINA2SRF10:
  68         cmd = ['illumina2srf',
  69                '-v1.0',
  70                '-o', dest_path,
  71                seq_pattern]
  72     elif cmdlevel == ILLUMINA2SRF11:
  73         seq_pattern = 's_%d_*_qseq.txt' % (lane,)
  74         cmd = ['illumina2srf',
  75                '-o', dest_path,
  76                seq_pattern]
  77     else:
  78         raise ValueError("Unrecognized run level %d" % (cmdlevel,))
  79
  80     LOGGER.info("Generated command: " + " ".join(cmd))
  81     cmd_list.append(" ".join(cmd))
  82   return cmd_list
  83
  84 def create_qseq_patterns(bustard_dir):
  85     """Scan a bustard directory for qseq files and determine a glob pattern
  86     """
  87     # grab one tile for each lane.
  88     qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
  89     # handle old runfolders
  90     if len(qseqs) == 0:
  91       qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
  92     if len(qseqs) == 0:
  93       r
  94     qseqs = [ os.path.split(x)[-1] for x in qseqs ]
  95     if len(qseqs[0].split('_')) == 4:
  96       # single ended
  97       return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
  98     elif len(qseqs[0].split('_')) == 5:
  99       # more than 1 read
 100       # build a dictionary of read numbers by lane
 101       # ( just in case we didn't run all 8 lanes )
 102       lanes = {}
 103       for q in qseqs:
 104         sample, lane, read, tile, extension = q.split('_')
 105         lanes.setdefault(lane, []).append(read)
 106       qseq_patterns = []
 107       # grab a lane from the dictionary
 108       # I don't think it matters which one.
 109       k = lanes.keys()[0]
 110       # build the list of patterns
 111       for read in lanes[k]:
 112         read = int(read)
 113         qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
 114       return qseq_patterns
 115     else:
 116       raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
 117
 118 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
 119   """
 120   make a subprocess-friendly list of command line arguments to run solexa2srf
 121   generates files like:
 122   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
 123    site        run name                    lane
 124
 125   run_name - most of the file name (run folder name is a good choice)
 126   lanes - list of integers corresponding to which lanes to process
 127   site_name - name of your "sequencing site" or "Individual"
 128   destdir - where to write all the srf files
 129   """
 130   # clean up pathname
 131   LOGGER.info("run_name %s" % (run_name,))
 132
 133   cmd_list = []
 134   for lane in lanes:
 135     name_prefix = '%s_%%l_%%t_' % (run_name,)
 136     destdir = os.path.normpath(destdir)
 137     qseq_patterns = create_qseq_patterns(bustard_dir)
 138
 139     for read, pattern in qseq_patterns:
 140       if read is None:
 141         destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, lane)
 142         dest_path = os.path.join(destdir, destname)
 143       else:
 144         destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, lane, read)
 145         dest_path = os.path.join(destdir, destname)
 146
 147       cmd = " ".join(['tar', 'cjf', dest_path, pattern % (lane,) ])
 148       LOGGER.info("Generated command: " + cmd)
 149       cmd_list.append(cmd)
 150
 151   return cmd_list
 152
 153 def run_commands(new_dir, cmd_list, num_jobs):
 154     LOGGER.info("chdir to %s" % (new_dir,))
 155     curdir = os.getcwd()
 156     os.chdir(new_dir)
 157     q = queuecommands.QueueCommands(cmd_list, num_jobs)
 158     q.run()
 159     os.chdir(curdir)
 160
 161 def make_md5_commands(destdir):
 162   """
 163   Scan the cycle dir and create md5s for the contents
 164   """
 165   cmd_list = []
 166   destdir = os.path.abspath(destdir)
 167   bz2s = glob(os.path.join(destdir, "*.bz2"))
 168   gzs = glob(os.path.join(destdir, "*gz"))
 169   srfs = glob(os.path.join(destdir, "*.srf"))
 170
 171   file_list = bz2s + gzs + srfs
 172
 173   for f in file_list:
 174       cmd = " ".join(['md5sum', f, '>', f + '.md5'])
 175       LOGGER.info('generated command: ' + cmd)
 176       cmd_list.append(cmd)
 177
 178   return cmd_list
 179