htsworkflow/pipelines/srf.py

   1 from glob import glob
   2 import logging
   3 import os
   4
   5 from htsworkflow.util import queuecommands
   6
   7 SOLEXA2SRF = 0
   8 ILLUMINA2SRF10 = 1
   9 ILLUMINA2SRF11 = 2
  10
  11 def pathname_to_run_name(base):
  12   """
  13   Convert a pathname to a base runfolder name
  14   handle the case with a trailing /
  15
  16   >>> print pathname_to_run_name("/a/b/c/run")
  17   run
  18   >>> print pathname_to_run_name("/a/b/c/run/")
  19   run
  20   >>> print pathname_to_run_name("run")
  21   run
  22   >>> print pathname_to_run_name("run/")
  23   run
  24   >>> print pathname_to_run_name("../run")
  25   run
  26   >>> print pathname_to_run_name("../run/")
  27   run
  28   """
  29   name = ""
  30   while len(name) == 0:
  31     base, name = os.path.split(base)
  32     if len(base) == 0:
  33       break
  34   return name
  35
  36 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
  37   """
  38   make a subprocess-friendly list of command line arguments to run solexa2srf
  39   generates files like:
  40   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
  41    site        run name                    lane
  42
  43   run_name - most of the file name (run folder name is a good choice)
  44   lanes - list of integers corresponding to which lanes to process
  45   site_name - name of your "sequencing site" or "Individual"
  46   destdir - where to write all the srf files
  47   """
  48   # clean up pathname
  49   logging.info("run_name %s" % (run_name,))
  50
  51   cmd_list = []
  52   for lane in lanes:
  53     name_prefix = '%s_%%l_%%t_' % (run_name,)
  54     destname = '%s_%s_%d.srf' % (site_name, run_name, lane)
  55     destdir = os.path.normpath(destdir)
  56     dest_path = os.path.join(destdir, destname)
  57     seq_pattern = 's_%d_*_seq.txt' % (lane,)
  58
  59     if cmdlevel == SOLEXA2SRF:
  60         cmd = ['solexa2srf',
  61                '-N', name_prefix,
  62                '-n', '%t:%3x:%3y',
  63                '-o', dest_path,
  64                seq_pattern]
  65     elif cmdlevel == ILLUMINA2SRF10:
  66         cmd = ['illumina2srf',
  67                '-v1.0',
  68                '-o', dest_path,
  69                seq_pattern]
  70     elif cmdlevel == ILLUMINA2SRF11:
  71         seq_pattern = 's_%d_*_qseq.txt' % (lane,)
  72         cmd = ['illumina2srf',
  73                '-o', dest_path,
  74                seq_pattern]
  75     else:
  76         raise ValueError("Unrecognized run level %d" % (cmdlevel,))
  77
  78     logging.info("Generated command: " + " ".join(cmd))
  79     cmd_list.append(" ".join(cmd))
  80   return cmd_list
  81
  82 def create_qseq_patterns(bustard_dir):
  83     """Scan a bustard directory for qseq files and determine a glob pattern
  84     """
  85     # grab one tile for each lane.
  86     qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
  87     # handle old runfolders
  88     if len(qseqs) == 0:
  89       qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
  90     if len(qseqs) == 0:
  91       r
  92     qseqs = [ os.path.split(x)[-1] for x in qseqs ]
  93     if len(qseqs[0].split('_')) == 4:
  94       # single ended
  95       return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
  96     elif len(qseqs[0].split('_')) == 5:
  97       # more than 1 read
  98       # build a dictionary of read numbers by lane
  99       # ( just in case we didn't run all 8 lanes )
 100       lanes = {}
 101       for q in qseqs:
 102         sample, lane, read, tile, extension = q.split('_')
 103         lanes.setdefault(lane, []).append(read)
 104       qseq_patterns = []
 105       # grab a lane from the dictionary
 106       # I don't think it matters which one.
 107       k = lanes.keys()[0]
 108       # build the list of patterns
 109       for read in lanes[k]:
 110         read = int(read)
 111         qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
 112       return qseq_patterns
 113     else:
 114       raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
 115
 116 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
 117   """
 118   make a subprocess-friendly list of command line arguments to run solexa2srf
 119   generates files like:
 120   woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
 121    site        run name                    lane
 122
 123   run_name - most of the file name (run folder name is a good choice)
 124   lanes - list of integers corresponding to which lanes to process
 125   site_name - name of your "sequencing site" or "Individual"
 126   destdir - where to write all the srf files
 127   """
 128   # clean up pathname
 129   logging.info("run_name %s" % (run_name,))
 130
 131   cmd_list = []
 132   for lane in lanes:
 133     name_prefix = '%s_%%l_%%t_' % (run_name,)
 134     destdir = os.path.normpath(destdir)
 135     qseq_patterns = create_qseq_patterns(bustard_dir)
 136
 137     for read, pattern in qseq_patterns:
 138       if read is None:
 139         destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, lane)
 140         dest_path = os.path.join(destdir, destname)
 141       else:
 142         destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, lane, read)
 143         dest_path = os.path.join(destdir, destname)
 144
 145       cmd = " ".join(['tar', 'cjf', dest_path, pattern % (lane,) ])
 146       logging.info("Generated command: " + cmd)
 147       cmd_list.append(cmd)
 148
 149   return cmd_list
 150
 151 def run_commands(new_dir, cmd_list, num_jobs):
 152     logging.info("chdir to %s" % (new_dir,))
 153     curdir = os.getcwd()
 154     os.chdir(new_dir)
 155     q = queuecommands.QueueCommands(cmd_list, num_jobs)
 156     q.run()
 157     os.chdir(curdir)
 158
 159 def make_md5_commands(destdir):
 160   """
 161   Scan the cycle dir and create md5s for the contents
 162   """
 163   cmd_list = []
 164   destdir = os.path.abspath(destdir)
 165   bz2s = glob(os.path.join(destdir, "*.bz2"))
 166   gzs = glob(os.path.join(destdir, "*gz"))
 167   srfs = glob(os.path.join(destdir, "*.srf"))
 168
 169   file_list = bz2s + gzs + srfs
 170
 171   for f in file_list:
 172       cmd = " ".join(['md5sum', f, '>', f + '.md5'])
 173       logging.info('generated command: ' + cmd)
 174       cmd_list.append(cmd)
 175
 176   return cmd_list