6 from htsworkflow.util import queuecommands
7 from htsworkflow.pipelines.samplekey import SampleKey
9 LOGGER = logging.getLogger(__name__)
15 def pathname_to_run_name(base):
17 Convert a pathname to a base runfolder name
18 handle the case with a trailing /
20 >>> print pathname_to_run_name("/a/b/c/run")
22 >>> print pathname_to_run_name("/a/b/c/run/")
24 >>> print pathname_to_run_name("run")
26 >>> print pathname_to_run_name("run/")
28 >>> print pathname_to_run_name("../run")
30 >>> print pathname_to_run_name("../run/")
35 base, name = os.path.split(base)
40 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
42 make a subprocess-friendly list of command line arguments to run solexa2srf
44 woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
47 run_name - most of the file name (run folder name is a good choice)
48 lanes - list of integers corresponding to which lanes to process
49 site_name - name of your "sequencing site" or "Individual"
50 destdir - where to write all the srf files
53 LOGGER.info("run_name %s" % (run_name,))
57 if not isinstance(key, SampleKey):
58 errmsg = "Expected %s got %s"
59 raise ValueError(errmsg % (str(SampleKey), str(type(key))))
60 name_prefix = '%s_%%l_' % (run_name,)
61 destname = '%s_%s_%d.srf' % (site_name, run_name, key.lane)
62 destdir = os.path.normpath(destdir)
63 dest_path = os.path.join(destdir, destname)
64 seq_pattern = 's_%d_*_seq.txt' % (key.lane,)
66 if cmdlevel == SOLEXA2SRF:
72 elif cmdlevel == ILLUMINA2SRF10:
73 cmd = ['illumina2srf',
77 elif cmdlevel == ILLUMINA2SRF11:
78 seq_pattern = 's_%d_*_qseq.txt' % (key.lane,)
79 cmd = ['illumina2srf',
83 raise ValueError("Unrecognized run level %d" % (cmdlevel,))
85 LOGGER.info("Generated command: " + " ".join(cmd))
86 cmd_list.append(" ".join(cmd))
89 def create_qseq_patterns(bustard_dir):
90 """Scan a bustard directory for qseq files and determine a glob pattern
92 # grab one tile for each lane.
93 qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
94 # handle old runfolders
96 qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
99 qseqs = [ os.path.split(x)[-1] for x in qseqs ]
100 if len(qseqs[0].split('_')) == 4:
102 return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
103 elif len(qseqs[0].split('_')) == 5:
105 # build a dictionary of read numbers by lane
106 # ( just in case we didn't run all 8 lanes )
109 sample, lane, read, tile, extension = q.split('_')
110 lanes.setdefault(lane, []).append(read)
112 # grab a lane from the dictionary
113 # I don't think it matters which one.
115 # build the list of patterns
116 for read in lanes[k]:
118 qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
121 raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
123 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
125 make a subprocess-friendly list of command line arguments to run solexa2srf
126 generates files like:
127 woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
130 run_name - most of the file name (run folder name is a good choice)
131 lanes - list of integers corresponding to which lanes to process
132 site_name - name of your "sequencing site" or "Individual"
133 destdir - where to write all the srf files
136 LOGGER.info("run_name %s" % (run_name,))
140 if not isinstance(key, SampleKey):
141 errmsg = "Expected %s got %s"
142 raise ValueError(errmsg % (str(SampleKey), str(type(key))))
143 name_prefix = '%s_%%l_%%t_' % (run_name,)
144 destdir = os.path.normpath(destdir)
145 qseq_patterns = create_qseq_patterns(bustard_dir)
147 for read, pattern in qseq_patterns:
149 destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, key.lane)
150 dest_path = os.path.join(destdir, destname)
152 destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, key.lane, read)
153 dest_path = os.path.join(destdir, destname)
155 cmd = " ".join(['tar', 'cjf', dest_path, pattern % (key.lane,) ])
156 LOGGER.info("Generated command: " + cmd)
161 def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
163 make a subprocess-friendly list of command line arguments to save HiSeq fastq files
165 run_name - most of the file name (run folder name is a good choice)
166 basecall_dir - location of unaligned files.
167 site_name - name of your "sequencing site" or "Individual"
168 destdir - root of where to save fastq files
171 LOGGER.info("run_name %s" % (run_name,))
174 project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
175 for project_dir in project_dirs:
176 _, project_name = os.path.split(project_dir)
177 sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
178 project_dest = os.path.join(destdir, project_name)
179 if not os.path.exists(project_dest):
180 LOGGER.info("Making: %s" % (project_dest))
181 os.mkdir(project_dest)
183 for fastq_file in sample_files:
184 shutil.copy(fastq_file, project_dest)
187 def run_commands(new_dir, cmd_list, num_jobs):
188 LOGGER.info("chdir to %s" % (new_dir,))
191 q = queuecommands.QueueCommands(cmd_list, num_jobs)
195 def make_md5_commands(destdir):
197 Scan the cycle dir and create md5s for the contents
200 destdir = os.path.abspath(destdir)
201 bz2s = glob(os.path.join(destdir, "*.bz2"))
202 gzs = glob(os.path.join(destdir, "*gz"))
203 srfs = glob(os.path.join(destdir, "*.srf"))
205 file_list = bz2s + gzs + srfs
208 cmd = " ".join(['md5sum', f, '>', f + '.md5'])
209 LOGGER.info('generated command: ' + cmd)