7 from htsworkflow.util import queuecommands
8 from htsworkflow.pipelines.samplekey import SampleKey
10 LOGGER = logging.getLogger(__name__)
16 def pathname_to_run_name(base):
18 Convert a pathname to a base runfolder name
19 handle the case with a trailing /
21 >>> print pathname_to_run_name("/a/b/c/run")
23 >>> print pathname_to_run_name("/a/b/c/run/")
25 >>> print pathname_to_run_name("run")
27 >>> print pathname_to_run_name("run/")
29 >>> print pathname_to_run_name("../run")
31 >>> print pathname_to_run_name("../run/")
36 base, name = os.path.split(base)
41 def make_srf_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
43 make a subprocess-friendly list of command line arguments to run solexa2srf
45 woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
48 run_name - most of the file name (run folder name is a good choice)
49 lanes - list of integers corresponding to which lanes to process
50 site_name - name of your "sequencing site" or "Individual"
51 destdir - where to write all the srf files
54 LOGGER.info("run_name %s" % (run_name,))
58 if not isinstance(key, SampleKey):
59 errmsg = "Expected %s got %s"
60 raise ValueError(errmsg % (str(SampleKey), str(type(key))))
61 name_prefix = '%s_%%l_' % (run_name,)
62 destname = '%s_%s_%d.srf' % (site_name, run_name, key.lane)
63 destdir = os.path.normpath(destdir)
64 dest_path = os.path.join(destdir, destname)
65 seq_pattern = 's_%d_*_seq.txt' % (key.lane,)
67 if cmdlevel == SOLEXA2SRF:
73 elif cmdlevel == ILLUMINA2SRF10:
74 cmd = ['illumina2srf',
78 elif cmdlevel == ILLUMINA2SRF11:
79 seq_pattern = 's_%d_*_qseq.txt' % (key.lane,)
80 cmd = ['illumina2srf',
84 raise ValueError("Unrecognized run level %d" % (cmdlevel,))
86 LOGGER.info("Generated command: " + " ".join(cmd))
87 cmd_list.append(" ".join(cmd))
90 def create_qseq_patterns(bustard_dir):
91 """Scan a bustard directory for qseq files and determine a glob pattern
93 # grab one tile for each lane.
94 qseqs = glob(os.path.join(bustard_dir, '*_1101_qseq.txt'))
95 # handle old runfolders
97 qseqs = glob(os.path.join(bustard_dir, '*_0001_qseq.txt'))
100 qseqs = [ os.path.split(x)[-1] for x in qseqs ]
101 if len(qseqs[0].split('_')) == 4:
103 return [(None, "s_%d_[0-9][0-9][0-9][0-9]_qseq.txt")]
104 elif len(qseqs[0].split('_')) == 5:
106 # build a dictionary of read numbers by lane
107 # ( just in case we didn't run all 8 lanes )
110 sample, lane, read, tile, extension = q.split('_')
111 lanes.setdefault(lane, []).append(read)
113 # grab a lane from the dictionary
114 # I don't think it matters which one.
116 # build the list of patterns
117 for read in lanes[k]:
119 qseq_patterns.append((read, 's_%d_' + '%d_[0-9][0-9][0-9][0-9]_qseq.txt' % (read,)))
122 raise RuntimeError('unrecognized qseq pattern, not a single or multiple read pattern')
124 def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdlevel=ILLUMINA2SRF11):
126 make a subprocess-friendly list of command line arguments to run solexa2srf
127 generates files like:
128 woldlab:080514_HWI-EAS229_0029_20768AAXX:8.srf
131 run_name - most of the file name (run folder name is a good choice)
132 lanes - list of integers corresponding to which lanes to process
133 site_name - name of your "sequencing site" or "Individual"
134 destdir - where to write all the srf files
137 LOGGER.info("run_name %s" % (run_name,))
141 if not isinstance(key, SampleKey):
142 errmsg = "Expected %s got %s"
143 raise ValueError(errmsg % (str(SampleKey), str(type(key))))
144 name_prefix = '%s_%%l_%%t_' % (run_name,)
145 destdir = os.path.normpath(destdir)
146 qseq_patterns = create_qseq_patterns(bustard_dir)
148 for read, pattern in qseq_patterns:
150 destname = '%s_%s_l%d.tar.bz2' % (site_name, run_name, key.lane)
151 dest_path = os.path.join(destdir, destname)
153 destname = '%s_%s_l%d_r%d.tar.bz2' % (site_name, run_name, key.lane, read)
154 dest_path = os.path.join(destdir, destname)
156 cmd = " ".join(['tar', 'cjf', dest_path, pattern % (key.lane,) ])
157 LOGGER.info("Generated command: " + cmd)
162 def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
164 make a subprocess-friendly list of command line arguments to save HiSeq fastq files
166 run_name - most of the file name (run folder name is a good choice)
167 basecall_dir - location of unaligned files.
168 site_name - name of your "sequencing site" or "Individual"
169 destdir - root of where to save fastq files
172 LOGGER.info("run_name %s" % (run_name,))
175 project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
176 for project_dir in project_dirs:
177 _, project_name = os.path.split(project_dir)
178 sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
179 project_dest = os.path.join(destdir, project_name)
180 if not os.path.exists(project_dest):
181 LOGGER.info("Making: %s" % (project_dest))
182 os.mkdir(project_dest)
184 for fastq_file in sample_files:
185 shutil.copy(fastq_file, project_dest)
188 def run_commands(new_dir, cmd_list, num_jobs):
189 LOGGER.info("chdir to %s" % (new_dir,))
192 q = queuecommands.QueueCommands(cmd_list, num_jobs)
196 def make_md5_commands(destdir):
198 Scan the cycle dir and create md5s for the contents
201 destdir = os.path.abspath(destdir)
202 bz2s = glob(os.path.join(destdir, "*.bz2"))
203 gzs = glob(os.path.join(destdir, "*gz"))
204 srfs = glob(os.path.join(destdir, "*.srf"))
206 file_list = bz2s + gzs + srfs
209 cmd = " ".join(['md5sum', f, '>', f + '.md5'])
210 LOGGER.info('generated command: ' + cmd)
215 def main(cmdline=None):
216 parser = make_parser()
217 opts, args = parser.parse_args(cmdline)
219 logging.basicConfig(level = logging.DEBUG)
221 parser.error("Specify run name. Usually runfolder name")
222 if not opts.destination:
223 parser.error("Specify where to write sequence files")
224 if not opts.site_name:
225 parser.error("Specify site name")
227 parser.error("Can only process one directory")
230 LOGGER.info("Raw Format is: %s" % (opts.format, ))
232 if opts.format == 'fastq':
233 LOGGER.info("raw data = %s" % (source,))
234 copy_hiseq_project_fastqs(opts.name, source, opts.site_name, opts.destination)
235 elif opts.format == 'qseq':
236 seq_cmds = make_qseq_commands(opts.name, source, opts.lanes, opts.site_name, opts.destination)
237 elif opts.format == 'srf':
238 seq_cmds = make_srf_commands(opts.name, source, opts.lanes, opts.site_name, opts.destination, 0)
240 raise ValueError('Unknown --format=%s' % (opts.format))
242 srf.run_commands(args.source, seq_cmds, num_jobs)
245 parser = optparse.OptionParser()
246 parser.add_option('-f', '--format', default='fastq',
247 help="Format raw data is in")
248 parser.add_option('-n', '--name', default=None,
249 help="Specify run name")
250 parser.add_option('-d', '--destination', default=None,
251 help='specify where to write files (cycle dir)')
252 parser.add_option('-s', '--site-name', default=None,
253 help="specify site name")
254 parser.add_option('-l', '--lanes', default="1,2,3,4,5,6,7,8",
255 help="what lanes to process, defaults to all")
257 if __name__ == "__main__":