#!/usr/bin/python """ Make a tree of symlinks organized by library id. """ from ConfigParser import SafeConfigParser from glob import glob import logging from optparse import OptionParser import logging import os import stat import sys from htsworkflow.util import fctracker def find_lanes(flowcell_dir, flowcell_id, lane): lane_name = "s_%s_eland_*" %(lane) pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name) lanes = glob(pattern) return lanes def make_long_lane_name(flowcell_dir, lane_pathname): """ make a name from the eland result file name """ if flowcell_dir == lane_pathname[0:len(flowcell_dir)]: subpath = lane_pathname[len(flowcell_dir):] long_name = subpath.replace(os.path.sep, "_") return long_name else: return None def parse_srf_directory(srf_dir): """ search srf_dir for *.srf files builds a dictionary indexed by flowcell name. """ flowcells = {} srfs = glob(os.path.join(srf_dir,'*.srf')) for pathname in srfs: path, filename = os.path.split(pathname) basename, ext = os.path.splitext(filename) record = basename.split('_') if len(record) != 6: logging.error("Unrecognized srf file: %s expected 6 fields got %d" % (pathname,len(record))) continue site = record[0] date = record[1] machine = record[2] runid = record[3] flowcellid = record[4] laneid = record[5] desc = "_".join([site,date,machine,runid,flowcellid]) flowcells[flowcellid] = desc return flowcells def carefully_make_hardlink(source, destination, dry_run=False): """ Make a hard link, failing if a different link already exists Checking to see if the link already exists and is the same as the link we want to make. If the link already exists and is different, throw an error. """ logging.debug("CHECKING: %s -> %s", source, destination) if not os.path.exists(source): logging.warning("%s doesn't exist", source) return if os.path.exists(destination): if os.path.samefile(source, destination): logging.debug('SAME: %s -> %s' % (source, destination)) return else: raise IOError('%s and %s are different files' % \ (source, destination)) logging.info('Linking: %s -> %s' % (source, destination)) if dry_run: return os.link(source, destination) os.chmod(destination, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH ) def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run): """ find eland files at different alignment lengths and put each of those in the file """ lanes = find_lanes(flowcell_dir, flowcell_id, lane) for lane_pathname in lanes: long_name = make_long_lane_name(flowcell_dir, lane_pathname) long_pathname = os.path.join(library_path, long_name) carefully_make_hardlink(lane_pathname, long_pathname, dry_run) def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run): """ Link srf files into our library directories. the srf files must be named: _____.srf """ srf_basename = srf_names.get(flowcell_id, None) if srf_basename is None: logging.info("srf file for %s was not found", flowcell_id) else: srf_filename = "%s_%s.srf" % (srf_basename, lane) source = os.path.join(srf_dir, srf_filename) destination = os.path.join(library_path, srf_filename) carefully_make_hardlink(source, destination, dry_run) def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir, dry_run=False): """ Iterate over the library """ library_dir = os.path.normpath(library_dir) + os.path.sep flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep srfs_dir = os.path.normpath(srfs_dir) + os.path.sep srf_names = parse_srf_directory(srfs_dir) for lib_id, lib in fcdb.library.items(): library_path = os.path.join(library_dir, str(lib_id)) if not os.path.exists(library_path): os.mkdir(library_path) for flowcell_id, lane in lib.get('lanes', []): link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run) link_srf_lanes(srf_names, library_path, srfs_dir, flowcell_id, lane, dry_run) def make_parser(): """ Make parser """ parser = OptionParser() parser.add_option('-c', '--config', default=None, help='path to a configuration file containing a ' 'sequence archive section') parser.add_option("--database", dest="database", help="path to the fctracker.db", default=None) parser.add_option('-a', '--sequence-archive', default=None, help='path to where the sequence archive lives') parser.add_option("-w", "--where", dest="where", help="add a where clause", default=None) parser.add_option('-v', '--verbose', action='store_true', default=False, help='be more verbose') parser.add_option('-d', '--debug', action='store_true', default=False, help='report everything') parser.add_option("--dry-run", dest="dry_run", action="store_true", default=False, help="Don't modify the filesystem") return parser def main(argv=None): FRONTEND_NAME = 'frontend' SECTION_NAME = 'sequence_archive' DATABASE_OPT = 'database_name' ARCHIVE_OPT = 'archive_path' if argv is None: argv = [] parser = make_parser() # parse command line arguments opt, args = parser.parse_args(argv) # setup logging level = logging.WARN if opt.verbose: level = logging.INFO if opt.debug: level = logging.DEBUG logging.basicConfig(level=level) # figure out what config file to read config_path = [os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'] if opt.config is not None: config_path = [opt.config] # parse options from config file config_file = SafeConfigParser() config_file.read(config_path) # load defaults from config file if not overriden by the command line print opt.database if opt.database is None and \ config_file.has_option(FRONTEND_NAME, DATABASE_OPT): opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT) if opt.sequence_archive is None and \ config_file.has_option(SECTION_NAME, ARCHIVE_OPT): opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT) # complain if critical things are missing if opt.database is None: parser.error('Need location of htsworkflow frontend database') if opt.sequence_archive is None: parser.error('Need the root path for the sequence archive') fcdb = fctracker.fctracker(opt.database) cells = fcdb._get_flowcells(opt.where) library_dir = os.path.join(opt.sequence_archive, 'libraries') flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells') srfs_dir = os.path.join(opt.sequence_archive, 'srfs') make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir, opt.dry_run) return 0 if __name__ == "__main__": rv = main(sys.argv[1:]) # sys.exit(rv)