#!/usr/bin/env python from ConfigParser import SafeConfigParser import logging import os from optparse import OptionParser import stat import shelve from htsworkflow.util import api from htsworkflow.pipelines.sequences import scan_for_sequences LOGGER = logging.getLogger(__name__) def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey): """ compare our flowcell database with our list of sequences and return a fully populated database """ fcdb = shelve.open(fcdb_filename) libdb = {} apidata = {'apiid': apiid, 'apikey': apikey} for seq in sequences: flowcell = seq.flowcell flowcell_info = None # get info about flowcell from server or shelf if not fcdb.has_key(flowcell): url = api.flowcell_url(baseurl, flowcell) flowcell_info = api.retrieve_info(url, apidata) if flowcell_info is not None: fcdb[flowcell] = flowcell_info else: flowcell_info = fcdb[flowcell] # make library id db if flowcell_info is not None: lane_collection = flowcell_info['lane_set'][unicode(seq.lane)] if type(lane_collection) != type([]): lane_collection = [lane_collection] for sample in lane_collection: seq_library_id = sample['library_id'] libdb.setdefault(seq_library_id, []).append(seq) fcdb.sync() return fcdb, libdb def carefully_make_hardlink(source, destination, dry_run=False): """ Make a hard link, failing if a different link already exists Checking to see if the link already exists and is the same as the link we want to make. If the link already exists and is different, throw an error. If we didn't update anything return 0, if we did update return 1. """ LOGGER.debug("CHECKING: %s -> %s", source, destination) if not os.path.exists(source): LOGGER.warning("%s doesn't exist", source) return 0 if os.path.exists(destination): if os.path.samefile(source, destination): LOGGER.debug('SAME: %s -> %s' % (source, destination)) return 0 else: LOGGER.error('%s and %s are different files, skipping' % \ (source, destination)) return 0 LOGGER.debug('Linking: %s -> %s' % (source, destination)) # we would do something by this part if dry_run: return 1 os.link(source, destination) os.chmod(destination, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH ) return 1 def make_library_links(root, library_db, dry_run=False): """ Make a tree of sequencer roots organized by library id Root is the root of the library tree library_db is a dictionary of SequenceFiles organized by library id """ count = 0 root = os.path.abspath(root) for lib_id, sequences in library_db.items(): target_dir = os.path.join(root, lib_id) if not os.path.exists(target_dir): LOGGER.info("mkdir %s" % (target_dir,)) if not dry_run: os.mkdir(target_dir) for s in sequences: count += carefully_make_hardlink(s.path, s.make_target_name(target_dir), dry_run=dry_run) return count def configure_logging(opts): # setup logging level = logging.WARN if opts.verbose: level = logging.INFO if opts.debug: level = logging.DEBUG logging.basicConfig(level=level) def configure_opts(opts): """ Load in options from config file """ SECTION_NAME = 'sequence_archive' ARCHIVE_OPT = 'sequence_archive' CACHE_OPT = 'cache' HOST_OPT = 'host' APIID_OPT = 'apiid' APIKEY_OPT = 'apikey' # figure out what config file to read config_path = [os.path.expanduser('~/.htsworkflow.ini'), '/etc/htsworkflow.ini'] if opts.config is not None: config_path = [opts.config] # parse options from config file config_file = SafeConfigParser() config_file.read(config_path) # load defaults from config file if not overriden by the command line if opts.cache is None: if config_file.has_option(SECTION_NAME, CACHE_OPT): opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT) else: opts.cache = os.path.expanduser('~/.flowcelldb.shelve') if opts.sequence_archive is None and \ config_file.has_option(SECTION_NAME, ARCHIVE_OPT): opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT) opts.sequence_archive = os.path.expanduser(opts.sequence_archive) opts.sequence_archive = os.path.abspath(opts.sequence_archive) opts.library_tree = os.path.join(opts.sequence_archive, 'libraries') opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells') opts.srfs = os.path.join(opts.sequence_archive, 'srfs') if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT): opts.host = config_file.get(SECTION_NAME, HOST_OPT) if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT): opts.apiid = config_file.get(SECTION_NAME, APIID_OPT) if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT): opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT) return opts def make_parser(): """ Make parser """ parser = OptionParser() parser.add_option('-c', '--config', default=None, help='path to a configuration file containing a ' 'sequence archive section') parser.add_option('--cache', default=None, help="default flowcell cache") parser.add_option('--host', default=None, help="specify http://host for quering flowcell information") parser.add_option('--apiid', default=None, help="API ID to use when retriving information") parser.add_option("--apikey", default=None, help="API Key for when retriving information") parser.add_option('-a', '--sequence-archive', default=None, help='path to where the sequence archive lives') parser.add_option('-v', '--verbose', action='store_true', default=False, help='be more verbose') parser.add_option('-d', '--debug', action='store_true', default=False, help='report everything') parser.add_option("--dry-run", dest="dry_run", action="store_true", default=False, help="Don't modify the filesystem") return parser def main(cmdline=None): parser = make_parser() opts, args = parser.parse_args(cmdline) configure_logging(opts) opts = configure_opts(opts) # complain if critical things are missing if opts.cache is None: parser.error('Need location of htsworkflow frontend database') if opts.sequence_archive is None: parser.error('Need the root path for the sequence archive') seq_dirs = [ opts.flowcells, opts.srfs ] if len(args) > 0: seq_dirs = [os.path.abspath(f) for f in args] seqs = scan_for_sequences(seq_dirs) fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey) updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run) LOGGER.warn("%s flowcells in database" % (len(fcdb),)) LOGGER.warn("found %s sequence files" % (len(seqs),)) LOGGER.warn("%s libraries being checked" % (len(libdb),)) LOGGER.warn("%s sequence files were linked" % (updates,)) return 0 if __name__ == "__main__": main()