X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=scripts%2Fhtsw-update-archive;fp=scripts%2Fhtsw-update-archive;h=2ccbec6632717902e3ff2de3513d918952b93cef;hb=2ac07634853fe713a7e9ebdd748acef99e17b1c3;hp=0000000000000000000000000000000000000000;hpb=a6984cfa2e80ef3f70255e3e26c5954210ef2dac;p=htsworkflow.git diff --git a/scripts/htsw-update-archive b/scripts/htsw-update-archive new file mode 100755 index 0000000..2ccbec6 --- /dev/null +++ b/scripts/htsw-update-archive @@ -0,0 +1,219 @@ +#!/usr/bin/env python + +from ConfigParser import SafeConfigParser + +import logging +import os +from optparse import OptionParser +import stat +import shelve + +from htsworkflow.util import api +from htsworkflow.pipelines.sequences import scan_for_sequences + +def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey): + """ + compare our flowcell database with our list of sequences and return + a fully populated database + """ + fcdb = shelve.open(fcdb_filename) + libdb = {} + apidata = {'apiid': apiid, 'apikey': apikey} + for seq in sequences: + flowcell = seq.flowcell + flowcell_info = None + + # get info about flowcell from server or shelf + if not fcdb.has_key(flowcell): + url = api.flowcell_url(baseurl, flowcell) + flowcell_info = api.retrieve_info(url, apidata) + if flowcell_info is not None: + fcdb[flowcell] = flowcell_info + else: + flowcell_info = fcdb[flowcell] + + # make library id db + if flowcell_info is not None: + seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id'] + libdb.setdefault(seq_library_id, []).append(seq) + + fcdb.sync() + return fcdb, libdb + +def carefully_make_hardlink(source, destination, dry_run=False): + """ + Make a hard link, failing if a different link already exists + + Checking to see if the link already exists and is + the same as the link we want to make. + If the link already exists and is different, throw an error. + + If we didn't update anything return 0, if we did update + return 1. + """ + logging.debug("CHECKING: %s -> %s", source, destination) + + if not os.path.exists(source): + logging.warning("%s doesn't exist", source) + return 0 + + if os.path.exists(destination): + if os.path.samefile(source, destination): + logging.debug('SAME: %s -> %s' % (source, destination)) + return 0 + else: + logging.error('%s and %s are different files, skipping' % \ + (source, destination)) + return 0 + logging.debug('Linking: %s -> %s' % (source, destination)) + + # we would do something by this part + if dry_run: return 1 + + os.link(source, destination) + os.chmod(destination, + stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH ) + return 1 + +def make_library_links(root, library_db, dry_run=False): + """ + Make a tree of sequencer roots organized by library id + + Root is the root of the library tree + library_db is a dictionary of SequenceFiles organized by library id + """ + count = 0 + root = os.path.abspath(root) + for lib_id, sequences in library_db.items(): + target_dir = os.path.join(root, lib_id) + if not os.path.exists(target_dir): + logging.info("mkdir %s" % (target_dir,)) + if not dry_run: + os.mkdir(target_dir) + + for s in sequences: + count += carefully_make_hardlink(s.path, + s.make_target_name(target_dir), + dry_run=dry_run) + return count + +def configure_logging(opts): + # setup logging + level = logging.WARN + if opts.verbose: + level = logging.INFO + if opts.debug: + level = logging.DEBUG + logging.basicConfig(level=level) + + +def configure_opts(opts): + """ + Load in options from config file + """ + SECTION_NAME = 'sequence_archive' + ARCHIVE_OPT = 'sequence_archive' + CACHE_OPT = 'cache' + HOST_OPT = 'host' + APIID_OPT = 'apiid' + APIKEY_OPT = 'apikey' + + # figure out what config file to read + config_path = [os.path.expanduser('~/.htsworkflow.ini'), + '/etc/htsworkflow.ini'] + if opts.config is not None: + config_path = [opts.config] + # parse options from config file + config_file = SafeConfigParser() + config_file.read(config_path) + + # load defaults from config file if not overriden by the command line + if opts.cache is None: + if config_file.has_option(SECTION_NAME, CACHE_OPT): + opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT) + else: + opts.cache = os.path.expanduser('~/.flowcelldb.shelve') + + if opts.sequence_archive is None and \ + config_file.has_option(SECTION_NAME, ARCHIVE_OPT): + opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT) + opts.sequence_archive = os.path.expanduser(opts.sequence_archive) + + opts.sequence_archive = os.path.abspath(opts.sequence_archive) + opts.library_tree = os.path.join(opts.sequence_archive, 'libraries') + opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells') + opts.srfs = os.path.join(opts.sequence_archive, 'srfs') + + if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT): + opts.host = config_file.get(SECTION_NAME, HOST_OPT) + + if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT): + opts.apiid = config_file.get(SECTION_NAME, APIID_OPT) + + if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT): + opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT) + + return opts + +def make_parser(): + """ + Make parser + """ + parser = OptionParser() + parser.add_option('-c', '--config', default=None, + help='path to a configuration file containing a ' + 'sequence archive section') + parser.add_option('--cache', default=None, + help="default flowcell cache") + + parser.add_option('--host', default=None, + help="specify http://host for quering flowcell information") + parser.add_option('--apiid', default=None, + help="API ID to use when retriving information") + parser.add_option("--apikey", default=None, + help="API Key for when retriving information") + + parser.add_option('-a', '--sequence-archive', default=None, + help='path to where the sequence archive lives') + + parser.add_option('-v', '--verbose', action='store_true', default=False, + help='be more verbose') + parser.add_option('-d', '--debug', action='store_true', default=False, + help='report everything') + + parser.add_option("--dry-run", dest="dry_run", action="store_true", + default=False, + help="Don't modify the filesystem") + return parser + +def main(cmdline=None): + parser = make_parser() + opts, args = parser.parse_args(cmdline) + + configure_logging(opts) + opts = configure_opts(opts) + + # complain if critical things are missing + if opts.cache is None: + parser.error('Need location of htsworkflow frontend database') + + if opts.sequence_archive is None: + parser.error('Need the root path for the sequence archive') + + seq_dirs = [ opts.flowcells, opts.srfs ] + if len(args) > 0: + seq_dirs = [os.path.abspath(f) for f in args] + + seqs = scan_for_sequences(seq_dirs) + fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey) + updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run) + + logging.warn("%s flowcells in database" % (len(fcdb),)) + logging.warn("found %s sequence files" % (len(seqs),)) + logging.warn("%s libraries being checked" % (len(libdb),)) + logging.warn("%s sequence files were linked" % (updates,)) + + return 0 + +if __name__ == "__main__": + main()