X-Git-Url: http://woldlab.caltech.edu/gitweb/?a=blobdiff_plain;f=scripts%2Fmake-library-tree;fp=scripts%2Fmake-library-tree;h=0000000000000000000000000000000000000000;hb=67bb7faa500c2d74358fb128944f899a06d3f0a7;hp=2ccbec6632717902e3ff2de3513d918952b93cef;hpb=7e62631cfd9d17dad4adc401a38bdaa8d3b3d926;p=htsworkflow.git diff --git a/scripts/make-library-tree b/scripts/make-library-tree deleted file mode 100644 index 2ccbec6..0000000 --- a/scripts/make-library-tree +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python - -from ConfigParser import SafeConfigParser - -import logging -import os -from optparse import OptionParser -import stat -import shelve - -from htsworkflow.util import api -from htsworkflow.pipelines.sequences import scan_for_sequences - -def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey): - """ - compare our flowcell database with our list of sequences and return - a fully populated database - """ - fcdb = shelve.open(fcdb_filename) - libdb = {} - apidata = {'apiid': apiid, 'apikey': apikey} - for seq in sequences: - flowcell = seq.flowcell - flowcell_info = None - - # get info about flowcell from server or shelf - if not fcdb.has_key(flowcell): - url = api.flowcell_url(baseurl, flowcell) - flowcell_info = api.retrieve_info(url, apidata) - if flowcell_info is not None: - fcdb[flowcell] = flowcell_info - else: - flowcell_info = fcdb[flowcell] - - # make library id db - if flowcell_info is not None: - seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id'] - libdb.setdefault(seq_library_id, []).append(seq) - - fcdb.sync() - return fcdb, libdb - -def carefully_make_hardlink(source, destination, dry_run=False): - """ - Make a hard link, failing if a different link already exists - - Checking to see if the link already exists and is - the same as the link we want to make. - If the link already exists and is different, throw an error. - - If we didn't update anything return 0, if we did update - return 1. - """ - logging.debug("CHECKING: %s -> %s", source, destination) - - if not os.path.exists(source): - logging.warning("%s doesn't exist", source) - return 0 - - if os.path.exists(destination): - if os.path.samefile(source, destination): - logging.debug('SAME: %s -> %s' % (source, destination)) - return 0 - else: - logging.error('%s and %s are different files, skipping' % \ - (source, destination)) - return 0 - logging.debug('Linking: %s -> %s' % (source, destination)) - - # we would do something by this part - if dry_run: return 1 - - os.link(source, destination) - os.chmod(destination, - stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH ) - return 1 - -def make_library_links(root, library_db, dry_run=False): - """ - Make a tree of sequencer roots organized by library id - - Root is the root of the library tree - library_db is a dictionary of SequenceFiles organized by library id - """ - count = 0 - root = os.path.abspath(root) - for lib_id, sequences in library_db.items(): - target_dir = os.path.join(root, lib_id) - if not os.path.exists(target_dir): - logging.info("mkdir %s" % (target_dir,)) - if not dry_run: - os.mkdir(target_dir) - - for s in sequences: - count += carefully_make_hardlink(s.path, - s.make_target_name(target_dir), - dry_run=dry_run) - return count - -def configure_logging(opts): - # setup logging - level = logging.WARN - if opts.verbose: - level = logging.INFO - if opts.debug: - level = logging.DEBUG - logging.basicConfig(level=level) - - -def configure_opts(opts): - """ - Load in options from config file - """ - SECTION_NAME = 'sequence_archive' - ARCHIVE_OPT = 'sequence_archive' - CACHE_OPT = 'cache' - HOST_OPT = 'host' - APIID_OPT = 'apiid' - APIKEY_OPT = 'apikey' - - # figure out what config file to read - config_path = [os.path.expanduser('~/.htsworkflow.ini'), - '/etc/htsworkflow.ini'] - if opts.config is not None: - config_path = [opts.config] - # parse options from config file - config_file = SafeConfigParser() - config_file.read(config_path) - - # load defaults from config file if not overriden by the command line - if opts.cache is None: - if config_file.has_option(SECTION_NAME, CACHE_OPT): - opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT) - else: - opts.cache = os.path.expanduser('~/.flowcelldb.shelve') - - if opts.sequence_archive is None and \ - config_file.has_option(SECTION_NAME, ARCHIVE_OPT): - opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT) - opts.sequence_archive = os.path.expanduser(opts.sequence_archive) - - opts.sequence_archive = os.path.abspath(opts.sequence_archive) - opts.library_tree = os.path.join(opts.sequence_archive, 'libraries') - opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells') - opts.srfs = os.path.join(opts.sequence_archive, 'srfs') - - if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT): - opts.host = config_file.get(SECTION_NAME, HOST_OPT) - - if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT): - opts.apiid = config_file.get(SECTION_NAME, APIID_OPT) - - if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT): - opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT) - - return opts - -def make_parser(): - """ - Make parser - """ - parser = OptionParser() - parser.add_option('-c', '--config', default=None, - help='path to a configuration file containing a ' - 'sequence archive section') - parser.add_option('--cache', default=None, - help="default flowcell cache") - - parser.add_option('--host', default=None, - help="specify http://host for quering flowcell information") - parser.add_option('--apiid', default=None, - help="API ID to use when retriving information") - parser.add_option("--apikey", default=None, - help="API Key for when retriving information") - - parser.add_option('-a', '--sequence-archive', default=None, - help='path to where the sequence archive lives') - - parser.add_option('-v', '--verbose', action='store_true', default=False, - help='be more verbose') - parser.add_option('-d', '--debug', action='store_true', default=False, - help='report everything') - - parser.add_option("--dry-run", dest="dry_run", action="store_true", - default=False, - help="Don't modify the filesystem") - return parser - -def main(cmdline=None): - parser = make_parser() - opts, args = parser.parse_args(cmdline) - - configure_logging(opts) - opts = configure_opts(opts) - - # complain if critical things are missing - if opts.cache is None: - parser.error('Need location of htsworkflow frontend database') - - if opts.sequence_archive is None: - parser.error('Need the root path for the sequence archive') - - seq_dirs = [ opts.flowcells, opts.srfs ] - if len(args) > 0: - seq_dirs = [os.path.abspath(f) for f in args] - - seqs = scan_for_sequences(seq_dirs) - fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey) - updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run) - - logging.warn("%s flowcells in database" % (len(fcdb),)) - logging.warn("found %s sequence files" % (len(seqs),)) - logging.warn("%s libraries being checked" % (len(libdb),)) - logging.warn("%s sequence files were linked" % (updates,)) - - return 0 - -if __name__ == "__main__": - main()