+++ /dev/null
-#!/usr/bin/env python
-
-from ConfigParser import SafeConfigParser
-
-import logging
-import os
-from optparse import OptionParser
-import stat
-import shelve
-
-from htsworkflow.util import api
-from htsworkflow.pipelines.sequences import scan_for_sequences
-
-def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
- """
- compare our flowcell database with our list of sequences and return
- a fully populated database
- """
- fcdb = shelve.open(fcdb_filename)
- libdb = {}
- apidata = {'apiid': apiid, 'apikey': apikey}
- for seq in sequences:
- flowcell = seq.flowcell
- flowcell_info = None
-
- # get info about flowcell from server or shelf
- if not fcdb.has_key(flowcell):
- url = api.flowcell_url(baseurl, flowcell)
- flowcell_info = api.retrieve_info(url, apidata)
- if flowcell_info is not None:
- fcdb[flowcell] = flowcell_info
- else:
- flowcell_info = fcdb[flowcell]
-
- # make library id db
- if flowcell_info is not None:
- seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
- libdb.setdefault(seq_library_id, []).append(seq)
-
- fcdb.sync()
- return fcdb, libdb
-
-def carefully_make_hardlink(source, destination, dry_run=False):
- """
- Make a hard link, failing if a different link already exists
-
- Checking to see if the link already exists and is
- the same as the link we want to make.
- If the link already exists and is different, throw an error.
-
- If we didn't update anything return 0, if we did update
- return 1.
- """
- logging.debug("CHECKING: %s -> %s", source, destination)
-
- if not os.path.exists(source):
- logging.warning("%s doesn't exist", source)
- return 0
-
- if os.path.exists(destination):
- if os.path.samefile(source, destination):
- logging.debug('SAME: %s -> %s' % (source, destination))
- return 0
- else:
- logging.error('%s and %s are different files, skipping' % \
- (source, destination))
- return 0
- logging.debug('Linking: %s -> %s' % (source, destination))
-
- # we would do something by this part
- if dry_run: return 1
-
- os.link(source, destination)
- os.chmod(destination,
- stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
- return 1
-
-def make_library_links(root, library_db, dry_run=False):
- """
- Make a tree of sequencer roots organized by library id
-
- Root is the root of the library tree
- library_db is a dictionary of SequenceFiles organized by library id
- """
- count = 0
- root = os.path.abspath(root)
- for lib_id, sequences in library_db.items():
- target_dir = os.path.join(root, lib_id)
- if not os.path.exists(target_dir):
- logging.info("mkdir %s" % (target_dir,))
- if not dry_run:
- os.mkdir(target_dir)
-
- for s in sequences:
- count += carefully_make_hardlink(s.path,
- s.make_target_name(target_dir),
- dry_run=dry_run)
- return count
-
-def configure_logging(opts):
- # setup logging
- level = logging.WARN
- if opts.verbose:
- level = logging.INFO
- if opts.debug:
- level = logging.DEBUG
- logging.basicConfig(level=level)
-
-
-def configure_opts(opts):
- """
- Load in options from config file
- """
- SECTION_NAME = 'sequence_archive'
- ARCHIVE_OPT = 'sequence_archive'
- CACHE_OPT = 'cache'
- HOST_OPT = 'host'
- APIID_OPT = 'apiid'
- APIKEY_OPT = 'apikey'
-
- # figure out what config file to read
- config_path = [os.path.expanduser('~/.htsworkflow.ini'),
- '/etc/htsworkflow.ini']
- if opts.config is not None:
- config_path = [opts.config]
- # parse options from config file
- config_file = SafeConfigParser()
- config_file.read(config_path)
-
- # load defaults from config file if not overriden by the command line
- if opts.cache is None:
- if config_file.has_option(SECTION_NAME, CACHE_OPT):
- opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
- else:
- opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
-
- if opts.sequence_archive is None and \
- config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
- opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
- opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
-
- opts.sequence_archive = os.path.abspath(opts.sequence_archive)
- opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
- opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
- opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
-
- if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
- opts.host = config_file.get(SECTION_NAME, HOST_OPT)
-
- if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
- opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
-
- if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
- opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
-
- return opts
-
-def make_parser():
- """
- Make parser
- """
- parser = OptionParser()
- parser.add_option('-c', '--config', default=None,
- help='path to a configuration file containing a '
- 'sequence archive section')
- parser.add_option('--cache', default=None,
- help="default flowcell cache")
-
- parser.add_option('--host', default=None,
- help="specify http://host for quering flowcell information")
- parser.add_option('--apiid', default=None,
- help="API ID to use when retriving information")
- parser.add_option("--apikey", default=None,
- help="API Key for when retriving information")
-
- parser.add_option('-a', '--sequence-archive', default=None,
- help='path to where the sequence archive lives')
-
- parser.add_option('-v', '--verbose', action='store_true', default=False,
- help='be more verbose')
- parser.add_option('-d', '--debug', action='store_true', default=False,
- help='report everything')
-
- parser.add_option("--dry-run", dest="dry_run", action="store_true",
- default=False,
- help="Don't modify the filesystem")
- return parser
-
-def main(cmdline=None):
- parser = make_parser()
- opts, args = parser.parse_args(cmdline)
-
- configure_logging(opts)
- opts = configure_opts(opts)
-
- # complain if critical things are missing
- if opts.cache is None:
- parser.error('Need location of htsworkflow frontend database')
-
- if opts.sequence_archive is None:
- parser.error('Need the root path for the sequence archive')
-
- seq_dirs = [ opts.flowcells, opts.srfs ]
- if len(args) > 0:
- seq_dirs = [os.path.abspath(f) for f in args]
-
- seqs = scan_for_sequences(seq_dirs)
- fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
- updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
-
- logging.warn("%s flowcells in database" % (len(fcdb),))
- logging.warn("found %s sequence files" % (len(seqs),))
- logging.warn("%s libraries being checked" % (len(libdb),))
- logging.warn("%s sequence files were linked" % (updates,))
-
- return 0
-
-if __name__ == "__main__":
- main()