+#!/usr/bin/env python
+
+from ConfigParser import SafeConfigParser
+
+import logging
+import os
+from optparse import OptionParser
+import stat
+import shelve
+
+from htsworkflow.util import api
+from htsworkflow.pipelines.sequences import scan_for_sequences
+
+def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
+ """
+ compare our flowcell database with our list of sequences and return
+ a fully populated database
+ """
+ fcdb = shelve.open(fcdb_filename)
+ libdb = {}
+ apidata = {'apiid': apiid, 'apikey': apikey}
+ for seq in sequences:
+ flowcell = seq.flowcell
+ flowcell_info = None
+
+ # get info about flowcell from server or shelf
+ if not fcdb.has_key(flowcell):
+ url = api.flowcell_url(baseurl, flowcell)
+ flowcell_info = api.retrieve_info(url, apidata)
+ if flowcell_info is not None:
+ fcdb[flowcell] = flowcell_info
+ else:
+ flowcell_info = fcdb[flowcell]
+
+ # make library id db
+ if flowcell_info is not None:
+ seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
+ libdb.setdefault(seq_library_id, []).append(seq)
+
+ fcdb.sync()
+ return fcdb, libdb
+
+def carefully_make_hardlink(source, destination, dry_run=False):
+ """
+ Make a hard link, failing if a different link already exists
+
+ Checking to see if the link already exists and is
+ the same as the link we want to make.
+ If the link already exists and is different, throw an error.
+
+ If we didn't update anything return 0, if we did update
+ return 1.
+ """
+ logging.debug("CHECKING: %s -> %s", source, destination)
+
+ if not os.path.exists(source):
+ logging.warning("%s doesn't exist", source)
+ return 0
+
+ if os.path.exists(destination):
+ if os.path.samefile(source, destination):
+ logging.debug('SAME: %s -> %s' % (source, destination))
+ return 0
+ else:
+ logging.error('%s and %s are different files, skipping' % \
+ (source, destination))
+ return 0
+ logging.debug('Linking: %s -> %s' % (source, destination))
+
+ # we would do something by this part
+ if dry_run: return 1
+
+ os.link(source, destination)
+ os.chmod(destination,
+ stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
+ return 1
+
+def make_library_links(root, library_db, dry_run=False):
+ """
+ Make a tree of sequencer roots organized by library id
+
+ Root is the root of the library tree
+ library_db is a dictionary of SequenceFiles organized by library id
+ """
+ count = 0
+ root = os.path.abspath(root)
+ for lib_id, sequences in library_db.items():
+ target_dir = os.path.join(root, lib_id)
+ if not os.path.exists(target_dir):
+ logging.info("mkdir %s" % (target_dir,))
+ if not dry_run:
+ os.mkdir(target_dir)
+
+ for s in sequences:
+ count += carefully_make_hardlink(s.path,
+ s.make_target_name(target_dir),
+ dry_run=dry_run)
+ return count
+
+def configure_logging(opts):
+ # setup logging
+ level = logging.WARN
+ if opts.verbose:
+ level = logging.INFO
+ if opts.debug:
+ level = logging.DEBUG
+ logging.basicConfig(level=level)
+
+
+def configure_opts(opts):
+ """
+ Load in options from config file
+ """
+ SECTION_NAME = 'sequence_archive'
+ ARCHIVE_OPT = 'sequence_archive'
+ CACHE_OPT = 'cache'
+ HOST_OPT = 'host'
+ APIID_OPT = 'apiid'
+ APIKEY_OPT = 'apikey'
+
+ # figure out what config file to read
+ config_path = [os.path.expanduser('~/.htsworkflow.ini'),
+ '/etc/htsworkflow.ini']
+ if opts.config is not None:
+ config_path = [opts.config]
+ # parse options from config file
+ config_file = SafeConfigParser()
+ config_file.read(config_path)
+
+ # load defaults from config file if not overriden by the command line
+ if opts.cache is None:
+ if config_file.has_option(SECTION_NAME, CACHE_OPT):
+ opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
+ else:
+ opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
+
+ if opts.sequence_archive is None and \
+ config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
+ opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
+ opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
+
+ opts.sequence_archive = os.path.abspath(opts.sequence_archive)
+ opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
+ opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
+ opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
+
+ if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
+ opts.host = config_file.get(SECTION_NAME, HOST_OPT)
+
+ if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
+ opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
+
+ if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
+ opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
+
+ return opts
+
+def make_parser():
+ """
+ Make parser
+ """
+ parser = OptionParser()
+ parser.add_option('-c', '--config', default=None,
+ help='path to a configuration file containing a '
+ 'sequence archive section')
+ parser.add_option('--cache', default=None,
+ help="default flowcell cache")
+
+ parser.add_option('--host', default=None,
+ help="specify http://host for quering flowcell information")
+ parser.add_option('--apiid', default=None,
+ help="API ID to use when retriving information")
+ parser.add_option("--apikey", default=None,
+ help="API Key for when retriving information")
+
+ parser.add_option('-a', '--sequence-archive', default=None,
+ help='path to where the sequence archive lives')
+
+ parser.add_option('-v', '--verbose', action='store_true', default=False,
+ help='be more verbose')
+ parser.add_option('-d', '--debug', action='store_true', default=False,
+ help='report everything')
+
+ parser.add_option("--dry-run", dest="dry_run", action="store_true",
+ default=False,
+ help="Don't modify the filesystem")
+ return parser
+
+def main(cmdline=None):
+ parser = make_parser()
+ opts, args = parser.parse_args(cmdline)
+
+ configure_logging(opts)
+ opts = configure_opts(opts)
+
+ # complain if critical things are missing
+ if opts.cache is None:
+ parser.error('Need location of htsworkflow frontend database')
+
+ if opts.sequence_archive is None:
+ parser.error('Need the root path for the sequence archive')
+
+ seq_dirs = [ opts.flowcells, opts.srfs ]
+ if len(args) > 0:
+ seq_dirs = [os.path.abspath(f) for f in args]
+
+ seqs = scan_for_sequences(seq_dirs)
+ fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
+ updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
+
+ logging.warn("%s flowcells in database" % (len(fcdb),))
+ logging.warn("found %s sequence files" % (len(seqs),))
+ logging.warn("%s libraries being checked" % (len(libdb),))
+ logging.warn("%s sequence files were linked" % (updates,))
+
+ return 0
+
+if __name__ == "__main__":
+ main()