Rename all the scripts to start with htsw-
[htsworkflow.git] / scripts / htsw-update-archive
diff --git a/scripts/htsw-update-archive b/scripts/htsw-update-archive
new file mode 100755 (executable)
index 0000000..2ccbec6
--- /dev/null
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+
+from ConfigParser import SafeConfigParser
+
+import logging
+import os
+from optparse import OptionParser
+import stat
+import shelve
+
+from htsworkflow.util import api
+from htsworkflow.pipelines.sequences import scan_for_sequences
+
+def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
+    """
+    compare our flowcell database with our list of sequences and return
+    a fully populated database
+    """
+    fcdb = shelve.open(fcdb_filename)
+    libdb = {}
+    apidata = {'apiid': apiid, 'apikey': apikey}
+    for seq in sequences:
+        flowcell = seq.flowcell
+        flowcell_info = None
+
+        # get info about flowcell from server or shelf
+        if not fcdb.has_key(flowcell):
+            url = api.flowcell_url(baseurl, flowcell)
+            flowcell_info = api.retrieve_info(url, apidata)
+            if flowcell_info is not None:
+                fcdb[flowcell] = flowcell_info
+        else:
+            flowcell_info = fcdb[flowcell]
+
+        # make library id db
+        if flowcell_info is not None:
+            seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
+            libdb.setdefault(seq_library_id, []).append(seq)
+           
+    fcdb.sync()
+    return fcdb, libdb
+
+def carefully_make_hardlink(source, destination, dry_run=False):
+    """
+    Make a hard link, failing if a different link already exists
+
+    Checking to see if the link already exists and is
+    the same as the link we want to make.
+    If the link already exists and is different, throw an error.
+
+    If we didn't update anything return 0, if we did update
+    return 1.
+    """
+    logging.debug("CHECKING: %s -> %s", source, destination)
+
+    if not os.path.exists(source):
+        logging.warning("%s doesn't exist", source)
+        return 0
+
+    if os.path.exists(destination):
+        if os.path.samefile(source, destination):
+            logging.debug('SAME: %s -> %s' % (source, destination))
+            return 0
+        else:
+            logging.error('%s and %s are different files, skipping' % \
+                          (source, destination)) 
+            return 0
+    logging.debug('Linking: %s -> %s' % (source, destination))
+
+    # we would do something by this part
+    if dry_run: return 1
+
+    os.link(source, destination)
+    os.chmod(destination,
+             stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
+    return 1
+    
+def make_library_links(root, library_db, dry_run=False):
+    """
+    Make a tree of sequencer roots organized by library id
+
+    Root is the root of the library tree
+    library_db is a dictionary of SequenceFiles organized by library id
+    """
+    count = 0
+    root = os.path.abspath(root)
+    for lib_id, sequences in library_db.items():
+        target_dir = os.path.join(root, lib_id)
+        if not os.path.exists(target_dir):
+            logging.info("mkdir %s" % (target_dir,))
+            if not dry_run:
+                os.mkdir(target_dir)
+            
+        for s in sequences:
+            count += carefully_make_hardlink(s.path,
+                                             s.make_target_name(target_dir),
+                                             dry_run=dry_run)
+    return count
+
+def configure_logging(opts):
+    # setup logging
+    level = logging.WARN
+    if opts.verbose:
+        level = logging.INFO
+    if opts.debug:
+        level = logging.DEBUG
+    logging.basicConfig(level=level)
+    
+
+def configure_opts(opts):
+    """
+    Load in options from config file
+    """
+    SECTION_NAME = 'sequence_archive'
+    ARCHIVE_OPT = 'sequence_archive'
+    CACHE_OPT = 'cache'
+    HOST_OPT = 'host'
+    APIID_OPT = 'apiid'
+    APIKEY_OPT = 'apikey'
+
+    # figure out what config file to read
+    config_path = [os.path.expanduser('~/.htsworkflow.ini'),
+                   '/etc/htsworkflow.ini']
+    if opts.config is not None:
+        config_path = [opts.config]
+    # parse options from config file
+    config_file = SafeConfigParser()
+    config_file.read(config_path)
+
+    # load defaults from config file if not overriden by the command line
+    if opts.cache is None:
+        if config_file.has_option(SECTION_NAME, CACHE_OPT):
+            opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
+        else:
+            opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
+
+    if opts.sequence_archive is None and \
+       config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
+        opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
+        opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
+
+    opts.sequence_archive = os.path.abspath(opts.sequence_archive)
+    opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
+    opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
+    opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
+
+    if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
+        opts.host = config_file.get(SECTION_NAME, HOST_OPT)
+
+    if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
+        opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
+
+    if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
+        opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
+      
+    return opts
+
+def make_parser():
+    """
+    Make parser
+    """
+    parser = OptionParser()
+    parser.add_option('-c', '--config', default=None,
+                      help='path to a configuration file containing a '
+                           'sequence archive section')
+    parser.add_option('--cache', default=None,
+                      help="default flowcell cache")
+    
+    parser.add_option('--host', default=None,
+                      help="specify http://host for quering flowcell information")
+    parser.add_option('--apiid', default=None,
+                      help="API ID to use when retriving information")
+    parser.add_option("--apikey", default=None,
+                      help="API Key for when retriving information")
+    
+    parser.add_option('-a', '--sequence-archive', default=None,
+                      help='path to where the sequence archive lives')
+
+    parser.add_option('-v', '--verbose', action='store_true', default=False,
+                      help='be more verbose')
+    parser.add_option('-d', '--debug', action='store_true', default=False,
+                      help='report everything')
+             
+    parser.add_option("--dry-run", dest="dry_run", action="store_true",
+                      default=False,
+                      help="Don't modify the filesystem")
+    return parser
+
+def main(cmdline=None):
+    parser = make_parser()
+    opts, args = parser.parse_args(cmdline)
+
+    configure_logging(opts)
+    opts = configure_opts(opts)
+  
+    # complain if critical things are missing
+    if opts.cache is None:
+       parser.error('Need location of htsworkflow frontend database')
+
+    if opts.sequence_archive is None:
+       parser.error('Need the root path for the sequence archive')
+
+    seq_dirs = [ opts.flowcells, opts.srfs ]
+    if len(args) > 0:
+        seq_dirs = [os.path.abspath(f) for f in args]
+    
+    seqs = scan_for_sequences(seq_dirs)
+    fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
+    updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
+    
+    logging.warn("%s flowcells in database" % (len(fcdb),))
+    logging.warn("found %s sequence files" % (len(seqs),))
+    logging.warn("%s libraries being checked" % (len(libdb),))
+    logging.warn("%s sequence files were linked" % (updates,))
+    
+    return 0
+    
+if __name__ == "__main__":
+    main()