3 from ConfigParser import SafeConfigParser
7 from optparse import OptionParser
11 from htsworkflow.util import api
12 from htsworkflow.pipelines.sequences import scan_for_sequences
14 LOGGER = logging.getLogger(__name__)
16 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
18 compare our flowcell database with our list of sequences and return
19 a fully populated database
21 fcdb = shelve.open(fcdb_filename)
23 apidata = {'apiid': apiid, 'apikey': apikey}
25 flowcell = seq.flowcell
28 # get info about flowcell from server or shelf
29 if not fcdb.has_key(flowcell):
30 url = api.flowcell_url(baseurl, flowcell)
31 flowcell_info = api.retrieve_info(url, apidata)
32 if flowcell_info is not None:
33 fcdb[flowcell] = flowcell_info
35 flowcell_info = fcdb[flowcell]
38 if flowcell_info is not None:
39 seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
40 libdb.setdefault(seq_library_id, []).append(seq)
45 def carefully_make_hardlink(source, destination, dry_run=False):
47 Make a hard link, failing if a different link already exists
49 Checking to see if the link already exists and is
50 the same as the link we want to make.
51 If the link already exists and is different, throw an error.
53 If we didn't update anything return 0, if we did update
56 LOGGER.debug("CHECKING: %s -> %s", source, destination)
58 if not os.path.exists(source):
59 LOGGER.warning("%s doesn't exist", source)
62 if os.path.exists(destination):
63 if os.path.samefile(source, destination):
64 LOGGER.debug('SAME: %s -> %s' % (source, destination))
67 LOGGER.error('%s and %s are different files, skipping' % \
68 (source, destination))
70 LOGGER.debug('Linking: %s -> %s' % (source, destination))
72 # we would do something by this part
75 os.link(source, destination)
77 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
80 def make_library_links(root, library_db, dry_run=False):
82 Make a tree of sequencer roots organized by library id
84 Root is the root of the library tree
85 library_db is a dictionary of SequenceFiles organized by library id
88 root = os.path.abspath(root)
89 for lib_id, sequences in library_db.items():
90 target_dir = os.path.join(root, lib_id)
91 if not os.path.exists(target_dir):
92 LOGGER.info("mkdir %s" % (target_dir,))
97 count += carefully_make_hardlink(s.path,
98 s.make_target_name(target_dir),
102 def configure_logging(opts):
108 level = logging.DEBUG
109 logging.basicConfig(level=level)
112 def configure_opts(opts):
114 Load in options from config file
116 SECTION_NAME = 'sequence_archive'
117 ARCHIVE_OPT = 'sequence_archive'
121 APIKEY_OPT = 'apikey'
123 # figure out what config file to read
124 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
125 '/etc/htsworkflow.ini']
126 if opts.config is not None:
127 config_path = [opts.config]
128 # parse options from config file
129 config_file = SafeConfigParser()
130 config_file.read(config_path)
132 # load defaults from config file if not overriden by the command line
133 if opts.cache is None:
134 if config_file.has_option(SECTION_NAME, CACHE_OPT):
135 opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
137 opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
139 if opts.sequence_archive is None and \
140 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
141 opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
142 opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
144 opts.sequence_archive = os.path.abspath(opts.sequence_archive)
145 opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
146 opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
147 opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
149 if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
150 opts.host = config_file.get(SECTION_NAME, HOST_OPT)
152 if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
153 opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
155 if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
156 opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
164 parser = OptionParser()
165 parser.add_option('-c', '--config', default=None,
166 help='path to a configuration file containing a '
167 'sequence archive section')
168 parser.add_option('--cache', default=None,
169 help="default flowcell cache")
171 parser.add_option('--host', default=None,
172 help="specify http://host for quering flowcell information")
173 parser.add_option('--apiid', default=None,
174 help="API ID to use when retriving information")
175 parser.add_option("--apikey", default=None,
176 help="API Key for when retriving information")
178 parser.add_option('-a', '--sequence-archive', default=None,
179 help='path to where the sequence archive lives')
181 parser.add_option('-v', '--verbose', action='store_true', default=False,
182 help='be more verbose')
183 parser.add_option('-d', '--debug', action='store_true', default=False,
184 help='report everything')
186 parser.add_option("--dry-run", dest="dry_run", action="store_true",
188 help="Don't modify the filesystem")
191 def main(cmdline=None):
192 parser = make_parser()
193 opts, args = parser.parse_args(cmdline)
195 configure_logging(opts)
196 opts = configure_opts(opts)
198 # complain if critical things are missing
199 if opts.cache is None:
200 parser.error('Need location of htsworkflow frontend database')
202 if opts.sequence_archive is None:
203 parser.error('Need the root path for the sequence archive')
205 seq_dirs = [ opts.flowcells, opts.srfs ]
207 seq_dirs = [os.path.abspath(f) for f in args]
209 seqs = scan_for_sequences(seq_dirs)
210 fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
211 updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
213 LOGGER.warn("%s flowcells in database" % (len(fcdb),))
214 LOGGER.warn("found %s sequence files" % (len(seqs),))
215 LOGGER.warn("%s libraries being checked" % (len(libdb),))
216 LOGGER.warn("%s sequence files were linked" % (updates,))
220 if __name__ == "__main__":