3 from ConfigParser import SafeConfigParser
7 from optparse import OptionParser
11 from htsworkflow.util import api
12 from htsworkflow.pipelines.sequences import scan_for_sequences
14 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
16 compare our flowcell database with our list of sequences and return
17 a fully populated database
19 fcdb = shelve.open(fcdb_filename)
21 apidata = {'apiid': apiid, 'apikey': apikey}
23 flowcell = seq.flowcell
26 # get info about flowcell from server or shelf
27 if not fcdb.has_key(flowcell):
28 url = api.flowcell_url(baseurl, flowcell)
29 flowcell_info = api.retrieve_info(url, apidata)
30 if flowcell_info is not None:
31 fcdb[flowcell] = flowcell_info
33 flowcell_info = fcdb[flowcell]
36 if flowcell_info is not None:
37 seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
38 libdb.setdefault(seq_library_id, []).append(seq)
43 def carefully_make_hardlink(source, destination, dry_run=False):
45 Make a hard link, failing if a different link already exists
47 Checking to see if the link already exists and is
48 the same as the link we want to make.
49 If the link already exists and is different, throw an error.
51 If we didn't update anything return 0, if we did update
54 logging.debug("CHECKING: %s -> %s", source, destination)
56 if not os.path.exists(source):
57 logging.warning("%s doesn't exist", source)
60 if os.path.exists(destination):
61 if os.path.samefile(source, destination):
62 logging.debug('SAME: %s -> %s' % (source, destination))
65 logging.error('%s and %s are different files, skipping' % \
66 (source, destination))
68 logging.debug('Linking: %s -> %s' % (source, destination))
70 # we would do something by this part
73 os.link(source, destination)
75 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
78 def make_library_links(root, library_db, dry_run=False):
80 Make a tree of sequencer roots organized by library id
82 Root is the root of the library tree
83 library_db is a dictionary of SequenceFiles organized by library id
86 root = os.path.abspath(root)
87 for lib_id, sequences in library_db.items():
88 target_dir = os.path.join(root, lib_id)
89 if not os.path.exists(target_dir):
90 logging.info("mkdir %s" % (target_dir,))
95 count += carefully_make_hardlink(s.path,
96 s.make_target_name(target_dir),
100 def configure_logging(opts):
106 level = logging.DEBUG
107 logging.basicConfig(level=level)
110 def configure_opts(opts):
112 Load in options from config file
114 SECTION_NAME = 'sequence_archive'
115 ARCHIVE_OPT = 'sequence_archive'
119 APIKEY_OPT = 'apikey'
121 # figure out what config file to read
122 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
123 '/etc/htsworkflow.ini']
124 if opts.config is not None:
125 config_path = [opts.config]
126 # parse options from config file
127 config_file = SafeConfigParser()
128 config_file.read(config_path)
130 # load defaults from config file if not overriden by the command line
131 if opts.cache is None:
132 if config_file.has_option(SECTION_NAME, CACHE_OPT):
133 opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
135 opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
137 if opts.sequence_archive is None and \
138 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
139 opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
140 opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
142 opts.sequence_archive = os.path.abspath(opts.sequence_archive)
143 opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
144 opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
145 opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
147 if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
148 opts.host = config_file.get(SECTION_NAME, HOST_OPT)
150 if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
151 opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
153 if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
154 opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
162 parser = OptionParser()
163 parser.add_option('-c', '--config', default=None,
164 help='path to a configuration file containing a '
165 'sequence archive section')
166 parser.add_option('--cache', default=None,
167 help="default flowcell cache")
169 parser.add_option('--host', default=None,
170 help="specify http://host for quering flowcell information")
171 parser.add_option('--apiid', default=None,
172 help="API ID to use when retriving information")
173 parser.add_option("--apikey", default=None,
174 help="API Key for when retriving information")
176 parser.add_option('-a', '--sequence-archive', default=None,
177 help='path to where the sequence archive lives')
179 parser.add_option('-v', '--verbose', action='store_true', default=False,
180 help='be more verbose')
181 parser.add_option('-d', '--debug', action='store_true', default=False,
182 help='report everything')
184 parser.add_option("--dry-run", dest="dry_run", action="store_true",
186 help="Don't modify the filesystem")
189 def main(cmdline=None):
190 parser = make_parser()
191 opts, args = parser.parse_args(cmdline)
193 configure_logging(opts)
194 opts = configure_opts(opts)
196 # complain if critical things are missing
197 if opts.cache is None:
198 parser.error('Need location of htsworkflow frontend database')
200 if opts.sequence_archive is None:
201 parser.error('Need the root path for the sequence archive')
203 seq_dirs = [ opts.flowcells, opts.srfs ]
205 seq_dirs = [os.path.abspath(f) for f in args]
207 seqs = scan_for_sequences(seq_dirs)
208 fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
209 updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
211 logging.warn("%s flowcells in database" % (len(fcdb),))
212 logging.warn("found %s sequence files" % (len(seqs),))
213 logging.warn("%s libraries being checked" % (len(libdb),))
214 logging.warn("%s sequence files were linked" % (updates,))
218 if __name__ == "__main__":