3 from ConfigParser import SafeConfigParser
7 from optparse import OptionParser
12 from htsworkflow.util import api
14 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
15 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
16 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
18 class SequenceFile(object):
19 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
20 self.filetype = filetype
22 self.flowcell = flowcell
29 return hash(self.key())
32 return (self.flowcell, self.lane)
35 return unicode(self.path)
38 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
40 def make_target_name(self, root):
42 Create target name for where we need to link this sequence too
44 path, basename = os.path.split(self.path)
45 # Because the names aren't unque we include the flowcel name
46 # because there were different eland files for different length
47 # analyses, we include the cycle length in the name.
48 if self.filetype == 'eland':
49 template = "%(flowcell)s_%(cycle)s_%(eland)s"
50 basename = template % { 'flowcell': self.flowcell,
54 # all the other file types have names that include flowcell/lane
55 # information and thus are unique so we don't have to do anything
56 return os.path.join(root, basename)
58 def parse_srf(path, filename):
59 basename, ext = os.path.splitext(filename)
60 records = basename.split('_')
62 lane = int(records[5][0])
63 fullpath = os.path.join(path, filename)
64 return SequenceFile('srf', fullpath, flowcell, lane)
66 def parse_qseq(path, filename):
67 basename, ext = os.path.splitext(filename)
68 records = basename.split('_')
69 fullpath = os.path.join(path, filename)
71 lane = int(records[5][1])
72 read = int(records[6][1])
73 return SequenceFile('qseq', fullpath, flowcell, lane, read)
75 def parse_fastq(path, filename):
76 basename, ext = os.path.splitext(filename)
77 records = basename.split('_')
78 fullpath = os.path.join(path, filename)
80 lane = int(records[5][1])
81 read = int(records[6][1])
82 if records[-1] == 'pass':
86 return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
88 def parse_eland(path, filename, eland_match):
89 fullpath = os.path.join(path, filename)
90 rest, cycle = os.path.split(path)
91 rest, flowcell = os.path.split(rest)
92 lane = eland_match.group('lane')
93 read = eland_match.group('read')
94 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
96 def scan_for_sequences(dirs):
99 logging.info("Scanning %s for sequences" % (d,))
100 for path, dirname, filenames in os.walk(d):
103 # find sequence files
104 if raw_seq_re.match(f):
105 if f.endswith('.md5'):
107 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
108 seq = parse_srf(path, f)
109 elif qseq_re.match(f):
110 seq = parse_qseq(path, f)
111 elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
112 seq = parse_fastq(path, f)
113 eland_match = eland_re.match(f)
115 if f.endswith('.md5'):
117 seq = parse_eland(path, f, eland_match)
119 sequences.append(seq)
120 logging.debug("Found sequence at %s" % (f,))
125 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
127 compare our flowcell database with our list of sequences and return
128 a fully populated database
130 fcdb = shelve.open(fcdb_filename)
132 apidata = {'apiid': apiid, 'apikey': apikey}
133 for seq in sequences:
134 flowcell = seq.flowcell
137 # get info about flowcell from server or shelf
138 if not fcdb.has_key(flowcell):
139 url = api.flowcell_url(baseurl, flowcell)
140 flowcell_info = api.retrieve_info(url, apidata)
141 if flowcell_info is not None:
142 fcdb[flowcell] = flowcell_info
144 flowcell_info = fcdb[flowcell]
147 if flowcell_info is not None:
148 seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
149 libdb.setdefault(seq_library_id, []).append(seq)
154 def carefully_make_hardlink(source, destination, dry_run=False):
156 Make a hard link, failing if a different link already exists
158 Checking to see if the link already exists and is
159 the same as the link we want to make.
160 If the link already exists and is different, throw an error.
162 If we didn't update anything return 0, if we did update
165 logging.debug("CHECKING: %s -> %s", source, destination)
167 if not os.path.exists(source):
168 logging.warning("%s doesn't exist", source)
171 if os.path.exists(destination):
172 if os.path.samefile(source, destination):
173 logging.debug('SAME: %s -> %s' % (source, destination))
176 logging.error('%s and %s are different files, skipping' % \
177 (source, destination))
179 logging.debug('Linking: %s -> %s' % (source, destination))
181 # we would do something by this part
184 os.link(source, destination)
185 os.chmod(destination,
186 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
189 def make_library_links(root, library_db, dry_run=False):
191 Make a tree of sequencer roots organized by library id
193 Root is the root of the library tree
194 library_db is a dictionary of SequenceFiles organized by library id
197 root = os.path.abspath(root)
198 for lib_id, sequences in library_db.items():
199 target_dir = os.path.join(root, lib_id)
200 if not os.path.exists(target_dir):
201 logging.info("mkdir %s" % (target_dir,))
206 count += carefully_make_hardlink(s.path,
207 s.make_target_name(target_dir),
211 def configure_logging(opts):
217 level = logging.DEBUG
218 logging.basicConfig(level=level)
221 def configure_opts(opts):
223 Load in options from config file
225 SECTION_NAME = 'sequence_archive'
226 ARCHIVE_OPT = 'sequence_archive'
230 APIKEY_OPT = 'apikey'
232 # figure out what config file to read
233 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
234 '/etc/htsworkflow.ini']
235 if opts.config is not None:
236 config_path = [opts.config]
237 # parse options from config file
238 config_file = SafeConfigParser()
239 config_file.read(config_path)
241 # load defaults from config file if not overriden by the command line
242 if opts.cache is None:
243 if config_file.has_option(SECTION_NAME, CACHE_OPT):
244 opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
246 opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
248 if opts.sequence_archive is None and \
249 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
250 opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
251 opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
253 opts.sequence_archive = os.path.abspath(opts.sequence_archive)
254 opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
255 opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
256 opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
258 if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
259 opts.host = config_file.get(SECTION_NAME, HOST_OPT)
261 if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
262 opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
264 if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
265 opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
273 parser = OptionParser()
274 parser.add_option('-c', '--config', default=None,
275 help='path to a configuration file containing a '
276 'sequence archive section')
277 parser.add_option('--cache', default=None,
278 help="default flowcell cache")
280 parser.add_option('--host', default=None,
281 help="specify http://host for quering flowcell information")
282 parser.add_option('--apiid', default=None,
283 help="API ID to use when retriving information")
284 parser.add_option("--apikey", default=None,
285 help="API Key for when retriving information")
287 parser.add_option('-a', '--sequence-archive', default=None,
288 help='path to where the sequence archive lives')
290 parser.add_option('-v', '--verbose', action='store_true', default=False,
291 help='be more verbose')
292 parser.add_option('-d', '--debug', action='store_true', default=False,
293 help='report everything')
295 parser.add_option("--dry-run", dest="dry_run", action="store_true",
297 help="Don't modify the filesystem")
300 def main(cmdline=None):
301 parser = make_parser()
302 opts, args = parser.parse_args(cmdline)
304 configure_logging(opts)
305 opts = configure_opts(opts)
307 # complain if critical things are missing
308 if opts.cache is None:
309 parser.error('Need location of htsworkflow frontend database')
311 if opts.sequence_archive is None:
312 parser.error('Need the root path for the sequence archive')
314 seq_dirs = [ opts.flowcells, opts.srfs ]
316 seq_dirs = [os.path.abspath(f) for f in args]
318 seqs = scan_for_sequences(seq_dirs)
319 fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
320 updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
322 logging.warn("%s flowcells in database" % (len(fcdb),))
323 logging.warn("found %s sequence files" % (len(seqs),))
324 logging.warn("%s libraries being checked" % (len(libdb),))
325 logging.warn("%s sequence files were linked" % (updates,))
329 if __name__ == "__main__":