3 from ConfigParser import SafeConfigParser
7 from optparse import OptionParser
15 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
16 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
17 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
19 class SequenceFile(object):
20 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
21 self.filetype = filetype
23 self.flowcell = flowcell
30 return hash(self.key())
33 return (self.flowcell, self.lane)
36 return unicode(self.path)
39 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
41 def make_target_name(self, root):
43 Create target name for where we need to link this sequence too
45 path, basename = os.path.split(self.path)
46 # Because the names aren't unque we include the flowcel name
47 # because there were different eland files for different length
48 # analyses, we include the cycle length in the name.
49 if self.filetype == 'eland':
50 template = "%(flowcell)s_%(cycle)s_%(eland)s"
51 basename = template % { 'flowcell': self.flowcell,
55 # all the other file types have names that include flowcell/lane
56 # information and thus are unique so we don't have to do anything
57 return os.path.join(root, basename)
59 def parse_srf(path, filename):
60 basename, ext = os.path.splitext(filename)
61 records = basename.split('_')
63 lane = int(records[5][0])
64 fullpath = os.path.join(path, filename)
65 return SequenceFile('srf', fullpath, flowcell, lane)
67 def parse_qseq(path, filename):
68 basename, ext = os.path.splitext(filename)
69 records = basename.split('_')
70 fullpath = os.path.join(path, filename)
72 lane = int(records[5][1])
73 read = int(records[6][1])
74 return SequenceFile('qseq', fullpath, flowcell, lane, read)
76 def parse_fastq(path, filename):
77 basename, ext = os.path.splitext(filename)
78 records = basename.split('_')
79 fullpath = os.path.join(path, filename)
81 lane = int(records[5][1])
82 read = int(records[6][1])
83 if records[-1] == 'pass':
87 return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
89 def parse_eland(path, filename, eland_match):
90 fullpath = os.path.join(path, filename)
91 rest, cycle = os.path.split(path)
92 rest, flowcell = os.path.split(rest)
93 lane = eland_match.group('lane')
94 read = eland_match.group('read')
95 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
97 def scan_for_sequences(dirs):
100 logging.info("Scanning %s for sequences" % (d,))
101 for path, dirname, filenames in os.walk(d):
104 # find sequence files
105 if raw_seq_re.match(f):
106 if f.endswith('.md5'):
108 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
109 seq = parse_srf(path, f)
110 elif qseq_re.match(f):
111 seq = parse_qseq(path, f)
112 elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
113 seq = parse_fastq(path, f)
114 eland_match = eland_re.match(f)
116 if f.endswith('.md5'):
118 seq = parse_eland(path, f, eland_match)
120 sequences.append(seq)
121 logging.debug("Found sequence at %s" % (f,))
125 def retrieve_info(url, apidata):
127 Return a dictionary from the HTSworkflow API
130 apipayload = urllib.urlencode(apidata)
131 web = urllib2.urlopen(url, apipayload)
132 except urllib2.URLError, e:
134 logging.info("%s was not found" % (url,))
137 errmsg = 'URLError: %d %s' % (e.code, e.msg)
138 raise IOError(errmsg)
140 contents = web.read()
143 return json.loads(contents)
145 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
147 compare our flowcell database with our list of sequences and return
148 a fully populated database
150 fcdb = shelve.open(fcdb_filename)
152 apidata = {'apiid': apiid, 'apikey': apikey}
153 for seq in sequences:
154 flowcell = seq.flowcell
157 # get info about flowcell from server or shelf
158 if not fcdb.has_key(flowcell):
159 url = urlparse.urljoin(baseurl, 'experiments/config/%s/json' % (flowcell,))
160 flowcell_info = retrieve_info(url, apidata)
161 if flowcell_info is not None:
162 fcdb[flowcell] = flowcell_info
164 flowcell_info = fcdb[flowcell]
167 if flowcell_info is not None:
168 seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
169 libdb.setdefault(seq_library_id, []).append(seq)
174 def carefully_make_hardlink(source, destination, dry_run=False):
176 Make a hard link, failing if a different link already exists
178 Checking to see if the link already exists and is
179 the same as the link we want to make.
180 If the link already exists and is different, throw an error.
182 If we didn't update anything return 0, if we did update
185 logging.debug("CHECKING: %s -> %s", source, destination)
187 if not os.path.exists(source):
188 logging.warning("%s doesn't exist", source)
191 if os.path.exists(destination):
192 if os.path.samefile(source, destination):
193 logging.debug('SAME: %s -> %s' % (source, destination))
196 raise IOError('%s and %s are different files' % \
197 (source, destination))
198 logging.debug('Linking: %s -> %s' % (source, destination))
200 # we would do something by this part
203 os.link(source, destination)
204 os.chmod(destination,
205 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
208 def make_library_links(root, library_db, dry_run=False):
210 Make a tree of sequencer roots organized by library id
212 Root is the root of the library tree
213 library_db is a dictionary of SequenceFiles organized by library id
216 root = os.path.abspath(root)
217 for lib_id, sequences in library_db.items():
218 target_dir = os.path.join(root, lib_id)
219 if not os.path.exists(target_dir):
220 logging.info("mkdir %s" % (target_dir,))
225 count += carefully_make_hardlink(s.path,
226 s.make_target_name(target_dir),
230 def configure_logging(opts):
236 level = logging.DEBUG
237 logging.basicConfig(level=level)
240 def configure_opts(opts):
242 Load in options from config file
244 SECTION_NAME = 'sequence_archive'
245 ARCHIVE_OPT = 'sequence_archive'
249 APIKEY_OPT = 'apikey'
251 # figure out what config file to read
252 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
253 '/etc/htsworkflow.ini']
254 if opts.config is not None:
255 config_path = [opts.config]
256 # parse options from config file
257 config_file = SafeConfigParser()
258 config_file.read(config_path)
260 # load defaults from config file if not overriden by the command line
261 if opts.cache is None:
262 if config_file.has_option(SECTION_NAME, CACHE_OPT):
263 opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
265 opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
267 if opts.sequence_archive is None and \
268 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
269 opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
271 opts.sequence_archive = os.path.abspath(opts.sequence_archive)
272 opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
273 opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
274 opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
276 if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
277 opts.host = config_file.get(SECTION_NAME, HOST_OPT)
279 if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
280 opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
282 if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
283 opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
291 parser = OptionParser()
292 parser.add_option('-c', '--config', default=None,
293 help='path to a configuration file containing a '
294 'sequence archive section')
295 parser.add_option('--cache', default=None,
296 help="default flowcell cache")
298 parser.add_option('--host', default=None,
299 help="specify http://host for quering flowcell information")
300 parser.add_option('--apiid', default=None,
301 help="API ID to use when retriving information")
302 parser.add_option("--apikey", default=None,
303 help="API Key for when retriving information")
305 parser.add_option('-a', '--sequence-archive', default=None,
306 help='path to where the sequence archive lives')
308 parser.add_option('-v', '--verbose', action='store_true', default=False,
309 help='be more verbose')
310 parser.add_option('-d', '--debug', action='store_true', default=False,
311 help='report everything')
313 parser.add_option("--dry-run", dest="dry_run", action="store_true",
315 help="Don't modify the filesystem")
318 def main(cmdline=None):
319 parser = make_parser()
320 opts, args = parser.parse_args(cmdline)
322 configure_logging(opts)
323 opts = configure_opts(opts)
325 # complain if critical things are missing
326 if opts.cache is None:
327 parser.error('Need location of htsworkflow frontend database')
329 if opts.sequence_archive is None:
330 parser.error('Need the root path for the sequence archive')
332 seq_dirs = [ opts.flowcells, opts.srfs ]
334 seq_dirs = [os.path.abspath(f) for f in args]
336 seqs = scan_for_sequences(seq_dirs)
337 fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
338 updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
340 logging.warn("%s flowcells in database" % (len(fcdb),))
341 logging.warn("found %s sequence files" % (len(seqs),))
342 logging.warn("%s libraries being checked" % (len(libdb),))
343 logging.warn("%s sequence files were linked" % (updates,))
347 if __name__ == "__main__":