3 from ConfigParser import SafeConfigParser
4 # try to deal with python <2.6
8 import simplejson as json
12 from optparse import OptionParser
20 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
21 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
22 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
24 class SequenceFile(object):
25 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
26 self.filetype = filetype
28 self.flowcell = flowcell
35 return hash(self.key())
38 return (self.flowcell, self.lane)
41 return unicode(self.path)
44 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
46 def make_target_name(self, root):
48 Create target name for where we need to link this sequence too
50 path, basename = os.path.split(self.path)
51 # Because the names aren't unque we include the flowcel name
52 # because there were different eland files for different length
53 # analyses, we include the cycle length in the name.
54 if self.filetype == 'eland':
55 template = "%(flowcell)s_%(cycle)s_%(eland)s"
56 basename = template % { 'flowcell': self.flowcell,
60 # all the other file types have names that include flowcell/lane
61 # information and thus are unique so we don't have to do anything
62 return os.path.join(root, basename)
64 def parse_srf(path, filename):
65 basename, ext = os.path.splitext(filename)
66 records = basename.split('_')
68 lane = int(records[5][0])
69 fullpath = os.path.join(path, filename)
70 return SequenceFile('srf', fullpath, flowcell, lane)
72 def parse_qseq(path, filename):
73 basename, ext = os.path.splitext(filename)
74 records = basename.split('_')
75 fullpath = os.path.join(path, filename)
77 lane = int(records[5][1])
78 read = int(records[6][1])
79 return SequenceFile('qseq', fullpath, flowcell, lane, read)
81 def parse_fastq(path, filename):
82 basename, ext = os.path.splitext(filename)
83 records = basename.split('_')
84 fullpath = os.path.join(path, filename)
86 lane = int(records[5][1])
87 read = int(records[6][1])
88 if records[-1] == 'pass':
92 return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
94 def parse_eland(path, filename, eland_match):
95 fullpath = os.path.join(path, filename)
96 rest, cycle = os.path.split(path)
97 rest, flowcell = os.path.split(rest)
98 lane = eland_match.group('lane')
99 read = eland_match.group('read')
100 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
102 def scan_for_sequences(dirs):
105 logging.info("Scanning %s for sequences" % (d,))
106 for path, dirname, filenames in os.walk(d):
109 # find sequence files
110 if raw_seq_re.match(f):
111 if f.endswith('.md5'):
113 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
114 seq = parse_srf(path, f)
115 elif qseq_re.match(f):
116 seq = parse_qseq(path, f)
117 elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
118 seq = parse_fastq(path, f)
119 eland_match = eland_re.match(f)
121 if f.endswith('.md5'):
123 seq = parse_eland(path, f, eland_match)
125 sequences.append(seq)
126 logging.debug("Found sequence at %s" % (f,))
130 def retrieve_info(url, apidata):
132 Return a dictionary from the HTSworkflow API
135 apipayload = urllib.urlencode(apidata)
136 web = urllib2.urlopen(url, apipayload)
137 except urllib2.URLError, e:
139 logging.info("%s was not found" % (url,))
142 errmsg = 'URLError: %d %s' % (e.code, e.msg)
143 raise IOError(errmsg)
145 contents = web.read()
148 return json.loads(contents)
150 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
152 compare our flowcell database with our list of sequences and return
153 a fully populated database
155 fcdb = shelve.open(fcdb_filename)
157 apidata = {'apiid': apiid, 'apikey': apikey}
158 for seq in sequences:
159 flowcell = seq.flowcell
162 # get info about flowcell from server or shelf
163 if not fcdb.has_key(flowcell):
164 url = urlparse.urljoin(baseurl, 'experiments/config/%s/json' % (flowcell,))
165 flowcell_info = retrieve_info(url, apidata)
166 if flowcell_info is not None:
167 fcdb[flowcell] = flowcell_info
169 flowcell_info = fcdb[flowcell]
172 if flowcell_info is not None:
173 seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
174 libdb.setdefault(seq_library_id, []).append(seq)
179 def carefully_make_hardlink(source, destination, dry_run=False):
181 Make a hard link, failing if a different link already exists
183 Checking to see if the link already exists and is
184 the same as the link we want to make.
185 If the link already exists and is different, throw an error.
187 If we didn't update anything return 0, if we did update
190 logging.debug("CHECKING: %s -> %s", source, destination)
192 if not os.path.exists(source):
193 logging.warning("%s doesn't exist", source)
196 if os.path.exists(destination):
197 if os.path.samefile(source, destination):
198 logging.debug('SAME: %s -> %s' % (source, destination))
201 logging.error('%s and %s are different files, skipping' % \
202 (source, destination))
204 logging.debug('Linking: %s -> %s' % (source, destination))
206 # we would do something by this part
209 os.link(source, destination)
210 os.chmod(destination,
211 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
214 def make_library_links(root, library_db, dry_run=False):
216 Make a tree of sequencer roots organized by library id
218 Root is the root of the library tree
219 library_db is a dictionary of SequenceFiles organized by library id
222 root = os.path.abspath(root)
223 for lib_id, sequences in library_db.items():
224 target_dir = os.path.join(root, lib_id)
225 if not os.path.exists(target_dir):
226 logging.info("mkdir %s" % (target_dir,))
231 count += carefully_make_hardlink(s.path,
232 s.make_target_name(target_dir),
236 def configure_logging(opts):
242 level = logging.DEBUG
243 logging.basicConfig(level=level)
246 def configure_opts(opts):
248 Load in options from config file
250 SECTION_NAME = 'sequence_archive'
251 ARCHIVE_OPT = 'sequence_archive'
255 APIKEY_OPT = 'apikey'
257 # figure out what config file to read
258 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
259 '/etc/htsworkflow.ini']
260 if opts.config is not None:
261 config_path = [opts.config]
262 # parse options from config file
263 config_file = SafeConfigParser()
264 config_file.read(config_path)
266 # load defaults from config file if not overriden by the command line
267 if opts.cache is None:
268 if config_file.has_option(SECTION_NAME, CACHE_OPT):
269 opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
271 opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
273 if opts.sequence_archive is None and \
274 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
275 opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
277 opts.sequence_archive = os.path.abspath(opts.sequence_archive)
278 opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
279 opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
280 opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
282 if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
283 opts.host = config_file.get(SECTION_NAME, HOST_OPT)
285 if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
286 opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
288 if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
289 opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
297 parser = OptionParser()
298 parser.add_option('-c', '--config', default=None,
299 help='path to a configuration file containing a '
300 'sequence archive section')
301 parser.add_option('--cache', default=None,
302 help="default flowcell cache")
304 parser.add_option('--host', default=None,
305 help="specify http://host for quering flowcell information")
306 parser.add_option('--apiid', default=None,
307 help="API ID to use when retriving information")
308 parser.add_option("--apikey", default=None,
309 help="API Key for when retriving information")
311 parser.add_option('-a', '--sequence-archive', default=None,
312 help='path to where the sequence archive lives')
314 parser.add_option('-v', '--verbose', action='store_true', default=False,
315 help='be more verbose')
316 parser.add_option('-d', '--debug', action='store_true', default=False,
317 help='report everything')
319 parser.add_option("--dry-run", dest="dry_run", action="store_true",
321 help="Don't modify the filesystem")
324 def main(cmdline=None):
325 parser = make_parser()
326 opts, args = parser.parse_args(cmdline)
328 configure_logging(opts)
329 opts = configure_opts(opts)
331 # complain if critical things are missing
332 if opts.cache is None:
333 parser.error('Need location of htsworkflow frontend database')
335 if opts.sequence_archive is None:
336 parser.error('Need the root path for the sequence archive')
338 seq_dirs = [ opts.flowcells, opts.srfs ]
340 seq_dirs = [os.path.abspath(f) for f in args]
342 seqs = scan_for_sequences(seq_dirs)
343 fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
344 updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
346 logging.warn("%s flowcells in database" % (len(fcdb),))
347 logging.warn("found %s sequence files" % (len(seqs),))
348 logging.warn("%s libraries being checked" % (len(libdb),))
349 logging.warn("%s sequence files were linked" % (updates,))
353 if __name__ == "__main__":