scripts/make-library-tree

   1 #!/usr/bin/env python
   2
   3 from ConfigParser import SafeConfigParser
   4 # try to deal with python <2.6
   5 try:
   6   import json
   7 except ImportError:
   8   import simplejson as json
   9
  10 import logging
  11 import os
  12 from optparse import OptionParser
  13 import stat
  14 import re
  15 import shelve
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
  21 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
  22 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
  23
  24 class SequenceFile(object):
  25     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
  26         self.filetype = filetype
  27         self.path = path
  28         self.flowcell = flowcell
  29         self.lane = lane
  30         self.read = read
  31         self.pf = pf
  32         self.cycle = cycle
  33
  34     def __hash__(self):
  35         return hash(self.key())
  36
  37     def key(self):
  38         return (self.flowcell, self.lane)
  39
  40     def unicode(self):
  41         return unicode(self.path)
  42
  43     def __repr__(self):
  44         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
  45
  46     def make_target_name(self, root):
  47         """
  48         Create target name for where we need to link this sequence too
  49         """
  50         path, basename = os.path.split(self.path)
  51         # Because the names aren't unque we include the flowcel name
  52         # because there were different eland files for different length
  53         # analyses, we include the cycle length in the name.
  54         if self.filetype == 'eland':
  55             template = "%(flowcell)s_%(cycle)s_%(eland)s"
  56             basename = template % { 'flowcell': self.flowcell,
  57                                     'cycle': self.cycle,
  58                                     'eland': basename }
  59         # else:
  60         # all the other file types have names that include flowcell/lane
  61         # information and thus are unique so we don't have to do anything
  62         return os.path.join(root, basename)
  63
  64 def parse_srf(path, filename):
  65     basename, ext = os.path.splitext(filename)
  66     records = basename.split('_')
  67     flowcell = records[4]
  68     lane = int(records[5][0])
  69     fullpath = os.path.join(path, filename)
  70     return SequenceFile('srf', fullpath, flowcell, lane)
  71
  72 def parse_qseq(path, filename):
  73     basename, ext = os.path.splitext(filename)
  74     records = basename.split('_')
  75     fullpath = os.path.join(path, filename)
  76     flowcell = records[4]
  77     lane = int(records[5][1])
  78     read = int(records[6][1])
  79     return SequenceFile('qseq', fullpath, flowcell, lane, read)
  80
  81 def parse_fastq(path, filename):
  82     basename, ext = os.path.splitext(filename)
  83     records = basename.split('_')
  84     fullpath = os.path.join(path, filename)
  85     flowcell = records[4]
  86     lane = int(records[5][1])
  87     read = int(records[6][1])
  88     if records[-1] == 'pass':
  89         pf = True
  90     else:
  91         pf = False
  92     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
  93
  94 def parse_eland(path, filename, eland_match):
  95     fullpath = os.path.join(path, filename)
  96     rest, cycle = os.path.split(path)
  97     rest, flowcell = os.path.split(rest)
  98     lane = eland_match.group('lane')
  99     read = eland_match.group('read')
 100     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
 101
 102 def scan_for_sequences(dirs):
 103     sequences = []
 104     for d in dirs:
 105         logging.info("Scanning %s for sequences" % (d,))
 106         for path, dirname, filenames in os.walk(d):
 107             for f in filenames:
 108                 seq = None
 109                 # find sequence files
 110                 if raw_seq_re.match(f):
 111                     if f.endswith('.md5'):
 112                         continue
 113                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
 114                         seq = parse_srf(path, f)
 115                     elif qseq_re.match(f):
 116                         seq = parse_qseq(path, f)
 117                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
 118                         seq = parse_fastq(path, f)
 119                 eland_match = eland_re.match(f)
 120                 if eland_match:
 121                     if f.endswith('.md5'):
 122                         continue
 123                     seq = parse_eland(path, f, eland_match)
 124                 if seq:
 125                     sequences.append(seq)
 126                     logging.debug("Found sequence at %s" % (f,))
 127
 128     return sequences
 129
 130 def retrieve_info(url, apidata):
 131     """
 132     Return a dictionary from the HTSworkflow API
 133     """
 134     try:
 135         apipayload = urllib.urlencode(apidata)
 136         web = urllib2.urlopen(url, apipayload)
 137     except urllib2.URLError, e:
 138         if e.code == 404:
 139             logging.info("%s was not found" % (url,))
 140             return None
 141         else:
 142             errmsg = 'URLError: %d %s' % (e.code, e.msg)
 143             raise IOError(errmsg)
 144
 145     contents = web.read()
 146     headers = web.info()
 147
 148     return json.loads(contents)
 149
 150 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
 151     """
 152     compare our flowcell database with our list of sequences and return
 153     a fully populated database
 154     """
 155     fcdb = shelve.open(fcdb_filename)
 156     libdb = {}
 157     apidata = {'apiid': apiid, 'apikey': apikey}
 158     for seq in sequences:
 159         flowcell = seq.flowcell
 160         flowcell_info = None
 161
 162         # get info about flowcell from server or shelf
 163         if not fcdb.has_key(flowcell):
 164             url = urlparse.urljoin(baseurl, 'experiments/config/%s/json' % (flowcell,))
 165             flowcell_info = retrieve_info(url, apidata)
 166             if flowcell_info is not None:
 167                 fcdb[flowcell] = flowcell_info
 168         else:
 169             flowcell_info = fcdb[flowcell]
 170
 171         # make library id db
 172         if flowcell_info is not None:
 173             seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
 174             libdb.setdefault(seq_library_id, []).append(seq)
 175
 176     fcdb.sync()
 177     return fcdb, libdb
 178
 179 def carefully_make_hardlink(source, destination, dry_run=False):
 180     """
 181     Make a hard link, failing if a different link already exists
 182
 183     Checking to see if the link already exists and is
 184     the same as the link we want to make.
 185     If the link already exists and is different, throw an error.
 186
 187     If we didn't update anything return 0, if we did update
 188     return 1.
 189     """
 190     logging.debug("CHECKING: %s -> %s", source, destination)
 191
 192     if not os.path.exists(source):
 193         logging.warning("%s doesn't exist", source)
 194         return 0
 195
 196     if os.path.exists(destination):
 197         if os.path.samefile(source, destination):
 198             logging.debug('SAME: %s -> %s' % (source, destination))
 199             return 0
 200         else:
 201             logging.error('%s and %s are different files, skipping' % \
 202                           (source, destination))
 203             return 0
 204     logging.debug('Linking: %s -> %s' % (source, destination))
 205
 206     # we would do something by this part
 207     if dry_run: return 1
 208
 209     os.link(source, destination)
 210     os.chmod(destination,
 211              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
 212     return 1
 213
 214 def make_library_links(root, library_db, dry_run=False):
 215     """
 216     Make a tree of sequencer roots organized by library id
 217
 218     Root is the root of the library tree
 219     library_db is a dictionary of SequenceFiles organized by library id
 220     """
 221     count = 0
 222     root = os.path.abspath(root)
 223     for lib_id, sequences in library_db.items():
 224         target_dir = os.path.join(root, lib_id)
 225         if not os.path.exists(target_dir):
 226             logging.info("mkdir %s" % (target_dir,))
 227             if not dry_run:
 228                 os.mkdir(target_dir)
 229
 230         for s in sequences:
 231             count += carefully_make_hardlink(s.path,
 232                                              s.make_target_name(target_dir),
 233                                              dry_run=dry_run)
 234     return count
 235
 236 def configure_logging(opts):
 237     # setup logging
 238     level = logging.WARN
 239     if opts.verbose:
 240         level = logging.INFO
 241     if opts.debug:
 242         level = logging.DEBUG
 243     logging.basicConfig(level=level)
 244
 245
 246 def configure_opts(opts):
 247     """
 248     Load in options from config file
 249     """
 250     SECTION_NAME = 'sequence_archive'
 251     ARCHIVE_OPT = 'sequence_archive'
 252     CACHE_OPT = 'cache'
 253     HOST_OPT = 'host'
 254     APIID_OPT = 'apiid'
 255     APIKEY_OPT = 'apikey'
 256
 257     # figure out what config file to read
 258     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
 259                    '/etc/htsworkflow.ini']
 260     if opts.config is not None:
 261         config_path = [opts.config]
 262     # parse options from config file
 263     config_file = SafeConfigParser()
 264     config_file.read(config_path)
 265
 266     # load defaults from config file if not overriden by the command line
 267     if opts.cache is None:
 268         if config_file.has_option(SECTION_NAME, CACHE_OPT):
 269             opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
 270         else:
 271             opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
 272
 273     if opts.sequence_archive is None and \
 274        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
 275         opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
 276
 277     opts.sequence_archive = os.path.abspath(opts.sequence_archive)
 278     opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
 279     opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
 280     opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
 281
 282     if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
 283         opts.host = config_file.get(SECTION_NAME, HOST_OPT)
 284
 285     if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
 286         opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
 287
 288     if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
 289         opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
 290
 291     return opts
 292
 293 def make_parser():
 294     """
 295     Make parser
 296     """
 297     parser = OptionParser()
 298     parser.add_option('-c', '--config', default=None,
 299                       help='path to a configuration file containing a '
 300                            'sequence archive section')
 301     parser.add_option('--cache', default=None,
 302                       help="default flowcell cache")
 303
 304     parser.add_option('--host', default=None,
 305                       help="specify http://host for quering flowcell information")
 306     parser.add_option('--apiid', default=None,
 307                       help="API ID to use when retriving information")
 308     parser.add_option("--apikey", default=None,
 309                       help="API Key for when retriving information")
 310
 311     parser.add_option('-a', '--sequence-archive', default=None,
 312                       help='path to where the sequence archive lives')
 313
 314     parser.add_option('-v', '--verbose', action='store_true', default=False,
 315                       help='be more verbose')
 316     parser.add_option('-d', '--debug', action='store_true', default=False,
 317                       help='report everything')
 318
 319     parser.add_option("--dry-run", dest="dry_run", action="store_true",
 320                       default=False,
 321                       help="Don't modify the filesystem")
 322     return parser
 323
 324 def main(cmdline=None):
 325     parser = make_parser()
 326     opts, args = parser.parse_args(cmdline)
 327
 328     configure_logging(opts)
 329     opts = configure_opts(opts)
 330
 331     # complain if critical things are missing
 332     if opts.cache is None:
 333        parser.error('Need location of htsworkflow frontend database')
 334
 335     if opts.sequence_archive is None:
 336        parser.error('Need the root path for the sequence archive')
 337
 338     seq_dirs = [ opts.flowcells, opts.srfs ]
 339     if len(args) > 0:
 340         seq_dirs = [os.path.abspath(f) for f in args]
 341
 342     seqs = scan_for_sequences(seq_dirs)
 343     fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
 344     updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
 345
 346     logging.warn("%s flowcells in database" % (len(fcdb),))
 347     logging.warn("found %s sequence files" % (len(seqs),))
 348     logging.warn("%s libraries being checked" % (len(libdb),))
 349     logging.warn("%s sequence files were linked" % (updates,))
 350
 351     return 0
 352
 353 if __name__ == "__main__":
 354     main()