htsworkflow/pipelines/sequences.py

   1 """
   2 Utilities to work with the various eras of sequence archive files
   3 """
   4 import logging
   5 import os
   6 import re
   7
   8 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
   9 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
  10 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
  11
  12 class SequenceFile(object):
  13     """
  14     Simple container class that holds the path to a sequence archive
  15     and basic descriptive information.
  16     """
  17     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
  18         self.filetype = filetype
  19         self.path = path
  20         self.flowcell = flowcell
  21         self.lane = lane
  22         self.read = read
  23         self.pf = pf
  24         self.cycle = cycle
  25
  26     def __hash__(self):
  27         return hash(self.key())
  28
  29     def key(self):
  30         return (self.flowcell, self.lane)
  31
  32     def unicode(self):
  33         return unicode(self.path)
  34
  35     def __eq__(self, other):
  36         """
  37         Equality is defined if everything but the path matches
  38         """
  39         attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
  40         for a in attributes:
  41             if getattr(self, a) != getattr(other, a):
  42                 return False
  43
  44         return True
  45
  46     def __repr__(self):
  47         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
  48
  49     def make_target_name(self, root):
  50         """
  51         Create target name for where we need to link this sequence too
  52         """
  53         path, basename = os.path.split(self.path)
  54         # Because the names aren't unque we include the flowcel name
  55         # because there were different eland files for different length
  56         # analyses, we include the cycle length in the name.
  57         if self.filetype == 'eland':
  58             template = "%(flowcell)s_%(cycle)s_%(eland)s"
  59             basename = template % { 'flowcell': self.flowcell,
  60                                     'cycle': self.cycle,
  61                                     'eland': basename }
  62         # else:
  63         # all the other file types have names that include flowcell/lane
  64         # information and thus are unique so we don't have to do anything
  65         return os.path.join(root, basename)
  66
  67 def parse_srf(path, filename):
  68     basename, ext = os.path.splitext(filename)
  69     records = basename.split('_')
  70     flowcell = records[4]
  71     lane = int(records[5][0])
  72     fullpath = os.path.join(path, filename)
  73     return SequenceFile('srf', fullpath, flowcell, lane)
  74
  75 def parse_qseq(path, filename):
  76     basename, ext = os.path.splitext(filename)
  77     records = basename.split('_')
  78     fullpath = os.path.join(path, filename)
  79     flowcell = records[4]
  80     lane = int(records[5][1])
  81     read = int(records[6][1])
  82     return SequenceFile('qseq', fullpath, flowcell, lane, read)
  83
  84 def parse_fastq(path, filename):
  85     basename, ext = os.path.splitext(filename)
  86     records = basename.split('_')
  87     fullpath = os.path.join(path, filename)
  88     flowcell = records[4]
  89     lane = int(records[5][1])
  90     read = int(records[6][1])
  91     if records[-1].startswith('pass'):
  92         pf = True
  93     elif records[-1].startswith('nopass'):
  94         pf = False
  95     else:
  96         raise ValueError("Unrecognized fastq name")
  97
  98     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
  99
 100 def parse_eland(path, filename, eland_match=None):
 101     if eland_match is None:
 102         eland_match = eland_re.match(filename)
 103     fullpath = os.path.join(path, filename)
 104     rest, cycle = os.path.split(path)
 105     rest, flowcell = os.path.split(rest)
 106     if eland_match.group('lane'):
 107         lane = int(eland_match.group('lane'))
 108     else:
 109         lane = None
 110     if eland_match.group('read'):
 111         read = int(eland_match.group('read'))
 112     else:
 113         read = None
 114     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
 115
 116 def scan_for_sequences(dirs):
 117     """
 118     Scan through a list of directories for sequence like files
 119     """
 120     # be forgiving if someone just gives us a string
 121     if type(dirs) != type([]):
 122         dirs = [dirs]
 123
 124     sequences = []
 125     for d in dirs:
 126         logging.info("Scanning %s for sequences" % (d,))
 127         for path, dirname, filenames in os.walk(d):
 128             for f in filenames:
 129                 seq = None
 130                 # find sequence files
 131                 if raw_seq_re.match(f):
 132                     if f.endswith('.md5'):
 133                         continue
 134                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
 135                         seq = parse_srf(path, f)
 136                     elif qseq_re.match(f):
 137                         seq = parse_qseq(path, f)
 138                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
 139                         seq = parse_fastq(path, f)
 140                 eland_match = eland_re.match(f)
 141                 if eland_match:
 142                     if f.endswith('.md5'):
 143                         continue
 144                     seq = parse_eland(path, f, eland_match)
 145                 if seq:
 146                     sequences.append(seq)
 147                     logging.debug("Found sequence at %s" % (f,))
 148
 149     return sequences