htsworkflow/pipelines/sequences.py

   1 """
   2 Utilities to work with the various eras of sequence archive files
   3 """
   4 import logging
   5 import os
   6 import re
   7
   8 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
   9 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+')
  10 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2')
  11
  12 SEQUENCE_TABLE_NAME = "sequences"
  13 def create_sequence_table(cursor):
  14     """
  15     Create a SQL table to hold  SequenceFile entries
  16     """
  17     sql = """
  18 CREATE TABLE %(table)s (
  19   filetype   CHAR(8),
  20   path       TEXT,
  21   flowcell   CHAR(8),
  22   lane       INTEGER,
  23   read       INTEGER,
  24   pf         BOOLEAN,
  25   cycle      CHAR(8)
  26 );
  27 """ %( {'table': SEQUENCE_TABLE_NAME} )
  28     return cursor.execute(sql)
  29
  30 class SequenceFile(object):
  31     """
  32     Simple container class that holds the path to a sequence archive
  33     and basic descriptive information.
  34     """
  35     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
  36         self.filetype = filetype
  37         self.path = path
  38         self.flowcell = flowcell
  39         self.lane = lane
  40         self.read = read
  41         self.pf = pf
  42         self.cycle = cycle
  43
  44     def __hash__(self):
  45         return hash(self.key())
  46
  47     def key(self):
  48         return (self.flowcell, self.lane)
  49
  50     def unicode(self):
  51         return unicode(self.path)
  52
  53     def __eq__(self, other):
  54         """
  55         Equality is defined if everything but the path matches
  56         """
  57         attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
  58         for a in attributes:
  59             if getattr(self, a) != getattr(other, a):
  60                 return False
  61
  62         return True
  63
  64     def __repr__(self):
  65         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
  66
  67     def make_target_name(self, root):
  68         """
  69         Create target name for where we need to link this sequence too
  70         """
  71         path, basename = os.path.split(self.path)
  72         # Because the names aren't unque we include the flowcel name
  73         # because there were different eland files for different length
  74         # analyses, we include the cycle length in the name.
  75         if self.filetype == 'eland':
  76             template = "%(flowcell)s_%(cycle)s_%(eland)s"
  77             basename = template % { 'flowcell': self.flowcell,
  78                                     'cycle': self.cycle,
  79                                     'eland': basename }
  80         # else:
  81         # all the other file types have names that include flowcell/lane
  82         # information and thus are unique so we don't have to do anything
  83         return os.path.join(root, basename)
  84
  85     def save(self, cursor):
  86         """
  87         Add this entry to a DB2.0 database.
  88         """
  89         #FIXME: NEEDS SQL ESCAPING
  90         header_macro = {'table': SEQUENCE_TABLE_NAME }
  91         sql_header = "insert into %(table)s (" % header_macro
  92         sql_columns = ['filetype','path','flowcell','lane']
  93         sql_middle = ") values ("
  94         sql_values = [self.filetype, self.path, self.flowcell, self.lane]
  95         sql_footer = ");"
  96         for name in ['read', 'pf', 'cycle']:
  97             value = getattr(self, name)
  98             if value is not None:
  99                 sql_columns.append(name)
 100                 sql_values.append(value)
 101
 102         sql = " ".join([sql_header,
 103                         ", ".join(sql_columns),
 104                         sql_middle,
 105                         # note the following makes a string like ?,?,?
 106                         ",".join(["?"] * len(sql_values)),
 107                         sql_footer])
 108
 109         return cursor.execute(sql, sql_values)
 110
 111 def get_flowcell_cycle(path):
 112     """
 113     Extract flowcell, cycle from pathname
 114     """
 115     rest, cycle = os.path.split(path)
 116     rest, flowcell = os.path.split(rest)
 117     cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
 118     if cycle_match is None:
 119         raise ValueError("Expected .../flowcell/cycle/ directory structure")
 120     start = cycle_match.group('start')
 121     if start is not None:
 122         start = int(start)
 123     stop = cycle_match.group('stop')
 124     if stop is not None:
 125         stop = int(stop)
 126
 127     return flowcell, start, stop
 128
 129 def parse_srf(path, filename):
 130     flowcell_dir, start, stop = get_flowcell_cycle(path)
 131     basename, ext = os.path.splitext(filename)
 132     records = basename.split('_')
 133     flowcell = records[4]
 134     lane = int(records[5][0])
 135     fullpath = os.path.join(path, filename)
 136
 137     if flowcell_dir != flowcell:
 138         logging.warn("flowcell %s found in wrong directory %s" % \
 139                          (flowcell, path))
 140
 141     return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
 142
 143 def parse_qseq(path, filename):
 144     flowcell_dir, start, stop = get_flowcell_cycle(path)
 145     basename, ext = os.path.splitext(filename)
 146     records = basename.split('_')
 147     fullpath = os.path.join(path, filename)
 148     flowcell = records[4]
 149     lane = int(records[5][1])
 150     read = int(records[6][1])
 151
 152     if flowcell_dir != flowcell:
 153         logging.warn("flowcell %s found in wrong directory %s" % \
 154                          (flowcell, path))
 155
 156     return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop)
 157
 158 def parse_fastq(path, filename):
 159     flowcell_dir, start, stop = get_flowcell_cycle(path)
 160     basename, ext = os.path.splitext(filename)
 161     records = basename.split('_')
 162     fullpath = os.path.join(path, filename)
 163     flowcell = records[4]
 164     lane = int(records[5][1])
 165     read = int(records[6][1])
 166     if records[-1].startswith('pass'):
 167         pf = True
 168     elif records[-1].startswith('nopass'):
 169         pf = False
 170     else:
 171         raise ValueError("Unrecognized fastq name")
 172
 173     if flowcell_dir != flowcell:
 174         logging.warn("flowcell %s found in wrong directory %s" % \
 175                          (flowcell, path))
 176
 177     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
 178
 179 def parse_eland(path, filename, eland_match=None):
 180     if eland_match is None:
 181         eland_match = eland_re.match(filename)
 182     fullpath = os.path.join(path, filename)
 183     flowcell, start, stop = get_flowcell_cycle(path)
 184     if eland_match.group('lane'):
 185         lane = int(eland_match.group('lane'))
 186     else:
 187         lane = None
 188     if eland_match.group('read'):
 189         read = int(eland_match.group('read'))
 190     else:
 191         read = None
 192     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop)
 193
 194 def scan_for_sequences(dirs):
 195     """
 196     Scan through a list of directories for sequence like files
 197     """
 198     sequences = []
 199     for d in dirs:
 200         logging.info("Scanning %s for sequences" % (d,))
 201         for path, dirname, filenames in os.walk(d):
 202             for f in filenames:
 203                 seq = None
 204                 # find sequence files
 205                 if raw_seq_re.match(f):
 206                     if f.endswith('.md5'):
 207                         continue
 208                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
 209                         seq = parse_srf(path, f)
 210                     elif qseq_re.match(f):
 211                         seq = parse_qseq(path, f)
 212                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
 213                         seq = parse_fastq(path, f)
 214                 eland_match = eland_re.match(f)
 215                 if eland_match:
 216                     if f.endswith('.md5'):
 217                         continue
 218                     seq = parse_eland(path, f, eland_match)
 219                 if seq:
 220                     sequences.append(seq)
 221                     logging.debug("Found sequence at %s" % (f,))
 222
 223     return sequences