Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow

[htsworkflow.git] / htsworkflow / pipelines / sequences.py
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index 993bcc94979cf5c3c6fe12eaa3c4b4b062a28581..772af7b432754245b7a5eddb9ab3b1de45842eb7 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -1,8 +1,10 @@
  """
  Utilities to work with the various eras of sequence archive files
  """
+import collections
  import logging
  import os
+import types
  import re
  
  LOGGER = logging.getLogger(__name__)
@@ -29,12 +31,34 @@ CREATE TABLE %(table)s (
  """ %( {'table': SEQUENCE_TABLE_NAME} )
      return cursor.execute(sql)
  
+FlowcellPath = collections.namedtuple('FlowcellPath',
+                                      'flowcell start stop project')
+
  class SequenceFile(object):
      """
      Simple container class that holds the path to a sequence archive
      and basic descriptive information.
      """
-    def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
+    def __init__(self, filetype, path, flowcell, lane,
+                 read=None,
+                 pf=None,
+                 cycle=None,
+                 project=None,
+                 index=None,
+                 split=None):
+        """Store various fields used in our sequence files
+
+        filetype is one of 'qseq', 'srf', 'fastq'
+        path = location of file
+        flowcell = files flowcell id
+        lane = which lane
+        read = which sequencer read, usually 1 or 2
+        pf = did it pass filter
+        cycle = cycle dir name e.g. C1-202
+        project = projed name from HiSeq, probably library ID
+        index = HiSeq barcode index sequence
+        split = file fragment from HiSeq (Since one file is split into many)
+        """
          self.filetype = filetype
          self.path = path
          self.flowcell = flowcell
@@ -42,12 +66,15 @@ class SequenceFile(object):
          self.read = read
          self.pf = pf
          self.cycle = cycle
+        self.project = project
+        self.index = index
+        self.split = split
  
      def __hash__(self):
          return hash(self.key())
  
      def key(self):
-        return (self.flowcell, self.lane)
+        return (self.flowcell, self.lane, self.read, self.project, self.split)
  
      def unicode(self):
          return unicode(self.path)
@@ -56,7 +83,15 @@ class SequenceFile(object):
          """
          Equality is defined if everything but the path matches
          """
-        attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
+        attributes = ['filetype',
+                      'flowcell',
+                      'lane',
+                      'read',
+                      'pf',
+                      'cycle',
+                      'project',
+                      'index',
+                      'split']
          for a in attributes:
              if getattr(self, a) != getattr(other, a):
                  return False
@@ -114,7 +149,16 @@ def get_flowcell_cycle(path):
      """
      Extract flowcell, cycle from pathname
      """
-    rest, cycle = os.path.split(path)
+    path = os.path.normpath(path)
+    project = None
+    rest, tail = os.path.split(path)
+    if tail.startswith('Project_'):
+        # we're in a multiplexed sample
+        project = tail
+        rest, cycle = os.path.split(rest)
+    else:
+        cycle = tail
+
      rest, flowcell = os.path.split(rest)
      cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
      if cycle_match is None:
@@ -128,10 +172,10 @@ def get_flowcell_cycle(path):
      if stop is not None:
          stop = int(stop)
  
-    return flowcell, start, stop
+    return FlowcellPath(flowcell, start, stop, project)
  
  def parse_srf(path, filename):
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      flowcell = records[4]
@@ -145,7 +189,7 @@ def parse_srf(path, filename):
      return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
  
  def parse_qseq(path, filename):
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
@@ -162,20 +206,40 @@ def parse_qseq(path, filename):
  def parse_fastq(path, filename):
      """Parse fastq names
      """
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
-    basename, ext = os.path.splitext(filename)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
+    basename = re.sub('\.fastq(\.gz|\.bz2)?$', '', filename)
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
-    flowcell = records[4]
-    lane = int(records[5][1])
-    read = int(records[6][1])
-    pf = parse_fastq_pf_flag(records)
+    if project is not None:
+        # demultiplexed sample!
+        flowcell = flowcell_dir
+        lane = int(records[2][-1])
+        read = int(records[3][-1])
+        pf = True # as I understand it hiseq runs toss the ones that fail filter
+        index = records[1]
+        project_id = records[0]
+        split = records[4]
+        sequence_type = 'split_fastq'
+    else:
+        flowcell = records[4]
+        lane = int(records[5][1])
+        read = int(records[6][1])
+        pf = parse_fastq_pf_flag(records)
+        index = None
+        project_id = None
+        split = None
+        sequence_type = 'fastq'
  
      if flowcell_dir != flowcell:
          LOGGER.warn("flowcell %s found in wrong directory %s" % \
                           (flowcell, path))
  
-    return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
+    return SequenceFile(sequence_type, fullpath, flowcell, lane, read,
+                        pf=pf,
+                        cycle=stop,
+                        project=project_id,
+                        index=index,
+                        split=split)
  
  def parse_fastq_pf_flag(records):
      """Take a fastq filename split on _ and look for the pass-filter flag
@@ -200,7 +264,7 @@ def parse_eland(path, filename, eland_match=None):
      if eland_match is None:
          eland_match = eland_re.match(filename)
      fullpath = os.path.join(path, filename)
-    flowcell, start, stop = get_flowcell_cycle(path)
+    flowcell, start, stop, project = get_flowcell_cycle(path)
      if eland_match.group('lane'):
          lane = int(eland_match.group('lane'))
      else:
@@ -216,6 +280,9 @@ def scan_for_sequences(dirs):
      Scan through a list of directories for sequence like files
      """
      sequences = []
+    if type(dirs) in types.StringTypes:
+        raise ValueError("You probably want a list or set, not a string")
+
      for d in dirs:
          LOGGER.info("Scanning %s for sequences" % (d,))
          if not os.path.exists(d):
@@ -226,15 +293,16 @@ def scan_for_sequences(dirs):
              for f in filenames:
                  seq = None
                  # find sequence files
-                if raw_seq_re.match(f):
-                    if f.endswith('.md5'):
-                        continue
-                    elif f.endswith('.srf') or f.endswith('.srf.bz2'):
-                        seq = parse_srf(path, f)
-                    elif qseq_re.match(f):
-                        seq = parse_qseq(path, f)
-                    elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
-                        seq = parse_fastq(path, f)
+                if f.endswith('.md5'):
+                    continue
+                elif f.endswith('.srf') or f.endswith('.srf.bz2'):
+                    seq = parse_srf(path, f)
+                elif qseq_re.match(f):
+                    seq = parse_qseq(path, f)
+                elif f.endswith('.fastq') or \
+                     f.endswith('.fastq.bz2') or \
+                     f.endswith('.fastq.gz'):
+                    seq = parse_fastq(path, f)
                  eland_match = eland_re.match(f)
                  if eland_match:
                      if f.endswith('.md5'):