Support scanning for split fastq files generated by HiSeq demultiplexing

author Diane Trout <diane@caltech.edu>

Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)

committer Diane Trout <diane@caltech.edu>

Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)
author Diane Trout <diane@caltech.edu>
Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)
committer Diane Trout <diane@caltech.edu>
Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index f3cc9fe6df28a3513d57f0f24f1b67939f429879..69681fa9db8b16b3bdaed8878f1f8a8d8bfc376c 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -39,9 +39,26 @@ class SequenceFile(object):
      Simple container class that holds the path to a sequence archive
      and basic descriptive information.
      """
-    def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None,
+    def __init__(self, filetype, path, flowcell, lane,
+                 read=None,
+                 pf=None,
+                 cycle=None,
                   project=None,
-                 index=None):
+                 index=None,
+                 split=None):
+        """Store various fields used in our sequence files
+
+        filetype is one of 'qseq', 'srf', 'fastq'
+        path = location of file
+        flowcell = files flowcell id
+        lane = which lane
+        read = which sequencer read, usually 1 or 2
+        pf = did it pass filter
+        cycle = cycle dir name e.g. C1-202
+        project = projed name from HiSeq, probably library ID
+        index = HiSeq barcode index sequence
+        split = file fragment from HiSeq (Since one file is split into many)
+        """
          self.filetype = filetype
          self.path = path
          self.flowcell = flowcell
@@ -51,12 +68,13 @@ class SequenceFile(object):
          self.cycle = cycle
          self.project = project
          self.index = index
+        self.split = split
  
      def __hash__(self):
          return hash(self.key())
  
      def key(self):
-        return (self.flowcell, self.lane)
+        return (self.flowcell, self.lane, self.read, self.project, self.split)
  
      def unicode(self):
          return unicode(self.path)
@@ -181,7 +199,7 @@ def parse_fastq(path, filename):
      """Parse fastq names
      """
      flowcell_dir, start, stop, project = get_flowcell_cycle(path)
-    basename, ext = os.path.splitext(filename)
+    basename = re.sub('\.fastq(\.gz|\.bz2)?$', '', filename)
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
      if project is not None:
@@ -192,6 +210,8 @@ def parse_fastq(path, filename):
          pf = True # as I understand it hiseq runs toss the ones that fail filter
          index = records[1]
          project_id = records[0]
+        split = records[4]
+        sequence_type = 'split_fastq'
      else:
          flowcell = records[4]
          lane = int(records[5][1])
@@ -199,16 +219,19 @@ def parse_fastq(path, filename):
          pf = parse_fastq_pf_flag(records)
          index = None
          project_id = None
+        split = None
+        sequence_type = 'fastq'
  
      if flowcell_dir != flowcell:
          LOGGER.warn("flowcell %s found in wrong directory %s" % \
                           (flowcell, path))
  
-    return SequenceFile('fastq', fullpath, flowcell, lane, read,
+    return SequenceFile(sequence_type, fullpath, flowcell, lane, read,
                          pf=pf,
                          cycle=stop,
                          project=project_id,
-                        index=index)
+                        index=index,
+                        split=split)
  
  def parse_fastq_pf_flag(records):
      """Take a fastq filename split on _ and look for the pass-filter flag
@@ -262,17 +285,16 @@ def scan_for_sequences(dirs):
              for f in filenames:
                  seq = None
                  # find sequence files
-                if raw_seq_re.match(f):
-                    if f.endswith('.md5'):
-                        continue
-                    elif f.endswith('.srf') or f.endswith('.srf.bz2'):
-                        seq = parse_srf(path, f)
-                    elif qseq_re.match(f):
-                        seq = parse_qseq(path, f)
-                    elif f.endswith('.fastq') or \
-                         f.endswith('.fastq.bz2') or \
-                         f.endswith('.fastq.gz'):
-                        seq = parse_fastq(path, f)
+                if f.endswith('.md5'):
+                    continue
+                elif f.endswith('.srf') or f.endswith('.srf.bz2'):
+                    seq = parse_srf(path, f)
+                elif qseq_re.match(f):
+                    seq = parse_qseq(path, f)
+                elif f.endswith('.fastq') or \
+                     f.endswith('.fastq.bz2') or \
+                     f.endswith('.fastq.gz'):
+                    seq = parse_fastq(path, f)
                  eland_match = eland_re.match(f)
                  if eland_match:
                      if f.endswith('.md5'):
diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py

index 157246a94e3eae277e78df2813f5670aabb89621..e00f5ec8efaf12ee2a94d70749407c44fe754742 100644 (file)
--- a/htsworkflow/pipelines/test/test_sequences.py
+++ b/htsworkflow/pipelines/test/test_sequences.py
@@ -128,7 +128,7 @@ class SequenceFileTests(unittest.TestCase):
          pathname = os.path.join(path,name)
          f = sequences.parse_fastq(path, name)
  
-        self.failUnlessEqual(f.filetype, 'fastq')
+        self.failUnlessEqual(f.filetype, 'split_fastq')
          self.failUnlessEqual(f.path, pathname)
          self.failUnlessEqual(f.flowcell, '42BW9AAXX')
          self.failUnlessEqual(f.lane, 1)
@@ -142,7 +142,7 @@ class SequenceFileTests(unittest.TestCase):
          pathname = os.path.join(path,name)
          f = sequences.parse_fastq(path, name)
  
-        self.failUnlessEqual(f.filetype, 'fastq')
+        self.failUnlessEqual(f.filetype, 'split_fastq')
          self.failUnlessEqual(f.path, pathname)
          self.failUnlessEqual(f.flowcell, '42BW9AAXX')
          self.failUnlessEqual(f.lane, 1)
@@ -152,6 +152,22 @@ class SequenceFileTests(unittest.TestCase):
          self.failUnlessEqual(f.index, 'AAATTT')
          self.failUnlessEqual(f.cycle, 38)
  
+    def test_project_fastq_hashing(self):
+        """Can we tell the difference between sequence files?
+        """
+        path = '/root/42BW9AAXX/C1-38/Project_12345'
+        names = [('11111_NoIndex_L001_R1_001.fastq.gz',
+                  '11111_NoIndex_L001_R2_001.fastq.gz'),
+                 ('11112_NoIndex_L001_R1_001.fastq.gz',
+                  '11112_NoIndex_L001_R1_002.fastq.gz')
+                 ]
+        for a_name, b_name in names:
+            a = sequences.parse_fastq(path, a_name)
+            b = sequences.parse_fastq(path, b_name)
+            self.failIfEqual(a, b)
+            self.failIfEqual(a.key(), b.key())
+            self.failIfEqual(hash(a), hash(b))
+
      def test_eland(self):
          path = '/root/42BW9AAXX/C1-38'
          name = 's_4_eland_extended.txt.bz2'
author	Diane Trout <diane@caltech.edu>
	Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)
committer	Diane Trout <diane@caltech.edu>
	Tue, 6 Mar 2012 23:59:50 +0000 (15:59 -0800)
htsworkflow/pipelines/sequences.py		patch \| blob \| history
htsworkflow/pipelines/test/test_sequences.py		patch \| blob \| history