Core functions for saving and finding fastq files generated by a HiSeq.

author Diane Trout <diane@caltech.edu>

Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)

committer Diane Trout <diane@caltech.edu>

Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)
author Diane Trout <diane@caltech.edu>
Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)
committer Diane Trout <diane@caltech.edu>
Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index 4a2b4cdc3704fccb1b7fbdd5f48211c86851b6c4..4f871a2a210ab804e043fd340baa460ae8a3dd4a 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -555,7 +555,10 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
              lanes.append(lane)
  
          run_name = srf.pathname_to_run_name(r.pathname)
-        if raw_format == 'qseq':
+        seq_cmds = []
+        if raw_format == 'fastq':
+            srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, lanes, site, cycle_dir)
+        elif raw_format == 'qseq':
              seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
          elif raw_format == 'srf':
              seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index 993bcc94979cf5c3c6fe12eaa3c4b4b062a28581..f21e48c66aea85a3a6f71fdfab93e6dfcde40c7e 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -34,7 +34,9 @@ class SequenceFile(object):
      Simple container class that holds the path to a sequence archive
      and basic descriptive information.
      """
-    def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
+    def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None,
+                 project=None,
+                 index=None):
          self.filetype = filetype
          self.path = path
          self.flowcell = flowcell
@@ -42,6 +44,8 @@ class SequenceFile(object):
          self.read = read
          self.pf = pf
          self.cycle = cycle
+        self.project = project
+        self.index = index
  
      def __hash__(self):
          return hash(self.key())
@@ -56,7 +60,7 @@ class SequenceFile(object):
          """
          Equality is defined if everything but the path matches
          """
-        attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
+        attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index']
          for a in attributes:
              if getattr(self, a) != getattr(other, a):
                  return False
@@ -114,7 +118,15 @@ def get_flowcell_cycle(path):
      """
      Extract flowcell, cycle from pathname
      """
-    rest, cycle = os.path.split(path)
+    project = None
+    rest, tail = os.path.split(path)
+    if tail.startswith('Project_'):
+        # we're in a multiplexed sample
+        project = tail
+        rest, cycle = os.path.split(rest)
+    else:
+        cycle = tail
+
      rest, flowcell = os.path.split(rest)
      cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
      if cycle_match is None:
@@ -128,10 +140,10 @@ def get_flowcell_cycle(path):
      if stop is not None:
          stop = int(stop)
  
-    return flowcell, start, stop
+    return flowcell, start, stop, project
  
  def parse_srf(path, filename):
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      flowcell = records[4]
@@ -145,7 +157,7 @@ def parse_srf(path, filename):
      return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
  
  def parse_qseq(path, filename):
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
@@ -162,20 +174,35 @@ def parse_qseq(path, filename):
  def parse_fastq(path, filename):
      """Parse fastq names
      """
-    flowcell_dir, start, stop = get_flowcell_cycle(path)
+    flowcell_dir, start, stop, project = get_flowcell_cycle(path)
      basename, ext = os.path.splitext(filename)
      records = basename.split('_')
      fullpath = os.path.join(path, filename)
-    flowcell = records[4]
-    lane = int(records[5][1])
-    read = int(records[6][1])
-    pf = parse_fastq_pf_flag(records)
+    if project is not None:
+        # demultiplexed sample!
+        flowcell = flowcell_dir
+        lane = int(records[2][-1])
+        read = int(records[3][-1])
+        pf = True # as I understand it hiseq runs toss the ones that fail filter
+        index = records[1]
+        project_id = records[0]
+    else:
+        flowcell = records[4]
+        lane = int(records[5][1])
+        read = int(records[6][1])
+        pf = parse_fastq_pf_flag(records)
+        index = None
+        project_id = None
  
      if flowcell_dir != flowcell:
          LOGGER.warn("flowcell %s found in wrong directory %s" % \
                           (flowcell, path))
  
-    return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
+    return SequenceFile('fastq', fullpath, flowcell, lane, read,
+                        pf=pf,
+                        cycle=stop,
+                        project=project_id,
+                        index=index)
  
  def parse_fastq_pf_flag(records):
      """Take a fastq filename split on _ and look for the pass-filter flag
@@ -200,7 +227,7 @@ def parse_eland(path, filename, eland_match=None):
      if eland_match is None:
          eland_match = eland_re.match(filename)
      fullpath = os.path.join(path, filename)
-    flowcell, start, stop = get_flowcell_cycle(path)
+    flowcell, start, stop, project = get_flowcell_cycle(path)
      if eland_match.group('lane'):
          lane = int(eland_match.group('lane'))
      else:
@@ -233,7 +260,9 @@ def scan_for_sequences(dirs):
                          seq = parse_srf(path, f)
                      elif qseq_re.match(f):
                          seq = parse_qseq(path, f)
-                    elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
+                    elif f.endswith('.fastq') or \
+                         f.endswith('.fastq.bz2') or \
+                         f.endswith('.fastq.gz'):
                          seq = parse_fastq(path, f)
                  eland_match = eland_re.match(f)
                  if eland_match:
diff --git a/htsworkflow/pipelines/srf.py b/htsworkflow/pipelines/srf.py

index c313efe76e303db6a3508342204f482e87736a72..4f20ce1873fa42527b52619670ffc7caad8dd2ee 100644 (file)
--- a/htsworkflow/pipelines/srf.py
+++ b/htsworkflow/pipelines/srf.py
@@ -1,6 +1,7 @@
  from glob import glob
  import logging
  import os
+import shutil
  
  from htsworkflow.util import queuecommands
  
@@ -150,6 +151,32 @@ def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdleve
  
    return cmd_list
  
+def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
+    """
+    make a subprocess-friendly list of command line arguments to save HiSeq fastq files
+
+    run_name - most of the file name (run folder name is a good choice)
+    basecall_dir - location of unaligned files.
+    site_name - name of your "sequencing site" or "Individual"
+    destdir - root of where to save fastq files
+    """
+    # clean up pathname
+    LOGGER.info("run_name %s" % (run_name,))
+
+    cmd_list = []
+    project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
+    for project_dir in project_dirs:
+        _, project_name = os.path.split(project_dir)
+        sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
+        project_dest = os.path.join(destdir, project_name)
+        if not os.path.exists(project_dest):
+            LOGGER.info("Making: %s" % (project_dest))
+            os.mkdir(project_dest)
+
+        for fastq_file in sample_files:
+            shutil.copy(fastq_file, project_dest)
+
+
  def run_commands(new_dir, cmd_list, num_jobs):
      LOGGER.info("chdir to %s" % (new_dir,))
      curdir = os.getcwd()
diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py

index 3587e6505325571e8327291aa9e80d2ac93ded17..cede8c2b82644adf88bcfa13bb075ac1bce1f58a 100644 (file)
--- a/htsworkflow/pipelines/test/test_sequences.py
+++ b/htsworkflow/pipelines/test/test_sequences.py
@@ -13,11 +13,27 @@ class SequenceFileTests(unittest.TestCase):
          Make sure code to parse directory heirarchy works
          """
          path = '/root/42BW9AAXX/C1-152'
-        flowcell, start, stop = sequences.get_flowcell_cycle(path)
+        flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
  
          self.failUnlessEqual(flowcell, '42BW9AAXX')
          self.failUnlessEqual(start, 1)
          self.failUnlessEqual(stop, 152)
+        self.failUnlessEqual(project, None)
+
+        path = '/root/42BW9AAXX/other'
+        self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
+
+    def test_flowcell_project_cycle(self):
+        """
+        Make sure code to parse directory heirarchy works
+        """
+        path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
+        flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
+
+        self.failUnlessEqual(flowcell, '42BW9AAXX')
+        self.failUnlessEqual(start, 1)
+        self.failUnlessEqual(stop, 152)
+        self.failUnlessEqual(project, 'Project_12345_Index1')
  
          path = '/root/42BW9AAXX/other'
          self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
@@ -90,6 +106,36 @@ class SequenceFileTests(unittest.TestCase):
          self.failUnlessEqual(f.pf, False)
          self.failUnlessEqual(f.cycle, 38)
  
+    def test_project_fastq(self):
+        path = '/root/42BW9AAXX/C1-38/Project_12345'
+        name = '11111_NoIndex_L001_R1_001.fastq.gz'
+        pathname = os.path.join(path,name)
+        f = sequences.parse_fastq(path, name)
+
+        self.failUnlessEqual(f.filetype, 'fastq')
+        self.failUnlessEqual(f.path, pathname)
+        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
+        self.failUnlessEqual(f.lane, 1)
+        self.failUnlessEqual(f.read, 1)
+        self.failUnlessEqual(f.pf, True)
+        self.failUnlessEqual(f.project, '11111')
+        self.failUnlessEqual(f.index, 'NoIndex')
+        self.failUnlessEqual(f.cycle, 38)
+
+        name = '11112_AAATTT_L001_R2_003.fastq.gz'
+        pathname = os.path.join(path,name)
+        f = sequences.parse_fastq(path, name)
+
+        self.failUnlessEqual(f.filetype, 'fastq')
+        self.failUnlessEqual(f.path, pathname)
+        self.failUnlessEqual(f.flowcell, '42BW9AAXX')
+        self.failUnlessEqual(f.lane, 1)
+        self.failUnlessEqual(f.read, 2)
+        self.failUnlessEqual(f.pf, True)
+        self.failUnlessEqual(f.project, '11112')
+        self.failUnlessEqual(f.index, 'AAATTT')
+        self.failUnlessEqual(f.cycle, 38)
+
      def test_eland(self):
          path = '/root/42BW9AAXX/C1-38'
          name = 's_4_eland_extended.txt.bz2'
@@ -134,7 +180,7 @@ class SequenceFileTests(unittest.TestCase):
          db = sqlite3.connect(":memory:")
          c = db.cursor()
          sequences.create_sequence_table(c)
-        
+
          data = [('/root/42BW9AAXX/C1-152',
                  'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
                  ('/root/42BW9AAXX/C1-152',
@@ -151,7 +197,7 @@ class SequenceFileTests(unittest.TestCase):
          count = c.execute("select count(*) from sequences")
          row = count.fetchone()
          self.failUnlessEqual(row[0], 4)
- 
+
  
  def suite():
      return unittest.makeSuite(SequenceFileTests,'test')
author	Diane Trout <diane@caltech.edu>
	Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)
committer	Diane Trout <diane@caltech.edu>
	Fri, 2 Mar 2012 01:26:57 +0000 (17:26 -0800)
htsworkflow/pipelines/runfolder.py		patch \| blob \| history
htsworkflow/pipelines/sequences.py		patch \| blob \| history
htsworkflow/pipelines/srf.py		patch \| blob \| history
htsworkflow/pipelines/test/test_sequences.py		patch \| blob \| history