From: Diane Trout Date: Thu, 10 Jun 2010 00:55:10 +0000 (+0000) Subject: Extend htsworkflow.pipelines.sequences to also try to figure out the cycle count. X-Git-Tag: 0.4.3~6 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=1e43073b3a6031ff8677b9da4ddbdf75ac9dc380 Extend htsworkflow.pipelines.sequences to also try to figure out the cycle count. In addition there is experimental code to shove the found sequences into a sql database. I also needed to bug fix the sequence patterns to catch the fake flowcell ilmn200901 which wasn't matching my regexp for detecting flowcell ids. --- diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py index cbdb433..627dfe3 100644 --- a/htsworkflow/pipelines/sequences.py +++ b/htsworkflow/pipelines/sequences.py @@ -6,8 +6,26 @@ import os import re eland_re = re.compile('s_(?P\d)(_(?P\d))?_eland_') -raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+') -qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2') +raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+') +qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2') + +SEQUENCE_TABLE_NAME = "sequences" +def create_sequence_table(cursor): + """ + Create a SQL table to hold SequenceFile entries + """ + sql = """ +CREATE TABLE %(table)s ( + filetype CHAR(8), + path TEXT, + flowcell CHAR(8), + lane INTEGER, + read INTEGER, + pf BOOLEAN, + cycle CHAR(8) +); +""" %( {'table': SEQUENCE_TABLE_NAME} ) + return cursor.execute(sql) class SequenceFile(object): """ @@ -63,25 +81,82 @@ class SequenceFile(object): # all the other file types have names that include flowcell/lane # information and thus are unique so we don't have to do anything return os.path.join(root, basename) + + def save(self, cursor): + """ + Add this entry to a DB2.0 database. + """ + #FIXME: NEEDS SQL ESCAPING + header_macro = {'table': SEQUENCE_TABLE_NAME } + sql_header = "insert into %(table)s (" % header_macro + sql_columns = ['filetype','path','flowcell','lane'] + sql_middle = ") values (" + sql_values = [self.filetype, self.path, self.flowcell, self.lane] + sql_footer = ");" + for name in ['read', 'pf', 'cycle']: + value = getattr(self, name) + if value is not None: + sql_columns.append(name) + sql_values.append(value) + + sql = " ".join([sql_header, + ", ".join(sql_columns), + sql_middle, + # note the following makes a string like ?,?,? + ",".join(["?"] * len(sql_values)), + sql_footer]) + + return cursor.execute(sql, sql_values) + +def get_flowcell_cycle(path): + """ + Extract flowcell, cycle from pathname + """ + rest, cycle = os.path.split(path) + rest, flowcell = os.path.split(rest) + cycle_match = re.match("C(?P[0-9]+)-(?P[0-9]+)", cycle) + if cycle_match is None: + raise ValueError("Expected .../flowcell/cycle/ directory structure") + start = cycle_match.group('start') + if start is not None: + start = int(start) + stop = cycle_match.group('stop') + if stop is not None: + stop = int(stop) + + return flowcell, start, stop def parse_srf(path, filename): + flowcell_dir, start, stop = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') flowcell = records[4] lane = int(records[5][0]) fullpath = os.path.join(path, filename) - return SequenceFile('srf', fullpath, flowcell, lane) + + if flowcell_dir != flowcell: + logging.warn("flowcell %s found in wrong directory %s" % \ + (flowcell, path)) + + return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop) def parse_qseq(path, filename): + flowcell_dir, start, stop = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') fullpath = os.path.join(path, filename) flowcell = records[4] lane = int(records[5][1]) read = int(records[6][1]) - return SequenceFile('qseq', fullpath, flowcell, lane, read) + + if flowcell_dir != flowcell: + logging.warn("flowcell %s found in wrong directory %s" % \ + (flowcell, path)) + + return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop) def parse_fastq(path, filename): + flowcell_dir, start, stop = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') fullpath = os.path.join(path, filename) @@ -95,14 +170,17 @@ def parse_fastq(path, filename): else: raise ValueError("Unrecognized fastq name") - return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf) + if flowcell_dir != flowcell: + logging.warn("flowcell %s found in wrong directory %s" % \ + (flowcell, path)) + + return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop) def parse_eland(path, filename, eland_match=None): if eland_match is None: eland_match = eland_re.match(filename) fullpath = os.path.join(path, filename) - rest, cycle = os.path.split(path) - rest, flowcell = os.path.split(rest) + flowcell, start, stop = get_flowcell_cycle(path) if eland_match.group('lane'): lane = int(eland_match.group('lane')) else: @@ -111,16 +189,12 @@ def parse_eland(path, filename, eland_match=None): read = int(eland_match.group('read')) else: read = None - return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle) + return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop) def scan_for_sequences(dirs): """ Scan through a list of directories for sequence like files """ - # be forgiving if someone just gives us a string - if type(dirs) != type([]): - dirs = [dirs] - sequences = [] for d in dirs: logging.info("Scanning %s for sequences" % (d,)) diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py index 765df31..3587e65 100644 --- a/htsworkflow/pipelines/test/test_sequences.py +++ b/htsworkflow/pipelines/test/test_sequences.py @@ -8,6 +8,21 @@ class SequenceFileTests(unittest.TestCase): """ Make sure the sequence archive class works """ + def test_flowcell_cycle(self): + """ + Make sure code to parse directory heirarchy works + """ + path = '/root/42BW9AAXX/C1-152' + flowcell, start, stop = sequences.get_flowcell_cycle(path) + + self.failUnlessEqual(flowcell, '42BW9AAXX') + self.failUnlessEqual(start, 1) + self.failUnlessEqual(stop, 152) + + path = '/root/42BW9AAXX/other' + self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path) + + def test_srf(self): path = '/root/42BW9AAXX/C1-38' name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf' @@ -20,7 +35,7 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, None) self.failUnlessEqual(f.pf, None) - self.failUnlessEqual(f.cycle, None) + self.failUnlessEqual(f.cycle, 38) def test_qseq(self): path = '/root/42BW9AAXX/C1-36' @@ -34,7 +49,20 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, 1) self.failUnlessEqual(f.pf, None) - self.failUnlessEqual(f.cycle, None) + self.failUnlessEqual(f.cycle, 36) + + + path = '/root/ilmn200901/C1-202' + name = 'woldlab_090125_HWI-EAS_0000_ilmn200901_l1_r1.tar.bz2' + pathname = os.path.join(path, name) + f = sequences.parse_qseq(path, name) + + self.failUnlessEqual(f.filetype, 'qseq') + self.failUnlessEqual(f.path, pathname) + self.failUnlessEqual(f.lane, 1) + self.failUnlessEqual(f.read, 1) + self.failUnlessEqual(f.pf, None) + self.failUnlessEqual(f.cycle, 202) def test_fastq(self): path = '/root/42BW9AAXX/C1-38' @@ -48,7 +76,7 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, 1) self.failUnlessEqual(f.pf, True) - self.failUnlessEqual(f.cycle, None) + self.failUnlessEqual(f.cycle, 38) name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l4_r2_nopass.fastq.bz2' pathname = os.path.join(path,name) @@ -60,7 +88,7 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, 2) self.failUnlessEqual(f.pf, False) - self.failUnlessEqual(f.cycle, None) + self.failUnlessEqual(f.cycle, 38) def test_eland(self): path = '/root/42BW9AAXX/C1-38' @@ -74,7 +102,7 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, None) self.failUnlessEqual(f.pf, None) - self.failUnlessEqual(f.cycle, 'C1-38') + self.failUnlessEqual(f.cycle, 38) path = '/root/42BW9AAXX/C1-152' name = 's_4_1_eland_extended.txt.bz2' @@ -87,7 +115,7 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.lane, 4) self.failUnlessEqual(f.read, 1) self.failUnlessEqual(f.pf, None) - self.failUnlessEqual(f.cycle, 'C1-152') + self.failUnlessEqual(f.cycle, 152) def test_sequence_file_equality(self): path = '/root/42BW9AAXX/C1-38' @@ -98,6 +126,33 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f1_qseq, f2_qseq) + def test_sql(self): + """ + Make sure that the quick and dirty sql interface in sequences works + """ + import sqlite3 + db = sqlite3.connect(":memory:") + c = db.cursor() + sequences.create_sequence_table(c) + + data = [('/root/42BW9AAXX/C1-152', + 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'), + ('/root/42BW9AAXX/C1-152', + 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r2.tar.bz2'), + ('/root/42BW9AAXX/C1-152', + 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r1.tar.bz2'), + ('/root/42BW9AAXX/C1-152', + 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l2_r21.tar.bz2'),] + + for path, name in data: + seq = sequences.parse_qseq(path, name) + seq.save(c) + + count = c.execute("select count(*) from sequences") + row = count.fetchone() + self.failUnlessEqual(row[0], 4) + + def suite(): return unittest.makeSuite(SequenceFileTests,'test')