2 Utilities to work with the various eras of sequence archive files
8 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
9 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+')
10 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2')
12 SEQUENCE_TABLE_NAME = "sequences"
13 def create_sequence_table(cursor):
15 Create a SQL table to hold SequenceFile entries
18 CREATE TABLE %(table)s (
27 """ %( {'table': SEQUENCE_TABLE_NAME} )
28 return cursor.execute(sql)
30 class SequenceFile(object):
32 Simple container class that holds the path to a sequence archive
33 and basic descriptive information.
35 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
36 self.filetype = filetype
38 self.flowcell = flowcell
45 return hash(self.key())
48 return (self.flowcell, self.lane)
51 return unicode(self.path)
53 def __eq__(self, other):
55 Equality is defined if everything but the path matches
57 attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
59 if getattr(self, a) != getattr(other, a):
65 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
67 def make_target_name(self, root):
69 Create target name for where we need to link this sequence too
71 path, basename = os.path.split(self.path)
72 # Because the names aren't unque we include the flowcel name
73 # because there were different eland files for different length
74 # analyses, we include the cycle length in the name.
75 if self.filetype == 'eland':
76 template = "%(flowcell)s_%(cycle)s_%(eland)s"
77 basename = template % { 'flowcell': self.flowcell,
81 # all the other file types have names that include flowcell/lane
82 # information and thus are unique so we don't have to do anything
83 return os.path.join(root, basename)
85 def save(self, cursor):
87 Add this entry to a DB2.0 database.
89 #FIXME: NEEDS SQL ESCAPING
90 header_macro = {'table': SEQUENCE_TABLE_NAME }
91 sql_header = "insert into %(table)s (" % header_macro
92 sql_columns = ['filetype','path','flowcell','lane']
93 sql_middle = ") values ("
94 sql_values = [self.filetype, self.path, self.flowcell, self.lane]
96 for name in ['read', 'pf', 'cycle']:
97 value = getattr(self, name)
99 sql_columns.append(name)
100 sql_values.append(value)
102 sql = " ".join([sql_header,
103 ", ".join(sql_columns),
105 # note the following makes a string like ?,?,?
106 ",".join(["?"] * len(sql_values)),
109 return cursor.execute(sql, sql_values)
111 def get_flowcell_cycle(path):
113 Extract flowcell, cycle from pathname
115 rest, cycle = os.path.split(path)
116 rest, flowcell = os.path.split(rest)
117 cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
118 if cycle_match is None:
119 raise ValueError("Expected .../flowcell/cycle/ directory structure")
120 start = cycle_match.group('start')
121 if start is not None:
123 stop = cycle_match.group('stop')
127 return flowcell, start, stop
129 def parse_srf(path, filename):
130 flowcell_dir, start, stop = get_flowcell_cycle(path)
131 basename, ext = os.path.splitext(filename)
132 records = basename.split('_')
133 flowcell = records[4]
134 lane = int(records[5][0])
135 fullpath = os.path.join(path, filename)
137 if flowcell_dir != flowcell:
138 logging.warn("flowcell %s found in wrong directory %s" % \
141 return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
143 def parse_qseq(path, filename):
144 flowcell_dir, start, stop = get_flowcell_cycle(path)
145 basename, ext = os.path.splitext(filename)
146 records = basename.split('_')
147 fullpath = os.path.join(path, filename)
148 flowcell = records[4]
149 lane = int(records[5][1])
150 read = int(records[6][1])
152 if flowcell_dir != flowcell:
153 logging.warn("flowcell %s found in wrong directory %s" % \
156 return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop)
158 def parse_fastq(path, filename):
159 flowcell_dir, start, stop = get_flowcell_cycle(path)
160 basename, ext = os.path.splitext(filename)
161 records = basename.split('_')
162 fullpath = os.path.join(path, filename)
163 flowcell = records[4]
164 lane = int(records[5][1])
165 read = int(records[6][1])
166 if records[-1].startswith('pass'):
168 elif records[-1].startswith('nopass'):
171 raise ValueError("Unrecognized fastq name")
173 if flowcell_dir != flowcell:
174 logging.warn("flowcell %s found in wrong directory %s" % \
177 return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
179 def parse_eland(path, filename, eland_match=None):
180 if eland_match is None:
181 eland_match = eland_re.match(filename)
182 fullpath = os.path.join(path, filename)
183 flowcell, start, stop = get_flowcell_cycle(path)
184 if eland_match.group('lane'):
185 lane = int(eland_match.group('lane'))
188 if eland_match.group('read'):
189 read = int(eland_match.group('read'))
192 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop)
194 def scan_for_sequences(dirs):
196 Scan through a list of directories for sequence like files
200 logging.info("Scanning %s for sequences" % (d,))
201 for path, dirname, filenames in os.walk(d):
204 # find sequence files
205 if raw_seq_re.match(f):
206 if f.endswith('.md5'):
208 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
209 seq = parse_srf(path, f)
210 elif qseq_re.match(f):
211 seq = parse_qseq(path, f)
212 elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
213 seq = parse_fastq(path, f)
214 eland_match = eland_re.match(f)
216 if f.endswith('.md5'):
218 seq = parse_eland(path, f, eland_match)
220 sequences.append(seq)
221 logging.debug("Found sequence at %s" % (f,))