2 Utilities to work with the various eras of sequence archive files
8 LOGGER = logging.getLogger(__name__)
10 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
11 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+')
12 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2')
14 SEQUENCE_TABLE_NAME = "sequences"
15 def create_sequence_table(cursor):
17 Create a SQL table to hold SequenceFile entries
20 CREATE TABLE %(table)s (
29 """ %( {'table': SEQUENCE_TABLE_NAME} )
30 return cursor.execute(sql)
32 class SequenceFile(object):
34 Simple container class that holds the path to a sequence archive
35 and basic descriptive information.
37 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None,
40 self.filetype = filetype
42 self.flowcell = flowcell
47 self.project = project
51 return hash(self.key())
54 return (self.flowcell, self.lane)
57 return unicode(self.path)
59 def __eq__(self, other):
61 Equality is defined if everything but the path matches
63 attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index']
65 if getattr(self, a) != getattr(other, a):
71 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
73 def make_target_name(self, root):
75 Create target name for where we need to link this sequence too
77 path, basename = os.path.split(self.path)
78 # Because the names aren't unque we include the flowcel name
79 # because there were different eland files for different length
80 # analyses, we include the cycle length in the name.
81 if self.filetype == 'eland':
82 template = "%(flowcell)s_%(cycle)s_%(eland)s"
83 basename = template % { 'flowcell': self.flowcell,
87 # all the other file types have names that include flowcell/lane
88 # information and thus are unique so we don't have to do anything
89 return os.path.join(root, basename)
91 def save(self, cursor):
93 Add this entry to a DB2.0 database.
95 #FIXME: NEEDS SQL ESCAPING
96 header_macro = {'table': SEQUENCE_TABLE_NAME }
97 sql_header = "insert into %(table)s (" % header_macro
98 sql_columns = ['filetype','path','flowcell','lane']
99 sql_middle = ") values ("
100 sql_values = [self.filetype, self.path, self.flowcell, self.lane]
102 for name in ['read', 'pf', 'cycle']:
103 value = getattr(self, name)
104 if value is not None:
105 sql_columns.append(name)
106 sql_values.append(value)
108 sql = " ".join([sql_header,
109 ", ".join(sql_columns),
111 # note the following makes a string like ?,?,?
112 ",".join(["?"] * len(sql_values)),
115 return cursor.execute(sql, sql_values)
117 def get_flowcell_cycle(path):
119 Extract flowcell, cycle from pathname
122 rest, tail = os.path.split(path)
123 if tail.startswith('Project_'):
124 # we're in a multiplexed sample
126 rest, cycle = os.path.split(rest)
130 rest, flowcell = os.path.split(rest)
131 cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
132 if cycle_match is None:
134 "Expected .../flowcell/cycle/ directory structure in %s" % \
136 start = cycle_match.group('start')
137 if start is not None:
139 stop = cycle_match.group('stop')
143 return flowcell, start, stop, project
145 def parse_srf(path, filename):
146 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
147 basename, ext = os.path.splitext(filename)
148 records = basename.split('_')
149 flowcell = records[4]
150 lane = int(records[5][0])
151 fullpath = os.path.join(path, filename)
153 if flowcell_dir != flowcell:
154 LOGGER.warn("flowcell %s found in wrong directory %s" % \
157 return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
159 def parse_qseq(path, filename):
160 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
161 basename, ext = os.path.splitext(filename)
162 records = basename.split('_')
163 fullpath = os.path.join(path, filename)
164 flowcell = records[4]
165 lane = int(records[5][1])
166 read = int(records[6][1])
168 if flowcell_dir != flowcell:
169 LOGGER.warn("flowcell %s found in wrong directory %s" % \
172 return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop)
174 def parse_fastq(path, filename):
177 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
178 basename, ext = os.path.splitext(filename)
179 records = basename.split('_')
180 fullpath = os.path.join(path, filename)
181 if project is not None:
182 # demultiplexed sample!
183 flowcell = flowcell_dir
184 lane = int(records[2][-1])
185 read = int(records[3][-1])
186 pf = True # as I understand it hiseq runs toss the ones that fail filter
188 project_id = records[0]
190 flowcell = records[4]
191 lane = int(records[5][1])
192 read = int(records[6][1])
193 pf = parse_fastq_pf_flag(records)
197 if flowcell_dir != flowcell:
198 LOGGER.warn("flowcell %s found in wrong directory %s" % \
201 return SequenceFile('fastq', fullpath, flowcell, lane, read,
207 def parse_fastq_pf_flag(records):
208 """Take a fastq filename split on _ and look for the pass-filter flag
213 fastq_type = records[-1].lower()
214 if fastq_type.startswith('pass'):
216 elif fastq_type.startswith('nopass'):
218 elif fastq_type.startswith('all'):
221 raise ValueError("Unrecognized fastq name %s at %s" % \
222 (records[-1], os.path.join(path,filename)))
226 def parse_eland(path, filename, eland_match=None):
227 if eland_match is None:
228 eland_match = eland_re.match(filename)
229 fullpath = os.path.join(path, filename)
230 flowcell, start, stop, project = get_flowcell_cycle(path)
231 if eland_match.group('lane'):
232 lane = int(eland_match.group('lane'))
235 if eland_match.group('read'):
236 read = int(eland_match.group('read'))
239 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop)
241 def scan_for_sequences(dirs):
243 Scan through a list of directories for sequence like files
247 LOGGER.info("Scanning %s for sequences" % (d,))
248 if not os.path.exists(d):
249 LOGGER.warn("Flowcell directory %s does not exist" % (d,))
252 for path, dirname, filenames in os.walk(d):
255 # find sequence files
256 if raw_seq_re.match(f):
257 if f.endswith('.md5'):
259 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
260 seq = parse_srf(path, f)
261 elif qseq_re.match(f):
262 seq = parse_qseq(path, f)
263 elif f.endswith('.fastq') or \
264 f.endswith('.fastq.bz2') or \
265 f.endswith('.fastq.gz'):
266 seq = parse_fastq(path, f)
267 eland_match = eland_re.match(f)
269 if f.endswith('.md5'):
271 seq = parse_eland(path, f, eland_match)
273 sequences.append(seq)
274 LOGGER.debug("Found sequence at %s" % (f,))