2 Utilities to work with the various eras of sequence archive files
10 LOGGER = logging.getLogger(__name__)
12 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
13 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+')
14 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Za-z]+_l[\d]_r[\d].tar.bz2')
16 SEQUENCE_TABLE_NAME = "sequences"
17 def create_sequence_table(cursor):
19 Create a SQL table to hold SequenceFile entries
22 CREATE TABLE %(table)s (
31 """ %( {'table': SEQUENCE_TABLE_NAME} )
32 return cursor.execute(sql)
34 FlowcellPath = collections.namedtuple('FlowcellPath',
35 'flowcell start stop project')
37 class SequenceFile(object):
39 Simple container class that holds the path to a sequence archive
40 and basic descriptive information.
42 def __init__(self, filetype, path, flowcell, lane,
49 """Store various fields used in our sequence files
51 filetype is one of 'qseq', 'srf', 'fastq'
52 path = location of file
53 flowcell = files flowcell id
55 read = which sequencer read, usually 1 or 2
56 pf = did it pass filter
57 cycle = cycle dir name e.g. C1-202
58 project = projed name from HiSeq, probably library ID
59 index = HiSeq barcode index sequence
60 split = file fragment from HiSeq (Since one file is split into many)
62 self.filetype = filetype
64 self.flowcell = flowcell
69 self.project = project
74 return hash(self.key())
77 return (self.flowcell, self.lane, self.read, self.project, self.split)
80 return unicode(self.path)
82 def __eq__(self, other):
84 Equality is defined if everything but the path matches
86 attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index']
88 if getattr(self, a) != getattr(other, a):
94 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
96 def make_target_name(self, root):
98 Create target name for where we need to link this sequence too
100 path, basename = os.path.split(self.path)
101 # Because the names aren't unque we include the flowcel name
102 # because there were different eland files for different length
103 # analyses, we include the cycle length in the name.
104 if self.filetype == 'eland':
105 template = "%(flowcell)s_%(cycle)s_%(eland)s"
106 basename = template % { 'flowcell': self.flowcell,
110 # all the other file types have names that include flowcell/lane
111 # information and thus are unique so we don't have to do anything
112 return os.path.join(root, basename)
114 def save(self, cursor):
116 Add this entry to a DB2.0 database.
118 #FIXME: NEEDS SQL ESCAPING
119 header_macro = {'table': SEQUENCE_TABLE_NAME }
120 sql_header = "insert into %(table)s (" % header_macro
121 sql_columns = ['filetype','path','flowcell','lane']
122 sql_middle = ") values ("
123 sql_values = [self.filetype, self.path, self.flowcell, self.lane]
125 for name in ['read', 'pf', 'cycle']:
126 value = getattr(self, name)
127 if value is not None:
128 sql_columns.append(name)
129 sql_values.append(value)
131 sql = " ".join([sql_header,
132 ", ".join(sql_columns),
134 # note the following makes a string like ?,?,?
135 ",".join(["?"] * len(sql_values)),
138 return cursor.execute(sql, sql_values)
140 def get_flowcell_cycle(path):
142 Extract flowcell, cycle from pathname
144 path = os.path.normpath(path)
146 rest, tail = os.path.split(path)
147 if tail.startswith('Project_'):
148 # we're in a multiplexed sample
150 rest, cycle = os.path.split(rest)
154 rest, flowcell = os.path.split(rest)
155 cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
156 if cycle_match is None:
158 "Expected .../flowcell/cycle/ directory structure in %s" % \
160 start = cycle_match.group('start')
161 if start is not None:
163 stop = cycle_match.group('stop')
167 return FlowcellPath(flowcell, start, stop, project)
169 def parse_srf(path, filename):
170 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
171 basename, ext = os.path.splitext(filename)
172 records = basename.split('_')
173 flowcell = records[4]
174 lane = int(records[5][0])
175 fullpath = os.path.join(path, filename)
177 if flowcell_dir != flowcell:
178 LOGGER.warn("flowcell %s found in wrong directory %s" % \
181 return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
183 def parse_qseq(path, filename):
184 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
185 basename, ext = os.path.splitext(filename)
186 records = basename.split('_')
187 fullpath = os.path.join(path, filename)
188 flowcell = records[4]
189 lane = int(records[5][1])
190 read = int(records[6][1])
192 if flowcell_dir != flowcell:
193 LOGGER.warn("flowcell %s found in wrong directory %s" % \
196 return SequenceFile('qseq', fullpath, flowcell, lane, read, cycle=stop)
198 def parse_fastq(path, filename):
201 flowcell_dir, start, stop, project = get_flowcell_cycle(path)
202 basename = re.sub('\.fastq(\.gz|\.bz2)?$', '', filename)
203 records = basename.split('_')
204 fullpath = os.path.join(path, filename)
205 if project is not None:
206 # demultiplexed sample!
207 flowcell = flowcell_dir
208 lane = int(records[2][-1])
209 read = int(records[3][-1])
210 pf = True # as I understand it hiseq runs toss the ones that fail filter
212 project_id = records[0]
214 sequence_type = 'split_fastq'
216 flowcell = records[4]
217 lane = int(records[5][1])
218 read = int(records[6][1])
219 pf = parse_fastq_pf_flag(records)
223 sequence_type = 'fastq'
225 if flowcell_dir != flowcell:
226 LOGGER.warn("flowcell %s found in wrong directory %s" % \
229 return SequenceFile(sequence_type, fullpath, flowcell, lane, read,
236 def parse_fastq_pf_flag(records):
237 """Take a fastq filename split on _ and look for the pass-filter flag
242 fastq_type = records[-1].lower()
243 if fastq_type.startswith('pass'):
245 elif fastq_type.startswith('nopass'):
247 elif fastq_type.startswith('all'):
250 raise ValueError("Unrecognized fastq name %s at %s" % \
251 (records[-1], os.path.join(path,filename)))
255 def parse_eland(path, filename, eland_match=None):
256 if eland_match is None:
257 eland_match = eland_re.match(filename)
258 fullpath = os.path.join(path, filename)
259 flowcell, start, stop, project = get_flowcell_cycle(path)
260 if eland_match.group('lane'):
261 lane = int(eland_match.group('lane'))
264 if eland_match.group('read'):
265 read = int(eland_match.group('read'))
268 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=stop)
270 def scan_for_sequences(dirs):
272 Scan through a list of directories for sequence like files
275 if type(dirs) in types.StringTypes:
276 raise ValueError("You probably want a list or set, not a string")
279 LOGGER.info("Scanning %s for sequences" % (d,))
280 if not os.path.exists(d):
281 LOGGER.warn("Flowcell directory %s does not exist" % (d,))
284 for path, dirname, filenames in os.walk(d):
287 # find sequence files
288 if f.endswith('.md5'):
290 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
291 seq = parse_srf(path, f)
292 elif qseq_re.match(f):
293 seq = parse_qseq(path, f)
294 elif f.endswith('.fastq') or \
295 f.endswith('.fastq.bz2') or \
296 f.endswith('.fastq.gz'):
297 seq = parse_fastq(path, f)
298 eland_match = eland_re.match(f)
300 if f.endswith('.md5'):
302 seq = parse_eland(path, f, eland_match)
304 sequences.append(seq)
305 LOGGER.debug("Found sequence at %s" % (f,))