2 Utilities to work with the various eras of sequence archive files
8 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
9 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
10 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
12 class SequenceFile(object):
14 Simple container class that holds the path to a sequence archive
15 and basic descriptive information.
17 def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
18 self.filetype = filetype
20 self.flowcell = flowcell
27 return hash(self.key())
30 return (self.flowcell, self.lane)
33 return unicode(self.path)
35 def __eq__(self, other):
37 Equality is defined if everything but the path matches
39 attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
41 if getattr(self, a) != getattr(other, a):
47 return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
49 def make_target_name(self, root):
51 Create target name for where we need to link this sequence too
53 path, basename = os.path.split(self.path)
54 # Because the names aren't unque we include the flowcel name
55 # because there were different eland files for different length
56 # analyses, we include the cycle length in the name.
57 if self.filetype == 'eland':
58 template = "%(flowcell)s_%(cycle)s_%(eland)s"
59 basename = template % { 'flowcell': self.flowcell,
63 # all the other file types have names that include flowcell/lane
64 # information and thus are unique so we don't have to do anything
65 return os.path.join(root, basename)
67 def parse_srf(path, filename):
68 basename, ext = os.path.splitext(filename)
69 records = basename.split('_')
71 lane = int(records[5][0])
72 fullpath = os.path.join(path, filename)
73 return SequenceFile('srf', fullpath, flowcell, lane)
75 def parse_qseq(path, filename):
76 basename, ext = os.path.splitext(filename)
77 records = basename.split('_')
78 fullpath = os.path.join(path, filename)
80 lane = int(records[5][1])
81 read = int(records[6][1])
82 return SequenceFile('qseq', fullpath, flowcell, lane, read)
84 def parse_fastq(path, filename):
85 basename, ext = os.path.splitext(filename)
86 records = basename.split('_')
87 fullpath = os.path.join(path, filename)
89 lane = int(records[5][1])
90 read = int(records[6][1])
91 if records[-1].startswith('pass'):
93 elif records[-1].startswith('nopass'):
96 raise ValueError("Unrecognized fastq name")
98 return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
100 def parse_eland(path, filename, eland_match=None):
101 if eland_match is None:
102 eland_match = eland_re.match(filename)
103 fullpath = os.path.join(path, filename)
104 rest, cycle = os.path.split(path)
105 rest, flowcell = os.path.split(rest)
106 if eland_match.group('lane'):
107 lane = int(eland_match.group('lane'))
110 if eland_match.group('read'):
111 read = int(eland_match.group('read'))
114 return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
116 def scan_for_sequences(dirs):
118 Scan through a list of directories for sequence like files
120 # be forgiving if someone just gives us a string
121 if type(dirs) != type([]):
126 logging.info("Scanning %s for sequences" % (d,))
127 for path, dirname, filenames in os.walk(d):
130 # find sequence files
131 if raw_seq_re.match(f):
132 if f.endswith('.md5'):
134 elif f.endswith('.srf') or f.endswith('.srf.bz2'):
135 seq = parse_srf(path, f)
136 elif qseq_re.match(f):
137 seq = parse_qseq(path, f)
138 elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
139 seq = parse_fastq(path, f)
140 eland_match = eland_re.match(f)
142 if f.endswith('.md5'):
144 seq = parse_eland(path, f, eland_match)
146 sequences.append(seq)
147 logging.debug("Found sequence at %s" % (f,))