cbdb433e4d1f60ceb022e3dfd469350cc4cb83a2
[htsworkflow.git] / htsworkflow / pipelines / sequences.py
1 """
2 Utilities to work with the various eras of sequence archive files
3 """
4 import logging
5 import os
6 import re
7
8 eland_re = re.compile('s_(?P<lane>\d)(_(?P<read>\d))?_eland_')
9 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
10 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
11
12 class SequenceFile(object):
13     """
14     Simple container class that holds the path to a sequence archive
15     and basic descriptive information.     
16     """
17     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
18         self.filetype = filetype
19         self.path = path
20         self.flowcell = flowcell
21         self.lane = lane
22         self.read = read
23         self.pf = pf
24         self.cycle = cycle
25
26     def __hash__(self):
27         return hash(self.key())
28
29     def key(self):
30         return (self.flowcell, self.lane)
31
32     def unicode(self):
33         return unicode(self.path)
34
35     def __eq__(self, other):
36         """
37         Equality is defined if everything but the path matches
38         """
39         attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
40         for a in attributes:
41             if getattr(self, a) != getattr(other, a):
42                 return False
43
44         return True
45
46     def __repr__(self):
47         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
48
49     def make_target_name(self, root):
50         """
51         Create target name for where we need to link this sequence too
52         """
53         path, basename = os.path.split(self.path)
54         # Because the names aren't unque we include the flowcel name
55         # because there were different eland files for different length
56         # analyses, we include the cycle length in the name.
57         if self.filetype == 'eland':
58             template = "%(flowcell)s_%(cycle)s_%(eland)s"
59             basename = template % { 'flowcell': self.flowcell,
60                                     'cycle': self.cycle,
61                                     'eland': basename }
62         # else:
63         # all the other file types have names that include flowcell/lane
64         # information and thus are unique so we don't have to do anything
65         return os.path.join(root, basename)
66         
67 def parse_srf(path, filename):
68     basename, ext = os.path.splitext(filename)
69     records = basename.split('_')
70     flowcell = records[4]
71     lane = int(records[5][0])
72     fullpath = os.path.join(path, filename)
73     return SequenceFile('srf', fullpath, flowcell, lane)
74
75 def parse_qseq(path, filename):
76     basename, ext = os.path.splitext(filename)
77     records = basename.split('_')
78     fullpath = os.path.join(path, filename)
79     flowcell = records[4]
80     lane = int(records[5][1])
81     read = int(records[6][1])
82     return SequenceFile('qseq', fullpath, flowcell, lane, read)
83
84 def parse_fastq(path, filename):
85     basename, ext = os.path.splitext(filename)
86     records = basename.split('_')
87     fullpath = os.path.join(path, filename)
88     flowcell = records[4]
89     lane = int(records[5][1])
90     read = int(records[6][1])
91     if records[-1].startswith('pass'):
92         pf = True
93     elif records[-1].startswith('nopass'):
94         pf = False
95     else:
96         raise ValueError("Unrecognized fastq name")
97         
98     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
99
100 def parse_eland(path, filename, eland_match=None):
101     if eland_match is None:
102         eland_match = eland_re.match(filename)
103     fullpath = os.path.join(path, filename)
104     rest, cycle = os.path.split(path)
105     rest, flowcell = os.path.split(rest)
106     if eland_match.group('lane'):
107         lane = int(eland_match.group('lane'))
108     else:
109         lane = None
110     if eland_match.group('read'):
111         read = int(eland_match.group('read'))
112     else:
113         read = None
114     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
115     
116 def scan_for_sequences(dirs):
117     """
118     Scan through a list of directories for sequence like files
119     """
120     # be forgiving if someone just gives us a string
121     if type(dirs) != type([]):
122         dirs = [dirs]
123
124     sequences = []
125     for d in dirs:
126         logging.info("Scanning %s for sequences" % (d,))
127         for path, dirname, filenames in os.walk(d):
128             for f in filenames:
129                 seq = None
130                 # find sequence files
131                 if raw_seq_re.match(f):
132                     if f.endswith('.md5'):
133                         continue
134                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
135                         seq = parse_srf(path, f)
136                     elif qseq_re.match(f):
137                         seq = parse_qseq(path, f)
138                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
139                         seq = parse_fastq(path, f)
140                 eland_match = eland_re.match(f)
141                 if eland_match:
142                     if f.endswith('.md5'):
143                         continue
144                     seq = parse_eland(path, f, eland_match)
145                 if seq:
146                     sequences.append(seq)
147                     logging.debug("Found sequence at %s" % (f,))
148                     
149     return sequences