Move the knowledge of the urls for the REST API to one new file
[htsworkflow.git] / scripts / make-library-tree
1 #!/usr/bin/env python
2
3 from ConfigParser import SafeConfigParser
4
5 import logging
6 import os
7 from optparse import OptionParser
8 import stat
9 import re
10 import shelve
11
12 from htsworkflow.util import api
13
14 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
15 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
16 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
17
18 class SequenceFile(object):
19     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
20         self.filetype = filetype
21         self.path = path
22         self.flowcell = flowcell
23         self.lane = lane
24         self.read = read
25         self.pf = pf
26         self.cycle = cycle
27
28     def __hash__(self):
29         return hash(self.key())
30
31     def key(self):
32         return (self.flowcell, self.lane)
33
34     def unicode(self):
35         return unicode(self.path)
36
37     def __repr__(self):
38         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
39
40     def make_target_name(self, root):
41         """
42         Create target name for where we need to link this sequence too
43         """
44         path, basename = os.path.split(self.path)
45         # Because the names aren't unque we include the flowcel name
46         # because there were different eland files for different length
47         # analyses, we include the cycle length in the name.
48         if self.filetype == 'eland':
49             template = "%(flowcell)s_%(cycle)s_%(eland)s"
50             basename = template % { 'flowcell': self.flowcell,
51                                     'cycle': self.cycle,
52                                     'eland': basename }
53         # else:
54         # all the other file types have names that include flowcell/lane
55         # information and thus are unique so we don't have to do anything
56         return os.path.join(root, basename)
57         
58 def parse_srf(path, filename):
59     basename, ext = os.path.splitext(filename)
60     records = basename.split('_')
61     flowcell = records[4]
62     lane = int(records[5][0])
63     fullpath = os.path.join(path, filename)
64     return SequenceFile('srf', fullpath, flowcell, lane)
65
66 def parse_qseq(path, filename):
67     basename, ext = os.path.splitext(filename)
68     records = basename.split('_')
69     fullpath = os.path.join(path, filename)
70     flowcell = records[4]
71     lane = int(records[5][1])
72     read = int(records[6][1])
73     return SequenceFile('qseq', fullpath, flowcell, lane, read)
74
75 def parse_fastq(path, filename):
76     basename, ext = os.path.splitext(filename)
77     records = basename.split('_')
78     fullpath = os.path.join(path, filename)
79     flowcell = records[4]
80     lane = int(records[5][1])
81     read = int(records[6][1])
82     if records[-1] == 'pass':
83         pf = True
84     else:
85         pf = False
86     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
87
88 def parse_eland(path, filename, eland_match):
89     fullpath = os.path.join(path, filename)
90     rest, cycle = os.path.split(path)
91     rest, flowcell = os.path.split(rest)
92     lane = eland_match.group('lane')
93     read = eland_match.group('read')
94     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
95     
96 def scan_for_sequences(dirs):
97     sequences = []
98     for d in dirs:
99         logging.info("Scanning %s for sequences" % (d,))
100         for path, dirname, filenames in os.walk(d):
101             for f in filenames:
102                 seq = None
103                 # find sequence files
104                 if raw_seq_re.match(f):
105                     if f.endswith('.md5'):
106                         continue
107                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
108                         seq = parse_srf(path, f)
109                     elif qseq_re.match(f):
110                         seq = parse_qseq(path, f)
111                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
112                         seq = parse_fastq(path, f)
113                 eland_match = eland_re.match(f)
114                 if eland_match:
115                     if f.endswith('.md5'):
116                         continue
117                     seq = parse_eland(path, f, eland_match)
118                 if seq:
119                     sequences.append(seq)
120                     logging.debug("Found sequence at %s" % (f,))
121                     
122     return sequences
123
124
125 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
126     """
127     compare our flowcell database with our list of sequences and return
128     a fully populated database
129     """
130     fcdb = shelve.open(fcdb_filename)
131     libdb = {}
132     apidata = {'apiid': apiid, 'apikey': apikey}
133     for seq in sequences:
134         flowcell = seq.flowcell
135         flowcell_info = None
136
137         # get info about flowcell from server or shelf
138         if not fcdb.has_key(flowcell):
139             url = api.flowcell_url(baseurl, flowcell)
140             flowcell_info = api.retrieve_info(url, apidata)
141             if flowcell_info is not None:
142                 fcdb[flowcell] = flowcell_info
143         else:
144             flowcell_info = fcdb[flowcell]
145
146         # make library id db
147         if flowcell_info is not None:
148             seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
149             libdb.setdefault(seq_library_id, []).append(seq)
150            
151     fcdb.sync()
152     return fcdb, libdb
153
154 def carefully_make_hardlink(source, destination, dry_run=False):
155     """
156     Make a hard link, failing if a different link already exists
157
158     Checking to see if the link already exists and is
159     the same as the link we want to make.
160     If the link already exists and is different, throw an error.
161
162     If we didn't update anything return 0, if we did update
163     return 1.
164     """
165     logging.debug("CHECKING: %s -> %s", source, destination)
166
167     if not os.path.exists(source):
168         logging.warning("%s doesn't exist", source)
169         return 0
170
171     if os.path.exists(destination):
172         if os.path.samefile(source, destination):
173             logging.debug('SAME: %s -> %s' % (source, destination))
174             return 0
175         else:
176             logging.error('%s and %s are different files, skipping' % \
177                           (source, destination)) 
178             return 0
179     logging.debug('Linking: %s -> %s' % (source, destination))
180
181     # we would do something by this part
182     if dry_run: return 1
183
184     os.link(source, destination)
185     os.chmod(destination,
186              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
187     return 1
188     
189 def make_library_links(root, library_db, dry_run=False):
190     """
191     Make a tree of sequencer roots organized by library id
192
193     Root is the root of the library tree
194     library_db is a dictionary of SequenceFiles organized by library id
195     """
196     count = 0
197     root = os.path.abspath(root)
198     for lib_id, sequences in library_db.items():
199         target_dir = os.path.join(root, lib_id)
200         if not os.path.exists(target_dir):
201             logging.info("mkdir %s" % (target_dir,))
202             if not dry_run:
203                 os.mkdir(target_dir)
204             
205         for s in sequences:
206             count += carefully_make_hardlink(s.path,
207                                              s.make_target_name(target_dir),
208                                              dry_run=dry_run)
209     return count
210
211 def configure_logging(opts):
212     # setup logging
213     level = logging.WARN
214     if opts.verbose:
215         level = logging.INFO
216     if opts.debug:
217         level = logging.DEBUG
218     logging.basicConfig(level=level)
219     
220
221 def configure_opts(opts):
222     """
223     Load in options from config file
224     """
225     SECTION_NAME = 'sequence_archive'
226     ARCHIVE_OPT = 'sequence_archive'
227     CACHE_OPT = 'cache'
228     HOST_OPT = 'host'
229     APIID_OPT = 'apiid'
230     APIKEY_OPT = 'apikey'
231
232     # figure out what config file to read
233     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
234                    '/etc/htsworkflow.ini']
235     if opts.config is not None:
236         config_path = [opts.config]
237     # parse options from config file
238     config_file = SafeConfigParser()
239     config_file.read(config_path)
240
241     # load defaults from config file if not overriden by the command line
242     if opts.cache is None:
243         if config_file.has_option(SECTION_NAME, CACHE_OPT):
244             opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
245         else:
246             opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
247
248     if opts.sequence_archive is None and \
249        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
250         opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
251         opts.sequence_archive = os.path.expanduser(opts.sequence_archive)
252
253     opts.sequence_archive = os.path.abspath(opts.sequence_archive)
254     opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
255     opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
256     opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
257
258     if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
259         opts.host = config_file.get(SECTION_NAME, HOST_OPT)
260
261     if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
262         opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
263
264     if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
265         opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
266       
267     return opts
268
269 def make_parser():
270     """
271     Make parser
272     """
273     parser = OptionParser()
274     parser.add_option('-c', '--config', default=None,
275                       help='path to a configuration file containing a '
276                            'sequence archive section')
277     parser.add_option('--cache', default=None,
278                       help="default flowcell cache")
279     
280     parser.add_option('--host', default=None,
281                       help="specify http://host for quering flowcell information")
282     parser.add_option('--apiid', default=None,
283                       help="API ID to use when retriving information")
284     parser.add_option("--apikey", default=None,
285                       help="API Key for when retriving information")
286     
287     parser.add_option('-a', '--sequence-archive', default=None,
288                       help='path to where the sequence archive lives')
289
290     parser.add_option('-v', '--verbose', action='store_true', default=False,
291                       help='be more verbose')
292     parser.add_option('-d', '--debug', action='store_true', default=False,
293                       help='report everything')
294              
295     parser.add_option("--dry-run", dest="dry_run", action="store_true",
296                       default=False,
297                       help="Don't modify the filesystem")
298     return parser
299
300 def main(cmdline=None):
301     parser = make_parser()
302     opts, args = parser.parse_args(cmdline)
303
304     configure_logging(opts)
305     opts = configure_opts(opts)
306   
307     # complain if critical things are missing
308     if opts.cache is None:
309        parser.error('Need location of htsworkflow frontend database')
310
311     if opts.sequence_archive is None:
312        parser.error('Need the root path for the sequence archive')
313
314     seq_dirs = [ opts.flowcells, opts.srfs ]
315     if len(args) > 0:
316         seq_dirs = [os.path.abspath(f) for f in args]
317     
318     seqs = scan_for_sequences(seq_dirs)
319     fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
320     updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
321     
322     logging.warn("%s flowcells in database" % (len(fcdb),))
323     logging.warn("found %s sequence files" % (len(seqs),))
324     logging.warn("%s libraries being checked" % (len(libdb),))
325     logging.warn("%s sequence files were linked" % (updates,))
326     
327     return 0
328     
329 if __name__ == "__main__":
330     main()