Always return a count from carefully_make_hardlinks
[htsworkflow.git] / scripts / make-library-tree
1 #!/usr/bin/env python
2
3 from ConfigParser import SafeConfigParser
4 # try to deal with python <2.6
5 try:
6   import json
7 except ImportError:
8   import simplejson as json
9
10 import logging
11 import os
12 from optparse import OptionParser
13 import stat
14 import re
15 import shelve
16 import urllib
17 import urllib2
18 import urlparse
19
20 eland_re = re.compile('s_(?P<lane>\d)(?P<read>_\d)?_eland_')
21 raw_seq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+')
22 qseq_re = re.compile('woldlab_[0-9]{6}_[^_]+_[\d]+_[\dA-Z]+_l[\d]_r[\d].tar.bz2')
23
24 class SequenceFile(object):
25     def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
26         self.filetype = filetype
27         self.path = path
28         self.flowcell = flowcell
29         self.lane = lane
30         self.read = read
31         self.pf = pf
32         self.cycle = cycle
33
34     def __hash__(self):
35         return hash(self.key())
36
37     def key(self):
38         return (self.flowcell, self.lane)
39
40     def unicode(self):
41         return unicode(self.path)
42
43     def __repr__(self):
44         return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
45
46     def make_target_name(self, root):
47         """
48         Create target name for where we need to link this sequence too
49         """
50         path, basename = os.path.split(self.path)
51         # Because the names aren't unque we include the flowcel name
52         # because there were different eland files for different length
53         # analyses, we include the cycle length in the name.
54         if self.filetype == 'eland':
55             template = "%(flowcell)s_%(cycle)s_%(eland)s"
56             basename = template % { 'flowcell': self.flowcell,
57                                     'cycle': self.cycle,
58                                     'eland': basename }
59         # else:
60         # all the other file types have names that include flowcell/lane
61         # information and thus are unique so we don't have to do anything
62         return os.path.join(root, basename)
63         
64 def parse_srf(path, filename):
65     basename, ext = os.path.splitext(filename)
66     records = basename.split('_')
67     flowcell = records[4]
68     lane = int(records[5][0])
69     fullpath = os.path.join(path, filename)
70     return SequenceFile('srf', fullpath, flowcell, lane)
71
72 def parse_qseq(path, filename):
73     basename, ext = os.path.splitext(filename)
74     records = basename.split('_')
75     fullpath = os.path.join(path, filename)
76     flowcell = records[4]
77     lane = int(records[5][1])
78     read = int(records[6][1])
79     return SequenceFile('qseq', fullpath, flowcell, lane, read)
80
81 def parse_fastq(path, filename):
82     basename, ext = os.path.splitext(filename)
83     records = basename.split('_')
84     fullpath = os.path.join(path, filename)
85     flowcell = records[4]
86     lane = int(records[5][1])
87     read = int(records[6][1])
88     if records[-1] == 'pass':
89         pf = True
90     else:
91         pf = False
92     return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf)
93
94 def parse_eland(path, filename, eland_match):
95     fullpath = os.path.join(path, filename)
96     rest, cycle = os.path.split(path)
97     rest, flowcell = os.path.split(rest)
98     lane = eland_match.group('lane')
99     read = eland_match.group('read')
100     return SequenceFile('eland', fullpath, flowcell, lane, read, cycle=cycle)
101     
102 def scan_for_sequences(dirs):
103     sequences = []
104     for d in dirs:
105         logging.info("Scanning %s for sequences" % (d,))
106         for path, dirname, filenames in os.walk(d):
107             for f in filenames:
108                 seq = None
109                 # find sequence files
110                 if raw_seq_re.match(f):
111                     if f.endswith('.md5'):
112                         continue
113                     elif f.endswith('.srf') or f.endswith('.srf.bz2'):
114                         seq = parse_srf(path, f)
115                     elif qseq_re.match(f):
116                         seq = parse_qseq(path, f)
117                     elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
118                         seq = parse_fastq(path, f)
119                 eland_match = eland_re.match(f)
120                 if eland_match:
121                     if f.endswith('.md5'):
122                         continue
123                     seq = parse_eland(path, f, eland_match)
124                 if seq:
125                     sequences.append(seq)
126                     logging.debug("Found sequence at %s" % (f,))
127                     
128     return sequences
129
130 def retrieve_info(url, apidata):
131     """
132     Return a dictionary from the HTSworkflow API
133     """
134     try:
135         apipayload = urllib.urlencode(apidata)
136         web = urllib2.urlopen(url, apipayload)
137     except urllib2.URLError, e:
138         if e.code == 404:
139             logging.info("%s was not found" % (url,))
140             return None
141         else:
142             errmsg = 'URLError: %d %s' % (e.code, e.msg)
143             raise IOError(errmsg)
144     
145     contents = web.read()
146     headers = web.info()
147
148     return json.loads(contents)
149
150 def build_flowcell_db(fcdb_filename, sequences, baseurl, apiid, apikey):
151     """
152     compare our flowcell database with our list of sequences and return
153     a fully populated database
154     """
155     fcdb = shelve.open(fcdb_filename)
156     libdb = {}
157     apidata = {'apiid': apiid, 'apikey': apikey}
158     for seq in sequences:
159         flowcell = seq.flowcell
160         flowcell_info = None
161
162         # get info about flowcell from server or shelf
163         if not fcdb.has_key(flowcell):
164             url = urlparse.urljoin(baseurl, 'experiments/config/%s/json' % (flowcell,))
165             flowcell_info = retrieve_info(url, apidata)
166             if flowcell_info is not None:
167                 fcdb[flowcell] = flowcell_info
168         else:
169             flowcell_info = fcdb[flowcell]
170
171         # make library id db
172         if flowcell_info is not None:
173             seq_library_id = flowcell_info['lane_set'][unicode(seq.lane)]['library_id']
174             libdb.setdefault(seq_library_id, []).append(seq)
175            
176     fcdb.sync()
177     return fcdb, libdb
178
179 def carefully_make_hardlink(source, destination, dry_run=False):
180     """
181     Make a hard link, failing if a different link already exists
182
183     Checking to see if the link already exists and is
184     the same as the link we want to make.
185     If the link already exists and is different, throw an error.
186
187     If we didn't update anything return 0, if we did update
188     return 1.
189     """
190     logging.debug("CHECKING: %s -> %s", source, destination)
191
192     if not os.path.exists(source):
193         logging.warning("%s doesn't exist", source)
194         return 0
195
196     if os.path.exists(destination):
197         if os.path.samefile(source, destination):
198             logging.debug('SAME: %s -> %s' % (source, destination))
199             return 0
200         else:
201             logging.error('%s and %s are different files, skipping' % \
202                           (source, destination)) 
203             return 0
204     logging.debug('Linking: %s -> %s' % (source, destination))
205
206     # we would do something by this part
207     if dry_run: return 1
208
209     os.link(source, destination)
210     os.chmod(destination,
211              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
212     return 1
213     
214 def make_library_links(root, library_db, dry_run=False):
215     """
216     Make a tree of sequencer roots organized by library id
217
218     Root is the root of the library tree
219     library_db is a dictionary of SequenceFiles organized by library id
220     """
221     count = 0
222     root = os.path.abspath(root)
223     for lib_id, sequences in library_db.items():
224         target_dir = os.path.join(root, lib_id)
225         if not os.path.exists(target_dir):
226             logging.info("mkdir %s" % (target_dir,))
227             if not dry_run:
228                 os.mkdir(target_dir)
229             
230         for s in sequences:
231             count += carefully_make_hardlink(s.path,
232                                              s.make_target_name(target_dir),
233                                              dry_run=dry_run)
234     return count
235
236 def configure_logging(opts):
237     # setup logging
238     level = logging.WARN
239     if opts.verbose:
240         level = logging.INFO
241     if opts.debug:
242         level = logging.DEBUG
243     logging.basicConfig(level=level)
244     
245
246 def configure_opts(opts):
247     """
248     Load in options from config file
249     """
250     SECTION_NAME = 'sequence_archive'
251     ARCHIVE_OPT = 'sequence_archive'
252     CACHE_OPT = 'cache'
253     HOST_OPT = 'host'
254     APIID_OPT = 'apiid'
255     APIKEY_OPT = 'apikey'
256
257     # figure out what config file to read
258     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
259                    '/etc/htsworkflow.ini']
260     if opts.config is not None:
261         config_path = [opts.config]
262     # parse options from config file
263     config_file = SafeConfigParser()
264     config_file.read(config_path)
265
266     # load defaults from config file if not overriden by the command line
267     if opts.cache is None:
268         if config_file.has_option(SECTION_NAME, CACHE_OPT):
269             opts.cache = config_file.get(FRONTEND_NAME, CACHE_OPT)
270         else:
271             opts.cache = os.path.expanduser('~/.flowcelldb.shelve')
272
273     if opts.sequence_archive is None and \
274        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
275         opts.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
276
277     opts.sequence_archive = os.path.abspath(opts.sequence_archive)
278     opts.library_tree = os.path.join(opts.sequence_archive, 'libraries')
279     opts.flowcells = os.path.join(opts.sequence_archive, 'flowcells')
280     opts.srfs = os.path.join(opts.sequence_archive, 'srfs')
281
282     if opts.host is None and config_file.has_option(SECTION_NAME, HOST_OPT):
283         opts.host = config_file.get(SECTION_NAME, HOST_OPT)
284
285     if opts.apiid is None and config_file.has_option(SECTION_NAME, APIID_OPT):
286         opts.apiid = config_file.get(SECTION_NAME, APIID_OPT)
287
288     if opts.apikey is None and config_file.has_option(SECTION_NAME, APIKEY_OPT):
289         opts.apikey = config_file.get(SECTION_NAME, APIKEY_OPT)
290       
291     return opts
292
293 def make_parser():
294     """
295     Make parser
296     """
297     parser = OptionParser()
298     parser.add_option('-c', '--config', default=None,
299                       help='path to a configuration file containing a '
300                            'sequence archive section')
301     parser.add_option('--cache', default=None,
302                       help="default flowcell cache")
303     
304     parser.add_option('--host', default=None,
305                       help="specify http://host for quering flowcell information")
306     parser.add_option('--apiid', default=None,
307                       help="API ID to use when retriving information")
308     parser.add_option("--apikey", default=None,
309                       help="API Key for when retriving information")
310     
311     parser.add_option('-a', '--sequence-archive', default=None,
312                       help='path to where the sequence archive lives')
313
314     parser.add_option('-v', '--verbose', action='store_true', default=False,
315                       help='be more verbose')
316     parser.add_option('-d', '--debug', action='store_true', default=False,
317                       help='report everything')
318              
319     parser.add_option("--dry-run", dest="dry_run", action="store_true",
320                       default=False,
321                       help="Don't modify the filesystem")
322     return parser
323
324 def main(cmdline=None):
325     parser = make_parser()
326     opts, args = parser.parse_args(cmdline)
327
328     configure_logging(opts)
329     opts = configure_opts(opts)
330   
331     # complain if critical things are missing
332     if opts.cache is None:
333        parser.error('Need location of htsworkflow frontend database')
334
335     if opts.sequence_archive is None:
336        parser.error('Need the root path for the sequence archive')
337
338     seq_dirs = [ opts.flowcells, opts.srfs ]
339     if len(args) > 0:
340         seq_dirs = [os.path.abspath(f) for f in args]
341     
342     seqs = scan_for_sequences(seq_dirs)
343     fcdb, libdb = build_flowcell_db(opts.cache, seqs, opts.host, opts.apiid, opts.apikey)
344     updates = make_library_links(opts.library_tree, libdb, dry_run=opts.dry_run)
345     
346     logging.warn("%s flowcells in database" % (len(fcdb),))
347     logging.warn("found %s sequence files" % (len(seqs),))
348     logging.warn("%s libraries being checked" % (len(libdb),))
349     logging.warn("%s sequence files were linked" % (updates,))
350     
351     return 0
352     
353 if __name__ == "__main__":
354     main()