1c3772efb9f6050086f79138dfae8e4c03ebb967
[htsworkflow.git] / scripts / make-library-tree
1 #!/usr/bin/python
2 """
3 Make a tree of symlinks organized by library id.
4 """
5 from ConfigParser import SafeConfigParser
6 from glob import glob
7 import logging
8 from optparse import OptionParser
9 import logging
10 import os
11 import stat
12 import sys
13
14 from htsworkflow.util import fctracker
15
16 def find_lanes(flowcell_dir, flowcell_id, lane):
17     lane_name = "s_%s_eland_*" %(lane)
18     pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
19     lanes = glob(pattern)
20     return lanes
21
22 def make_long_lane_name(flowcell_dir, lane_pathname):
23     """
24     make a name from the eland result file name
25     """
26     if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
27         subpath = lane_pathname[len(flowcell_dir):]
28         long_name = subpath.replace(os.path.sep, "_")
29         return long_name
30     else:
31         return None
32     
33 def parse_srf_directory(srf_dir):
34     """
35     search srf_dir for *.srf files
36
37     builds a dictionary indexed by flowcell name.
38     """
39     flowcells = {}
40     srfs = glob(os.path.join(srf_dir,'*.srf'))
41     for pathname in srfs:
42         path, filename = os.path.split(pathname)
43         basename, ext = os.path.splitext(filename)
44         record = basename.split('_')
45         assert len(record) == 6
46
47         site = record[0]
48         date = record[1]
49         machine = record[2]
50         runid = record[3]
51         flowcellid = record[4]
52         laneid = record[5]
53
54         desc = "_".join([site,date,machine,runid,flowcellid])
55         flowcells[flowcellid] = desc
56     return flowcells
57
58
59 def carefully_make_hardlink(source, destination, dry_run=False):
60     """
61     Make a hard link, failing if a different link already exists
62
63     Checking to see if the link already exists and is
64     the same as the link we want to make.
65     If the link already exists and is different, throw an error.
66     """
67     logging.debug("%s -> %s", source, destination)
68
69     if not os.path.exists(source):
70         logging.warning("%s doesn't exist", source)
71         return
72
73     if os.path.exists(destination):
74         if os.path.samefile(source, destination):
75             return
76         else:
77             raise IOError('%s and %s are different files' % \
78                            (source, destination))
79     logging.info('Linking: %s -> %s' % (source, destination))
80
81     if dry_run: return 
82
83     os.link(source, destination)
84     os.chmod(destination,
85              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
86
87 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
88     """
89     find eland files at different alignment lengths
90     and put each of those in the file 
91     """
92     lanes = find_lanes(flowcell_dir, flowcell_id, lane)
93     for lane_pathname in lanes:
94         long_name = make_long_lane_name(flowcell_dir, 
95                                         lane_pathname)
96         long_pathname = os.path.join(library_path, long_name)
97         carefully_make_hardlink(lane_pathname,
98                                 long_pathname,
99                                 dry_run)
100
101 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
102     """
103     Link srf files into our library directories.
104
105     the srf files must be named:
106     <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
107     """
108     srf_basename = srf_names.get(flowcell_id, None)
109     if srf_basename is None:
110         logging.info("srf file for %s was not found", flowcell_id)
111     else:
112         srf_filename = "%s_%s.srf" % (srf_basename, lane)
113         source = os.path.join(srf_dir, srf_filename)
114         destination = os.path.join(library_path, srf_filename)
115         carefully_make_hardlink(source, destination, dry_run)
116     
117
118 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
119                       dry_run=False):
120     """
121     Iterate over the library 
122     """
123     library_dir = os.path.normpath(library_dir) + os.path.sep
124     flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
125     srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
126
127     srf_names = parse_srf_directory(srfs_dir)
128
129     for lib_id, lib in fcdb.library.items():
130         library_path = os.path.join(library_dir, str(lib_id))
131         if not os.path.exists(library_path):
132             os.mkdir(library_path)
133
134         for flowcell_id, lane in lib.get('lanes', []):
135             link_all_eland_lanes(library_path, 
136                                  flowcell_dir, 
137                                  flowcell_id, 
138                                  lane, 
139                                  dry_run)
140
141             link_srf_lanes(srf_names, 
142                            library_path, 
143                            srfs_dir,
144                            flowcell_id,
145                            lane,
146                            dry_run)
147
148 def make_parser():
149     """
150     Make parser
151     """
152     parser = OptionParser()
153     parser.add_option('-c', '--config', default=None,
154                       help='path to a configuration file containing a '
155                            'sequence archive section')
156                       
157     parser.add_option("--database", dest="database",
158                       help="path to the fctracker.db",
159                       default=None)
160     parser.add_option('-a', '--sequence-archive', default=None,
161                       help='path to where the sequence archive lives')
162     parser.add_option("-w", "--where", dest="where",
163                       help="add a where clause",
164                       default=None)
165
166     parser.add_option('-v', '--verbose', action='store_true', default=False,
167                       help='be more verbose')
168     parser.add_option('-d', '--debug', action='store_true', default=False,
169                       help='report everything')
170              
171     parser.add_option("--dry-run", dest="dry_run", action="store_true",
172                       default=False,
173                       help="Don't modify the filesystem")
174     return parser
175
176 def main(argv=None):
177     FRONTEND_NAME = 'frontend'
178     SECTION_NAME = 'sequence_archive'
179     DATABASE_OPT = 'database_name'
180     ARCHIVE_OPT = 'archive_path'
181
182     if argv is None:
183         argv = []
184     parser = make_parser()
185
186     # parse command line arguments
187     opt, args = parser.parse_args(argv)
188
189     # setup logging
190     level = logging.WARN
191     if opt.verbose:
192         level = logging.INFO
193     if opt.debug:
194         level = logging.DEBUG
195     logging.basicConfig(level=level)
196
197     # figure out what config file to read
198     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
199                    '/etc/htsworkflow.ini']
200     if opt.config is not None:
201         config_path = [opt.config]
202     
203     # parse options from config file
204     config_file = SafeConfigParser()
205     config_file.read(config_path)
206
207     # load defaults from config file if not overriden by the command line
208     print opt.database
209     if opt.database is None and \
210        config_file.has_option(FRONTEND_NAME, DATABASE_OPT):
211         opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT)
212
213     if opt.sequence_archive is None and \
214        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
215         opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
216   
217     # complain if critical things are missing
218     if opt.database is None:
219        parser.error('Need location of htsworkflow frontend database')
220
221     if opt.sequence_archive is None:
222        parser.error('Need the root path for the sequence archive')
223
224     fcdb = fctracker.fctracker(opt.database)
225     cells = fcdb._get_flowcells(opt.where)
226
227     library_dir = os.path.join(opt.sequence_archive, 'libraries')
228     flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells')
229     srfs_dir = os.path.join(opt.sequence_archive, 'srfs')
230     make_library_tree(fcdb, 
231                       library_dir, flowcell_dir, srfs_dir, 
232                       opt.dry_run)
233
234     return 0
235
236 if __name__ == "__main__":
237     rv = main(sys.argv[1:])
238     # sys.exit(rv)