Update htsworkflow.util.fctracker module to work with the newer django db
[htsworkflow.git] / scripts / make-library-tree
1 #!/usr/bin/python
2 """
3 Make a tree of symlinks organized by library id.
4 """
5 from ConfigParser import SafeConfigParser
6 from glob import glob
7 import logging
8 from optparse import OptionParser
9 import logging
10 import os
11 import stat
12 import sys
13
14 from htsworkflow.util import fctracker
15
16 def find_lanes(flowcell_dir, flowcell_id, lane):
17     lane_name = "s_%s_eland_*" %(lane)
18     pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
19     lanes = glob(pattern)
20     return lanes
21
22 def make_long_lane_name(flowcell_dir, lane_pathname):
23     """
24     make a name from the eland result file name
25     """
26     if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
27         subpath = lane_pathname[len(flowcell_dir):]
28         long_name = subpath.replace(os.path.sep, "_")
29         return long_name
30     else:
31         return None
32     
33 def parse_srf_directory(srf_dir):
34     """
35     search srf_dir for *.srf files
36
37     builds a dictionary indexed by flowcell name.
38     """
39     flowcells = {}
40     srfs = glob(os.path.join(srf_dir,'*.srf'))
41     for pathname in srfs:
42         path, filename = os.path.split(pathname)
43         basename, ext = os.path.splitext(filename)
44         record = basename.split('_')
45         if len(record) != 6:
46             logging.error("Unrecognized srf file: %s expected 6 fields got %d" % (pathname,len(record)))
47             continue
48
49         site = record[0]
50         date = record[1]
51         machine = record[2]
52         runid = record[3]
53         flowcellid = record[4]
54         laneid = record[5]
55
56         desc = "_".join([site,date,machine,runid,flowcellid])
57         flowcells[flowcellid] = desc
58     return flowcells
59
60
61 def carefully_make_hardlink(source, destination, dry_run=False):
62     """
63     Make a hard link, failing if a different link already exists
64
65     Checking to see if the link already exists and is
66     the same as the link we want to make.
67     If the link already exists and is different, throw an error.
68     """
69     logging.debug("CHECKING: %s -> %s", source, destination)
70
71     if not os.path.exists(source):
72         logging.warning("%s doesn't exist", source)
73         return
74
75     if os.path.exists(destination):
76         if os.path.samefile(source, destination):
77             logging.debug('SAME: %s -> %s' % (source, destination))
78             return
79         else:
80             raise IOError('%s and %s are different files' % \
81                            (source, destination))
82     logging.info('Linking: %s -> %s' % (source, destination))
83
84     if dry_run: return 
85
86     os.link(source, destination)
87     os.chmod(destination,
88              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
89
90 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
91     """
92     find eland files at different alignment lengths
93     and put each of those in the file 
94     """
95     lanes = find_lanes(flowcell_dir, flowcell_id, lane)
96     for lane_pathname in lanes:
97         long_name = make_long_lane_name(flowcell_dir, 
98                                         lane_pathname)
99         long_pathname = os.path.join(library_path, long_name)
100         carefully_make_hardlink(lane_pathname,
101                                 long_pathname,
102                                 dry_run)
103
104 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
105     """
106     Link srf files into our library directories.
107
108     the srf files must be named:
109     <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
110     """
111     srf_basename = srf_names.get(flowcell_id, None)
112     if srf_basename is None:
113         logging.info("srf file for %s was not found", flowcell_id)
114     else:
115         srf_filename = "%s_%s.srf" % (srf_basename, lane)
116         source = os.path.join(srf_dir, srf_filename)
117         destination = os.path.join(library_path, srf_filename)
118         carefully_make_hardlink(source, destination, dry_run)
119     
120
121 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
122                       dry_run=False):
123     """
124     Iterate over the library 
125     """
126     library_dir = os.path.normpath(library_dir) + os.path.sep
127     flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
128     srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
129
130     srf_names = parse_srf_directory(srfs_dir)
131
132     for lib_id, lib in fcdb.library.items():
133         library_path = os.path.join(library_dir, str(lib_id))
134         if not os.path.exists(library_path):
135             os.mkdir(library_path)
136
137         for flowcell_id, lane in lib.get('lanes', []):
138             link_all_eland_lanes(library_path, 
139                                  flowcell_dir, 
140                                  flowcell_id, 
141                                  lane, 
142                                  dry_run)
143
144             link_srf_lanes(srf_names, 
145                            library_path, 
146                            srfs_dir,
147                            flowcell_id,
148                            lane,
149                            dry_run)
150
151 def make_parser():
152     """
153     Make parser
154     """
155     parser = OptionParser()
156     parser.add_option('-c', '--config', default=None,
157                       help='path to a configuration file containing a '
158                            'sequence archive section')
159                       
160     parser.add_option("--database", dest="database",
161                       help="path to the fctracker.db",
162                       default=None)
163     parser.add_option('-a', '--sequence-archive', default=None,
164                       help='path to where the sequence archive lives')
165     parser.add_option("-w", "--where", dest="where",
166                       help="add a where clause",
167                       default=None)
168
169     parser.add_option('-v', '--verbose', action='store_true', default=False,
170                       help='be more verbose')
171     parser.add_option('-d', '--debug', action='store_true', default=False,
172                       help='report everything')
173              
174     parser.add_option("--dry-run", dest="dry_run", action="store_true",
175                       default=False,
176                       help="Don't modify the filesystem")
177     return parser
178
179 def main(argv=None):
180     FRONTEND_NAME = 'frontend'
181     SECTION_NAME = 'sequence_archive'
182     DATABASE_OPT = 'database_name'
183     ARCHIVE_OPT = 'archive_path'
184
185     if argv is None:
186         argv = []
187     parser = make_parser()
188
189     # parse command line arguments
190     opt, args = parser.parse_args(argv)
191
192     # setup logging
193     level = logging.WARN
194     if opt.verbose:
195         level = logging.INFO
196     if opt.debug:
197         level = logging.DEBUG
198     logging.basicConfig(level=level)
199
200     # figure out what config file to read
201     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
202                    '/etc/htsworkflow.ini']
203     if opt.config is not None:
204         config_path = [opt.config]
205     
206     # parse options from config file
207     config_file = SafeConfigParser()
208     config_file.read(config_path)
209
210     # load defaults from config file if not overriden by the command line
211     print opt.database
212     if opt.database is None and \
213        config_file.has_option(FRONTEND_NAME, DATABASE_OPT):
214         opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT)
215
216     if opt.sequence_archive is None and \
217        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
218         opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
219   
220     # complain if critical things are missing
221     if opt.database is None:
222        parser.error('Need location of htsworkflow frontend database')
223
224     if opt.sequence_archive is None:
225        parser.error('Need the root path for the sequence archive')
226
227     fcdb = fctracker.fctracker(opt.database)
228     cells = fcdb._get_flowcells(opt.where)
229
230     library_dir = os.path.join(opt.sequence_archive, 'libraries')
231     flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells')
232     srfs_dir = os.path.join(opt.sequence_archive, 'srfs')
233     make_library_tree(fcdb, 
234                       library_dir, flowcell_dir, srfs_dir, 
235                       opt.dry_run)
236
237     return 0
238
239 if __name__ == "__main__":
240     rv = main(sys.argv[1:])
241     # sys.exit(rv)