3 Make a tree of symlinks organized by library id.
5 from ConfigParser import SafeConfigParser
8 from optparse import OptionParser
14 from htsworkflow.util import fctracker
16 def find_lanes(flowcell_dir, flowcell_id, lane):
17 lane_name = "s_%s_eland_*" %(lane)
18 pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
22 def make_long_lane_name(flowcell_dir, lane_pathname):
24 make a name from the eland result file name
26 if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
27 subpath = lane_pathname[len(flowcell_dir):]
28 long_name = subpath.replace(os.path.sep, "_")
33 def parse_srf_directory(srf_dir):
35 search srf_dir for *.srf files
37 builds a dictionary indexed by flowcell name.
40 srfs = glob(os.path.join(srf_dir,'*.srf'))
42 path, filename = os.path.split(pathname)
43 basename, ext = os.path.splitext(filename)
44 record = basename.split('_')
46 logging.error("Unrecognized srf file: %s expected 6 fields got %d" % (pathname,len(record)))
53 flowcellid = record[4]
56 desc = "_".join([site,date,machine,runid,flowcellid])
57 flowcells[flowcellid] = desc
61 def carefully_make_hardlink(source, destination, dry_run=False):
63 Make a hard link, failing if a different link already exists
65 Checking to see if the link already exists and is
66 the same as the link we want to make.
67 If the link already exists and is different, throw an error.
69 logging.debug("CHECKING: %s -> %s", source, destination)
71 if not os.path.exists(source):
72 logging.warning("%s doesn't exist", source)
75 if os.path.exists(destination):
76 if os.path.samefile(source, destination):
77 logging.debug('SAME: %s -> %s' % (source, destination))
80 raise IOError('%s and %s are different files' % \
81 (source, destination))
82 logging.info('Linking: %s -> %s' % (source, destination))
86 os.link(source, destination)
88 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
90 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
92 find eland files at different alignment lengths
93 and put each of those in the file
95 lanes = find_lanes(flowcell_dir, flowcell_id, lane)
96 for lane_pathname in lanes:
97 long_name = make_long_lane_name(flowcell_dir,
99 long_pathname = os.path.join(library_path, long_name)
100 carefully_make_hardlink(lane_pathname,
104 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
106 Link srf files into our library directories.
108 the srf files must be named:
109 <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
111 srf_basename = srf_names.get(flowcell_id, None)
112 if srf_basename is None:
113 logging.info("srf file for %s was not found", flowcell_id)
115 srf_filename = "%s_%s.srf" % (srf_basename, lane)
116 source = os.path.join(srf_dir, srf_filename)
117 destination = os.path.join(library_path, srf_filename)
118 carefully_make_hardlink(source, destination, dry_run)
121 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
124 Iterate over the library
126 library_dir = os.path.normpath(library_dir) + os.path.sep
127 flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
128 srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
130 srf_names = parse_srf_directory(srfs_dir)
132 for lib_id, lib in fcdb.library.items():
133 library_path = os.path.join(library_dir, str(lib_id))
134 if not os.path.exists(library_path):
135 os.mkdir(library_path)
137 for flowcell_id, lane in lib.get('lanes', []):
138 link_all_eland_lanes(library_path,
144 link_srf_lanes(srf_names,
155 parser = OptionParser()
156 parser.add_option('-c', '--config', default=None,
157 help='path to a configuration file containing a '
158 'sequence archive section')
160 parser.add_option("--database", dest="database",
161 help="path to the fctracker.db",
163 parser.add_option('-a', '--sequence-archive', default=None,
164 help='path to where the sequence archive lives')
165 parser.add_option("-w", "--where", dest="where",
166 help="add a where clause",
169 parser.add_option('-v', '--verbose', action='store_true', default=False,
170 help='be more verbose')
171 parser.add_option('-d', '--debug', action='store_true', default=False,
172 help='report everything')
174 parser.add_option("--dry-run", dest="dry_run", action="store_true",
176 help="Don't modify the filesystem")
180 FRONTEND_NAME = 'frontend'
181 SECTION_NAME = 'sequence_archive'
182 DATABASE_OPT = 'database_name'
183 ARCHIVE_OPT = 'archive_path'
187 parser = make_parser()
189 # parse command line arguments
190 opt, args = parser.parse_args(argv)
197 level = logging.DEBUG
198 logging.basicConfig(level=level)
200 # figure out what config file to read
201 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
202 '/etc/htsworkflow.ini']
203 if opt.config is not None:
204 config_path = [opt.config]
206 # parse options from config file
207 config_file = SafeConfigParser()
208 config_file.read(config_path)
210 # load defaults from config file if not overriden by the command line
212 if opt.database is None and \
213 config_file.has_option(FRONTEND_NAME, DATABASE_OPT):
214 opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT)
216 if opt.sequence_archive is None and \
217 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
218 opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
220 # complain if critical things are missing
221 if opt.database is None:
222 parser.error('Need location of htsworkflow frontend database')
224 if opt.sequence_archive is None:
225 parser.error('Need the root path for the sequence archive')
227 fcdb = fctracker.fctracker(opt.database)
228 cells = fcdb._get_flowcells(opt.where)
230 library_dir = os.path.join(opt.sequence_archive, 'libraries')
231 flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells')
232 srfs_dir = os.path.join(opt.sequence_archive, 'srfs')
233 make_library_tree(fcdb,
234 library_dir, flowcell_dir, srfs_dir,
239 if __name__ == "__main__":
240 rv = main(sys.argv[1:])