2 Make a tree of symlinks organized by library id.
4 from ConfigParser import SafeConfigParser
7 from optparse import OptionParser
12 from htsworkflow.util import fctracker
15 def find_lanes(flowcell_dir, flowcell_id, lane):
16 lane_name = "s_%s_eland_result*" %(lane)
17 pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
21 def make_long_lane_name(flowcell_dir, lane_pathname):
23 make a name from the eland result file name
25 if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
26 subpath = lane_pathname[len(flowcell_dir):]
27 long_name = subpath.replace(os.path.sep, "_")
32 def parse_srf_directory(srf_dir):
34 search srf_dir for *.srf files
36 builds a dictionary indexed by flowcell name.
39 srfs = glob(os.path.join(srf_dir,'*.srf'))
41 path, filename = os.path.split(pathname)
42 basename, ext = os.path.splitext(filename)
43 record = basename.split('_')
44 assert len(record) == 6
50 flowcellid = record[4]
53 desc = "_".join([site,date,machine,runid,flowcellid])
54 flowcells[flowcellid] = desc
58 def carefully_make_hardlink(source, destination, dry_run=False):
60 Make a hard link, failing if a different link already exists
62 Checking to see if the link already exists and is
63 the same as the link we want to make.
64 If the link already exists and is different, throw an error.
66 logging.debug("%s -> %s", source, destination)
68 if not os.path.exists(source):
69 logging.warning("%s doesn't exist", source)
72 if os.path.exists(destination):
73 if os.path.samefile(source, destination):
76 raise IOError('%s and %s are different files' % \
77 (source, destination))
81 os.link(source, destination)
83 stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
85 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
87 find eland files at different alignment lengths
88 and put each of those in the file
90 lanes = find_lanes(flowcell_dir, flowcell_id, lane)
91 for lane_pathname in lanes:
92 long_name = make_long_lane_name(flowcell_dir,
94 long_pathname = os.path.join(library_path, long_name)
95 carefully_make_hardlink(lane_pathname,
99 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
101 Link srf files into our library directories.
103 the srf files must be named:
104 <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
106 srf_basename = srf_names.get(flowcell_id, None)
107 if srf_basename is None:
108 logging.info("srf file for %s was not found", flowcell_id)
110 srf_filename = "%s_%s.srf" % (srf_basename, lane)
111 source = os.path.join(srf_dir, srf_filename)
112 destination = os.path.join(library_path, srf_filename)
113 carefully_make_hardlink(source, destination, dry_run)
116 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
119 Iterate over the library
121 library_dir = os.path.normpath(library_dir) + os.path.sep
122 flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
123 srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
125 srf_names = parse_srf_directory(srfs_dir)
127 for lib_id, lib in fcdb.library.items():
128 library_path = os.path.join(library_dir, str(lib_id))
129 if not os.path.exists(library_path):
130 os.mkdir(library_path)
132 for flowcell_id, lane in lib.get('lanes', []):
133 link_all_eland_lanes(library_path,
139 link_srf_lanes(srf_names,
150 parser = OptionParser()
151 parser.add_option('-c', '--config', default=None,
152 help='path to a configuration file containing a '
153 'sequence archive section')
155 parser.add_option("-d", "--database", dest="database",
156 help="path to the fctracker.db",
158 parser.add_option('-a', '--sequence-archive', default=None,
159 help='path to where the sequence archive lives')
160 parser.add_option("-w", "--where", dest="where",
161 help="add a where clause",
164 parser.add_option("--dry-run", dest="dry_run", action="store_true",
166 help="Don't modify the filesystem")
170 logging.basicConfig(level=logging.INFO)
172 FRONTEND_NAME = 'frontend'
173 SECTION_NAME = 'sequence_archive'
174 DATABASE_OPT = 'database_name'
175 ARCHIVE_OPT = 'archive_path'
179 parser = make_parser()
181 # parse command line arguments
182 opt, args = parser.parse_args(argv)
184 # figure out what config file to read
185 config_path = [os.path.expanduser('~/.htsworkflow.ini'),
186 '/etc/htsworkflow.ini']
187 if opt.config is not None:
188 config_path = [opt.config]
190 # parse options from config file
191 config_file = SafeConfigParser()
192 config_file.read(config_path)
194 # load defaults from config file if not overriden by the command line
196 if opt.database is None and \
197 config_file.has_option(FRONTEND_NAME, DATABASE_OPT):
198 opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT)
200 if opt.sequence_archive is None and \
201 config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
202 opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
204 # complain if critical things are missing
205 if opt.database is None:
206 parser.error('Need location of htsworkflow frontend database')
208 if opt.sequence_archive is None:
209 parser.error('Need the root path for the sequence archive')
211 fcdb = fctracker.fctracker(opt.database)
212 cells = fcdb._get_flowcells(opt.where)
214 library_dir = os.path.join(opt.sequence_archive, 'libraries')
215 flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells')
216 srfs_dir = os.path.join(opt.sequence_archive, 'srfs')
217 make_library_tree(fcdb,
218 library_dir, flowcell_dir, srfs_dir,
223 if __name__ == "__main__":
224 rv = main(sys.argv[1:])