use the compression handling auto-opener for our eland files
[htsworkflow.git] / scripts / make-library-tree
1 """
2 Make a tree of symlinks organized by library id.
3 """
4 from ConfigParser import SafeConfigParser
5 from glob import glob
6 import logging
7 from optparse import OptionParser
8 import os
9 import stat
10 import sys
11
12 from htsworkflow.util import fctracker
13
14
15 def find_lanes(flowcell_dir, flowcell_id, lane):
16     lane_name = "s_%s_eland_result*" %(lane)
17     pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
18     lanes = glob(pattern)
19     return lanes
20
21 def make_long_lane_name(flowcell_dir, lane_pathname):
22     """
23     make a name from the eland result file name
24     """
25     if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
26         subpath = lane_pathname[len(flowcell_dir):]
27         long_name = subpath.replace(os.path.sep, "_")
28         return long_name
29     else:
30         return None
31     
32 def parse_srf_directory(srf_dir):
33     """
34     search srf_dir for *.srf files
35
36     builds a dictionary indexed by flowcell name.
37     """
38     flowcells = {}
39     srfs = glob(os.path.join(srf_dir,'*.srf'))
40     for pathname in srfs:
41         path, filename = os.path.split(pathname)
42         basename, ext = os.path.splitext(filename)
43         record = basename.split('_')
44         assert len(record) == 6
45
46         site = record[0]
47         date = record[1]
48         machine = record[2]
49         runid = record[3]
50         flowcellid = record[4]
51         laneid = record[5]
52
53         desc = "_".join([site,date,machine,runid,flowcellid])
54         flowcells[flowcellid] = desc
55     return flowcells
56
57
58 def carefully_make_hardlink(source, destination, dry_run=False):
59     """
60     Make a hard link, failing if a different link already exists
61
62     Checking to see if the link already exists and is
63     the same as the link we want to make.
64     If the link already exists and is different, throw an error.
65     """
66     logging.debug("%s -> %s", source, destination)
67
68     if not os.path.exists(source):
69         logging.warning("%s doesn't exist", source)
70         return
71
72     if os.path.exists(destination):
73         if os.path.samefile(source, destination):
74             return
75         else:
76             raise IOError('%s and %s are different files' % \
77                            (source, destination))
78
79     if dry_run: return 
80
81     os.link(source, destination)
82     os.chmod(destination,
83              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
84
85 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
86     """
87     find eland files at different alignment lengths
88     and put each of those in the file 
89     """
90     lanes = find_lanes(flowcell_dir, flowcell_id, lane)
91     for lane_pathname in lanes:
92         long_name = make_long_lane_name(flowcell_dir, 
93                                         lane_pathname)
94         long_pathname = os.path.join(library_path, long_name)
95         carefully_make_hardlink(lane_pathname,
96                                 long_pathname,
97                                 dry_run)
98
99 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
100     """
101     Link srf files into our library directories.
102
103     the srf files must be named:
104     <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
105     """
106     srf_basename = srf_names.get(flowcell_id, None)
107     if srf_basename is None:
108         logging.info("srf file for %s was not found", flowcell_id)
109     else:
110         srf_filename = "%s_%s.srf" % (srf_basename, lane)
111         source = os.path.join(srf_dir, srf_filename)
112         destination = os.path.join(library_path, srf_filename)
113         carefully_make_hardlink(source, destination, dry_run)
114     
115
116 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
117                       dry_run=False):
118     """
119     Iterate over the library 
120     """
121     library_dir = os.path.normpath(library_dir) + os.path.sep
122     flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
123     srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
124
125     srf_names = parse_srf_directory(srfs_dir)
126
127     for lib_id, lib in fcdb.library.items():
128         library_path = os.path.join(library_dir, str(lib_id))
129         if not os.path.exists(library_path):
130             os.mkdir(library_path)
131
132         for flowcell_id, lane in lib.get('lanes', []):
133             link_all_eland_lanes(library_path, 
134                                  flowcell_dir, 
135                                  flowcell_id, 
136                                  lane, 
137                                  dry_run)
138
139             link_srf_lanes(srf_names, 
140                            library_path, 
141                            srfs_dir,
142                            flowcell_id,
143                            lane,
144                            dry_run)
145
146 def make_parser():
147     """
148     Make parser
149     """
150     parser = OptionParser()
151     parser.add_option('-c', '--config', default=None,
152                       help='path to a configuration file containing a '
153                            'sequence archive section')
154                       
155     parser.add_option("-d", "--database", dest="database",
156                       help="path to the fctracker.db",
157                       default=None)
158     parser.add_option('-a', '--sequence-archive', default=None,
159                       help='path to where the sequence archive lives')
160     parser.add_option("-w", "--where", dest="where",
161                       help="add a where clause",
162                       default=None)
163
164     parser.add_option("--dry-run", dest="dry_run", action="store_true",
165                       default=False,
166                       help="Don't modify the filesystem")
167     return parser
168
169 def main(argv=None):
170     logging.basicConfig(level=logging.INFO)
171
172     FRONTEND_NAME = 'frontend'
173     SECTION_NAME = 'sequence_archive'
174     DATABASE_OPT = 'database_name'
175     ARCHIVE_OPT = 'archive_path'
176
177     if argv is None:
178         argv = []
179     parser = make_parser()
180
181     # parse command line arguments
182     opt, args = parser.parse_args(argv)
183
184     # figure out what config file to read
185     config_path = [os.path.expanduser('~/.htsworkflow.ini'),
186                    '/etc/htsworkflow.ini']
187     if opt.config is not None:
188         config_path = [opt.config]
189     
190     # parse options from config file
191     config_file = SafeConfigParser()
192     config_file.read(config_path)
193
194     # load defaults from config file if not overriden by the command line
195     print opt.database
196     if opt.database is None and \
197        config_file.has_option(FRONTEND_NAME, DATABASE_OPT):
198         opt.database = config_file.get(FRONTEND_NAME, DATABASE_OPT)
199
200     if opt.sequence_archive is None and \
201        config_file.has_option(SECTION_NAME, ARCHIVE_OPT):
202         opt.sequence_archive = config_file.get(SECTION_NAME, ARCHIVE_OPT)
203   
204     # complain if critical things are missing
205     if opt.database is None:
206        parser.error('Need location of htsworkflow frontend database')
207
208     if opt.sequence_archive is None:
209        parser.error('Need the root path for the sequence archive')
210
211     fcdb = fctracker.fctracker(opt.database)
212     cells = fcdb._get_flowcells(opt.where)
213
214     library_dir = os.path.join(opt.sequence_archive, 'libraries')
215     flowcell_dir = os.path.join(opt.sequence_archive, 'flowcells')
216     srfs_dir = os.path.join(opt.sequence_archive, 'srfs')
217     make_library_tree(fcdb, 
218                       library_dir, flowcell_dir, srfs_dir, 
219                       opt.dry_run)
220
221     return 0
222
223 if __name__ == "__main__":
224     rv = main(sys.argv[1:])
225     # sys.exit(rv)