clean up the logic for deciding the output filename when using stdin
[htsworkflow.git] / scripts / make-library-tree
1 """
2 Make a tree of symlinks organized by library id.
3 """
4 from glob import glob
5 import logging
6 from optparse import OptionParser
7 import os
8 import stat
9 import sys
10
11 from gaworkflow.util import fctracker
12
13
14 def find_lanes(flowcell_dir, flowcell_id, lane):
15     lane_name = "s_%s_eland_result*" %(lane)
16     pattern = os.path.join(flowcell_dir, flowcell_id, "*", lane_name)
17     lanes = glob(pattern)
18     return lanes
19
20 def make_long_lane_name(flowcell_dir, lane_pathname):
21     """
22     make a name from the eland result file name
23     """
24     if flowcell_dir == lane_pathname[0:len(flowcell_dir)]:
25         subpath = lane_pathname[len(flowcell_dir):]
26         long_name = subpath.replace(os.path.sep, "_")
27         return long_name
28     else:
29         return None
30     
31 def parse_srf_directory(srf_dir):
32     """
33     search srf_dir for *.srf files
34
35     builds a dictionary indexed by flowcell name.
36     """
37     flowcells = {}
38     srfs = glob(os.path.join(srf_dir,'*.srf'))
39     for pathname in srfs:
40         path, filename = os.path.split(pathname)
41         basename, ext = os.path.splitext(filename)
42         record = basename.split('_')
43         assert len(record) == 6
44
45         site = record[0]
46         date = record[1]
47         machine = record[2]
48         runid = record[3]
49         flowcellid = record[4]
50         laneid = record[5]
51
52         desc = "_".join([site,date,machine,runid,flowcellid])
53         flowcells[flowcellid] = desc
54     return flowcells
55
56
57 def carefully_make_hardlink(source, destination, dry_run=False):
58     """
59     Make a hard link, failing if a different link already exists
60
61     Checking to see if the link already exists and is
62     the same as the link we want to make.
63     If the link already exists and is different, throw an error.
64     """
65     logging.debug("%s -> %s", source, destination)
66
67     if not os.path.exists(source):
68         logging.warning("%s doesn't exist", source)
69         return
70
71     if os.path.exists(destination):
72         if os.path.samefile(source, destination):
73             return
74         else:
75             raise IOError('%s and %s are different files' % \
76                            (source, destination))
77
78     if dry_run: return 
79
80     os.link(source, destination)
81     os.chmod(destination,
82              stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH )
83
84 def link_all_eland_lanes(library_path, flowcell_dir, flowcell_id, lane, dry_run):
85     """
86     find eland files at different alignment lengths
87     and put each of those in the file 
88     """
89     lanes = find_lanes(flowcell_dir, flowcell_id, lane)
90     for lane_pathname in lanes:
91         long_name = make_long_lane_name(flowcell_dir, 
92                                         lane_pathname)
93         long_pathname = os.path.join(library_path, long_name)
94         carefully_make_hardlink(lane_pathname,
95                                 long_pathname,
96                                 dry_run)
97
98 def link_srf_lanes(srf_names, library_path, srf_dir, flowcell_id, lane, dry_run):
99     """
100     Link srf files into our library directories.
101
102     the srf files must be named:
103     <site>_<date>_<machine>_<run>_<flowcellid>_<lane>.srf
104     """
105     srf_basename = srf_names.get(flowcell_id, None)
106     if srf_basename is None:
107         logging.info("srf file for %s was not found", flowcell_id)
108     else:
109         srf_filename = "%s_%s.srf" % (srf_basename, lane)
110         source = os.path.join(srf_dir, srf_filename)
111         destination = os.path.join(library_path, srf_filename)
112         carefully_make_hardlink(source, destination, dry_run)
113     
114
115 def make_library_tree(fcdb, library_dir, flowcell_dir, srfs_dir,
116                       dry_run=False):
117     """
118     Iterate over the library 
119     """
120     library_dir = os.path.normpath(library_dir) + os.path.sep
121     flowcell_dir = os.path.normpath(flowcell_dir) + os.path.sep
122     srfs_dir = os.path.normpath(srfs_dir) + os.path.sep
123
124     srf_names = parse_srf_directory(srfs_dir)
125
126     for lib_id, lib in fcdb.library.items():
127         library_path = os.path.join(library_dir, str(lib_id))
128         if not os.path.exists(library_path):
129             os.mkdir(library_path)
130
131         for flowcell_id, lane in lib.get('lanes', []):
132             link_all_eland_lanes(library_path, 
133                                  flowcell_dir, 
134                                  flowcell_id, 
135                                  lane, 
136                                  dry_run)
137
138             link_srf_lanes(srf_names, 
139                            library_path, 
140                            srfs_dir,
141                            flowcell_id,
142                            lane,
143                            dry_run)
144
145 def make_parser():
146     """
147     Make parser
148     """
149     parser = OptionParser()
150     parser.add_option("-d", "--database", dest="database",
151                       help="path to the fctracker.db",
152                       default=None)
153     parser.add_option("-w", "--where", dest="where",
154                       help="add a where clause",
155                       default=None)
156     parser.add_option("--dry-run", dest="dry_run", action="store_true",
157                       default=False,
158                       help="Don't modify the filesystem")
159     return parser
160
161 def main(argv=None):
162     logging.basicConfig(level=logging.INFO)
163
164     if argv is None:
165         argv = []
166     parser = make_parser()
167
168     opt, args = parser.parse_args(argv)
169     
170     fcdb = fctracker.fctracker(opt.database)
171     cells = fcdb._get_flowcells(opt.where)
172
173     root_dir = '/woldlab/mus/solexa-sequence'
174     library_dir = os.path.join(root_dir, 'libraries')
175     flowcell_dir = os.path.join(root_dir, 'flowcells')
176     srfs_dir = os.path.join(root_dir, 'srfs')
177     make_library_tree(fcdb, 
178                       library_dir, flowcell_dir, srfs_dir, 
179                       opt.dry_run)
180
181     return 0
182
183 if __name__ == "__main__":
184     rv = main(sys.argv[1:])
185     # sys.exit(rv)