Provide cross referencing information to the libraries to help find
[htsworkflow.git] / gaworkflow / util / fctracker.py
1 """
2 Provide some quick and dirty access and reporting for the fctracker database.
3
4 The advantage to this code is that it doesn't depend on django being
5 installed, so it can run on machines other than the webserver.
6 """
7 import datetime
8 import os
9 import re
10 import sys
11 import time
12
13 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
14   # we're python 2.5
15   import sqlite3
16 else:
17   import pysqlite2.dbapi2 as sqlite3
18
19
20 class fctracker:
21     """
22     provide a simple way to interact with the flowcell data in fctracker.db
23     """
24     def __init__(self, database):
25         # default to the current directory
26         if database is None: 
27             self.database = self._guess_fctracker_path()
28         else:
29             self.database = database
30         self.conn = sqlite3.connect(self.database)
31         self._get_library()
32         self._get_species()
33
34     def _guess_fctracker_path(self):
35         """
36         Guess a few obvious places for the database
37         """
38         fctracker = 'fctracker.db'
39         name = fctracker
40         # is it in the current dir?
41         if os.path.exists(name): 
42             return name
43         name = os.path.expanduser(os.path.join('~', fctracker))
44         if os.path.exists(name):
45             return name
46         raise RuntimeError("Can't find fctracker")
47
48     def _make_dict_from_table(self, table_name, pkey_name):
49         """
50         Convert a django table into a dictionary indexed by the primary key.
51         Yes, it really does just load everything into memory, hopefully
52         we stay under a few tens of thousands of runs for a while.
53         """
54         table = {}
55         c = self.conn.cursor()
56         c.execute('select * from %s;' % (table_name))
57         # extract just the field name
58         description = [ f[0] for f in c.description]
59         for row in c:
60             row_dict = dict(zip(description, row))
61             table[row_dict[pkey_name]] = row_dict
62         c.close()
63         return table
64
65     def _add_lanes_to_libraries(self):
66         """
67         add flowcell/lane ids to new attribute 'lanes' in the library dictionary
68         """
69         library_id_re = re.compile('lane_\d_library_id')
70
71         for fc_id, fc in self.flowcells.items():
72             lane_library = [ (x[0][5], x[1]) for x in fc.items() 
73                                              if library_id_re.match(x[0]) ]
74             for lane, library_id in lane_library:
75                 if not self.library[library_id].has_key('lanes'):
76                     self.library[library_id]['lanes'] = []
77                 self.library[library_id]['lanes'].append((fc_id, lane))
78
79     def _get_library(self):
80         """
81         attach the library dictionary to the instance
82         """
83         self.library = self._make_dict_from_table(
84                          'fctracker_library', 
85                          'library_id')
86                                                   
87         
88     def _get_species(self):
89         """
90         attach the species dictionary to the instance
91         """
92         self.species = self._make_dict_from_table(
93                          'fctracker_species',
94                          'id'
95                        )
96         
97     def _get_flowcells(self, where=None):
98         """
99         attach the flowcell dictionary to the instance
100
101         where is a sql where clause. (eg "where run_date > '2008-1-1'")
102         that can be used to limit what flowcells we select
103         FIXME: please add sanitization code
104         """
105         if where is None:
106             where = ""
107         self.flowcells = {}
108         c = self.conn.cursor()
109         c.execute('select * from fctracker_flowcell %s;' % (where))
110         # extract just the field name
111         description = [ f[0] for f in c.description ]
112         for row in c:
113             row_dict = dict(zip(description, row))
114             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
115                 lane_library = self.library[row_dict[lane+"_id"]]
116                 species_id = lane_library['library_species_id']
117                 lane_library['library_species'] = self.species[species_id]
118                 row_dict[lane] = lane_library
119             # some useful parsing
120             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
121             run_date = datetime.datetime(*run_date[:6])
122             row_dict['run_date'] = run_date
123             self.flowcells[row_dict['flowcell_id']] = row_dict
124
125         self._add_lanes_to_libraries()
126         return self.flowcells
127
128 def recoverable_drive_report(flowcells):
129     """
130     Attempt to report what flowcells are still on a hard drive
131     """
132     def flowcell_gone(cell):
133         """
134         Use a variety of heuristics to determine if the flowcell drive
135         has been deleted.
136         """
137         name = cell['flowcell_id']
138         if 'failed' in name:
139             return True
140         if 'deleted' in name:
141             return True
142         if 'not run' in name:
143             return True
144         return False
145
146     # sort flowcells by run date
147     flowcell_list = []
148     for key, cell in flowcells.items():
149         flowcell_list.append( (cell['run_date'], key) )
150     flowcell_list.sort()
151
152     report = []
153     line = "%(date)s %(id)s %(lane)s %(library_name)s (%(library_id)s) "
154     line += "%(species)s"
155     for run_date, flowcell_id in flowcell_list:
156         cell = flowcells[flowcell_id]
157         if flowcell_gone(cell):
158             continue
159         for l in range(1,9):
160             lane = 'lane_%d' % (l)
161             cell_library = cell['%s_library'%(lane)]
162             fields = {
163               'date': cell['run_date'].strftime('%y-%b-%d'),
164               'id': cell['flowcell_id'],
165               'lane': l,
166               'library_name': cell_library['library_name'],
167               'library_id': cell['%s_library_id'%(lane)],
168               'species': cell_library['library_species']['scientific_name'],
169             }
170             report.append(line % (fields))
171     return os.linesep.join(report)
172