Update htsworkflow.util.fctracker module to work with the newer django db
[htsworkflow.git] / htsworkflow / util / fctracker.py
1 """
2 Provide some quick and dirty access and reporting for the fctracker database.
3
4 The advantage to this code is that it doesn't depend on django being
5 installed, so it can run on machines other than the webserver.
6 """
7 import datetime
8 import os
9 import re
10 import sys
11 import time
12
13 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
14   # we're python 2.5
15   import sqlite3
16 else:
17   import pysqlite2.dbapi2 as sqlite3
18
19
20 class fctracker:
21     """
22     provide a simple way to interact with the flowcell data in fctracker.db
23     """
24     def __init__(self, database):
25         # default to the current directory
26         if database is None: 
27             self.database = self._guess_fctracker_path()
28         else:
29             self.database = database
30         self.conn = sqlite3.connect(self.database)
31         self._get_library()
32         self._get_species()
33
34     def _guess_fctracker_path(self):
35         """
36         Guess a few obvious places for the database
37         """
38         fctracker = 'fctracker.db'
39         name = fctracker
40         # is it in the current dir?
41         if os.path.exists(name): 
42             return name
43         name = os.path.expanduser(os.path.join('~', fctracker))
44         if os.path.exists(name):
45             return name
46         raise RuntimeError("Can't find fctracker")
47
48     def _make_dict_from_table(self, table_name, pkey_name):
49         """
50         Convert a django table into a dictionary indexed by the primary key.
51         Yes, it really does just load everything into memory, hopefully
52         we stay under a few tens of thousands of runs for a while.
53         """
54         table = {}
55         c = self.conn.cursor()
56         c.execute('select * from %s;' % (table_name))
57         # extract just the field name
58         description = [ f[0] for f in c.description]
59         for row in c:
60             row_dict = dict(zip(description, row))
61             table[row_dict[pkey_name]] = row_dict
62         c.close()
63         return table
64
65     def _add_lanes_to_libraries(self):
66         """
67         add flowcell/lane ids to new attribute 'lanes' in the library dictionary
68         """
69         library_id_re = re.compile('lane_\d_library_id')
70
71         for fc_id, fc in self.flowcells.items():
72             lane_library = [ (x[0][5], x[1]) for x in fc.items() 
73                                              if library_id_re.match(x[0]) ]
74             for lane, library_id in lane_library:
75                 if not self.library[library_id].has_key('lanes'):
76                     self.library[library_id]['lanes'] = []
77                 self.library[library_id]['lanes'].append((fc_id, lane))
78
79     def _get_library(self):
80         """
81         attach the library dictionary to the instance
82         """
83         self.library = self._make_dict_from_table(
84                          'samples_library', 
85                          'id')
86                                                   
87         
88     def _get_species(self):
89         """
90         attach the species dictionary to the instance
91         """
92         self.species = self._make_dict_from_table(
93                          'samples_species',
94                          'id'
95                        )
96         
97     def _get_flowcells(self, where=None):
98         """
99         attach the flowcell dictionary to the instance
100
101         where is a sql where clause. (eg "where run_date > '2008-1-1'")
102         that can be used to limit what flowcells we select
103         FIXME: please add sanitization code
104         """
105         if where is None:
106             where = ""
107         self.flowcells = {}
108         c = self.conn.cursor()
109         c.execute('select * from experiments_flowcell %s;' % (where))
110         # extract just the field name
111         description = [ f[0] for f in c.description ]
112         for row in c:
113             row_dict = dict(zip(description, row))
114             fcid, status = self._parse_flowcell_id(row_dict)
115             row_dict['flowcell_id'] = fcid
116             row_dict['flowcell_status'] = status
117
118             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
119                 lane_library = self.library[row_dict[lane+"_id"]]
120                 species_id = lane_library['library_species_id']
121                 lane_library['library_species'] = self.species[species_id]
122                 row_dict[lane] = lane_library
123             # some useful parsing
124             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
125             run_date = datetime.datetime(*run_date[:6])
126             row_dict['run_date'] = run_date
127             self.flowcells[row_dict['flowcell_id']] = row_dict
128
129         self._add_lanes_to_libraries()
130         return self.flowcells
131
132     def _parse_flowcell_id(self, flowcell_row):
133       """
134       Return flowcell id and status
135       
136       We stored the status information in the flowcell id name.
137       this was dumb, but database schemas are hard to update.
138       """
139       fields = flowcell_row['flowcell_id'].split()
140       fcid = None
141       status = None
142       if len(fields) > 0:
143         fcid = fields[0]
144       if len(fields) > 1:
145         status = fields[1]
146       return fcid, status
147       
148
149 def flowcell_gone(cell):
150     """
151     Use a variety of heuristics to determine if the flowcell drive
152     has been deleted.
153     """
154     status = cell['flowcell_status']
155     if status is None:
156         return False
157     failures = ['failed', 'deleted', 'not run']
158     for f in failures:
159       if re.search(f, status):
160         return True
161     else:
162       return False
163
164 def recoverable_drive_report(flowcells):
165     """
166     Attempt to report what flowcells are still on a hard drive
167     """
168     def format_status(status):
169       if status is None:
170         return ""
171       else:
172         return status+" "
173
174     # sort flowcells by run date
175     flowcell_list = []
176     for key, cell in flowcells.items():
177         flowcell_list.append( (cell['run_date'], key) )
178     flowcell_list.sort()
179
180     report = []
181     line = "%(date)s %(id)s %(status)s%(lane)s %(library_name)s (%(library_id)s) "
182     line += "%(species)s"
183     for run_date, flowcell_id in flowcell_list:
184         cell = flowcells[flowcell_id]
185         if flowcell_gone(cell):
186             continue
187         for l in range(1,9):
188             lane = 'lane_%d' % (l)
189             cell_library = cell['%s_library'%(lane)]
190             fields = {
191               'date': cell['run_date'].strftime('%y-%b-%d'),
192               'id': cell['flowcell_id'],
193               'lane': l,
194               'library_name': cell_library['library_name'],
195               'library_id': cell['%s_library_id'%(lane)],
196               'species': cell_library['library_species']['scientific_name'],
197               'status': format_status(cell['flowcell_status']),
198             }
199             report.append(line % (fields))
200     return os.linesep.join(report)
201