17fefb6855eb399eb166c50ed2e1ced47b4dc56a
[htsworkflow.git] / gaworkflow / util / fctracker.py
1 """
2 Provide some quick and dirty access and reporting for the fctracker database.
3
4 The advantage to this code is that it doesn't depend on django being
5 installed, so it can run on machines other than the webserver.
6 """
7 import datetime
8 import os
9 import sys
10 import time
11
12 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
13   # we're python 2.5
14   import sqlite3
15 else:
16   import pysqlite2.dbapi2 as sqlite3
17
18
19 class fctracker:
20     """
21     provide a simple way to interact with the flowcell data in fctracker.db
22     """
23     def __init__(self, database):
24         # default to the current directory
25         if database is None: 
26             self.database = self._guess_fctracker_path()
27         else:
28             self.database = database
29         self.conn = sqlite3.connect(self.database)
30         self._get_library()
31         self._get_species()
32
33     def _guess_fctracker_path(self):
34         """
35         Guess a few obvious places for the database
36         """
37         fctracker = 'fctracker.db'
38         name = fctracker
39         # is it in the current dir?
40         if os.path.exists(name): 
41             return name
42         name = os.path.expanduser(os.path.join('~', fctracker))
43         if os.path.exists(name):
44             return name
45         raise RuntimeError("Can't find fctracker")
46
47     def _make_dict_from_table(self, table_name, pkey_name):
48         """
49         Convert a django table into a dictionary indexed by the primary key.
50         Yes, it really does just load everything into memory, hopefully
51         we stay under a few tens of thousands of runs for a while.
52         """
53         table = {}
54         c = self.conn.cursor()
55         c.execute('select * from %s;' % (table_name))
56         # extract just the field name
57         description = [ f[0] for f in c.description]
58         for row in c:
59             row_dict = dict(zip(description, row))
60             table[row_dict[pkey_name]] = row_dict
61         c.close()
62         return table
63
64     def _get_library(self):
65         """
66         attach the library dictionary to the instance
67         """
68         self.library = self._make_dict_from_table(
69                          'fctracker_library', 
70                          'library_id')
71                                                   
72         
73     def _get_species(self):
74         """
75         attach the species dictionary to the instance
76         """
77         self.species = self._make_dict_from_table(
78                          'fctracker_species',
79                          'id'
80                        )
81         
82     def _get_flowcells(self, where=None):
83         """
84         attach the flowcell dictionary to the instance
85
86         where is a sql where clause. (eg "where run_date > '2008-1-1'")
87         that can be used to limit what flowcells we select
88         FIXME: please add sanitization code
89         """
90         if where is None:
91             where = ""
92         self.flowcells = {}
93         c = self.conn.cursor()
94         c.execute('select * from fctracker_flowcell %s;' % (where))
95         # extract just the field name
96         description = [ f[0] for f in c.description ]
97         for row in c:
98             row_dict = dict(zip(description, row))
99             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
100                 lane_library = self.library[row_dict[lane+"_id"]]
101                 species_id = lane_library['library_species_id']
102                 lane_library['library_species'] = self.species[species_id]
103                 row_dict[lane] = lane_library
104             # some useful parsing
105             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
106             run_date = datetime.datetime(*run_date[:6])
107             row_dict['run_date'] = run_date
108             self.flowcells[row_dict['flowcell_id']] = row_dict
109         return self.flowcells
110
111 def recoverable_drive_report(flowcells):
112     """
113     Attempt to report what flowcells are still on a hard drive
114     """
115     def flowcell_gone(cell):
116         """
117         Use a variety of heuristics to determine if the flowcell drive
118         has been deleted.
119         """
120         name = cell['flowcell_id']
121         if 'failed' in name:
122             return True
123         if 'deleted' in name:
124             return True
125         if 'not run' in name:
126             return True
127         return False
128
129     # sort flowcells by run date
130     flowcell_list = []
131     for key, cell in flowcells.items():
132         flowcell_list.append( (cell['run_date'], key) )
133     flowcell_list.sort()
134
135     report = []
136     line = "%(date)s %(id)s %(lane)s %(library_name)s (%(library_id)s) "
137     line += "%(species)s"
138     for run_date, flowcell_id in flowcell_list:
139         cell = flowcells[flowcell_id]
140         if flowcell_gone(cell):
141             continue
142         for l in range(1,9):
143             lane = 'lane_%d' % (l)
144             cell_library = cell['%s_library'%(lane)]
145             fields = {
146               'date': cell['run_date'].strftime('%y-%b-%d'),
147               'id': cell['flowcell_id'],
148               'lane': l,
149               'library_name': cell_library['library_name'],
150               'library_id': cell['%s_library_id'%(lane)],
151               'species': cell_library['library_species']['scientific_name'],
152             }
153             report.append(line % (fields))
154     return os.linesep.join(report)