htsworkflow/util/fctracker.py

   1 """
   2 Provide some quick and dirty access and reporting for the fctracker database.
   3
   4 The advantage to this code is that it doesn't depend on django being
   5 installed, so it can run on machines other than the webserver.
   6 """
   7 import datetime
   8 import os
   9 import re
  10 import sys
  11 import time
  12
  13 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
  14   # we're python 2.5
  15   import sqlite3
  16 else:
  17   import pysqlite2.dbapi2 as sqlite3
  18
  19
  20 class fctracker:
  21     """
  22     provide a simple way to interact with the flowcell data in fctracker.db
  23     """
  24     def __init__(self, database):
  25         # default to the current directory
  26         if database is None:
  27             self.database = self._guess_fctracker_path()
  28         else:
  29             self.database = database
  30         self.conn = sqlite3.connect(self.database)
  31         self._get_library()
  32         self._get_species()
  33
  34     def _guess_fctracker_path(self):
  35         """
  36         Guess a few obvious places for the database
  37         """
  38         fctracker = 'fctracker.db'
  39         name = fctracker
  40         # is it in the current dir?
  41         if os.path.exists(name):
  42             return name
  43         name = os.path.expanduser(os.path.join('~', fctracker))
  44         if os.path.exists(name):
  45             return name
  46         raise RuntimeError("Can't find fctracker")
  47
  48     def _make_dict_from_table(self, table_name, pkey_name):
  49         """
  50         Convert a django table into a dictionary indexed by the primary key.
  51         Yes, it really does just load everything into memory, hopefully
  52         we stay under a few tens of thousands of runs for a while.
  53         """
  54         table = {}
  55         c = self.conn.cursor()
  56         c.execute('select * from %s;' % (table_name))
  57         # extract just the field name
  58         description = [ f[0] for f in c.description]
  59         for row in c:
  60             row_dict = dict(zip(description, row))
  61             table[row_dict[pkey_name]] = row_dict
  62         c.close()
  63         return table
  64
  65     def _add_lanes_to_libraries(self):
  66         """
  67         add flowcell/lane ids to new attribute 'lanes' in the library dictionary
  68         """
  69         library_id_re = re.compile('lane_\d_library_id')
  70
  71         for fc_id, fc in self.flowcells.items():
  72             lane_library = [ (x[0][5], x[1]) for x in fc.items()
  73                                              if library_id_re.match(x[0]) ]
  74             for lane, library_id in lane_library:
  75                 if not self.library[library_id].has_key('lanes'):
  76                     self.library[library_id]['lanes'] = []
  77                 self.library[library_id]['lanes'].append((fc_id, lane))
  78
  79     def _get_library(self):
  80         """
  81         attach the library dictionary to the instance
  82         """
  83         self.library = self._make_dict_from_table(
  84                          'samples_library',
  85                          'id')
  86
  87
  88     def _get_species(self):
  89         """
  90         attach the species dictionary to the instance
  91         """
  92         self.species = self._make_dict_from_table(
  93                          'samples_species',
  94                          'id'
  95                        )
  96
  97     def _get_flowcells(self, where=None):
  98         """
  99         attach the flowcell dictionary to the instance
 100
 101         where is a sql where clause. (eg "where run_date > '2008-1-1'")
 102         that can be used to limit what flowcells we select
 103         FIXME: please add sanitization code
 104         """
 105         if where is None:
 106             where = ""
 107         self.flowcells = {}
 108         c = self.conn.cursor()
 109         c.execute('select * from experiments_flowcell %s;' % (where))
 110         # extract just the field name
 111         description = [ f[0] for f in c.description ]
 112         for row in c:
 113             row_dict = dict(zip(description, row))
 114             fcid, status = self._parse_flowcell_id(row_dict)
 115             row_dict['flowcell_id'] = fcid
 116             row_dict['flowcell_status'] = status
 117
 118             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
 119                 lane_library = self.library[row_dict[lane+"_id"]]
 120                 species_id = lane_library['library_species_id']
 121                 lane_library['library_species'] = self.species[species_id]
 122                 row_dict[lane] = lane_library
 123             # some useful parsing
 124             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
 125             run_date = datetime.datetime(*run_date[:6])
 126             row_dict['run_date'] = run_date
 127             self.flowcells[row_dict['flowcell_id']] = row_dict
 128
 129         self._add_lanes_to_libraries()
 130         return self.flowcells
 131
 132     def _parse_flowcell_id(self, flowcell_row):
 133       """
 134       Return flowcell id and status
 135
 136       We stored the status information in the flowcell id name.
 137       this was dumb, but database schemas are hard to update.
 138       """
 139       fields = flowcell_row['flowcell_id'].split()
 140       fcid = None
 141       status = None
 142       if len(fields) > 0:
 143         fcid = fields[0]
 144       if len(fields) > 1:
 145         status = fields[1]
 146       return fcid, status
 147
 148
 149 def flowcell_gone(cell):
 150     """
 151     Use a variety of heuristics to determine if the flowcell drive
 152     has been deleted.
 153     """
 154     status = cell['flowcell_status']
 155     if status is None:
 156         return False
 157     failures = ['failed', 'deleted', 'not run']
 158     for f in failures:
 159       if re.search(f, status):
 160         return True
 161     else:
 162       return False
 163
 164 def recoverable_drive_report(flowcells):
 165     """
 166     Attempt to report what flowcells are still on a hard drive
 167     """
 168     def format_status(status):
 169       if status is None:
 170         return ""
 171       else:
 172         return status+" "
 173
 174     # sort flowcells by run date
 175     flowcell_list = []
 176     for key, cell in flowcells.items():
 177         flowcell_list.append( (cell['run_date'], key) )
 178     flowcell_list.sort()
 179
 180     report = []
 181     line = "%(date)s %(id)s %(status)s%(lane)s %(library_name)s (%(library_id)s) "
 182     line += "%(species)s"
 183     for run_date, flowcell_id in flowcell_list:
 184         cell = flowcells[flowcell_id]
 185         if flowcell_gone(cell):
 186             continue
 187         for l in range(1,9):
 188             lane = 'lane_%d' % (l)
 189             cell_library = cell['%s_library'%(lane)]
 190             fields = {
 191               'date': cell['run_date'].strftime('%y-%b-%d'),
 192               'id': cell['flowcell_id'],
 193               'lane': l,
 194               'library_name': cell_library['library_name'],
 195               'library_id': cell['%s_library_id'%(lane)],
 196               'species': cell_library['library_species']['scientific_name'],
 197               'status': format_status(cell['flowcell_status']),
 198             }
 199             report.append(line % (fields))
 200     return os.linesep.join(report)
 201