gaworkflow/util/fctracker.py

   1 """
   2 Provide some quick and dirty access and reporting for the fctracker database.
   3
   4 The advantage to this code is that it doesn't depend on django being
   5 installed, so it can run on machines other than the webserver.
   6 """
   7 import datetime
   8 import os
   9 import sys
  10 import time
  11
  12 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
  13   # we're python 2.5
  14   import sqlite3
  15 else:
  16   import pysqlite2.dbapi2 as sqlite3
  17
  18
  19 class fctracker:
  20     """
  21     provide a simple way to interact with the flowcell data in fctracker.db
  22     """
  23     def __init__(self, database):
  24         # default to the current directory
  25         if database is None:
  26             self.database = self._guess_fctracker_path()
  27         else:
  28             self.database = database
  29         self.conn = sqlite3.connect(self.database)
  30         self._get_library()
  31         self._get_species()
  32
  33     def _guess_fctracker_path(self):
  34         """
  35         Guess a few obvious places for the database
  36         """
  37         fctracker = 'fctracker.db'
  38         name = fctracker
  39         # is it in the current dir?
  40         if os.path.exists(name):
  41             return name
  42         name = os.path.expanduser(os.path.join('~', fctracker))
  43         if os.path.exists(name):
  44             return name
  45         raise RuntimeError("Can't find fctracker")
  46
  47     def _make_dict_from_table(self, table_name, pkey_name):
  48         """
  49         Convert a django table into a dictionary indexed by the primary key.
  50         Yes, it really does just load everything into memory, hopefully
  51         we stay under a few tens of thousands of runs for a while.
  52         """
  53         table = {}
  54         c = self.conn.cursor()
  55         c.execute('select * from %s;' % (table_name))
  56         # extract just the field name
  57         description = [ f[0] for f in c.description]
  58         for row in c:
  59             row_dict = dict(zip(description, row))
  60             table[row_dict[pkey_name]] = row_dict
  61         c.close()
  62         return table
  63
  64     def _get_library(self):
  65         """
  66         attach the library dictionary to the instance
  67         """
  68         self.library = self._make_dict_from_table(
  69                          'fctracker_library',
  70                          'library_id')
  71
  72
  73     def _get_species(self):
  74         """
  75         attach the species dictionary to the instance
  76         """
  77         self.species = self._make_dict_from_table(
  78                          'fctracker_species',
  79                          'id'
  80                        )
  81
  82     def _get_flowcells(self, where=None):
  83         """
  84         attach the flowcell dictionary to the instance
  85
  86         where is a sql where clause. (eg "where run_date > '2008-1-1'")
  87         that can be used to limit what flowcells we select
  88         FIXME: please add sanitization code
  89         """
  90         if where is None:
  91             where = ""
  92         self.flowcells = {}
  93         c = self.conn.cursor()
  94         c.execute('select * from fctracker_flowcell %s;' % (where))
  95         # extract just the field name
  96         description = [ f[0] for f in c.description ]
  97         for row in c:
  98             row_dict = dict(zip(description, row))
  99             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
 100                 lane_library = self.library[row_dict[lane+"_id"]]
 101                 species_id = lane_library['library_species_id']
 102                 lane_library['library_species'] = self.species[species_id]
 103                 row_dict[lane] = lane_library
 104             # some useful parsing
 105             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
 106             run_date = datetime.datetime(*run_date[:6])
 107             row_dict['run_date'] = run_date
 108             self.flowcells[row_dict['flowcell_id']] = row_dict
 109         return self.flowcells
 110
 111 def recoverable_drive_report(flowcells):
 112     """
 113     Attempt to report what flowcells are still on a hard drive
 114     """
 115     def flowcell_gone(cell):
 116         """
 117         Use a variety of heuristics to determine if the flowcell drive
 118         has been deleted.
 119         """
 120         name = cell['flowcell_id']
 121         if 'failed' in name:
 122             return True
 123         if 'deleted' in name:
 124             return True
 125         if 'not run' in name:
 126             return True
 127         return False
 128
 129     # sort flowcells by run date
 130     flowcell_list = []
 131     for key, cell in flowcells.items():
 132         flowcell_list.append( (cell['run_date'], key) )
 133     flowcell_list.sort()
 134
 135     report = []
 136     line = "%(date)s %(id)s %(lane)s %(library_name)s (%(library_id)s) "
 137     line += "%(species)s"
 138     for run_date, flowcell_id in flowcell_list:
 139         cell = flowcells[flowcell_id]
 140         if flowcell_gone(cell):
 141             continue
 142         for l in range(1,9):
 143             lane = 'lane_%d' % (l)
 144             cell_library = cell['%s_library'%(lane)]
 145             fields = {
 146               'date': cell['run_date'].strftime('%y-%b-%d'),
 147               'id': cell['flowcell_id'],
 148               'lane': l,
 149               'library_name': cell_library['library_name'],
 150               'library_id': cell['%s_library_id'%(lane)],
 151               'species': cell_library['library_species']['scientific_name'],
 152             }
 153             report.append(line % (fields))
 154     return os.linesep.join(report)