gaworkflow/util/fctracker.py

   1 """
   2 Provide some quick and dirty access and reporting for the fctracker database.
   3
   4 The advantage to this code is that it doesn't depend on django being
   5 installed, so it can run on machines other than the webserver.
   6 """
   7 import datetime
   8 import os
   9 import re
  10 import sys
  11 import time
  12
  13 if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5:
  14   # we're python 2.5
  15   import sqlite3
  16 else:
  17   import pysqlite2.dbapi2 as sqlite3
  18
  19
  20 class fctracker:
  21     """
  22     provide a simple way to interact with the flowcell data in fctracker.db
  23     """
  24     def __init__(self, database):
  25         # default to the current directory
  26         if database is None:
  27             self.database = self._guess_fctracker_path()
  28         else:
  29             self.database = database
  30         self.conn = sqlite3.connect(self.database)
  31         self._get_library()
  32         self._get_species()
  33
  34     def _guess_fctracker_path(self):
  35         """
  36         Guess a few obvious places for the database
  37         """
  38         fctracker = 'fctracker.db'
  39         name = fctracker
  40         # is it in the current dir?
  41         if os.path.exists(name):
  42             return name
  43         name = os.path.expanduser(os.path.join('~', fctracker))
  44         if os.path.exists(name):
  45             return name
  46         raise RuntimeError("Can't find fctracker")
  47
  48     def _make_dict_from_table(self, table_name, pkey_name):
  49         """
  50         Convert a django table into a dictionary indexed by the primary key.
  51         Yes, it really does just load everything into memory, hopefully
  52         we stay under a few tens of thousands of runs for a while.
  53         """
  54         table = {}
  55         c = self.conn.cursor()
  56         c.execute('select * from %s;' % (table_name))
  57         # extract just the field name
  58         description = [ f[0] for f in c.description]
  59         for row in c:
  60             row_dict = dict(zip(description, row))
  61             table[row_dict[pkey_name]] = row_dict
  62         c.close()
  63         return table
  64
  65     def _add_lanes_to_libraries(self):
  66         """
  67         add flowcell/lane ids to new attribute 'lanes' in the library dictionary
  68         """
  69         library_id_re = re.compile('lane_\d_library_id')
  70
  71         for fc_id, fc in self.flowcells.items():
  72             lane_library = [ (x[0][5], x[1]) for x in fc.items()
  73                                              if library_id_re.match(x[0]) ]
  74             for lane, library_id in lane_library:
  75                 if not self.library[library_id].has_key('lanes'):
  76                     self.library[library_id]['lanes'] = []
  77                 self.library[library_id]['lanes'].append((fc_id, lane))
  78
  79     def _get_library(self):
  80         """
  81         attach the library dictionary to the instance
  82         """
  83         self.library = self._make_dict_from_table(
  84                          'fctracker_library',
  85                          'library_id')
  86
  87
  88     def _get_species(self):
  89         """
  90         attach the species dictionary to the instance
  91         """
  92         self.species = self._make_dict_from_table(
  93                          'fctracker_species',
  94                          'id'
  95                        )
  96
  97     def _get_flowcells(self, where=None):
  98         """
  99         attach the flowcell dictionary to the instance
 100
 101         where is a sql where clause. (eg "where run_date > '2008-1-1'")
 102         that can be used to limit what flowcells we select
 103         FIXME: please add sanitization code
 104         """
 105         if where is None:
 106             where = ""
 107         self.flowcells = {}
 108         c = self.conn.cursor()
 109         c.execute('select * from fctracker_flowcell %s;' % (where))
 110         # extract just the field name
 111         description = [ f[0] for f in c.description ]
 112         for row in c:
 113             row_dict = dict(zip(description, row))
 114             for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]:
 115                 lane_library = self.library[row_dict[lane+"_id"]]
 116                 species_id = lane_library['library_species_id']
 117                 lane_library['library_species'] = self.species[species_id]
 118                 row_dict[lane] = lane_library
 119             # some useful parsing
 120             run_date = time.strptime(row_dict['run_date'],  '%Y-%m-%d %H:%M:%S')
 121             run_date = datetime.datetime(*run_date[:6])
 122             row_dict['run_date'] = run_date
 123             self.flowcells[row_dict['flowcell_id']] = row_dict
 124
 125         self._add_lanes_to_libraries()
 126         return self.flowcells
 127
 128 def recoverable_drive_report(flowcells):
 129     """
 130     Attempt to report what flowcells are still on a hard drive
 131     """
 132     def flowcell_gone(cell):
 133         """
 134         Use a variety of heuristics to determine if the flowcell drive
 135         has been deleted.
 136         """
 137         name = cell['flowcell_id']
 138         if 'failed' in name:
 139             return True
 140         if 'deleted' in name:
 141             return True
 142         if 'not run' in name:
 143             return True
 144         return False
 145
 146     # sort flowcells by run date
 147     flowcell_list = []
 148     for key, cell in flowcells.items():
 149         flowcell_list.append( (cell['run_date'], key) )
 150     flowcell_list.sort()
 151
 152     report = []
 153     line = "%(date)s %(id)s %(lane)s %(library_name)s (%(library_id)s) "
 154     line += "%(species)s"
 155     for run_date, flowcell_id in flowcell_list:
 156         cell = flowcells[flowcell_id]
 157         if flowcell_gone(cell):
 158             continue
 159         for l in range(1,9):
 160             lane = 'lane_%d' % (l)
 161             cell_library = cell['%s_library'%(lane)]
 162             fields = {
 163               'date': cell['run_date'].strftime('%y-%b-%d'),
 164               'id': cell['flowcell_id'],
 165               'lane': l,
 166               'library_name': cell_library['library_name'],
 167               'library_id': cell['%s_library_id'%(lane)],
 168               'species': cell_library['library_species']['scientific_name'],
 169             }
 170             report.append(line % (fields))
 171     return os.linesep.join(report)
 172