1 from __future__ import absolute_import, print_function
11 from django.conf import settings
12 from django.core.exceptions import ObjectDoesNotExist
13 from django.core import urlresolvers
14 from django.utils import timezone
15 from django.db import models
16 from django.db.models.signals import post_init, pre_save
18 from samples.models import Library
19 from htsworkflow.util.conversion import parse_flowcell_id
20 from htsworkflow.pipelines import runfolder
24 LOGGER = logging.getLogger(__name__)
27 default_pM = int(settings.DEFAULT_PM)
28 except AttributeError, e:
29 LOGGER.error("invalid value for default_pm")
31 # how many days to wait before trying to re-import a runfolder
34 RESCAN_DELAY = int(settings.RESCAN_DELAY)
35 except (ValueError, AttributeError):
36 LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
37 "defaulting to %s" % (RESCAN_DELAY,))
39 RUN_STATUS_CHOICES = (
40 (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'),
41 (1, 'Data Pipeline Started'),
42 (2, 'Data Pipeline Interrupted'),
43 (3, 'Data Pipeline Finished'),
44 (4, 'Collect Results Started'),
45 (5, 'Collect Results Finished'),
50 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
53 class ClusterStation(models.Model):
54 """List of cluster stations"""
55 name = models.CharField(max_length=50, unique=True)
56 isdefault = models.BooleanField(default=False, null=False)
59 ordering = ["-isdefault", "name"]
61 def __unicode__(self):
62 return unicode(self.name)
66 d = cls.objects.filter(isdefault=True).all()
69 d = cls.objects.order_by('-id').all()
75 def update_isdefault(sender, instance, **kwargs):
76 """Clear default if needed
78 if instance.isdefault:
79 for c in ClusterStation.objects.filter(isdefault=True).all():
80 if c.id != instance.id:
84 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
86 class Sequencer(models.Model):
87 """Sequencers we've owned
89 name = models.CharField(max_length=50, db_index=True)
90 instrument_name = models.CharField(max_length=50, db_index=True)
91 serial_number = models.CharField(max_length=50, db_index=True)
92 model = models.CharField(max_length=255)
93 active = models.BooleanField(default=True, null=False)
94 isdefault = models.BooleanField(default=False, null=False)
95 comment = models.CharField(max_length=255)
98 ordering = ["-isdefault", "-active", "name"]
100 def __unicode__(self):
101 name = [unicode(self.name)]
102 if self.instrument_name is not None:
103 name.append("(%s)" % (unicode(self.instrument_name),))
104 return " ".join(name)
107 def get_absolute_url(self):
108 return ('experiments.views.sequencer',
113 d = cls.objects.filter(isdefault=True).all()
116 d = cls.objects.order_by('active', '-id').all()
122 def update_isdefault(sender, instance, **kwargs):
123 """Clear default if needed
125 if instance.isdefault:
126 for s in Sequencer.objects.filter(isdefault=True).all():
127 if s.id != instance.id:
131 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
134 class FlowCell(models.Model):
135 flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
136 run_date = models.DateTimeField()
137 advanced_run = models.BooleanField(default=False)
138 paired_end = models.BooleanField(default=False)
139 read_length = models.IntegerField(default=32) # Stanford is currenlty 25
140 control_lane = models.IntegerField(choices=[(1, 1),
152 cluster_station = models.ForeignKey(ClusterStation, default=ClusterStation.default)
153 sequencer = models.ForeignKey(Sequencer, default=Sequencer.default)
155 notes = models.TextField(blank=True)
157 def __unicode__(self):
158 return unicode(self.flowcell_id)
162 for lane in self.lane_set.order_by('lane_number'):
163 cluster_estimate = lane.cluster_estimate
164 if cluster_estimate is not None:
165 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
167 cluster_estimate = 'None'
168 library_id = lane.library_id
169 library = lane.library
170 element = '<tr><td>%d</td>'\
171 '<td><a href="%s">%s</a></td><td>%s</td></tr>'
172 html.append(element % (lane.lane_number,
173 library.get_admin_url(),
176 html.append('</table>')
177 return "\n".join(html)
178 Lanes.allow_tags = True
181 ordering = ["-run_date"]
183 def get_admin_url(self):
184 # that's the django way... except it didn't work
185 return urlresolvers.reverse('admin:experiments_flowcell_change',
188 def flowcell_type(self):
189 """Convert our boolean 'is paired' flag to a name
197 def get_absolute_url(self):
198 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
199 return ('experiments.views.flowcell_detail',
202 def get_raw_data_directory(self):
203 """Return location of where the raw data is stored"""
204 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
206 return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
208 def update_data_runs(self):
209 result_root = self.get_raw_data_directory()
210 LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
211 if result_root is None:
214 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
215 run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
218 for dirpath, dirnames, filenames in os.walk(result_root):
219 for filename in filenames:
220 if run_xml_re.match(filename):
221 # we have a run directory
222 relative_pathname = get_relative_pathname(dirpath)
223 self.import_data_run(relative_pathname, filename)
225 def import_data_run(self, relative_pathname, run_xml_name, force=False):
226 """Given a result directory import files"""
228 run_dir = get_absolute_pathname(relative_pathname)
229 run_xml_path = os.path.join(run_dir, run_xml_name)
231 runs = DataRun.objects.filter(result_dir = relative_pathname)
236 raise RuntimeError("Too many data runs for %s" % (
242 if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
243 LOGGER.debug("Importing run from %s" % (relative_pathname,))
244 run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
246 run.status = RUN_STATUS_REVERSE_MAP['DONE']
247 run.result_dir = relative_pathname
248 run.runfolder_name = run_xml_data.runfolder_name
249 run.cycle_start = run_xml_data.image_analysis.start
250 run.cycle_stop = run_xml_data.image_analysis.stop
251 naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
252 run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
253 run.image_software = run_xml_data.image_analysis.software
254 run.image_version = run_xml_data.image_analysis.version
255 run.basecall_software = run_xml_data.bustard.software
256 run.basecall_version = run_xml_data.bustard.version
257 # we're frequently not running alignments
258 if run_xml_data.gerald:
259 run.alignment_software = run_xml_data.gerald.software
260 run.alignment_version = run_xml_data.gerald.version
262 run.last_update_time = timezone.now()
265 run.update_result_files()
268 # FIXME: should we automatically update dataruns?
269 # Or should we expect someone to call update_data_runs?
270 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
271 # """Update our dataruns
273 # if not os.path.exists(settings.RESULT_HOME_DIR):
276 # instance.update_data_runs()
277 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
280 LANE_STATUS_CODES = [(0, 'Failed'),
283 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
284 LANE_STATUS_MAP[None] = "Unknown"
287 def is_valid_lane(value):
288 if value >= 1 and value <= 8:
294 class Lane(models.Model):
295 flowcell = models.ForeignKey(FlowCell)
296 lane_number = models.IntegerField()
297 library = models.ForeignKey(Library)
298 pM = models.DecimalField(max_digits=5,
303 cluster_estimate = models.IntegerField(blank=True, null=True)
304 status = models.IntegerField(choices=LANE_STATUS_CODES,
307 comment = models.TextField(null=True, blank=True)
310 def get_absolute_url(self):
311 return ('experiments.views.flowcell_lane_detail',
314 def __unicode__(self):
315 return self.flowcell.flowcell_id + ':' + unicode(self.lane_number)
318 class DataRun(models.Model):
319 flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
320 runfolder_name = models.CharField(max_length=50)
321 result_dir = models.CharField(max_length=255)
322 last_update_time = models.DateTimeField()
323 run_start_time = models.DateTimeField()
324 cycle_start = models.IntegerField(null=True, blank=True)
325 cycle_stop = models.IntegerField(null=True, blank=True)
326 run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
327 null=True, blank=True)
328 image_software = models.CharField(max_length=50)
329 image_version = models.CharField(max_length=50)
330 basecall_software = models.CharField(max_length=50)
331 basecall_version = models.CharField(max_length=50)
332 alignment_software = models.CharField(max_length=50)
333 alignment_version = models.CharField(max_length=50)
334 comment = models.TextField(blank=True)
336 def update_result_files(self):
337 abs_result_dir = get_absolute_pathname(self.result_dir)
339 for dirname, dirnames, filenames in os.walk(abs_result_dir):
340 for filename in filenames:
341 pathname = os.path.join(dirname, filename)
342 relative_pathname = get_relative_pathname(pathname)
343 datafiles = self.datafile_set.filter(
345 relative_pathname=relative_pathname)
346 if len(datafiles) > 0:
349 metadata = find_file_type_metadata_from_filename(filename)
350 if metadata is not None:
351 metadata['filename'] = filename
353 newfile.data_run = self
354 newfile.file_type = metadata['file_type']
355 newfile.relative_pathname = relative_pathname
357 lane_number = metadata.get('lane', None)
358 if lane_number is not None:
359 lane = self.flowcell.lane_set.get(
360 lane_number=lane_number)
361 newfile.library = lane.library
363 self.datafile_set.add(newfile)
365 self.last_update_time = timezone.now()
367 def lane_files(self):
370 for datafile in self.datafile_set.all():
371 metadata = datafile.attributes
372 if metadata is not None:
373 lane = metadata.get('lane', None)
375 lane_file_set = lanes.setdefault(lane, {})
376 normalized_name = datafile.file_type.normalized_name
377 lane_file_set[normalized_name] = datafile
380 def ivc_plots(self, lane):
381 ivc_name = ['IVC All', 'IVC Call',
382 'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
385 for rel_filename, metadata in self.get_result_files():
386 if metadata.file_type.name in ivc_name:
387 plots[metadata.file_type.name] = (rel_filename, metadata)
390 class FileType(models.Model):
391 """Represent potential file types
393 regex is a pattern used to detect if a filename matches this type
394 data run currently assumes that there may be a (?P<lane>) and
395 (?P<end>) pattern in the regular expression.
397 name = models.CharField(max_length=50)
398 mimetype = models.CharField(max_length=50, null=True, blank=True)
399 # regular expression from glob.fnmatch.translate
400 regex = models.TextField(null=True, blank=True)
402 def parse_filename(self, pathname):
403 """Does filename match our pattern?
405 Returns None if not, or dictionary of match variables if we do.
407 path, filename = os.path.split(pathname)
408 if len(self.regex) > 0:
409 match = re.match(self.regex, filename)
410 if match is not None:
411 # These are (?P<>) names we know about from our
413 results = match.groupdict()
415 # convert int parameters
416 for attribute_name in ['lane', 'end']:
417 value = results.get(attribute_name, None)
418 if value is not None:
419 results[attribute_name] = int(value)
423 def _get_normalized_name(self):
424 """Crush data file name into identifier friendly name"""
425 return self.name.replace(' ', '_').lower()
426 normalized_name = property(_get_normalized_name)
428 def __unicode__(self):
429 #return u"<FileType: %s>" % (self.name,)
434 """Helper function to set default UUID in DataFile"""
435 return str(uuid.uuid1())
438 class DataFile(models.Model):
439 """Store map from random ID to filename"""
440 random_key = models.CharField(max_length=64,
443 data_run = models.ForeignKey(DataRun, db_index=True)
444 library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
445 file_type = models.ForeignKey(FileType)
446 relative_pathname = models.CharField(max_length=255, db_index=True)
448 def _get_attributes(self):
449 return self.file_type.parse_filename(self.relative_pathname)
450 attributes = property(_get_attributes)
452 def _get_pathname(self):
453 return get_absolute_pathname(self.relative_pathname)
454 pathname = property(_get_pathname)
457 def get_absolute_url(self):
458 return ('experiments.views.read_result_file',
459 (), {'key': self.random_key})
462 def find_file_type_metadata_from_filename(pathname):
463 path, filename = os.path.split(pathname)
465 for file_type in FileType.objects.all():
466 result = file_type.parse_filename(filename)
467 if result is not None:
468 result['file_type'] = file_type
474 def get_relative_pathname(abspath):
475 """Strip off the result home directory from a path
477 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
478 relative_pathname = abspath.replace(result_home_dir, '')
479 return relative_pathname
482 def get_absolute_pathname(relative_pathname):
483 """Attach relative path to results home directory"""
484 return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)