1 from __future__ import absolute_import, print_function, unicode_literals
11 from django.conf import settings
12 from django.core.exceptions import ObjectDoesNotExist
13 from django.core import urlresolvers
14 from django.utils import timezone
15 from django.db import models
16 from django.db.models.signals import post_init, pre_save
18 from samples.models import Library
19 from htsworkflow.util.conversion import parse_flowcell_id
20 from htsworkflow.pipelines import runfolder
24 LOGGER = logging.getLogger(__name__)
27 default_pM = int(settings.DEFAULT_PM)
28 except AttributeError as e:
29 LOGGER.error("invalid value for default_pm")
31 # how many days to wait before trying to re-import a runfolder
34 RESCAN_DELAY = int(settings.RESCAN_DELAY)
35 except (ValueError, AttributeError):
36 LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
37 "defaulting to %s" % (RESCAN_DELAY,))
39 RUN_STATUS_CHOICES = (
40 (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'),
41 (1, 'Data Pipeline Started'),
42 (2, 'Data Pipeline Interrupted'),
43 (3, 'Data Pipeline Finished'),
44 (4, 'Collect Results Started'),
45 (5, 'Collect Results Finished'),
50 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
53 class ClusterStation(models.Model):
54 """List of cluster stations"""
55 name = models.CharField(max_length=50, unique=True)
56 isdefault = models.BooleanField(default=False, null=False)
59 ordering = ["-isdefault", "name"]
65 def update_isdefault(sender, instance, **kwargs):
66 """Clear default if needed
68 if instance.isdefault:
69 for c in ClusterStation.objects.filter(isdefault=True).all():
70 if c.id != instance.id:
74 def cluster_station_default():
75 d = ClusterStation.objects.filter(isdefault=True).all()
78 d = ClusterStation.objects.order_by('-id').all()
83 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
85 class Sequencer(models.Model):
86 """Sequencers we've owned
88 name = models.CharField(max_length=50, db_index=True)
89 instrument_name = models.CharField(max_length=50, db_index=True)
90 serial_number = models.CharField(max_length=50, db_index=True)
91 model = models.CharField(max_length=255)
92 active = models.BooleanField(default=True, null=False)
93 isdefault = models.BooleanField(default=False, null=False)
94 comment = models.CharField(max_length=255)
97 ordering = ["-isdefault", "-active", "name"]
100 name = [str(self.name)]
101 if self.instrument_name is not None:
102 name.append("(%s)" % (str(self.instrument_name),))
103 return " ".join(name)
106 def get_absolute_url(self):
107 return ('experiments.views.sequencer',
111 def update_isdefault(sender, instance, **kwargs):
112 """Clear default if needed
114 if instance.isdefault:
115 for s in Sequencer.objects.filter(isdefault=True).all():
116 if s.id != instance.id:
120 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
122 def sequencer_default():
123 d = Sequencer.objects.filter(isdefault=True).all()
126 d = Sequencer.objects.order_by('active', '-id').all()
132 class FlowCell(models.Model):
133 flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
134 run_date = models.DateTimeField()
135 advanced_run = models.BooleanField(default=False)
136 paired_end = models.BooleanField(default=False)
137 read_length = models.IntegerField(default=32) # Stanford is currenlty 25
138 control_lane = models.IntegerField(choices=[(1, 1),
150 cluster_station = models.ForeignKey(ClusterStation,
151 default=cluster_station_default)
152 sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
154 notes = models.TextField(blank=True)
157 return str(self.flowcell_id)
161 for lane in self.lane_set.order_by('lane_number'):
162 cluster_estimate = lane.cluster_estimate
163 if cluster_estimate is not None:
164 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
166 cluster_estimate = 'None'
167 library_id = lane.library_id
168 library = lane.library
169 element = '<tr><td>%d</td>'\
170 '<td><a href="%s">%s</a></td><td>%s</td></tr>'
171 html.append(element % (lane.lane_number,
172 library.get_admin_url(),
175 html.append('</table>')
176 return "\n".join(html)
177 Lanes.allow_tags = True
180 ordering = ["-run_date"]
182 def get_admin_url(self):
183 # that's the django way... except it didn't work
184 return urlresolvers.reverse('admin:experiments_flowcell_change',
187 def flowcell_type(self):
188 """Convert our boolean 'is paired' flag to a name
196 def get_absolute_url(self):
197 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
198 return ('experiments.views.flowcell_detail',
201 def get_raw_data_directory(self):
202 """Return location of where the raw data is stored"""
203 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
205 return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
207 def update_data_runs(self):
208 result_root = self.get_raw_data_directory()
209 LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
210 if result_root is None:
213 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
214 run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
217 for dirpath, dirnames, filenames in os.walk(result_root):
218 for filename in filenames:
219 if run_xml_re.match(filename):
220 # we have a run directory
221 relative_pathname = get_relative_pathname(dirpath)
222 self.import_data_run(relative_pathname, filename)
224 def import_data_run(self, relative_pathname, run_xml_name, force=False):
225 """Given a result directory import files"""
227 run_dir = get_absolute_pathname(relative_pathname)
228 run_xml_path = os.path.join(run_dir, run_xml_name)
230 runs = DataRun.objects.filter(result_dir = relative_pathname)
235 raise RuntimeError("Too many data runs for %s" % (
241 if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
242 LOGGER.debug("Importing run from %s" % (relative_pathname,))
243 run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
245 run.status = RUN_STATUS_REVERSE_MAP['DONE']
246 run.result_dir = relative_pathname
247 run.runfolder_name = run_xml_data.runfolder_name
248 run.cycle_start = run_xml_data.image_analysis.start
249 run.cycle_stop = run_xml_data.image_analysis.stop
250 naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
251 run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
252 run.image_software = run_xml_data.image_analysis.software
253 run.image_version = run_xml_data.image_analysis.version
254 run.basecall_software = run_xml_data.bustard.software
255 run.basecall_version = run_xml_data.bustard.version
256 # we're frequently not running alignments
257 if run_xml_data.gerald:
258 run.alignment_software = run_xml_data.gerald.software
259 run.alignment_version = run_xml_data.gerald.version
261 run.last_update_time = timezone.now()
264 run.update_result_files()
267 # FIXME: should we automatically update dataruns?
268 # Or should we expect someone to call update_data_runs?
269 #def update_flowcell_dataruns(sender, instance, *args, **kwargs):
270 # """Update our dataruns
272 # if not os.path.exists(settings.RESULT_HOME_DIR):
275 # instance.update_data_runs()
276 #post_init.connect(update_flowcell_dataruns, sender=FlowCell)
279 LANE_STATUS_CODES = [(0, 'Failed'),
282 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
283 LANE_STATUS_MAP[None] = "Unknown"
286 def is_valid_lane(value):
287 if value >= 1 and value <= 8:
293 class Lane(models.Model):
294 flowcell = models.ForeignKey(FlowCell)
295 lane_number = models.IntegerField()
296 library = models.ForeignKey(Library)
297 pM = models.DecimalField(max_digits=5,
302 cluster_estimate = models.IntegerField(blank=True, null=True)
303 status = models.IntegerField(choices=LANE_STATUS_CODES,
306 comment = models.TextField(null=True, blank=True)
309 def get_absolute_url(self):
310 return ('experiments.views.flowcell_lane_detail',
314 return self.flowcell.flowcell_id + ':' + str(self.lane_number)
317 class DataRun(models.Model):
318 flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
319 runfolder_name = models.CharField(max_length=50)
320 result_dir = models.CharField(max_length=255)
321 last_update_time = models.DateTimeField()
322 run_start_time = models.DateTimeField()
323 cycle_start = models.IntegerField(null=True, blank=True)
324 cycle_stop = models.IntegerField(null=True, blank=True)
325 run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
326 null=True, blank=True)
327 image_software = models.CharField(max_length=50)
328 image_version = models.CharField(max_length=50)
329 basecall_software = models.CharField(max_length=50)
330 basecall_version = models.CharField(max_length=50)
331 alignment_software = models.CharField(max_length=50)
332 alignment_version = models.CharField(max_length=50)
333 comment = models.TextField(blank=True)
335 def update_result_files(self):
336 abs_result_dir = get_absolute_pathname(self.result_dir)
338 for dirname, dirnames, filenames in os.walk(abs_result_dir):
339 for filename in filenames:
340 pathname = os.path.join(dirname, filename)
341 relative_pathname = get_relative_pathname(pathname)
342 datafiles = self.datafile_set.filter(
344 relative_pathname=relative_pathname)
345 if len(datafiles) > 0:
348 metadata = find_file_type_metadata_from_filename(filename)
349 if metadata is not None:
350 metadata['filename'] = filename
352 newfile.data_run = self
353 newfile.file_type = metadata['file_type']
354 newfile.relative_pathname = relative_pathname
356 lane_number = metadata.get('lane', None)
357 if lane_number is not None:
358 lane = self.flowcell.lane_set.get(
359 lane_number=lane_number)
360 newfile.library = lane.library
362 self.datafile_set.add(newfile)
364 self.last_update_time = timezone.now()
366 def lane_files(self):
369 for datafile in self.datafile_set.all():
370 metadata = datafile.attributes
371 if metadata is not None:
372 lane = metadata.get('lane', None)
374 lane_file_set = lanes.setdefault(lane, {})
375 normalized_name = datafile.file_type.normalized_name
376 lane_file_set[normalized_name] = datafile
379 def ivc_plots(self, lane):
380 ivc_name = ['IVC All', 'IVC Call',
381 'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
384 for rel_filename, metadata in self.get_result_files():
385 if metadata.file_type.name in ivc_name:
386 plots[metadata.file_type.name] = (rel_filename, metadata)
389 class FileType(models.Model):
390 """Represent potential file types
392 regex is a pattern used to detect if a filename matches this type
393 data run currently assumes that there may be a (?P<lane>) and
394 (?P<end>) pattern in the regular expression.
396 name = models.CharField(max_length=50)
397 mimetype = models.CharField(max_length=50, null=True, blank=True)
398 # regular expression from glob.fnmatch.translate
399 regex = models.TextField(null=True, blank=True)
401 def parse_filename(self, pathname):
402 """Does filename match our pattern?
404 Returns None if not, or dictionary of match variables if we do.
406 path, filename = os.path.split(pathname)
407 if len(self.regex) > 0:
408 match = re.match(self.regex, filename)
409 if match is not None:
410 # These are (?P<>) names we know about from our
412 results = match.groupdict()
414 # convert int parameters
415 for attribute_name in ['lane', 'end']:
416 value = results.get(attribute_name, None)
417 if value is not None:
418 results[attribute_name] = int(value)
422 def _get_normalized_name(self):
423 """Crush data file name into identifier friendly name"""
424 return self.name.replace(' ', '_').lower()
425 normalized_name = property(_get_normalized_name)
428 #return "<FileType: %s>" % (self.name,)
433 """Helper function to set default UUID in DataFile"""
434 return str(uuid.uuid1())
437 class DataFile(models.Model):
438 """Store map from random ID to filename"""
439 random_key = models.CharField(max_length=64,
442 data_run = models.ForeignKey(DataRun, db_index=True)
443 library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
444 file_type = models.ForeignKey(FileType)
445 relative_pathname = models.CharField(max_length=255, db_index=True)
447 def _get_attributes(self):
448 return self.file_type.parse_filename(self.relative_pathname)
449 attributes = property(_get_attributes)
451 def _get_pathname(self):
452 return get_absolute_pathname(self.relative_pathname)
453 pathname = property(_get_pathname)
456 def get_absolute_url(self):
457 return ('experiments.views.read_result_file',
458 (), {'key': self.random_key})
461 def find_file_type_metadata_from_filename(pathname):
462 path, filename = os.path.split(pathname)
464 for file_type in FileType.objects.all():
465 result = file_type.parse_filename(filename)
466 if result is not None:
467 result['file_type'] = file_type
473 def get_relative_pathname(abspath):
474 """Strip off the result home directory from a path
476 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
477 relative_pathname = abspath.replace(result_home_dir, '')
478 return relative_pathname
481 def get_absolute_pathname(relative_pathname):
482 """Attach relative path to results home directory"""
483 return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)