1 from __future__ import absolute_import, print_function, unicode_literals
10 from django.conf import settings
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import pre_save
16 from samples.models import Library, HTSUser
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
22 LOGGER = logging.getLogger(__name__)
25 default_pM = int(settings.DEFAULT_PM)
26 except AttributeError as e:
27 LOGGER.error("invalid value for default_pm")
29 # how many days to wait before trying to re-import a runfolder
32 RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34 LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35 "defaulting to %s" % (RESCAN_DELAY,))
37 RUN_STATUS_CHOICES = (
38 (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'),
39 (1, 'Data Pipeline Started'),
40 (2, 'Data Pipeline Interrupted'),
41 (3, 'Data Pipeline Finished'),
42 (4, 'Collect Results Started'),
43 (5, 'Collect Results Finished'),
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
51 class ClusterStation(models.Model):
52 """List of cluster stations"""
53 name = models.CharField(max_length=50, unique=True)
54 isdefault = models.BooleanField(default=False, null=False)
57 ordering = ["-isdefault", "name"]
63 def update_isdefault(sender, instance, **kwargs):
64 """Clear default if needed
66 if instance.isdefault:
67 for c in ClusterStation.objects.filter(isdefault=True).all():
68 if c.id != instance.id:
72 def cluster_station_default():
73 d = ClusterStation.objects.filter(isdefault=True).all()
76 d = ClusterStation.objects.order_by('-id').all()
81 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
83 class Sequencer(models.Model):
84 """Sequencers we've owned
86 name = models.CharField(max_length=50, db_index=True)
87 instrument_name = models.CharField(max_length=50, db_index=True)
88 serial_number = models.CharField(max_length=50, db_index=True)
89 model = models.CharField(max_length=255)
90 active = models.BooleanField(default=True, null=False)
91 isdefault = models.BooleanField(default=False, null=False)
92 comment = models.CharField(max_length=255)
95 ordering = ["-isdefault", "-active", "name"]
98 name = [str(self.name)]
99 if self.instrument_name is not None:
100 name.append("(%s)" % (str(self.instrument_name),))
101 return " ".join(name)
104 def get_absolute_url(self):
105 return ('experiments.views.sequencer',
109 def update_isdefault(sender, instance, **kwargs):
110 """Clear default if needed
112 if instance.isdefault:
113 for s in Sequencer.objects.filter(isdefault=True).all():
114 if s.id != instance.id:
118 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
120 def sequencer_default():
121 d = Sequencer.objects.filter(isdefault=True).all()
124 d = Sequencer.objects.order_by('active', '-id').all()
130 class FlowCell(models.Model):
131 flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
132 run_date = models.DateTimeField()
133 advanced_run = models.BooleanField(default=False)
134 paired_end = models.BooleanField(default=False)
135 read_length = models.IntegerField(default=32) # Stanford is currenlty 25
136 control_lane = models.IntegerField(choices=[(1, 1),
148 cluster_station = models.ForeignKey(ClusterStation,
149 default=cluster_station_default)
150 sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
152 notes = models.TextField(blank=True)
155 return str(self.flowcell_id)
159 for lane in self.lane_set.order_by('lane_number'):
160 cluster_estimate = lane.cluster_estimate
161 if cluster_estimate is not None:
162 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
164 cluster_estimate = 'None'
165 library_id = lane.library_id
166 library = lane.library
167 element = '<tr><td>%d</td>'\
168 '<td><a href="%s">%s</a></td><td>%s</td></tr>'
169 html.append(element % (lane.lane_number,
170 library.get_admin_url(),
173 html.append('</table>')
174 return "\n".join(html)
175 Lanes.allow_tags = True
178 ordering = ["-run_date"]
180 def get_admin_url(self):
181 # that's the django way... except it didn't work
182 return urlresolvers.reverse('admin:experiments_flowcell_change',
185 def flowcell_type(self):
186 """Convert our boolean 'is paired' flag to a name
194 def get_absolute_url(self):
195 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
196 return ('experiments.views.flowcell_detail',
199 def get_raw_data_directory(self):
200 """Return location of where the raw data is stored"""
201 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
203 return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
205 def update_sequencing_runs(self):
206 result_root = self.get_raw_data_directory()
207 LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
208 if result_root is None:
211 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
212 run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
215 for dirpath, dirnames, filenames in os.walk(result_root):
216 for filename in filenames:
217 if run_xml_re.match(filename):
218 # we have a run directory
219 relative_pathname = get_relative_pathname(dirpath)
220 self.import_sequencing_run(relative_pathname, filename)
222 def import_sequencing_run(self, relative_pathname, run_xml_name, force=False):
223 """Given a result directory import files"""
225 run_dir = get_absolute_pathname(relative_pathname)
226 run_xml_path = os.path.join(run_dir, run_xml_name)
228 runs = SequencingRun.objects.filter(result_dir = relative_pathname)
230 run = SequencingRun()
233 raise RuntimeError("Too many data runs for %s" % (
239 if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
240 LOGGER.debug("Importing run from %s" % (relative_pathname,))
241 run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
243 run.status = RUN_STATUS_REVERSE_MAP['DONE']
244 run.result_dir = relative_pathname
245 run.runfolder_name = run_xml_data.runfolder_name
246 run.cycle_start = run_xml_data.image_analysis.start
247 run.cycle_stop = run_xml_data.image_analysis.stop
248 naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
249 run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
250 run.image_software = run_xml_data.image_analysis.software
251 run.image_version = run_xml_data.image_analysis.version
252 run.basecall_software = run_xml_data.bustard.software
253 run.basecall_version = run_xml_data.bustard.version
254 # we're frequently not running alignments
255 if run_xml_data.gerald:
256 run.alignment_software = run_xml_data.gerald.software
257 run.alignment_version = run_xml_data.gerald.version
259 run.last_update_time = timezone.now()
262 run.update_result_files()
265 # FIXME: should we automatically update sequencing run?
266 # Or should we expect someone to call update_sequencing_runs?
267 #def update_flowcell_sequencingruns(sender, instance, *args, **kwargs):
268 # """Update our sequencing rungs
270 # if not os.path.exists(settings.RESULT_HOME_DIR):
273 # instance.update_sequencing_runs()
274 #post_init.connect(update_flowcell_sequencingruns, sender=FlowCell)
276 LANE_STATUS_CODES = [(0, 'Failed'),
280 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
281 LANE_STATUS_MAP[None] = "Unknown"
284 def is_valid_lane(value):
285 if value >= 1 and value <= 8:
291 class Lane(models.Model):
292 flowcell = models.ForeignKey(FlowCell)
293 lane_number = models.IntegerField()
294 library = models.ForeignKey(Library)
295 pM = models.DecimalField(max_digits=5,
300 cluster_estimate = models.IntegerField(blank=True, null=True)
301 status = models.IntegerField(choices=LANE_STATUS_CODES,
304 comment = models.TextField(null=True, blank=True)
307 def get_absolute_url(self):
308 return ('experiments.views.flowcell_lane_detail',
312 return self.flowcell.flowcell_id + ':' + str(self.lane_number)
315 class SequencingRun(models.Model):
316 flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
317 runfolder_name = models.CharField(max_length=50)
318 result_dir = models.CharField(max_length=255)
319 last_update_time = models.DateTimeField()
320 run_start_time = models.DateTimeField()
321 cycle_start = models.IntegerField(null=True, blank=True)
322 cycle_stop = models.IntegerField(null=True, blank=True)
323 run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
324 null=True, blank=True)
325 image_software = models.CharField(max_length=50)
326 image_version = models.CharField(max_length=50)
327 basecall_software = models.CharField(max_length=50)
328 basecall_version = models.CharField(max_length=50)
329 alignment_software = models.CharField(max_length=50)
330 alignment_version = models.CharField(max_length=50)
331 comment = models.TextField(blank=True)
333 def update_result_files(self):
334 abs_result_dir = get_absolute_pathname(self.result_dir)
336 for dirname, dirnames, filenames in os.walk(abs_result_dir):
337 for filename in filenames:
338 pathname = os.path.join(dirname, filename)
339 relative_pathname = get_relative_pathname(pathname)
340 datafiles = self.datafile_set.filter(
342 relative_pathname=relative_pathname)
343 if len(datafiles) > 0:
346 metadata = find_file_type_metadata_from_filename(filename)
347 if metadata is not None:
348 metadata['filename'] = filename
350 newfile.sequencing_run = self
351 newfile.file_type = metadata['file_type']
352 newfile.relative_pathname = relative_pathname
354 lane_number = metadata.get('lane', None)
355 if lane_number is not None:
356 lane = self.flowcell.lane_set.get(
357 lane_number=lane_number)
358 newfile.library = lane.library
360 self.datafile_set.add(newfile)
362 self.last_update_time = timezone.now()
364 def lane_files(self):
367 for datafile in self.datafile_set.all():
368 metadata = datafile.attributes
369 if metadata is not None:
370 lane = metadata.get('lane', None)
372 lane_file_set = lanes.setdefault(lane, {})
373 normalized_name = datafile.file_type.normalized_name
374 lane_file_set[normalized_name] = datafile
377 def ivc_plots(self, lane):
378 ivc_name = ['IVC All', 'IVC Call',
379 'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
382 for rel_filename, metadata in self.get_result_files():
383 if metadata.file_type.name in ivc_name:
384 plots[metadata.file_type.name] = (rel_filename, metadata)
387 class FileType(models.Model):
388 """Represent potential file types
390 regex is a pattern used to detect if a filename matches this type
391 data run currently assumes that there may be a (?P<lane>) and
392 (?P<end>) pattern in the regular expression.
394 name = models.CharField(max_length=50)
395 mimetype = models.CharField(max_length=50, null=True, blank=True)
396 # regular expression from glob.fnmatch.translate
397 regex = models.TextField(null=True, blank=True)
399 def parse_filename(self, pathname):
400 """Does filename match our pattern?
402 Returns None if not, or dictionary of match variables if we do.
404 path, filename = os.path.split(pathname)
405 if len(self.regex) > 0:
406 match = re.match(self.regex, filename)
407 if match is not None:
408 # These are (?P<>) names we know about from our
410 results = match.groupdict()
412 # convert int parameters
413 for attribute_name in ['lane', 'end']:
414 value = results.get(attribute_name, None)
415 if value is not None:
416 results[attribute_name] = int(value)
420 def _get_normalized_name(self):
421 """Crush data file name into identifier friendly name"""
422 return self.name.replace(' ', '_').lower()
423 normalized_name = property(_get_normalized_name)
430 """Helper function to set default UUID in DataFile"""
431 return str(uuid.uuid1())
434 class DataFile(models.Model):
435 """Store map from random ID to filename"""
436 random_key = models.CharField(max_length=64,
439 sequencing_run = models.ForeignKey(SequencingRun, db_index=True, null=True)
440 library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
441 file_type = models.ForeignKey(FileType)
442 relative_pathname = models.CharField(max_length=255, db_index=True)
444 def _get_attributes(self):
445 return self.file_type.parse_filename(self.relative_pathname)
446 attributes = property(_get_attributes)
448 def _get_pathname(self):
449 return get_absolute_pathname(self.relative_pathname)
450 pathname = property(_get_pathname)
453 def get_absolute_url(self):
454 return ('experiments.views.read_result_file',
455 (), {'key': self.random_key})
458 def find_file_type_metadata_from_filename(pathname):
459 path, filename = os.path.split(pathname)
461 for file_type in FileType.objects.all():
462 result = file_type.parse_filename(filename)
463 if result is not None:
464 result['file_type'] = file_type
470 def get_relative_pathname(abspath):
471 """Strip off the result home directory from a path
473 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
474 relative_pathname = abspath.replace(result_home_dir, '')
475 return relative_pathname
478 def get_absolute_pathname(relative_pathname):
479 """Attach relative path to results home directory"""
480 return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)