1 from __future__ import absolute_import, print_function, unicode_literals
10 from django.conf import settings
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import pre_save
16 from samples.models import Library, HTSUser
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
22 LOGGER = logging.getLogger(__name__)
25 default_pM = int(settings.DEFAULT_PM)
26 except AttributeError as e:
27 LOGGER.error("invalid value for default_pm")
29 # how many days to wait before trying to re-import a runfolder
32 RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34 LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35 "defaulting to %s" % (RESCAN_DELAY,))
37 RUN_STATUS_CHOICES = (
38 (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'),
39 (1, 'Data Pipeline Started'),
40 (2, 'Data Pipeline Interrupted'),
41 (3, 'Data Pipeline Finished'),
42 (4, 'Collect Results Started'),
43 (5, 'Collect Results Finished'),
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
51 class ClusterStation(models.Model):
52 """List of cluster stations"""
53 name = models.CharField(max_length=50, unique=True)
54 isdefault = models.BooleanField(default=False, null=False)
57 ordering = ["-isdefault", "name"]
63 def update_isdefault(sender, instance, **kwargs):
64 """Clear default if needed
66 if instance.isdefault:
67 for c in ClusterStation.objects.filter(isdefault=True).all():
68 if c.id != instance.id:
72 def cluster_station_default():
73 d = ClusterStation.objects.filter(isdefault=True).all()
76 d = ClusterStation.objects.order_by('-id').all()
81 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
83 class Sequencer(models.Model):
84 """Sequencers we've owned
86 name = models.CharField(max_length=50, db_index=True)
87 instrument_name = models.CharField(max_length=50, db_index=True)
88 serial_number = models.CharField(max_length=50, db_index=True)
89 model = models.CharField(max_length=255)
90 active = models.BooleanField(default=True, null=False)
91 isdefault = models.BooleanField(default=False, null=False)
92 comment = models.CharField(max_length=255)
95 ordering = ["-isdefault", "-active", "name"]
98 name = [str(self.name)]
99 if self.instrument_name is not None:
100 name.append("(%s)" % (str(self.instrument_name),))
101 return " ".join(name)
103 def get_absolute_url(self):
104 return urlresolvers.reverse('sequencer',
105 kwargs={'sequencer_id': self.id})
108 def update_isdefault(sender, instance, **kwargs):
109 """Clear default if needed
111 if instance.isdefault:
112 for s in Sequencer.objects.filter(isdefault=True).all():
113 if s.id != instance.id:
117 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
119 def sequencer_default():
120 d = Sequencer.objects.filter(isdefault=True).all()
123 d = Sequencer.objects.order_by('active', '-id').all()
129 class FlowCell(models.Model):
130 flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
131 run_date = models.DateTimeField()
132 advanced_run = models.BooleanField(default=False)
133 paired_end = models.BooleanField(default=False)
134 read_length = models.IntegerField(default=32) # Stanford is currenlty 25
135 control_lane = models.IntegerField(choices=[(1, 1),
147 cluster_station = models.ForeignKey(ClusterStation,
148 default=cluster_station_default)
149 sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
151 notes = models.TextField(blank=True)
154 return str(self.flowcell_id)
158 for lane in self.lane_set.order_by('lane_number'):
159 cluster_estimate = lane.cluster_estimate
160 if cluster_estimate is not None:
161 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
163 cluster_estimate = 'None'
164 library_id = lane.library_id
165 library = lane.library
166 element = '<tr><td>%d</td>'\
167 '<td><a href="%s">%s</a></td><td>%s</td></tr>'
168 html.append(element % (lane.lane_number,
169 library.get_admin_url(),
172 html.append('</table>')
173 return "\n".join(html)
174 Lanes.allow_tags = True
177 ordering = ["-run_date"]
179 def get_admin_url(self):
180 # that's the django way... except it didn't work
181 return urlresolvers.reverse('admin:experiments_flowcell_change',
184 def flowcell_type(self):
185 """Convert our boolean 'is paired' flag to a name
192 def get_absolute_url(self):
193 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
194 return urlresolvers.reverse('flowcell_detail', args=[str(flowcell_id)])
196 def get_raw_data_directory(self):
197 """Return location of where the raw data is stored"""
198 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
200 return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
202 def update_sequencing_runs(self):
203 result_root = self.get_raw_data_directory()
204 LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
205 if result_root is None:
208 run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
210 for dirpath, dirnames, filenames in os.walk(result_root):
211 for filename in filenames:
212 if run_xml_re.match(filename):
213 # we have a run directory
214 relative_pathname = get_relative_pathname(dirpath)
215 self.import_sequencing_run(relative_pathname, filename)
217 def import_sequencing_run(self, relative_pathname, run_xml_name, force=False):
218 """Given a result directory import files"""
220 run_dir = get_absolute_pathname(relative_pathname)
221 run_xml_path = os.path.join(run_dir, run_xml_name)
223 runs = SequencingRun.objects.filter(result_dir = relative_pathname)
225 run = SequencingRun()
228 raise RuntimeError("Too many data runs for %s" % (
234 if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
235 LOGGER.debug("Importing run from %s" % (relative_pathname,))
236 run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
238 run.status = RUN_STATUS_REVERSE_MAP['DONE']
239 run.result_dir = relative_pathname
240 run.runfolder_name = run_xml_data.runfolder_name
241 run.cycle_start = run_xml_data.image_analysis.start
242 run.cycle_stop = run_xml_data.image_analysis.stop
243 naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
244 run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
245 run.image_software = run_xml_data.image_analysis.software
246 run.image_version = run_xml_data.image_analysis.version
247 run.basecall_software = run_xml_data.bustard.software
248 run.basecall_version = run_xml_data.bustard.version
249 # we're frequently not running alignments
250 if run_xml_data.gerald:
251 run.alignment_software = run_xml_data.gerald.software
252 run.alignment_version = run_xml_data.gerald.version
254 run.last_update_time = timezone.now()
257 run.update_result_files()
260 # FIXME: should we automatically update sequencing run?
261 # Or should we expect someone to call update_sequencing_runs?
262 #def update_flowcell_sequencingruns(sender, instance, *args, **kwargs):
263 # """Update our sequencing rungs
265 # if not os.path.exists(settings.RESULT_HOME_DIR):
268 # instance.update_sequencing_runs()
269 #post_init.connect(update_flowcell_sequencingruns, sender=FlowCell)
271 LANE_STATUS_CODES = [(0, 'Failed'),
275 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
276 LANE_STATUS_MAP[None] = "Unknown"
279 def is_valid_lane(value):
280 if value >= 1 and value <= 8:
286 class Lane(models.Model):
288 ordering = ['-flowcell__run_date', '-library__id']
290 flowcell = models.ForeignKey(FlowCell)
291 lane_number = models.IntegerField()
292 library = models.ForeignKey(Library)
293 pM = models.DecimalField(max_digits=5,
298 cluster_estimate = models.IntegerField(blank=True, null=True)
299 status = models.IntegerField(choices=LANE_STATUS_CODES,
302 comment = models.TextField(null=True, blank=True)
304 def get_absolute_url(self):
305 return urlresolvers.reverse('flowcell_lane_detail',
306 kwargs={'lane_pk': str(self.id)})
309 return self.flowcell.flowcell_id + ':' + str(self.lane_number)
312 class SequencingRun(models.Model):
313 flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
314 runfolder_name = models.CharField(max_length=50)
315 result_dir = models.CharField(max_length=255)
316 last_update_time = models.DateTimeField()
317 run_start_time = models.DateTimeField()
318 cycle_start = models.IntegerField(null=True, blank=True)
319 cycle_stop = models.IntegerField(null=True, blank=True)
320 run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
321 null=True, blank=True)
322 image_software = models.CharField(max_length=50)
323 image_version = models.CharField(max_length=50)
324 basecall_software = models.CharField(max_length=50)
325 basecall_version = models.CharField(max_length=50)
326 alignment_software = models.CharField(max_length=50)
327 alignment_version = models.CharField(max_length=50)
328 comment = models.TextField(blank=True)
330 def update_result_files(self):
331 abs_result_dir = get_absolute_pathname(self.result_dir)
333 for dirname, dirnames, filenames in os.walk(abs_result_dir):
334 for filename in filenames:
335 pathname = os.path.join(dirname, filename)
336 relative_pathname = get_relative_pathname(pathname)
337 datafiles = self.datafile_set.filter(
339 relative_pathname=relative_pathname)
340 if len(datafiles) > 0:
343 metadata = find_file_type_metadata_from_filename(filename)
344 if metadata is not None:
345 metadata['filename'] = filename
347 newfile.sequencing_run = self
348 newfile.file_type = metadata['file_type']
349 newfile.relative_pathname = relative_pathname
351 lane_number = metadata.get('lane', None)
352 if lane_number is not None:
353 lane = self.flowcell.lane_set.get(
354 lane_number=lane_number)
355 newfile.library = lane.library
358 self.datafile_set.add(newfile)
360 self.last_update_time = timezone.now()
362 def lane_files(self):
365 for datafile in self.datafile_set.all():
366 metadata = datafile.attributes
367 if metadata is not None:
368 lane = metadata.get('lane', None)
370 lane_file_set = lanes.setdefault(lane, {})
371 normalized_name = datafile.file_type.normalized_name
372 lane_file_set[normalized_name] = datafile
375 def ivc_plots(self, lane):
376 ivc_name = ['IVC All', 'IVC Call',
377 'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
380 for rel_filename, metadata in self.get_result_files():
381 if metadata.file_type.name in ivc_name:
382 plots[metadata.file_type.name] = (rel_filename, metadata)
385 class FileType(models.Model):
386 """Represent potential file types
388 regex is a pattern used to detect if a filename matches this type
389 data run currently assumes that there may be a (?P<lane>) and
390 (?P<end>) pattern in the regular expression.
392 name = models.CharField(max_length=50)
393 mimetype = models.CharField(max_length=50, null=True, blank=True)
394 # regular expression from glob.fnmatch.translate
395 regex = models.TextField(null=True, blank=True)
397 def parse_filename(self, pathname):
398 """Does filename match our pattern?
400 Returns None if not, or dictionary of match variables if we do.
402 path, filename = os.path.split(pathname)
403 if len(self.regex) > 0:
404 match = re.match(self.regex, filename)
405 if match is not None:
406 # These are (?P<>) names we know about from our
408 results = match.groupdict()
410 # convert int parameters
411 for attribute_name in ['lane', 'end']:
412 value = results.get(attribute_name, None)
413 if value is not None:
414 results[attribute_name] = int(value)
418 def _get_normalized_name(self):
419 """Crush data file name into identifier friendly name"""
420 return self.name.replace(' ', '_').lower()
421 normalized_name = property(_get_normalized_name)
428 """Helper function to set default UUID in DataFile"""
429 return str(uuid.uuid1())
432 class DataFile(models.Model):
433 """Store map from random ID to filename"""
434 random_key = models.CharField(max_length=64,
437 sequencing_run = models.ForeignKey(SequencingRun, db_index=True, null=True)
438 library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
439 file_type = models.ForeignKey(FileType)
440 relative_pathname = models.CharField(max_length=255, db_index=True)
442 def _get_attributes(self):
443 return self.file_type.parse_filename(self.relative_pathname)
444 attributes = property(_get_attributes)
446 def _get_pathname(self):
447 return get_absolute_pathname(self.relative_pathname)
448 pathname = property(_get_pathname)
450 def get_absolute_url(self):
451 return urlresolvers.reverse('read_result_file', (), {'key': self.random_key})
454 def find_file_type_metadata_from_filename(pathname):
455 path, filename = os.path.split(pathname)
457 for file_type in FileType.objects.all():
458 result = file_type.parse_filename(filename)
459 if result is not None:
460 result['file_type'] = file_type
466 def get_relative_pathname(abspath):
467 """Strip off the result home directory from a path
469 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
470 relative_pathname = abspath.replace(result_home_dir, '')
471 return relative_pathname
474 def get_absolute_pathname(relative_pathname):
475 """Attach relative path to results home directory"""
476 return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)