1 from __future__ import absolute_import, print_function, unicode_literals
10 from django.conf import settings
11 from django.core import urlresolvers
12 from django.utils import timezone
13 from django.db import models
14 from django.db.models.signals import pre_save
16 from samples.models import Library, HTSUser
17 from htsworkflow.util.conversion import parse_flowcell_id
18 from htsworkflow.pipelines import runfolder
22 LOGGER = logging.getLogger(__name__)
25 default_pM = int(settings.DEFAULT_PM)
26 except AttributeError as e:
27 LOGGER.error("invalid value for default_pm")
29 # how many days to wait before trying to re-import a runfolder
32 RESCAN_DELAY = int(settings.RESCAN_DELAY)
33 except (ValueError, AttributeError):
34 LOGGER.error("Missing or invalid settings.RESCAN_DELAY, "\
35 "defaulting to %s" % (RESCAN_DELAY,))
37 RUN_STATUS_CHOICES = (
38 (0, 'Sequencer running'), # Solexa Data Pipeline Not Yet Started'),
39 (1, 'Data Pipeline Started'),
40 (2, 'Data Pipeline Interrupted'),
41 (3, 'Data Pipeline Finished'),
42 (4, 'Collect Results Started'),
43 (5, 'Collect Results Finished'),
48 RUN_STATUS_REVERSE_MAP = dict(((v, k) for k, v in RUN_STATUS_CHOICES))
51 class ClusterStation(models.Model):
52 """List of cluster stations"""
53 name = models.CharField(max_length=50, unique=True)
54 isdefault = models.BooleanField(default=False, null=False)
57 ordering = ["-isdefault", "name"]
63 def update_isdefault(sender, instance, **kwargs):
64 """Clear default if needed
66 if instance.isdefault:
67 for c in ClusterStation.objects.filter(isdefault=True).all():
68 if c.id != instance.id:
72 def cluster_station_default():
73 d = ClusterStation.objects.filter(isdefault=True).all()
76 d = ClusterStation.objects.order_by('-id').all()
81 pre_save.connect(ClusterStation.update_isdefault, sender=ClusterStation)
83 class Sequencer(models.Model):
84 """Sequencers we've owned
86 name = models.CharField(max_length=50, db_index=True)
87 instrument_name = models.CharField(max_length=50, db_index=True)
88 serial_number = models.CharField(max_length=50, db_index=True)
89 model = models.CharField(max_length=255)
90 active = models.BooleanField(default=True, null=False)
91 isdefault = models.BooleanField(default=False, null=False)
92 comment = models.CharField(max_length=255)
95 ordering = ["-isdefault", "-active", "name"]
98 name = [str(self.name)]
99 if self.instrument_name is not None:
100 name.append("(%s)" % (str(self.instrument_name),))
101 return " ".join(name)
103 def get_absolute_url(self):
104 return urlresolvers.reverse('sequencer',
105 kwargs={'sequencer_id': self.id})
108 def update_isdefault(sender, instance, **kwargs):
109 """Clear default if needed
111 if instance.isdefault:
112 for s in Sequencer.objects.filter(isdefault=True).all():
113 if s.id != instance.id:
117 pre_save.connect(Sequencer.update_isdefault, sender=Sequencer)
119 def sequencer_default():
120 d = Sequencer.objects.filter(isdefault=True).all()
123 d = Sequencer.objects.order_by('active', '-id').all()
129 class FlowCell(models.Model):
130 flowcell_id = models.CharField(max_length=20, unique=True, db_index=True)
131 run_date = models.DateTimeField()
132 advanced_run = models.BooleanField(default=False)
133 paired_end = models.BooleanField(default=False)
134 read_length = models.IntegerField(default=32) # Stanford is currenlty 25
135 control_lane = models.IntegerField(choices=[(1, 1),
147 cluster_station = models.ForeignKey(ClusterStation,
148 default=cluster_station_default)
149 sequencer = models.ForeignKey(Sequencer, default=sequencer_default)
151 notes = models.TextField(blank=True)
154 return str(self.flowcell_id)
158 for lane in self.lane_set.order_by('lane_number'):
159 cluster_estimate = lane.cluster_estimate
160 if cluster_estimate is not None:
161 cluster_estimate = "%s k" % ((int(cluster_estimate) / 1000), )
163 cluster_estimate = 'None'
164 library_id = lane.library_id
165 library = lane.library
166 element = '<tr><td>%d</td>'\
167 '<td><a href="%s">%s</a></td><td>%s</td></tr>'
168 html.append(element % (lane.lane_number,
169 library.get_admin_url(),
172 html.append('</table>')
173 return "\n".join(html)
174 Lanes.allow_tags = True
177 ordering = ["-run_date"]
179 def get_admin_url(self):
180 # that's the django way... except it didn't work
181 return urlresolvers.reverse('admin:experiments_flowcell_change',
184 def flowcell_type(self):
185 """Convert our boolean 'is paired' flag to a name
192 def get_absolute_url(self):
193 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
194 return urlresolvers.reverse('flowcell_detail', args=[str(flowcell_id)])
196 def get_raw_data_directory(self):
197 """Return location of where the raw data is stored"""
198 flowcell_id, status = parse_flowcell_id(self.flowcell_id)
200 return os.path.join(settings.RESULT_HOME_DIR, flowcell_id)
202 def update_sequencing_runs(self):
203 result_root = self.get_raw_data_directory()
204 LOGGER.debug("Update data runs flowcell root: %s" % (result_root,))
205 if result_root is None:
208 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
209 run_xml_re = re.compile(glob.fnmatch.translate('run*.xml'))
212 for dirpath, dirnames, filenames in os.walk(result_root):
213 for filename in filenames:
214 if run_xml_re.match(filename):
215 # we have a run directory
216 relative_pathname = get_relative_pathname(dirpath)
217 self.import_sequencing_run(relative_pathname, filename)
219 def import_sequencing_run(self, relative_pathname, run_xml_name, force=False):
220 """Given a result directory import files"""
222 run_dir = get_absolute_pathname(relative_pathname)
223 run_xml_path = os.path.join(run_dir, run_xml_name)
225 runs = SequencingRun.objects.filter(result_dir = relative_pathname)
227 run = SequencingRun()
230 raise RuntimeError("Too many data runs for %s" % (
236 if created or force or (now-run.last_update_time).days > RESCAN_DELAY:
237 LOGGER.debug("Importing run from %s" % (relative_pathname,))
238 run_xml_data = runfolder.load_pipeline_run_xml(run_xml_path)
240 run.status = RUN_STATUS_REVERSE_MAP['DONE']
241 run.result_dir = relative_pathname
242 run.runfolder_name = run_xml_data.runfolder_name
243 run.cycle_start = run_xml_data.image_analysis.start
244 run.cycle_stop = run_xml_data.image_analysis.stop
245 naive_run_start_time = datetime.datetime.fromordinal(run_xml_data.image_analysis.date.toordinal())
246 run.run_start_time = pytz.timezone(settings.TIME_ZONE).localize(naive_run_start_time)
247 run.image_software = run_xml_data.image_analysis.software
248 run.image_version = run_xml_data.image_analysis.version
249 run.basecall_software = run_xml_data.bustard.software
250 run.basecall_version = run_xml_data.bustard.version
251 # we're frequently not running alignments
252 if run_xml_data.gerald:
253 run.alignment_software = run_xml_data.gerald.software
254 run.alignment_version = run_xml_data.gerald.version
256 run.last_update_time = timezone.now()
259 run.update_result_files()
262 # FIXME: should we automatically update sequencing run?
263 # Or should we expect someone to call update_sequencing_runs?
264 #def update_flowcell_sequencingruns(sender, instance, *args, **kwargs):
265 # """Update our sequencing rungs
267 # if not os.path.exists(settings.RESULT_HOME_DIR):
270 # instance.update_sequencing_runs()
271 #post_init.connect(update_flowcell_sequencingruns, sender=FlowCell)
273 LANE_STATUS_CODES = [(0, 'Failed'),
277 LANE_STATUS_MAP = dict((int(k), v) for k, v in LANE_STATUS_CODES)
278 LANE_STATUS_MAP[None] = "Unknown"
281 def is_valid_lane(value):
282 if value >= 1 and value <= 8:
288 class Lane(models.Model):
289 flowcell = models.ForeignKey(FlowCell)
290 lane_number = models.IntegerField()
291 library = models.ForeignKey(Library)
292 pM = models.DecimalField(max_digits=5,
297 cluster_estimate = models.IntegerField(blank=True, null=True)
298 status = models.IntegerField(choices=LANE_STATUS_CODES,
301 comment = models.TextField(null=True, blank=True)
303 def get_absolute_url(self):
304 return urlresolvers.reverse('flowcell_lane_detail',
305 kwargs={'lane_pk': str(self.id)})
308 return self.flowcell.flowcell_id + ':' + str(self.lane_number)
311 class SequencingRun(models.Model):
312 flowcell = models.ForeignKey(FlowCell, verbose_name="Flowcell Id")
313 runfolder_name = models.CharField(max_length=50)
314 result_dir = models.CharField(max_length=255)
315 last_update_time = models.DateTimeField()
316 run_start_time = models.DateTimeField()
317 cycle_start = models.IntegerField(null=True, blank=True)
318 cycle_stop = models.IntegerField(null=True, blank=True)
319 run_status = models.IntegerField(choices=RUN_STATUS_CHOICES,
320 null=True, blank=True)
321 image_software = models.CharField(max_length=50)
322 image_version = models.CharField(max_length=50)
323 basecall_software = models.CharField(max_length=50)
324 basecall_version = models.CharField(max_length=50)
325 alignment_software = models.CharField(max_length=50)
326 alignment_version = models.CharField(max_length=50)
327 comment = models.TextField(blank=True)
329 def update_result_files(self):
330 abs_result_dir = get_absolute_pathname(self.result_dir)
332 for dirname, dirnames, filenames in os.walk(abs_result_dir):
333 for filename in filenames:
334 pathname = os.path.join(dirname, filename)
335 relative_pathname = get_relative_pathname(pathname)
336 datafiles = self.datafile_set.filter(
338 relative_pathname=relative_pathname)
339 if len(datafiles) > 0:
342 metadata = find_file_type_metadata_from_filename(filename)
343 if metadata is not None:
344 metadata['filename'] = filename
346 newfile.sequencing_run = self
347 newfile.file_type = metadata['file_type']
348 newfile.relative_pathname = relative_pathname
350 lane_number = metadata.get('lane', None)
351 if lane_number is not None:
352 lane = self.flowcell.lane_set.get(
353 lane_number=lane_number)
354 newfile.library = lane.library
356 self.datafile_set.add(newfile)
358 self.last_update_time = timezone.now()
360 def lane_files(self):
363 for datafile in self.datafile_set.all():
364 metadata = datafile.attributes
365 if metadata is not None:
366 lane = metadata.get('lane', None)
368 lane_file_set = lanes.setdefault(lane, {})
369 normalized_name = datafile.file_type.normalized_name
370 lane_file_set[normalized_name] = datafile
373 def ivc_plots(self, lane):
374 ivc_name = ['IVC All', 'IVC Call',
375 'IVC Percent Base', 'IVC Percent All', 'IVC Percent Call']
378 for rel_filename, metadata in self.get_result_files():
379 if metadata.file_type.name in ivc_name:
380 plots[metadata.file_type.name] = (rel_filename, metadata)
383 class FileType(models.Model):
384 """Represent potential file types
386 regex is a pattern used to detect if a filename matches this type
387 data run currently assumes that there may be a (?P<lane>) and
388 (?P<end>) pattern in the regular expression.
390 name = models.CharField(max_length=50)
391 mimetype = models.CharField(max_length=50, null=True, blank=True)
392 # regular expression from glob.fnmatch.translate
393 regex = models.TextField(null=True, blank=True)
395 def parse_filename(self, pathname):
396 """Does filename match our pattern?
398 Returns None if not, or dictionary of match variables if we do.
400 path, filename = os.path.split(pathname)
401 if len(self.regex) > 0:
402 match = re.match(self.regex, filename)
403 if match is not None:
404 # These are (?P<>) names we know about from our
406 results = match.groupdict()
408 # convert int parameters
409 for attribute_name in ['lane', 'end']:
410 value = results.get(attribute_name, None)
411 if value is not None:
412 results[attribute_name] = int(value)
416 def _get_normalized_name(self):
417 """Crush data file name into identifier friendly name"""
418 return self.name.replace(' ', '_').lower()
419 normalized_name = property(_get_normalized_name)
426 """Helper function to set default UUID in DataFile"""
427 return str(uuid.uuid1())
430 class DataFile(models.Model):
431 """Store map from random ID to filename"""
432 random_key = models.CharField(max_length=64,
435 sequencing_run = models.ForeignKey(SequencingRun, db_index=True, null=True)
436 library = models.ForeignKey(Library, db_index=True, null=True, blank=True)
437 file_type = models.ForeignKey(FileType)
438 relative_pathname = models.CharField(max_length=255, db_index=True)
440 def _get_attributes(self):
441 return self.file_type.parse_filename(self.relative_pathname)
442 attributes = property(_get_attributes)
444 def _get_pathname(self):
445 return get_absolute_pathname(self.relative_pathname)
446 pathname = property(_get_pathname)
448 def get_absolute_url(self):
449 return urlresolvers.reverse('read_result_file', (), {'key': self.random_key})
452 def find_file_type_metadata_from_filename(pathname):
453 path, filename = os.path.split(pathname)
455 for file_type in FileType.objects.all():
456 result = file_type.parse_filename(filename)
457 if result is not None:
458 result['file_type'] = file_type
464 def get_relative_pathname(abspath):
465 """Strip off the result home directory from a path
467 result_home_dir = os.path.join(settings.RESULT_HOME_DIR, '')
468 relative_pathname = abspath.replace(result_home_dir, '')
469 return relative_pathname
472 def get_absolute_pathname(relative_pathname):
473 """Attach relative path to results home directory"""
474 return os.path.join(settings.RESULT_HOME_DIR, relative_pathname)