Merge lp:~codehelp/lava-scheduler/reserved-boards into lp:lava-scheduler

Proposed by Neil Williams
Status: Merged
Approved by: Neil Williams
Approved revision: 260
Merged at revision: 257
Proposed branch: lp:~codehelp/lava-scheduler/reserved-boards
Merge into: lp:lava-scheduler
Prerequisite: lp:~stylesen/lava-scheduler/fix-worker-multinode-error
Diff against target: 237 lines (+53/-22)
6 files modified
lava_scheduler_app/admin.py (+1/-1)
lava_scheduler_app/api.py (+2/-3)
lava_scheduler_app/models.py (+25/-8)
lava_scheduler_app/views.py (+4/-3)
lava_scheduler_daemon/dbjobsource.py (+20/-6)
lava_scheduler_daemon/service.py (+1/-1)
To merge this branch: bzr merge lp:~codehelp/lava-scheduler/reserved-boards
Reviewer Review Type Date Requested Status
Antonio Terceiro Approve
Review via email: mp+183496@code.launchpad.net

Description of the change

Add a new RESERVED device status and stop _fix_device from overloading RUNNING for the period after submission but before a MultiNode job is running.

There is a separate fix for the regression in #1202285 but that needs to go in after this fix as #1202285 makes this bug harder to reproduce without fixing it.

To post a comment you must log in.
259. By Neil Williams

Ensure the device transitions to Running once the job
(single node or multi node) starts to run.

260. By Neil Williams

Ensure there is an actual device before trying to check it.

Revision history for this message
Antonio Terceiro (terceiro) wrote :

Looks good

 review approve

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lava_scheduler_app/admin.py'
--- lava_scheduler_app/admin.py 2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/admin.py 2013-09-02 18:15:38 +0000
@@ -8,7 +8,7 @@
88
99
10def offline_action(modeladmin, request, queryset):10def offline_action(modeladmin, request, queryset):
11 for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING]):11 for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING, Device.RESERVED]):
12 if device.can_admin(request.user):12 if device.can_admin(request.user):
13 device.put_into_maintenance_mode(request.user, "admin action")13 device.put_into_maintenance_mode(request.user, "admin action")
14offline_action.short_description = "take offline"14offline_action.short_description = "take offline"
1515
=== modified file 'lava_scheduler_app/api.py'
--- lava_scheduler_app/api.py 2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/api.py 2013-09-02 18:15:38 +0000
@@ -2,7 +2,6 @@
2from simplejson import JSONDecodeError2from simplejson import JSONDecodeError
3from django.db.models import Count3from django.db.models import Count
4from linaro_django_xmlrpc.models import ExposedAPI4from linaro_django_xmlrpc.models import ExposedAPI
5from lava_scheduler_app import utils
6from lava_scheduler_app.models import (5from lava_scheduler_app.models import (
7 Device,6 Device,
8 DeviceType,7 DeviceType,
@@ -165,8 +164,8 @@
165 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),164 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
166 offline=SumIf('device', condition='status in (%s,%s)'165 offline=SumIf('device', condition='status in (%s,%s)'
167 % (Device.OFFLINE, Device.OFFLINING)),166 % (Device.OFFLINE, Device.OFFLINING)),
168 busy=SumIf('device', condition='status=%s'167 busy=SumIf('device', condition='status in (%s,%s)'
169 % Device.RUNNING), ).order_by('name')168 % (Device.RUNNING, Device.RESERVED)), ).order_by('name')
170169
171 for dev_type in device_types:170 for dev_type in device_types:
172 device_type = {}171 device_type = {}
173172
=== modified file 'lava_scheduler_app/models.py'
--- lava_scheduler_app/models.py 2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/models.py 2013-09-02 18:15:38 +0000
@@ -51,19 +51,20 @@
5151
5252
53def check_device_availability(requested_devices):53def check_device_availability(requested_devices):
54 """Checks whether the number of devices requested is available.54 """Checks whether the number of devices requested is available for a multinode job.
55 55
56 See utils.requested_device_count() for details of REQUESTED_DEVICES56 See utils.requested_device_count() for details of REQUESTED_DEVICES
57 dictionary format.57 dictionary format.
5858
59 Returns True if the requested number of devices are available, else59 Returns True for singlenode or if the requested number of devices are available
60 raises DevicesUnavailableException.60 for the multinode job, else raises DevicesUnavailableException.
61 """61 """
62 device_types = DeviceType.objects.values_list('name').filter(62 device_types = DeviceType.objects.values_list('name').filter(
63 models.Q(device__status=Device.IDLE) | \63 models.Q(device__status=Device.IDLE) |
64 models.Q(device__status=Device.RUNNING)64 models.Q(device__status=Device.RUNNING) |
65 models.Q(device__status=Device.RESERVED)
65 ).annotate(66 ).annotate(
66 num_count=models.Count('name')67 num_count=models.Count('name')
67 ).order_by('name')68 ).order_by('name')
6869
69 if requested_devices:70 if requested_devices:
@@ -115,6 +116,7 @@
115 RUNNING = 2116 RUNNING = 2
116 OFFLINING = 3117 OFFLINING = 3
117 RETIRED = 4118 RETIRED = 4
119 RESERVED = 5
118120
119 STATUS_CHOICES = (121 STATUS_CHOICES = (
120 (OFFLINE, 'Offline'),122 (OFFLINE, 'Offline'),
@@ -122,6 +124,7 @@
122 (RUNNING, 'Running'),124 (RUNNING, 'Running'),
123 (OFFLINING, 'Going offline'),125 (OFFLINING, 'Going offline'),
124 (RETIRED, 'Retired'),126 (RETIRED, 'Retired'),
127 (RESERVED, 'Reserved')
125 )128 )
126129
127 # A device health shows a device is ready to test or not130 # A device health shows a device is ready to test or not
@@ -201,7 +204,7 @@
201 return user.has_perm('lava_scheduler_app.change_device')204 return user.has_perm('lava_scheduler_app.change_device')
202205
203 def put_into_maintenance_mode(self, user, reason):206 def put_into_maintenance_mode(self, user, reason):
204 if self.status in [self.RUNNING, self.OFFLINING]:207 if self.status in [self.RUNNING, self.RESERVED, self.OFFLINING]:
205 new_status = self.OFFLINING208 new_status = self.OFFLINING
206 else:209 else:
207 new_status = self.OFFLINE210 new_status = self.OFFLINE
@@ -236,6 +239,16 @@
236 self.health_status = Device.HEALTH_LOOPING239 self.health_status = Device.HEALTH_LOOPING
237 self.save()240 self.save()
238241
242 def cancel_reserved_status(self, user, reason):
243 if self.status != Device.RESERVED:
244 return
245 new_status = self.IDLE
246 DeviceStateTransition.objects.create(
247 created_by=user, device=self, old_state=self.status,
248 new_state=new_status, message=reason, job=None).save()
249 self.status = new_status
250 self.save()
251
239252
240class JobFailureTag(models.Model):253class JobFailureTag(models.Model):
241 """254 """
@@ -324,7 +337,7 @@
324337
325 tags = models.ManyToManyField(Tag, blank=True)338 tags = models.ManyToManyField(Tag, blank=True)
326339
327 # This is set once the job starts.340 # This is set once the job starts or is reserved.
328 actual_device = models.ForeignKey(341 actual_device = models.ForeignKey(
329 Device, null=True, default=None, related_name='+', blank=True)342 Device, null=True, default=None, related_name='+', blank=True)
330343
@@ -598,6 +611,10 @@
598 return self._can_admin(user) and self.status in states611 return self._can_admin(user) and self.status in states
599612
600 def cancel(self):613 def cancel(self):
614 # if SUBMITTED with actual_device - clear the actual_device back to idle.
615 if self.status == TestJob.SUBMITTED and self.actual_device is not None:
616 device = Device.objects.get(hostname=self.actual_device)
617 device.cancel_reserved_status(self.submitter, "multinode-cancel")
601 if self.status == TestJob.RUNNING:618 if self.status == TestJob.RUNNING:
602 self.status = TestJob.CANCELING619 self.status = TestJob.CANCELING
603 else:620 else:
604621
=== modified file 'lava_scheduler_app/views.py'
--- lava_scheduler_app/views.py 2013-08-28 15:13:07 +0000
+++ lava_scheduler_app/views.py 2013-09-02 18:15:38 +0000
@@ -371,7 +371,8 @@
371 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),371 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
372 offline=SumIf('device', condition='status in (%s,%s)' %372 offline=SumIf('device', condition='status in (%s,%s)' %
373 (Device.OFFLINE, Device.OFFLINING)),373 (Device.OFFLINE, Device.OFFLINING)),
374 busy=SumIf('device', condition='status=%s' % Device.RUNNING),).order_by('name')374 busy=SumIf('device', condition='status in (%s,%s)' %
375 (Device.RUNNING, Device.RESERVED)),).order_by('name')
375376
376 def render_status(self, record):377 def render_status(self, record):
377 return "%s idle, %s offline, %s busy" % (record.idle,378 return "%s idle, %s offline, %s busy" % (record.idle,
@@ -535,7 +536,7 @@
535 'health_jobs', reverse(health_jobs_json, kwargs=dict(pk=pk)),536 'health_jobs', reverse(health_jobs_json, kwargs=dict(pk=pk)),
536 params=(device,)),537 params=(device,)),
537 'show_maintenance': device.can_admin(request.user) and538 'show_maintenance': device.can_admin(request.user) and
538 device.status in [Device.IDLE, Device.RUNNING],539 device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
539 'show_online': device.can_admin(request.user) and540 'show_online': device.can_admin(request.user) and
540 device.status in [Device.OFFLINE, Device.OFFLINING],541 device.status in [Device.OFFLINE, Device.OFFLINING],
541 'bread_crumb_trail': BreadCrumbTrail.leading_to(health_job_list, pk=pk),542 'bread_crumb_trail': BreadCrumbTrail.leading_to(health_job_list, pk=pk),
@@ -993,7 +994,7 @@
993 'jobs', reverse(recent_jobs_json, kwargs=dict(pk=device.pk)),994 'jobs', reverse(recent_jobs_json, kwargs=dict(pk=device.pk)),
994 params=(device,)),995 params=(device,)),
995 'show_maintenance': device.can_admin(request.user) and996 'show_maintenance': device.can_admin(request.user) and
996 device.status in [Device.IDLE, Device.RUNNING],997 device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
997 'show_online': device.can_admin(request.user) and998 'show_online': device.can_admin(request.user) and
998 device.status in [Device.OFFLINE, Device.OFFLINING],999 device.status in [Device.OFFLINE, Device.OFFLINING],
999 'bread_crumb_trail': BreadCrumbTrail.leading_to(device_detail, pk=pk),1000 'bread_crumb_trail': BreadCrumbTrail.leading_to(device_detail, pk=pk),
10001001
=== modified file 'lava_scheduler_daemon/dbjobsource.py'
--- lava_scheduler_daemon/dbjobsource.py 2013-08-28 15:13:07 +0000
+++ lava_scheduler_daemon/dbjobsource.py 2013-09-02 18:15:38 +0000
@@ -129,14 +129,18 @@
129 def _fix_device(self, device, job):129 def _fix_device(self, device, job):
130 """Associate an available/idle DEVICE to the given JOB.130 """Associate an available/idle DEVICE to the given JOB.
131131
132 If the MultiNode job is waiting as Submitted, the device
133 could be running a different job.
132 Returns the job with actual_device set to DEVICE.134 Returns the job with actual_device set to DEVICE.
133135
134 If we are unable to grab the DEVICE then we return None.136 If we are unable to grab the DEVICE then we return None.
135 """137 """
138 if device.status == Device.RUNNING:
139 return None
136 DeviceStateTransition.objects.create(140 DeviceStateTransition.objects.create(
137 created_by=None, device=device, old_state=device.status,141 created_by=None, device=device, old_state=device.status,
138 new_state=Device.RUNNING, message=None, job=job).save()142 new_state=Device.RESERVED, message=None, job=job).save()
139 device.status = Device.RUNNING143 device.status = Device.RESERVED
140 device.current_job = job144 device.current_job = job
141 try:145 try:
142 # The unique constraint on current_job may cause this to146 # The unique constraint on current_job may cause this to
@@ -190,10 +194,10 @@
190 for d in devices:194 for d in devices:
191 self.logger.debug("Checking %s" % d.hostname)195 self.logger.debug("Checking %s" % d.hostname)
192 if d.hostname in configured_boards:196 if d.hostname in configured_boards:
193 if job:197 if job:
194 job = self._fix_device(d, job)198 job = self._fix_device(d, job)
195 if job:199 if job:
196 job_list.add(job)200 job_list.add(job)
197201
198 # Remove scheduling multinode jobs until all the jobs in the202 # Remove scheduling multinode jobs until all the jobs in the
199 # target_group are assigned devices.203 # target_group are assigned devices.
@@ -288,6 +292,14 @@
288292
289 def getJobDetails_impl(self, job):293 def getJobDetails_impl(self, job):
290 job.status = TestJob.RUNNING294 job.status = TestJob.RUNNING
295 # need to set the device RUNNING if device was RESERVED
296 if job.actual_device.status == Device.RESERVED:
297 DeviceStateTransition.objects.create(
298 created_by=None, device=job.actual_device, old_state=job.actual_device.status,
299 new_state=Device.RUNNING, message=None, job=job).save()
300 job.actual_device.status = Device.RUNNING
301 job.actual_device.current_job = job
302 job.actual_device.save()
291 job.start_time = datetime.datetime.utcnow()303 job.start_time = datetime.datetime.utcnow()
292 shutil.rmtree(job.output_dir, ignore_errors=True)304 shutil.rmtree(job.output_dir, ignore_errors=True)
293 job.log_file.save('job-%s.log' % job.id, ContentFile(''), save=False)305 job.log_file.save('job-%s.log' % job.id, ContentFile(''), save=False)
@@ -316,6 +328,8 @@
316 device.status = Device.IDLE328 device.status = Device.IDLE
317 elif device.status == Device.OFFLINING:329 elif device.status == Device.OFFLINING:
318 device.status = Device.OFFLINE330 device.status = Device.OFFLINE
331 elif device.status == Device.RESERVED:
332 device.status = Device.IDLE
319 else:333 else:
320 self.logger.error(334 self.logger.error(
321 "Unexpected device state in jobCompleted: %s" % device.status)335 "Unexpected device state in jobCompleted: %s" % device.status)
322336
=== modified file 'lava_scheduler_daemon/service.py'
--- lava_scheduler_daemon/service.py 2013-09-02 18:15:38 +0000
+++ lava_scheduler_daemon/service.py 2013-09-02 18:15:38 +0000
@@ -47,7 +47,7 @@
47 x.hostname for x in dispatcher_config.get_devices()]47 x.hostname for x in dispatcher_config.get_devices()]
4848
49 for job in job_list:49 for job in job_list:
50 if job.actual_device.hostname in configured_boards:50 if job.actual_device and job.actual_device.hostname in configured_boards:
51 new_job = JobRunner(self.source, job, self.dispatcher,51 new_job = JobRunner(self.source, job, self.dispatcher,
52 self.reactor, self.daemon_options)52 self.reactor, self.daemon_options)
53 self.logger.info("Starting Job: %d " % job.id)53 self.logger.info("Starting Job: %d " % job.id)

Subscribers

People subscribed via source and target branches