Merge lp:~codehelp/lava-scheduler/reserved-boards into lp:lava-scheduler

Proposed by Neil Williams
Status: Merged
Approved by: Neil Williams
Approved revision: 260
Merged at revision: 257
Proposed branch: lp:~codehelp/lava-scheduler/reserved-boards
Merge into: lp:lava-scheduler
Prerequisite: lp:~stylesen/lava-scheduler/fix-worker-multinode-error
Diff against target: 237 lines (+53/-22)
6 files modified
lava_scheduler_app/admin.py (+1/-1)
lava_scheduler_app/api.py (+2/-3)
lava_scheduler_app/models.py (+25/-8)
lava_scheduler_app/views.py (+4/-3)
lava_scheduler_daemon/dbjobsource.py (+20/-6)
lava_scheduler_daemon/service.py (+1/-1)
To merge this branch: bzr merge lp:~codehelp/lava-scheduler/reserved-boards
Reviewer Review Type Date Requested Status
Antonio Terceiro Approve
Review via email: mp+183496@code.launchpad.net

Description of the change

Add a new RESERVED device status and stop _fix_device from overloading RUNNING for the period after submission but before a MultiNode job is running.

There is a separate fix for the regression in #1202285 but that needs to go in after this fix as #1202285 makes this bug harder to reproduce without fixing it.

To post a comment you must log in.
259. By Neil Williams

Ensure the device transitions to Running once the job
(single node or multi node) starts to run.

260. By Neil Williams

Ensure there is an actual device before trying to check it.

Revision history for this message
Antonio Terceiro (terceiro) wrote :

Looks good

 review approve

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lava_scheduler_app/admin.py'
2--- lava_scheduler_app/admin.py 2013-08-28 15:13:07 +0000
3+++ lava_scheduler_app/admin.py 2013-09-02 18:15:38 +0000
4@@ -8,7 +8,7 @@
5
6
7 def offline_action(modeladmin, request, queryset):
8- for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING]):
9+ for device in queryset.filter(status__in=[Device.IDLE, Device.RUNNING, Device.RESERVED]):
10 if device.can_admin(request.user):
11 device.put_into_maintenance_mode(request.user, "admin action")
12 offline_action.short_description = "take offline"
13
14=== modified file 'lava_scheduler_app/api.py'
15--- lava_scheduler_app/api.py 2013-08-28 15:13:07 +0000
16+++ lava_scheduler_app/api.py 2013-09-02 18:15:38 +0000
17@@ -2,7 +2,6 @@
18 from simplejson import JSONDecodeError
19 from django.db.models import Count
20 from linaro_django_xmlrpc.models import ExposedAPI
21-from lava_scheduler_app import utils
22 from lava_scheduler_app.models import (
23 Device,
24 DeviceType,
25@@ -165,8 +164,8 @@
26 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
27 offline=SumIf('device', condition='status in (%s,%s)'
28 % (Device.OFFLINE, Device.OFFLINING)),
29- busy=SumIf('device', condition='status=%s'
30- % Device.RUNNING), ).order_by('name')
31+ busy=SumIf('device', condition='status in (%s,%s)'
32+ % (Device.RUNNING, Device.RESERVED)), ).order_by('name')
33
34 for dev_type in device_types:
35 device_type = {}
36
37=== modified file 'lava_scheduler_app/models.py'
38--- lava_scheduler_app/models.py 2013-08-28 15:13:07 +0000
39+++ lava_scheduler_app/models.py 2013-09-02 18:15:38 +0000
40@@ -51,19 +51,20 @@
41
42
43 def check_device_availability(requested_devices):
44- """Checks whether the number of devices requested is available.
45+ """Checks whether the number of devices requested is available for a multinode job.
46
47 See utils.requested_device_count() for details of REQUESTED_DEVICES
48 dictionary format.
49
50- Returns True if the requested number of devices are available, else
51- raises DevicesUnavailableException.
52+ Returns True for singlenode or if the requested number of devices are available
53+ for the multinode job, else raises DevicesUnavailableException.
54 """
55 device_types = DeviceType.objects.values_list('name').filter(
56- models.Q(device__status=Device.IDLE) | \
57- models.Q(device__status=Device.RUNNING)
58+ models.Q(device__status=Device.IDLE) |
59+ models.Q(device__status=Device.RUNNING) |
60+ models.Q(device__status=Device.RESERVED)
61 ).annotate(
62- num_count=models.Count('name')
63+ num_count=models.Count('name')
64 ).order_by('name')
65
66 if requested_devices:
67@@ -115,6 +116,7 @@
68 RUNNING = 2
69 OFFLINING = 3
70 RETIRED = 4
71+ RESERVED = 5
72
73 STATUS_CHOICES = (
74 (OFFLINE, 'Offline'),
75@@ -122,6 +124,7 @@
76 (RUNNING, 'Running'),
77 (OFFLINING, 'Going offline'),
78 (RETIRED, 'Retired'),
79+ (RESERVED, 'Reserved')
80 )
81
82 # A device health shows a device is ready to test or not
83@@ -201,7 +204,7 @@
84 return user.has_perm('lava_scheduler_app.change_device')
85
86 def put_into_maintenance_mode(self, user, reason):
87- if self.status in [self.RUNNING, self.OFFLINING]:
88+ if self.status in [self.RUNNING, self.RESERVED, self.OFFLINING]:
89 new_status = self.OFFLINING
90 else:
91 new_status = self.OFFLINE
92@@ -236,6 +239,16 @@
93 self.health_status = Device.HEALTH_LOOPING
94 self.save()
95
96+ def cancel_reserved_status(self, user, reason):
97+ if self.status != Device.RESERVED:
98+ return
99+ new_status = self.IDLE
100+ DeviceStateTransition.objects.create(
101+ created_by=user, device=self, old_state=self.status,
102+ new_state=new_status, message=reason, job=None).save()
103+ self.status = new_status
104+ self.save()
105+
106
107 class JobFailureTag(models.Model):
108 """
109@@ -324,7 +337,7 @@
110
111 tags = models.ManyToManyField(Tag, blank=True)
112
113- # This is set once the job starts.
114+ # This is set once the job starts or is reserved.
115 actual_device = models.ForeignKey(
116 Device, null=True, default=None, related_name='+', blank=True)
117
118@@ -598,6 +611,10 @@
119 return self._can_admin(user) and self.status in states
120
121 def cancel(self):
122+ # if SUBMITTED with actual_device - clear the actual_device back to idle.
123+ if self.status == TestJob.SUBMITTED and self.actual_device is not None:
124+ device = Device.objects.get(hostname=self.actual_device)
125+ device.cancel_reserved_status(self.submitter, "multinode-cancel")
126 if self.status == TestJob.RUNNING:
127 self.status = TestJob.CANCELING
128 else:
129
130=== modified file 'lava_scheduler_app/views.py'
131--- lava_scheduler_app/views.py 2013-08-28 15:13:07 +0000
132+++ lava_scheduler_app/views.py 2013-09-02 18:15:38 +0000
133@@ -371,7 +371,8 @@
134 .annotate(idle=SumIf('device', condition='status=%s' % Device.IDLE),
135 offline=SumIf('device', condition='status in (%s,%s)' %
136 (Device.OFFLINE, Device.OFFLINING)),
137- busy=SumIf('device', condition='status=%s' % Device.RUNNING),).order_by('name')
138+ busy=SumIf('device', condition='status in (%s,%s)' %
139+ (Device.RUNNING, Device.RESERVED)),).order_by('name')
140
141 def render_status(self, record):
142 return "%s idle, %s offline, %s busy" % (record.idle,
143@@ -535,7 +536,7 @@
144 'health_jobs', reverse(health_jobs_json, kwargs=dict(pk=pk)),
145 params=(device,)),
146 'show_maintenance': device.can_admin(request.user) and
147- device.status in [Device.IDLE, Device.RUNNING],
148+ device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
149 'show_online': device.can_admin(request.user) and
150 device.status in [Device.OFFLINE, Device.OFFLINING],
151 'bread_crumb_trail': BreadCrumbTrail.leading_to(health_job_list, pk=pk),
152@@ -993,7 +994,7 @@
153 'jobs', reverse(recent_jobs_json, kwargs=dict(pk=device.pk)),
154 params=(device,)),
155 'show_maintenance': device.can_admin(request.user) and
156- device.status in [Device.IDLE, Device.RUNNING],
157+ device.status in [Device.IDLE, Device.RUNNING, Device.RESERVED],
158 'show_online': device.can_admin(request.user) and
159 device.status in [Device.OFFLINE, Device.OFFLINING],
160 'bread_crumb_trail': BreadCrumbTrail.leading_to(device_detail, pk=pk),
161
162=== modified file 'lava_scheduler_daemon/dbjobsource.py'
163--- lava_scheduler_daemon/dbjobsource.py 2013-08-28 15:13:07 +0000
164+++ lava_scheduler_daemon/dbjobsource.py 2013-09-02 18:15:38 +0000
165@@ -129,14 +129,18 @@
166 def _fix_device(self, device, job):
167 """Associate an available/idle DEVICE to the given JOB.
168
169+ If the MultiNode job is waiting as Submitted, the device
170+ could be running a different job.
171 Returns the job with actual_device set to DEVICE.
172
173 If we are unable to grab the DEVICE then we return None.
174 """
175+ if device.status == Device.RUNNING:
176+ return None
177 DeviceStateTransition.objects.create(
178 created_by=None, device=device, old_state=device.status,
179- new_state=Device.RUNNING, message=None, job=job).save()
180- device.status = Device.RUNNING
181+ new_state=Device.RESERVED, message=None, job=job).save()
182+ device.status = Device.RESERVED
183 device.current_job = job
184 try:
185 # The unique constraint on current_job may cause this to
186@@ -190,10 +194,10 @@
187 for d in devices:
188 self.logger.debug("Checking %s" % d.hostname)
189 if d.hostname in configured_boards:
190- if job:
191- job = self._fix_device(d, job)
192- if job:
193- job_list.add(job)
194+ if job:
195+ job = self._fix_device(d, job)
196+ if job:
197+ job_list.add(job)
198
199 # Remove scheduling multinode jobs until all the jobs in the
200 # target_group are assigned devices.
201@@ -288,6 +292,14 @@
202
203 def getJobDetails_impl(self, job):
204 job.status = TestJob.RUNNING
205+ # need to set the device RUNNING if device was RESERVED
206+ if job.actual_device.status == Device.RESERVED:
207+ DeviceStateTransition.objects.create(
208+ created_by=None, device=job.actual_device, old_state=job.actual_device.status,
209+ new_state=Device.RUNNING, message=None, job=job).save()
210+ job.actual_device.status = Device.RUNNING
211+ job.actual_device.current_job = job
212+ job.actual_device.save()
213 job.start_time = datetime.datetime.utcnow()
214 shutil.rmtree(job.output_dir, ignore_errors=True)
215 job.log_file.save('job-%s.log' % job.id, ContentFile(''), save=False)
216@@ -316,6 +328,8 @@
217 device.status = Device.IDLE
218 elif device.status == Device.OFFLINING:
219 device.status = Device.OFFLINE
220+ elif device.status == Device.RESERVED:
221+ device.status = Device.IDLE
222 else:
223 self.logger.error(
224 "Unexpected device state in jobCompleted: %s" % device.status)
225
226=== modified file 'lava_scheduler_daemon/service.py'
227--- lava_scheduler_daemon/service.py 2013-09-02 18:15:38 +0000
228+++ lava_scheduler_daemon/service.py 2013-09-02 18:15:38 +0000
229@@ -47,7 +47,7 @@
230 x.hostname for x in dispatcher_config.get_devices()]
231
232 for job in job_list:
233- if job.actual_device.hostname in configured_boards:
234+ if job.actual_device and job.actual_device.hostname in configured_boards:
235 new_job = JobRunner(self.source, job, self.dispatcher,
236 self.reactor, self.daemon_options)
237 self.logger.info("Starting Job: %d " % job.id)

Subscribers

People subscribed via source and target branches