Merge ~sylvain-pineau/checkbox-ng:remote-auto-retry into checkbox-ng:master

Proposed by Sylvain Pineau
Status: Merged
Approved by: Sylvain Pineau
Approved revision: a354f92851528d9a68f45ccb6f70f1468e4b7124
Merged at revision: 304e2bf4fc293f19e9dec111169c601835fe591d
Proposed branch: ~sylvain-pineau/checkbox-ng:remote-auto-retry
Merge into: checkbox-ng:master
Diff against target: 194 lines (+112/-45)
2 files modified
checkbox_ng/launcher/remote.py (+67/-44)
plainbox/impl/session/remote_assistant.py (+45/-1)
Reviewer Review Type Date Requested Status
Maciej Kisielewski (community) Approve
Review via email: mp+362077@code.launchpad.net

Description of the change

Support for auto-retry of failed jobs, as set in launchers:

[ui]
auto_retry = yes
max_attempts = 5
delay_before_retry = 30

Nota: Since the delay_before_retry is handled on master side, disconnecting the master when rerunning jobs does not let them run on slave. We have to reconnect to continue.

To post a comment you must log in.
Revision history for this message
Maciej Kisielewski (kissiel) wrote :

Code looks good. I'm +1 on landing it. It'll get tested heavily on my extended ctrl+c support branch, so we can iterate on it quickly if something goes bad.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
diff --git a/checkbox_ng/launcher/remote.py b/checkbox_ng/launcher/remote.py
index 70bdc60..945068d 100644
--- a/checkbox_ng/launcher/remote.py
+++ b/checkbox_ng/launcher/remote.py
@@ -376,50 +376,11 @@ class RemoteMaster(Command, ReportsStage, MainLoopStage):
376 if any([x['user'] is not None for x in jobs_repr]):376 if any([x['user'] is not None for x in jobs_repr]):
377 self.password_query()377 self.password_query()
378378
379 for job in jobs_repr:379 self._run_jobs(jobs_repr, total_num)
380 SimpleUI.header(380 if self.launcher.auto_retry:
381 _('Running job {} / {}').format(381 while True:
382 job['num'], total_num,382 if not self._maybe_auto_retry_jobs():
383 fill='-'))383 break
384 SimpleUI.header(job['name'])
385 print(_("ID: {0}").format(job['id']))
386 print(_("Category: {0}").format(job['category_name']))
387 SimpleUI.horiz_line()
388 next_job = False
389 for interaction in self.sa.run_job(job['id']):
390 if interaction.kind == 'sudo_input':
391 self.sa.save_password(
392 self._sudo_provider.encrypted_password)
393 if interaction.kind == 'purpose':
394 SimpleUI.description(_('Purpose:'), interaction.message)
395 elif interaction.kind in ['description', 'steps']:
396 SimpleUI.description(_('Steps:'), interaction.message)
397 if job['command'] is None:
398 cmd = 'run'
399 else:
400 cmd = SimpleUI(None).wait_for_interaction_prompt(None)
401 if cmd == 'skip':
402 next_job = True
403 self.sa.remember_users_response(cmd)
404 elif interaction.kind == 'verification':
405 self.wait_for_job(dont_finish=True)
406 if interaction.message:
407 SimpleUI.description(
408 _('Verification:'), interaction.message)
409 JobAdapter = namedtuple('job_adapter', ['command'])
410 job = JobAdapter(job['command'])
411 cmd = SimpleUI(None)._interaction_callback(
412 job, interaction.extra)
413 self.sa.remember_users_response(cmd)
414 self.finish_job(interaction.extra.get_result())
415 next_job = True
416 elif interaction.kind == 'comment':
417 new_comment = input(SimpleUI.C.BLUE(
418 _('Please enter your comments:') + '\n'))
419 self.sa.remember_users_response(new_comment + '\n')
420 if next_job:
421 continue
422 self.wait_for_job()
423 self.finish_session()384 self.finish_session()
424385
425 def resume_interacting(self, interaction):386 def resume_interacting(self, interaction):
@@ -473,3 +434,65 @@ class RemoteMaster(Command, ReportsStage, MainLoopStage):
473 exported_stream.seek(0)434 exported_stream.seek(0)
474 result = transport.send(exported_stream)435 result = transport.send(exported_stream)
475 return result436 return result
437
438 def _maybe_auto_retry_jobs(self):
439 # create a list of jobs that qualify for rerunning
440 retry_candidates = self.sa.get_auto_retry_candidates()
441 # bail-out early if no job qualifies for rerunning
442 if not retry_candidates:
443 return False
444 # we wait before retrying
445 delay = self.launcher.delay_before_retry
446 _logger.info(_("Waiting {} seconds before retrying failed"
447 " jobs...".format(delay)))
448 time.sleep(delay)
449 # include resource jobs that jobs to retry depend on
450 candidates = self.sa.prepare_auto_retry_candidates(retry_candidates)
451 self._run_jobs(self.sa.get_jobs_repr(candidates), len(candidates))
452 return True
453
454 def _run_jobs(self, jobs_repr, total_num=0):
455 for job in jobs_repr:
456 SimpleUI.header(
457 _('Running job {} / {}').format(
458 job['num'], total_num,
459 fill='-'))
460 SimpleUI.header(job['name'])
461 print(_("ID: {0}").format(job['id']))
462 print(_("Category: {0}").format(job['category_name']))
463 SimpleUI.horiz_line()
464 next_job = False
465 for interaction in self.sa.run_job(job['id']):
466 if interaction.kind == 'sudo_input':
467 self.sa.save_password(
468 self._sudo_provider.encrypted_password)
469 if interaction.kind == 'purpose':
470 SimpleUI.description(_('Purpose:'), interaction.message)
471 elif interaction.kind in ['description', 'steps']:
472 SimpleUI.description(_('Steps:'), interaction.message)
473 if job['command'] is None:
474 cmd = 'run'
475 else:
476 cmd = SimpleUI(None).wait_for_interaction_prompt(None)
477 if cmd == 'skip':
478 next_job = True
479 self.sa.remember_users_response(cmd)
480 elif interaction.kind == 'verification':
481 self.wait_for_job(dont_finish=True)
482 if interaction.message:
483 SimpleUI.description(
484 _('Verification:'), interaction.message)
485 JobAdapter = namedtuple('job_adapter', ['command'])
486 job = JobAdapter(job['command'])
487 cmd = SimpleUI(None)._interaction_callback(
488 job, interaction.extra)
489 self.sa.remember_users_response(cmd)
490 self.finish_job(interaction.extra.get_result())
491 next_job = True
492 elif interaction.kind == 'comment':
493 new_comment = input(SimpleUI.C.BLUE(
494 _('Please enter your comments:') + '\n'))
495 self.sa.remember_users_response(new_comment + '\n')
496 if next_job:
497 continue
498 self.wait_for_job()
diff --git a/plainbox/impl/session/remote_assistant.py b/plainbox/impl/session/remote_assistant.py
index e1a9930..6390b9e 100644
--- a/plainbox/impl/session/remote_assistant.py
+++ b/plainbox/impl/session/remote_assistant.py
@@ -262,6 +262,10 @@ class RemoteSessionAssistant():
262 def finish_bootstrap(self):262 def finish_bootstrap(self):
263 self._sa.finish_bootstrap()263 self._sa.finish_bootstrap()
264 self._state = Bootstrapped264 self._state = Bootstrapped
265 if self._launcher.auto_retry:
266 for job_id in self._sa.get_static_todo_list():
267 job_state = self._sa.get_job_state(job_id)
268 job_state.attempts = self._launcher.max_attempts
265 return self._sa.get_static_todo_list()269 return self._sa.get_static_todo_list()
266270
267 def save_todo_list(self, chosen_jobs):271 def save_todo_list(self, chosen_jobs):
@@ -431,11 +435,51 @@ class RemoteSessionAssistant():
431 self._sa.use_job_result(self._currently_running_job, result)435 self._sa.use_job_result(self._currently_running_job, result)
432 if self._state != Bootstrapping:436 if self._state != Bootstrapping:
433 if not self._sa.get_dynamic_todo_list():437 if not self._sa.get_dynamic_todo_list():
434 self._state = Idle438 if (
439 self._launcher.auto_retry and
440 self.get_auto_retry_candidates()
441 ):
442 self._state = TestsSelected
443 else:
444 self._state = Idle
435 else:445 else:
436 self._state = TestsSelected446 self._state = TestsSelected
437 return result447 return result
438448
449 def get_auto_retry_candidates(self):
450 """Get all the tests that might be selected for an automatic retry."""
451 def retry_predicate(job_state):
452 return job_state.result.outcome in (IJobResult.OUTCOME_FAIL,) \
453 and job_state.effective_auto_retry != 'no'
454 retry_candidates = []
455 todo_list = self._sa.get_static_todo_list()
456 job_states = {job_id: self._sa.get_job_state(job_id) for job_id
457 in todo_list}
458 for job_id, job_state in job_states.items():
459 if retry_predicate(job_state) and job_state.attempts > 0:
460 retry_candidates.append(self._sa.get_job(job_id))
461 return retry_candidates
462
463 def prepare_auto_retry_candidates(self, retry_candidates):
464 """Include resource jobs that jobs to retry depend on."""
465 candidates = []
466 resources_to_rerun = []
467 for job in retry_candidates:
468 job_state = self._sa.get_job_state(job.id)
469 for inhibitor in job_state.readiness_inhibitor_list:
470 if inhibitor.cause == InhibitionCause.FAILED_DEP:
471 resources_to_rerun.append(inhibitor.related_job)
472 # reset outcome of jobs that are selected for re-running
473 for job in retry_candidates + resources_to_rerun:
474 self._sa.get_job_state(job.id).result = MemoryJobResult({})
475 candidates.append(job.id)
476 _logger.info("{}: {} attempts".format(
477 job.id,
478 self._sa.get_job_state(job.id).attempts
479 ))
480 self._state = TestsSelected
481 return candidates
482
439 def get_jobs_repr(self, job_ids, offset=0):483 def get_jobs_repr(self, job_ids, offset=0):
440 """484 """
441 Translate jobs into a {'field': 'val'} representations.485 Translate jobs into a {'field': 'val'} representations.

Subscribers

People subscribed via source and target branches