Merge ~sylvain-pineau/checkbox-ng:remote-auto-retry into checkbox-ng:master

Proposed by Sylvain Pineau
Status: Merged
Approved by: Sylvain Pineau
Approved revision: a354f92851528d9a68f45ccb6f70f1468e4b7124
Merged at revision: 304e2bf4fc293f19e9dec111169c601835fe591d
Proposed branch: ~sylvain-pineau/checkbox-ng:remote-auto-retry
Merge into: checkbox-ng:master
Diff against target: 194 lines (+112/-45)
2 files modified
checkbox_ng/launcher/remote.py (+67/-44)
plainbox/impl/session/remote_assistant.py (+45/-1)
Reviewer Review Type Date Requested Status
Maciej Kisielewski (community) Approve
Review via email: mp+362077@code.launchpad.net

Description of the change

Support for auto-retry of failed jobs, as set in launchers:

[ui]
auto_retry = yes
max_attempts = 5
delay_before_retry = 30

Nota: Since the delay_before_retry is handled on master side, disconnecting the master when rerunning jobs does not let them run on slave. We have to reconnect to continue.

To post a comment you must log in.
Revision history for this message
Maciej Kisielewski (kissiel) wrote :

Code looks good. I'm +1 on landing it. It'll get tested heavily on my extended ctrl+c support branch, so we can iterate on it quickly if something goes bad.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/checkbox_ng/launcher/remote.py b/checkbox_ng/launcher/remote.py
2index 70bdc60..945068d 100644
3--- a/checkbox_ng/launcher/remote.py
4+++ b/checkbox_ng/launcher/remote.py
5@@ -376,50 +376,11 @@ class RemoteMaster(Command, ReportsStage, MainLoopStage):
6 if any([x['user'] is not None for x in jobs_repr]):
7 self.password_query()
8
9- for job in jobs_repr:
10- SimpleUI.header(
11- _('Running job {} / {}').format(
12- job['num'], total_num,
13- fill='-'))
14- SimpleUI.header(job['name'])
15- print(_("ID: {0}").format(job['id']))
16- print(_("Category: {0}").format(job['category_name']))
17- SimpleUI.horiz_line()
18- next_job = False
19- for interaction in self.sa.run_job(job['id']):
20- if interaction.kind == 'sudo_input':
21- self.sa.save_password(
22- self._sudo_provider.encrypted_password)
23- if interaction.kind == 'purpose':
24- SimpleUI.description(_('Purpose:'), interaction.message)
25- elif interaction.kind in ['description', 'steps']:
26- SimpleUI.description(_('Steps:'), interaction.message)
27- if job['command'] is None:
28- cmd = 'run'
29- else:
30- cmd = SimpleUI(None).wait_for_interaction_prompt(None)
31- if cmd == 'skip':
32- next_job = True
33- self.sa.remember_users_response(cmd)
34- elif interaction.kind == 'verification':
35- self.wait_for_job(dont_finish=True)
36- if interaction.message:
37- SimpleUI.description(
38- _('Verification:'), interaction.message)
39- JobAdapter = namedtuple('job_adapter', ['command'])
40- job = JobAdapter(job['command'])
41- cmd = SimpleUI(None)._interaction_callback(
42- job, interaction.extra)
43- self.sa.remember_users_response(cmd)
44- self.finish_job(interaction.extra.get_result())
45- next_job = True
46- elif interaction.kind == 'comment':
47- new_comment = input(SimpleUI.C.BLUE(
48- _('Please enter your comments:') + '\n'))
49- self.sa.remember_users_response(new_comment + '\n')
50- if next_job:
51- continue
52- self.wait_for_job()
53+ self._run_jobs(jobs_repr, total_num)
54+ if self.launcher.auto_retry:
55+ while True:
56+ if not self._maybe_auto_retry_jobs():
57+ break
58 self.finish_session()
59
60 def resume_interacting(self, interaction):
61@@ -473,3 +434,65 @@ class RemoteMaster(Command, ReportsStage, MainLoopStage):
62 exported_stream.seek(0)
63 result = transport.send(exported_stream)
64 return result
65+
66+ def _maybe_auto_retry_jobs(self):
67+ # create a list of jobs that qualify for rerunning
68+ retry_candidates = self.sa.get_auto_retry_candidates()
69+ # bail-out early if no job qualifies for rerunning
70+ if not retry_candidates:
71+ return False
72+ # we wait before retrying
73+ delay = self.launcher.delay_before_retry
74+ _logger.info(_("Waiting {} seconds before retrying failed"
75+ " jobs...".format(delay)))
76+ time.sleep(delay)
77+ # include resource jobs that jobs to retry depend on
78+ candidates = self.sa.prepare_auto_retry_candidates(retry_candidates)
79+ self._run_jobs(self.sa.get_jobs_repr(candidates), len(candidates))
80+ return True
81+
82+ def _run_jobs(self, jobs_repr, total_num=0):
83+ for job in jobs_repr:
84+ SimpleUI.header(
85+ _('Running job {} / {}').format(
86+ job['num'], total_num,
87+ fill='-'))
88+ SimpleUI.header(job['name'])
89+ print(_("ID: {0}").format(job['id']))
90+ print(_("Category: {0}").format(job['category_name']))
91+ SimpleUI.horiz_line()
92+ next_job = False
93+ for interaction in self.sa.run_job(job['id']):
94+ if interaction.kind == 'sudo_input':
95+ self.sa.save_password(
96+ self._sudo_provider.encrypted_password)
97+ if interaction.kind == 'purpose':
98+ SimpleUI.description(_('Purpose:'), interaction.message)
99+ elif interaction.kind in ['description', 'steps']:
100+ SimpleUI.description(_('Steps:'), interaction.message)
101+ if job['command'] is None:
102+ cmd = 'run'
103+ else:
104+ cmd = SimpleUI(None).wait_for_interaction_prompt(None)
105+ if cmd == 'skip':
106+ next_job = True
107+ self.sa.remember_users_response(cmd)
108+ elif interaction.kind == 'verification':
109+ self.wait_for_job(dont_finish=True)
110+ if interaction.message:
111+ SimpleUI.description(
112+ _('Verification:'), interaction.message)
113+ JobAdapter = namedtuple('job_adapter', ['command'])
114+ job = JobAdapter(job['command'])
115+ cmd = SimpleUI(None)._interaction_callback(
116+ job, interaction.extra)
117+ self.sa.remember_users_response(cmd)
118+ self.finish_job(interaction.extra.get_result())
119+ next_job = True
120+ elif interaction.kind == 'comment':
121+ new_comment = input(SimpleUI.C.BLUE(
122+ _('Please enter your comments:') + '\n'))
123+ self.sa.remember_users_response(new_comment + '\n')
124+ if next_job:
125+ continue
126+ self.wait_for_job()
127diff --git a/plainbox/impl/session/remote_assistant.py b/plainbox/impl/session/remote_assistant.py
128index e1a9930..6390b9e 100644
129--- a/plainbox/impl/session/remote_assistant.py
130+++ b/plainbox/impl/session/remote_assistant.py
131@@ -262,6 +262,10 @@ class RemoteSessionAssistant():
132 def finish_bootstrap(self):
133 self._sa.finish_bootstrap()
134 self._state = Bootstrapped
135+ if self._launcher.auto_retry:
136+ for job_id in self._sa.get_static_todo_list():
137+ job_state = self._sa.get_job_state(job_id)
138+ job_state.attempts = self._launcher.max_attempts
139 return self._sa.get_static_todo_list()
140
141 def save_todo_list(self, chosen_jobs):
142@@ -431,11 +435,51 @@ class RemoteSessionAssistant():
143 self._sa.use_job_result(self._currently_running_job, result)
144 if self._state != Bootstrapping:
145 if not self._sa.get_dynamic_todo_list():
146- self._state = Idle
147+ if (
148+ self._launcher.auto_retry and
149+ self.get_auto_retry_candidates()
150+ ):
151+ self._state = TestsSelected
152+ else:
153+ self._state = Idle
154 else:
155 self._state = TestsSelected
156 return result
157
158+ def get_auto_retry_candidates(self):
159+ """Get all the tests that might be selected for an automatic retry."""
160+ def retry_predicate(job_state):
161+ return job_state.result.outcome in (IJobResult.OUTCOME_FAIL,) \
162+ and job_state.effective_auto_retry != 'no'
163+ retry_candidates = []
164+ todo_list = self._sa.get_static_todo_list()
165+ job_states = {job_id: self._sa.get_job_state(job_id) for job_id
166+ in todo_list}
167+ for job_id, job_state in job_states.items():
168+ if retry_predicate(job_state) and job_state.attempts > 0:
169+ retry_candidates.append(self._sa.get_job(job_id))
170+ return retry_candidates
171+
172+ def prepare_auto_retry_candidates(self, retry_candidates):
173+ """Include resource jobs that jobs to retry depend on."""
174+ candidates = []
175+ resources_to_rerun = []
176+ for job in retry_candidates:
177+ job_state = self._sa.get_job_state(job.id)
178+ for inhibitor in job_state.readiness_inhibitor_list:
179+ if inhibitor.cause == InhibitionCause.FAILED_DEP:
180+ resources_to_rerun.append(inhibitor.related_job)
181+ # reset outcome of jobs that are selected for re-running
182+ for job in retry_candidates + resources_to_rerun:
183+ self._sa.get_job_state(job.id).result = MemoryJobResult({})
184+ candidates.append(job.id)
185+ _logger.info("{}: {} attempts".format(
186+ job.id,
187+ self._sa.get_job_state(job.id).attempts
188+ ))
189+ self._state = TestsSelected
190+ return candidates
191+
192 def get_jobs_repr(self, job_ids, offset=0):
193 """
194 Translate jobs into a {'field': 'val'} representations.

Subscribers

People subscribed via source and target branches