Merge ~adam-collard/maas-ci/+git/system-tests:retry-boot into ~maas-committers/maas-ci/+git/system-tests:master

Proposed by Adam Collard
Status: Merged
Approved by: Adam Collard
Approved revision: 642b7f6f3f0a3e3f2c98cfcc3538a11db9e878dc
Merge reported by: MAAS Lander
Merged at revision: not available
Proposed branch: ~adam-collard/maas-ci/+git/system-tests:retry-boot
Merge into: ~maas-committers/maas-ci/+git/system-tests:master
Diff against target: 234 lines (+72/-71)
3 files modified
systemtests/env_builder/test_basic.py (+66/-46)
systemtests/state.py (+1/-0)
systemtests/utils.py (+5/-25)
Reviewer Review Type Date Requested Status
Thorsten Merten Approve
MAAS Lander Approve
Review via email: mp+436109@code.launchpad.net

Commit message

[lxd vm] re-attempt boot after 5m of not seeing machine, use ready_remote_maas

Description of the change

This is needed to cope with the issues we see on maas-integration-ci environment where LXD is being spammed by Juju and sometimes needs two attempts to get the VM to net boot successfully at this stage of the tests.

Enlistment via booting is tested extensively by the other machine specific tests later in the run.

To post a comment you must log in.
Revision history for this message
MAAS Lander (maas-lander) wrote :

UNIT TESTS
-b retry-boot lp:~adam-collard/maas-ci/+git/system-tests into -b master lp:~maas-committers/maas-ci/+git/system-tests

STATUS: SUCCESS
COMMIT: 3cd9f45d4c6663d6e93c39b4c751621a1ee95dac

review: Approve
Revision history for this message
Thorsten Merten (thorsten-merten) wrote :

Some minor ideas below. I am really excited to see this running on CI :)

review: Approve
642b7f6... by Adam Collard

Address Thorsten's review comments

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/systemtests/env_builder/test_basic.py b/systemtests/env_builder/test_basic.py
2index 7ecfbd4..7e74c31 100644
3--- a/systemtests/env_builder/test_basic.py
4+++ b/systemtests/env_builder/test_basic.py
5@@ -9,6 +9,11 @@ from urllib.request import urlopen
6 import pytest
7 from retry import retry
8
9+from systemtests.api import (
10+ AuthenticatedAPIClient,
11+ Machine,
12+ UnauthenticatedMAASAPIClient,
13+)
14 from systemtests.lxd import Instance, get_lxd
15 from systemtests.utils import (
16 UnexpectedMachineStatus,
17@@ -17,17 +22,59 @@ from systemtests.utils import (
18 retries,
19 wait_for_machine,
20 wait_for_new_machine,
21- wait_for_ready_controllers,
22 )
23
24 if TYPE_CHECKING:
25 from logging import Logger
26
27- from systemtests.api import AuthenticatedAPIClient, UnauthenticatedMAASAPIClient
28 from systemtests.machine_config import MachineConfig
29 from systemtests.region import MAASRegion
30
31
32+@retry(tries=3)
33+def _ensure_machine_enlisted(
34+ maas_api_client: AuthenticatedAPIClient,
35+ mac_address: str,
36+ instance: Instance,
37+) -> Machine:
38+ instance_log = instance.logger.getChild(instance.name)
39+ # Find the VM in MAAS by MAC
40+ machines_with_matching_mac = maas_api_client.list_machines(mac_address=mac_address)
41+ if machines_with_matching_mac:
42+ # Yay, it exists
43+ return machines_with_matching_mac[0]
44+
45+ @retry(tries=5, delay=5, backoff=1.2, logger=instance_log)
46+ def _boot_vm(vm: Instance) -> None:
47+ status = instance.status()
48+ if status == "RUNNING":
49+ instance_log.debug("already running, restarting")
50+ instance.restart()
51+ elif status == "STOPPED":
52+ instance_log.debug("is stopped, starting")
53+ try:
54+ instance.start()
55+ except CalledProcessError:
56+ debug_lxd_vm(instance.name, instance_log)
57+ raise
58+ else:
59+ assert False, f"Don't know how to handle lxd_vm status: {status}"
60+
61+ # Machine not registered, let's boot it up
62+ _boot_vm(instance)
63+ try:
64+ vm_status = instance.status()
65+ except ValueError:
66+ vm_status = "not available"
67+ instance_log.debug(f"is {vm_status}")
68+
69+ machine = wait_for_new_machine(
70+ maas_api_client, mac_address, instance.name, timeout=(5 * 60, 20)
71+ )
72+ instance_log.debug(f"found machine {machine['hostname']}")
73+ return machine
74+
75+
76 class TestSetup:
77 @pytest.mark.skip_if_installed_from_snap("Prometheus is installed in the snap")
78 def test_setup_prometheus(
79@@ -114,67 +161,40 @@ class TestSetup:
80
81 def test_ensure_ready_vm_for_hardware_sync(
82 self,
83+ ready_remote_maas: None,
84 instance_config: MachineConfig,
85 maas_api_client: AuthenticatedAPIClient,
86 testlog: Logger,
87 ) -> None:
88 """Ensure that we have a Ready VM at the end."""
89- lxd = get_lxd(logger=testlog)
90 vm_name = instance_config.name
91+ lxd = get_lxd(logger=testlog)
92 instance = Instance(lxd, vm_name)
93 if instance.exists():
94 # Force delete the VM so we know we're starting clean
95 instance.delete()
96
97- # Ensure that the Region Controller is ready
98- wait_for_ready_controllers(maas_api_client)
99+ mac_address = instance_config.mac_address
100 # Need to create a network device with a hwaddr
101- config: dict[str, str] = {"security.secureboot": "false"}
102+ config: dict[str, str] = {
103+ "security.secureboot": "false",
104+ "volatile.eth0.hwaddr": mac_address,
105+ }
106 if instance_config.lxd_profile:
107 config["profile"] = instance_config.lxd_profile
108- if instance_config.mac_address:
109- config["volatile.eth0.hwaddr"] = instance_config.mac_address
110
111 instance = lxd.create_vm(vm_name, config)
112
113- mac_address = instance_config.mac_address
114-
115- # Find the VM in MAAS by MAC
116- maybe_machine = maas_api_client.list_machines(mac_address=mac_address)
117- if maybe_machine:
118- # Yay, it exists
119- machine = maybe_machine[0]
120- else:
121- # Machine not registered, let's boot it up
122- @retry(tries=5, delay=5, backoff=1.2, logger=testlog)
123- def _boot_vm(vm: Instance) -> None:
124- status = instance.status()
125- if status == "RUNNING":
126- testlog.debug(f"{instance.name} is already running, restarting")
127- instance.restart()
128- elif status == "STOPPED":
129- testlog.debug(f"{instance.name} is stopped, starting")
130- try:
131- instance.start()
132- except CalledProcessError:
133- debug_lxd_vm(vm_name, testlog)
134- raise
135- else:
136- assert False, f"Don't know how to handle lxd_vm status: {status}"
137-
138- _boot_vm(instance)
139- try:
140- vm_status = instance.status()
141- except ValueError:
142- vm_status = "not available"
143- testlog.debug(f"{vm_name} is {vm_status}")
144-
145- try:
146- machine = wait_for_new_machine(maas_api_client, mac_address, vm_name)
147- except UnexpectedMachineStatus as err:
148- # We know that this is a LXD VM - so debug it
149- err.debug_info.extend(debug_lxd_vm(vm_name, testlog))
150- raise
151+ maas_api_client.logger = testlog.getChild(vm_name)
152+ # Try 3 times to boot the LXD VM and get it enlisted
153+ try:
154+ machine = _ensure_machine_enlisted(maas_api_client, mac_address, instance)
155+ except UnexpectedMachineStatus as err:
156+ # We know that this is a LXD VM - so debug it
157+ err.debug_info.extend(
158+ debug_lxd_vm(instance.name, testlog.getChild(vm_name))
159+ )
160+ assert False, err
161
162 # Make sure we have power parameters set
163 if not machine["power_type"]:
164diff --git a/systemtests/state.py b/systemtests/state.py
165index 57f0f7c..38494d0 100644
166--- a/systemtests/state.py
167+++ b/systemtests/state.py
168@@ -164,6 +164,7 @@ def configured_maas(
169 def all_rack_controllers_commissioned(
170 logger: Logger, admin: AuthenticatedAPIClient
171 ) -> bool:
172+ """Check if all rack controllers have passed commissioning."""
173 for rack in get_rack_controllers(admin):
174 status = rack["commissioning_status"]
175 status_name = rack["commissioning_status_name"]
176diff --git a/systemtests/utils.py b/systemtests/utils.py
177index 4b0e63d..1edd11a 100644
178--- a/systemtests/utils.py
179+++ b/systemtests/utils.py
180@@ -6,7 +6,6 @@ import random
181 import re
182 import string
183 import time
184-from collections import Counter
185 from dataclasses import dataclass
186 from logging import Logger
187 from typing import Iterator, Optional, TypedDict, Union
188@@ -153,28 +152,6 @@ def debug_last_events(
189 return events
190
191
192-def wait_for_ready_controllers(
193- api_client: api.AuthenticatedAPIClient, timeout: float = 10 * 60, delay: float = 30
194-) -> None:
195- """Wait for all region controllers to have passed commissioning."""
196- quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
197- for retry_info in retries(timeout, delay):
198- region_controllers = quiet_client.list_region_controllers()
199- commissioning_statuses = Counter(
200- rc["commissioning_status_name"] for rc in region_controllers
201- )
202- passed_count = commissioning_statuses["Passed"]
203- total_count = sum(commissioning_statuses.values())
204- if passed_count == total_count:
205- api_client.logger.debug("All region controllers have passed commissioning!")
206- return
207- else:
208- api_client.logger.debug(
209- "Not all region controllers have passed commissioning ("
210- f"{passed_count}/{total_count}), sleeping for {delay} seconds"
211- )
212-
213-
214 # XXX: Move to api.py
215 def wait_for_machine(
216 api_client: api.AuthenticatedAPIClient,
217@@ -222,12 +199,15 @@ def debug_lxd_vm(machine_name: str, logger: Logger) -> list[str]:
218
219 # XXX: Move to api.py
220 def wait_for_new_machine(
221- api_client: api.AuthenticatedAPIClient, mac_address: str, machine_name: str
222+ api_client: api.AuthenticatedAPIClient,
223+ mac_address: str,
224+ machine_name: str,
225+ timeout: tuple[float, float] = (30 * 60, 30),
226 ) -> api.Machine:
227 """Blocks execution until a machine with the given mac_address appears as New."""
228 __tracebackhide__ = True
229 quiet_client = api.QuietAuthenticatedAPIClient.from_api_client(api_client)
230- for retry_info in retries(50 * 60, 30):
231+ for retry_info in retries(*timeout):
232 machines = quiet_client.list_machines(mac_address=mac_address, status="new")
233 if machines:
234 return machines[0]

Subscribers

People subscribed via source and target branches

to all changes: