Merge ~canonical-bootstack/charm-openstack-service-checks:feature/contrail-service-checks into ~canonical-bootstack/charm-openstack-service-checks:master

Proposed by Jeremy Lounder
Status: Merged
Approved by: Alvaro Uria
Approved revision: fcb5547f0dfaf983d0186af1388ff7871ce0d950
Merged at revision: 3bc2b50c4960827288234d14d30fdcb02eb45af8
Proposed branch: ~canonical-bootstack/charm-openstack-service-checks:feature/contrail-service-checks
Merge into: ~canonical-bootstack/charm-openstack-service-checks:master
Diff against target: 220 lines (+180/-0)
4 files modified
config.yaml (+4/-0)
files/plugins/check_contrail_analytics_alarms.py (+130/-0)
lib/lib_openstack_service_checks.py (+15/-0)
tests/functional/test_deploy.py (+31/-0)
Reviewer Review Type Date Requested Status
Jeremy Lounder (community) Approve
Review via email: mp+374819@code.launchpad.net

Commit message

Add Nagios check for Juniper Contrail Analytics alarms

Based on a new charm originally proposed by npochet into Juniper's contrail-charms: https://github.com/Juniper/contrail-charms/pull/70/files
The check also parses the /analytics/alarms URL into Nagios alert format.

To post a comment you must log in.
Revision history for this message
🤖 Canonical IS Merge Bot (canonical-is-mergebot) wrote :

This merge proposal is being monitored by mergebot. Change the status to Approved to merge.

Revision history for this message
Alvaro Uria (aluria) wrote :

keystone.yaml will be removed per conversation, so use the already existing nagios.novarc config file. I've also suggested a minor change on the contrail_vip property, as I was recommended in the past by the OpenStack charmers.

We may want to validate that the contrail_vip is valid and make "check_contrail_alarms.py -H <vip>" work.

Revision history for this message
Alvaro Uria (aluria) wrote :

Sample of the output generated by the check: https://pastebin.canonical.com/p/RqG7KbGZgg/

Revision history for this message
Jeremy Lounder (jldev) :
review: Approve
Revision history for this message
🤖 Canonical IS Merge Bot (canonical-is-mergebot) wrote :

Change cannot be self approved, setting status to needs review.

Revision history for this message
🤖 Canonical IS Merge Bot (canonical-is-mergebot) wrote :

Change successfully merged at revision 3bc2b50c4960827288234d14d30fdcb02eb45af8

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/config.yaml b/config.yaml
2index 86ade7d..e571370 100644
3--- a/config.yaml
4+++ b/config.yaml
5@@ -104,3 +104,7 @@ options:
6 URL to use with check_http if there is a Swift endpoint. Default is '/', but it's possible to add extra params,
7 e.g. '/v3 -e Unauthorized -d x-openstack-request-id' or a different url, e.g. '/healthcheck'. Mitaka Swift
8 typically needs '/healthcheck'.
9+ contrail_analytics_vip:
10+ type: string
11+ default: ''
12+ description: The VIP used for Contrail Analytics. Leave blank to disable Contrail monitoring.
13diff --git a/files/plugins/check_contrail_analytics_alarms.py b/files/plugins/check_contrail_analytics_alarms.py
14new file mode 100755
15index 0000000..b76eb64
16--- /dev/null
17+++ b/files/plugins/check_contrail_analytics_alarms.py
18@@ -0,0 +1,130 @@
19+#!/usr/bin/env python3
20+
21+import argparse
22+import collections
23+import datetime
24+import ipaddress
25+import os
26+import os_client_config
27+import requests
28+import subprocess
29+
30+import nagios_plugin3
31+
32+
33+def parse_contrail_alarms(data):
34+ """Validate output data from Contrail Analytics Alarms section.
35+
36+ :param data: dict
37+ :returns: str
38+
39+ The returned str shows a summary in the first line (as it will be displayed
40+ in Nagios alerts). The rest of lines are sorted by timetamp (ts).
41+ """
42+ # If ack=False is found, crit_counter+=1; else, return WARNING
43+ counter = crit_counter = 0
44+ msgs_list = collections.defaultdict(lambda: [])
45+ for node_type in data.keys():
46+ # node_type: analytics-node, database-node, vrouter, ...
47+ for item in data[node_type]:
48+ # KVM, LXD or physical node hostname
49+ hostname = item["name"]
50+ # timestamp = item["value"]["UVEAlarms"]["__T"] / 1e6
51+ for alarm in item["value"]["UVEAlarms"]["alarms"]:
52+ ack = alarm["ack"]
53+ alarm_info = {
54+ 'hostname': hostname,
55+ 'nagios_status': 'WARNING',
56+ 'desc': alarm["description"],
57+ 'sev': alarm["severity"],
58+ 'ts': datetime.datetime.utcfromtimestamp(alarm["timestamp"] / 1e6),
59+ 'type': alarm["type"],
60+ }
61+ counter += 1
62+ if not ack or alarm["severity"] > 0:
63+ crit_counter += 1
64+ alarm_info["nagios_status"] = 'CRITICAL'
65+
66+ alarm_msg = ('{nagios_status}: {node_type}{{{hostname}, sev={sev},'
67+ ' ts[{ts}]}} {desc}'.format(
68+ node_type=node_type, **alarm_info))
69+ msgs_list[alarm["timestamp"]].append(alarm_msg)
70+
71+ if not msgs_list:
72+ return 'OK: no alarms'
73+
74+ msg = 'CRITICAL: ' if crit_counter > 0 else 'WARNING: '
75+ msg += 'total_alarms[{}], unacked_or_sev_gt_0[{}]\n{}'.format(
76+ counter, crit_counter, '\n'.join(
77+ '\n'.join(msgs_list[key]) for key in sorted(msgs_list)))
78+ return msg
79+
80+
81+def check_contrail_alarms(contrail_vip, token):
82+ """Check the alarms in Contrail Analytics.
83+
84+ @param str vip: VIP of Contrail
85+ @param str token: Token for the authentication
86+ @returns: None
87+ """
88+ url = 'http://{}:8081/analytics/alarms'.format(contrail_vip)
89+ headers = {'X-Auth-Token': token}
90+ try:
91+ r = requests.get(url=url, headers=headers)
92+ except requests.exceptions.ConnectionError as error:
93+ raise nagios_plugin3.CriticalError(
94+ 'CRITICAL: contrail analytics API error: {}'.format(error))
95+
96+ if r.code != 200:
97+ raise nagios_plugin3.CriticalError(
98+ 'CRITICAL: contrail analytics API return code is {}'.format(r.code))
99+
100+ result = r.json()
101+ msg = parse_contrail_alarms(result)
102+
103+ if msg.startswith('CRITICAL: '):
104+ raise nagios_plugin3.CriticalError(msg)
105+ elif msg.startswith('WARNING: '):
106+ raise nagios_plugin3.WarnError(msg)
107+ print('OK: no unacknowledged or sev>0 contrail analytics alarms')
108+
109+
110+def load_os_envvars():
111+ # grab environment vars
112+ command = ['/bin/bash', '-c', "source {} && env".format(args.env)]
113+ proc = subprocess.Popen(command, stdout=subprocess.PIPE)
114+ for line in proc.stdout:
115+ (key, _, value) = line.partition(b'=')
116+ os.environ[key.decode('utf-8')] = value.rstrip().decode('utf-8')
117+ proc.communicate()
118+
119+
120+def validate_ipv4(ipv4_addr):
121+ try:
122+ ipaddress.IPv4Address(ipv4_addr)
123+ except ipaddress.AddressValueError:
124+ raise nagios_plugin3.UnknownError(
125+ 'UNKNOWN: invalid contrail IPv4 address {}'.format(ipv4_addr))
126+
127+
128+if __name__ == '__main__':
129+ parser = argparse.ArgumentParser(description='Check Contrail alarms')
130+ parser.add_argument('--env', dest='env',
131+ default='/var/lib/nagios/nagios.novarc',
132+ help='Novarc file to use for this check')
133+ parser.add_argument('--host', '-H', dest='host', nargs=1,
134+ help='Contrail Analytics Virtual IP')
135+ args = parser.parse_args()
136+
137+ # Validate Contrail Analytics IP
138+ contrail_analytics_vip = None
139+ if isinstance(args.host, list):
140+ contrail_analytics_vip = args.host[0]
141+ nagios_plugin3.try_check(validate_ipv4, contrail_analytics_vip)
142+
143+ # Retrieve token from Keystone
144+ load_os_envvars()
145+ keystone_client = os_client_config.session_client('identity', cloud='envvars')
146+ token = keystone_client.get_token()
147+
148+ nagios_plugin3.try_check(check_contrail_alarms, contrail_analytics_vip, token)
149diff --git a/lib/lib_openstack_service_checks.py b/lib/lib_openstack_service_checks.py
150index fd12f51..8d654ef 100644
151--- a/lib/lib_openstack_service_checks.py
152+++ b/lib/lib_openstack_service_checks.py
153@@ -41,6 +41,10 @@ class OSCHelper():
154 return '/var/lib/nagios/nagios.novarc'
155
156 @property
157+ def contrail_analytics_vip(self):
158+ return self.charm_config['contrail_analytics_vip']
159+
160+ @property
161 def plugins_dir(self):
162 return '/usr/local/lib/nagios/plugins/'
163
164@@ -175,6 +179,17 @@ class OSCHelper():
165 'check_neutron_agents.sh'),
166 )
167
168+ if self.contrail_analytics_vip:
169+ contrail_check_command = '{} --host {}'.format(
170+ os.path.join(self.plugins_dir, 'check_contrail_analytics_alarms.py'),
171+ self.contrail_analytics_vip)
172+ nrpe.add_check(shortname='contrail_analytics_alarms',
173+ description='Check Contrail Analytics alarms',
174+ check_cmd=contrail_check_command,
175+ )
176+ else:
177+ nrpe.remove_check(shortname='contrail_analytics_alarms')
178+
179 if len(self.check_dns):
180 nrpe.add_check(shortname='dns_multi',
181 description='Check DNS names are resolvable',
182diff --git a/tests/functional/test_deploy.py b/tests/functional/test_deploy.py
183index 30ca92d..5ef53d8 100644
184--- a/tests/functional/test_deploy.py
185+++ b/tests/functional/test_deploy.py
186@@ -188,3 +188,34 @@ async def test_openstackservicechecks_enable_rally(deploy_app, model, file_stat)
187 for filename in filenames:
188 test_stat = await file_stat(filename, unit)
189 assert test_stat['size'] > 0
190+
191+
192+async def test_openstackservicechecks_enable_contrail_analytics_vip(deploy_app, model, file_stat):
193+ unit = [unit for unit in model.units.values() if unit.entity_id.startswith(deploy_app.name)]
194+ if len(unit) != 1:
195+ assert False
196+
197+ unit = unit[0]
198+ filename = '/etc/nagios/nrpe.d/check_contrail_analytics_alarms.cfg'
199+
200+ # disable rally nrpe check if it was enabled (ie. from a previous run of functests)
201+ config = await deploy_app.get_config()
202+ if config['contrail_analytics_vip']['value']:
203+ await deploy_app.set_config({'contrail_analytics_vip': ''})
204+ # Wait until nrpe check is set
205+ await model.block_until(lambda: deploy_app.status == 'active' and unit.agent_status == 'idle',
206+ timeout=600)
207+
208+ # Check BEFORE enabling contrail_analytics_vip
209+ # raises exception because filename does not exist
210+ with pytest.raises(json.decoder.JSONDecodeError):
211+ await file_stat(filename, unit)
212+
213+ await deploy_app.set_config({'contrail_analytics_vip': '127.0.0.1'})
214+ # Wait until nrpe check is set
215+ await model.block_until(lambda: deploy_app.status == 'active' and unit.agent_status == 'idle',
216+ timeout=600)
217+
218+ # Check AFTER enabling contrail_analytics_vip
219+ test_stat = await file_stat(filename, unit)
220+ assert test_stat['size'] > 0

Subscribers

People subscribed via source and target branches

to all changes: