Merge lp:~brad-marshall/charms/trusty/ceph-osd/add-nrpe-checks into lp:~openstack-charmers-archive/charms/trusty/ceph-osd/trunk

Proposed by Brad Marshall
Status: Merged
Merged at revision: 34
Proposed branch: lp:~brad-marshall/charms/trusty/ceph-osd/add-nrpe-checks
Merge into: lp:~openstack-charmers-archive/charms/trusty/ceph-osd/trunk
Diff against target: 566 lines (+488/-0)
8 files modified
charm-helpers-hooks.yaml (+1/-0)
config.yaml (+11/-0)
files/nagios/check_ceph_status.py (+44/-0)
files/nagios/collect_ceph_status.sh (+18/-0)
hooks/charmhelpers/contrib/charmsupport/nrpe.py (+222/-0)
hooks/charmhelpers/contrib/charmsupport/volumes.py (+156/-0)
hooks/hooks.py (+32/-0)
metadata.yaml (+4/-0)
To merge this branch: bzr merge lp:~brad-marshall/charms/trusty/ceph-osd/add-nrpe-checks
Reviewer Review Type Date Requested Status
Liam Young (community) Disapprove
Review via email: mp+241496@code.launchpad.net

Description of the change

Adds nrpe-external-master interface and adds basic nrpe checks.

To post a comment you must log in.
35. By Brad Marshall

[bradm] Fixes from pep8 run

36. By Brad Marshall

[bradm] Removed nagios check files that were moved to nrpe-external-master charm

Revision history for this message
Liam Young (gnuoy) wrote :

Thank for the mp. The new nrpe support is very gratefully received !

I've taken this branch and centralised the common code between this and the other nrpe branches and moved it to charm-helpers. To land it I created a new branch from this one which has now been merged into the 'next' charm. The 'next' charms will overwrite the stable ones in a couple of weeks.

review: Disapprove

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'charm-helpers-hooks.yaml'
--- charm-helpers-hooks.yaml 2014-09-27 02:28:51 +0000
+++ charm-helpers-hooks.yaml 2014-11-18 01:06:55 +0000
@@ -7,3 +7,4 @@
7 - utils7 - utils
8 - contrib.openstack.alternatives8 - contrib.openstack.alternatives
9 - contrib.network.ip9 - contrib.network.ip
10 - contrib.charmsupport
1011
=== modified file 'config.yaml'
--- config.yaml 2014-10-06 22:11:14 +0000
+++ config.yaml 2014-11-18 01:06:55 +0000
@@ -121,3 +121,14 @@
121 order for this charm to function correctly, the privacy extension must be121 order for this charm to function correctly, the privacy extension must be
122 disabled and a non-temporary address must be configured/available on122 disabled and a non-temporary address must be configured/available on
123 your network interface.123 your network interface.
124 nagios_context:
125 default: "juju"
126 type: string
127 description: |
128 Used by the nrpe-external-master subordinate charm.
129 A string that will be prepended to instance name to set the host name
130 in nagios. So for instance the hostname would be something like:
131 juju-myservice-0
132 If you're running multiple environments with the same services in them
133 this allows you to differentiate between them.
134
124135
=== added directory 'files/nagios'
=== added file 'files/nagios/check_ceph_status.py'
--- files/nagios/check_ceph_status.py 1970-01-01 00:00:00 +0000
+++ files/nagios/check_ceph_status.py 2014-11-18 01:06:55 +0000
@@ -0,0 +1,44 @@
1#!/usr/bin/env python
2
3# Copyright (C) 2014 Canonical
4# All Rights Reserved
5# Author: Jacek Nykis <jacek.nykis@canonical.com>
6
7import re
8import argparse
9import subprocess
10import nagios_plugin
11
12
13def check_ceph_status(args):
14 if args.status_file:
15 nagios_plugin.check_file_freshness(args.status_file, 3600)
16 with open(args.status_file, "r") as f:
17 lines = f.readlines()
18 status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1)
19 else:
20 lines = subprocess.check_output(["ceph", "status"]).split('\n')
21 status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1)
22
23 if ('health' not in status_data
24 or 'monmap' not in status_data
25 or 'osdmap'not in status_data):
26 raise nagios_plugin.UnknownError('UNKNOWN: status data is incomplete')
27
28 if status_data['health'] != 'HEALTH_OK':
29 msg = 'CRITICAL: ceph health status: "{}"'.format(status_data['health'])
30 raise nagios_plugin.CriticalError(msg)
31 osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap'])
32 if osds.group(1) > osds.group(2): # not all OSDs are "up"
33 msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format(
34 osds.group(1), osds.group(2))
35 raise nagios_plugin.CriticalError(msg)
36 print "All OK"
37
38
39if __name__ == '__main__':
40 parser = argparse.ArgumentParser(description='Check ceph status')
41 parser.add_argument('-f', '--file', dest='status_file',
42 default=False, help='Optional file with "ceph status" output')
43 args = parser.parse_args()
44 nagios_plugin.try_check(check_ceph_status, args)
045
=== added file 'files/nagios/collect_ceph_status.sh'
--- files/nagios/collect_ceph_status.sh 1970-01-01 00:00:00 +0000
+++ files/nagios/collect_ceph_status.sh 2014-11-18 01:06:55 +0000
@@ -0,0 +1,18 @@
1#!/bin/bash
2# Copyright (C) 2014 Canonical
3# All Rights Reserved
4# Author: Jacek Nykis <jacek.nykis@canonical.com>
5
6LOCK=/var/lock/ceph-status.lock
7lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1
8if [ $? -ne 0 ]; then
9 exit 1
10fi
11trap "rm -f $LOCK > /dev/null 2>&1" exit
12
13DATA_DIR="/var/lib/nagios"
14if [ ! -d $DATA_DIR ]; then
15 mkdir -p $DATA_DIR
16fi
17
18ceph status >${DATA_DIR}/cat-ceph-status.txt
019
=== added directory 'hooks/charmhelpers/contrib/charmsupport'
=== added file 'hooks/charmhelpers/contrib/charmsupport/__init__.py'
=== added file 'hooks/charmhelpers/contrib/charmsupport/nrpe.py'
--- hooks/charmhelpers/contrib/charmsupport/nrpe.py 1970-01-01 00:00:00 +0000
+++ hooks/charmhelpers/contrib/charmsupport/nrpe.py 2014-11-18 01:06:55 +0000
@@ -0,0 +1,222 @@
1"""Compatibility with the nrpe-external-master charm"""
2# Copyright 2012 Canonical Ltd.
3#
4# Authors:
5# Matthew Wedgwood <matthew.wedgwood@canonical.com>
6
7import subprocess
8import pwd
9import grp
10import os
11import re
12import shlex
13import yaml
14
15from charmhelpers.core.hookenv import (
16 config,
17 local_unit,
18 log,
19 relation_ids,
20 relation_set,
21)
22
23from charmhelpers.core.host import service
24
25# This module adds compatibility with the nrpe-external-master and plain nrpe
26# subordinate charms. To use it in your charm:
27#
28# 1. Update metadata.yaml
29#
30# provides:
31# (...)
32# nrpe-external-master:
33# interface: nrpe-external-master
34# scope: container
35#
36# and/or
37#
38# provides:
39# (...)
40# local-monitors:
41# interface: local-monitors
42# scope: container
43
44#
45# 2. Add the following to config.yaml
46#
47# nagios_context:
48# default: "juju"
49# type: string
50# description: |
51# Used by the nrpe subordinate charms.
52# A string that will be prepended to instance name to set the host name
53# in nagios. So for instance the hostname would be something like:
54# juju-myservice-0
55# If you're running multiple environments with the same services in them
56# this allows you to differentiate between them.
57#
58# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master
59#
60# 4. Update your hooks.py with something like this:
61#
62# from charmsupport.nrpe import NRPE
63# (...)
64# def update_nrpe_config():
65# nrpe_compat = NRPE()
66# nrpe_compat.add_check(
67# shortname = "myservice",
68# description = "Check MyService",
69# check_cmd = "check_http -w 2 -c 10 http://localhost"
70# )
71# nrpe_compat.add_check(
72# "myservice_other",
73# "Check for widget failures",
74# check_cmd = "/srv/myapp/scripts/widget_check"
75# )
76# nrpe_compat.write()
77#
78# def config_changed():
79# (...)
80# update_nrpe_config()
81#
82# def nrpe_external_master_relation_changed():
83# update_nrpe_config()
84#
85# def local_monitors_relation_changed():
86# update_nrpe_config()
87#
88# 5. ln -s hooks.py nrpe-external-master-relation-changed
89# ln -s hooks.py local-monitors-relation-changed
90
91
92class CheckException(Exception):
93 pass
94
95
96class Check(object):
97 shortname_re = '[A-Za-z0-9-_]+$'
98 service_template = ("""
99#---------------------------------------------------
100# This file is Juju managed
101#---------------------------------------------------
102define service {{
103 use active-service
104 host_name {nagios_hostname}
105 service_description {nagios_hostname}[{shortname}] """
106 """{description}
107 check_command check_nrpe!{command}
108 servicegroups {nagios_servicegroup}
109}}
110""")
111
112 def __init__(self, shortname, description, check_cmd):
113 super(Check, self).__init__()
114 # XXX: could be better to calculate this from the service name
115 if not re.match(self.shortname_re, shortname):
116 raise CheckException("shortname must match {}".format(
117 Check.shortname_re))
118 self.shortname = shortname
119 self.command = "check_{}".format(shortname)
120 # Note: a set of invalid characters is defined by the
121 # Nagios server config
122 # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()=
123 self.description = description
124 self.check_cmd = self._locate_cmd(check_cmd)
125
126 def _locate_cmd(self, check_cmd):
127 search_path = (
128 '/',
129 os.path.join(os.environ['CHARM_DIR'],
130 'files/nrpe-external-master'),
131 '/usr/lib/nagios/plugins',
132 '/usr/local/lib/nagios/plugins',
133 )
134 parts = shlex.split(check_cmd)
135 for path in search_path:
136 if os.path.exists(os.path.join(path, parts[0])):
137 command = os.path.join(path, parts[0])
138 if len(parts) > 1:
139 command += " " + " ".join(parts[1:])
140 return command
141 log('Check command not found: {}'.format(parts[0]))
142 return ''
143
144 def write(self, nagios_context, hostname):
145 nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format(
146 self.command)
147 with open(nrpe_check_file, 'w') as nrpe_check_config:
148 nrpe_check_config.write("# check {}\n".format(self.shortname))
149 nrpe_check_config.write("command[{}]={}\n".format(
150 self.command, self.check_cmd))
151
152 if not os.path.exists(NRPE.nagios_exportdir):
153 log('Not writing service config as {} is not accessible'.format(
154 NRPE.nagios_exportdir))
155 else:
156 self.write_service_config(nagios_context, hostname)
157
158 def write_service_config(self, nagios_context, hostname):
159 for f in os.listdir(NRPE.nagios_exportdir):
160 if re.search('.*{}.cfg'.format(self.command), f):
161 os.remove(os.path.join(NRPE.nagios_exportdir, f))
162
163 templ_vars = {
164 'nagios_hostname': hostname,
165 'nagios_servicegroup': nagios_context,
166 'description': self.description,
167 'shortname': self.shortname,
168 'command': self.command,
169 }
170 nrpe_service_text = Check.service_template.format(**templ_vars)
171 nrpe_service_file = '{}/service__{}_{}.cfg'.format(
172 NRPE.nagios_exportdir, hostname, self.command)
173 with open(nrpe_service_file, 'w') as nrpe_service_config:
174 nrpe_service_config.write(str(nrpe_service_text))
175
176 def run(self):
177 subprocess.call(self.check_cmd)
178
179
180class NRPE(object):
181 nagios_logdir = '/var/log/nagios'
182 nagios_exportdir = '/var/lib/nagios/export'
183 nrpe_confdir = '/etc/nagios/nrpe.d'
184
185 def __init__(self, hostname=None):
186 super(NRPE, self).__init__()
187 self.config = config()
188 self.nagios_context = self.config['nagios_context']
189 self.unit_name = local_unit().replace('/', '-')
190 if hostname:
191 self.hostname = hostname
192 else:
193 self.hostname = "{}-{}".format(self.nagios_context, self.unit_name)
194 self.checks = []
195
196 def add_check(self, *args, **kwargs):
197 self.checks.append(Check(*args, **kwargs))
198
199 def write(self):
200 try:
201 nagios_uid = pwd.getpwnam('nagios').pw_uid
202 nagios_gid = grp.getgrnam('nagios').gr_gid
203 except:
204 log("Nagios user not set up, nrpe checks not updated")
205 return
206
207 if not os.path.exists(NRPE.nagios_logdir):
208 os.mkdir(NRPE.nagios_logdir)
209 os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid)
210
211 nrpe_monitors = {}
212 monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}}
213 for nrpecheck in self.checks:
214 nrpecheck.write(self.nagios_context, self.hostname)
215 nrpe_monitors[nrpecheck.shortname] = {
216 "command": nrpecheck.command,
217 }
218
219 service('restart', 'nagios-nrpe-server')
220
221 for rid in relation_ids("local-monitors"):
222 relation_set(relation_id=rid, monitors=yaml.dump(monitors))
0223
=== added file 'hooks/charmhelpers/contrib/charmsupport/volumes.py'
--- hooks/charmhelpers/contrib/charmsupport/volumes.py 1970-01-01 00:00:00 +0000
+++ hooks/charmhelpers/contrib/charmsupport/volumes.py 2014-11-18 01:06:55 +0000
@@ -0,0 +1,156 @@
1'''
2Functions for managing volumes in juju units. One volume is supported per unit.
3Subordinates may have their own storage, provided it is on its own partition.
4
5Configuration stanzas:
6 volume-ephemeral:
7 type: boolean
8 default: true
9 description: >
10 If false, a volume is mounted as sepecified in "volume-map"
11 If true, ephemeral storage will be used, meaning that log data
12 will only exist as long as the machine. YOU HAVE BEEN WARNED.
13 volume-map:
14 type: string
15 default: {}
16 description: >
17 YAML map of units to device names, e.g:
18 "{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }"
19 Service units will raise a configure-error if volume-ephemeral
20 is 'true' and no volume-map value is set. Use 'juju set' to set a
21 value and 'juju resolved' to complete configuration.
22
23Usage:
24 from charmsupport.volumes import configure_volume, VolumeConfigurationError
25 from charmsupport.hookenv import log, ERROR
26 def post_mount_hook():
27 stop_service('myservice')
28 def post_mount_hook():
29 start_service('myservice')
30
31 if __name__ == '__main__':
32 try:
33 configure_volume(before_change=pre_mount_hook,
34 after_change=post_mount_hook)
35 except VolumeConfigurationError:
36 log('Storage could not be configured', ERROR)
37'''
38
39# XXX: Known limitations
40# - fstab is neither consulted nor updated
41
42import os
43from charmhelpers.core import hookenv
44from charmhelpers.core import host
45import yaml
46
47
48MOUNT_BASE = '/srv/juju/volumes'
49
50
51class VolumeConfigurationError(Exception):
52 '''Volume configuration data is missing or invalid'''
53 pass
54
55
56def get_config():
57 '''Gather and sanity-check volume configuration data'''
58 volume_config = {}
59 config = hookenv.config()
60
61 errors = False
62
63 if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'):
64 volume_config['ephemeral'] = True
65 else:
66 volume_config['ephemeral'] = False
67
68 try:
69 volume_map = yaml.safe_load(config.get('volume-map', '{}'))
70 except yaml.YAMLError as e:
71 hookenv.log("Error parsing YAML volume-map: {}".format(e),
72 hookenv.ERROR)
73 errors = True
74 if volume_map is None:
75 # probably an empty string
76 volume_map = {}
77 elif not isinstance(volume_map, dict):
78 hookenv.log("Volume-map should be a dictionary, not {}".format(
79 type(volume_map)))
80 errors = True
81
82 volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME'])
83 if volume_config['device'] and volume_config['ephemeral']:
84 # asked for ephemeral storage but also defined a volume ID
85 hookenv.log('A volume is defined for this unit, but ephemeral '
86 'storage was requested', hookenv.ERROR)
87 errors = True
88 elif not volume_config['device'] and not volume_config['ephemeral']:
89 # asked for permanent storage but did not define volume ID
90 hookenv.log('Ephemeral storage was requested, but there is no volume '
91 'defined for this unit.', hookenv.ERROR)
92 errors = True
93
94 unit_mount_name = hookenv.local_unit().replace('/', '-')
95 volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name)
96
97 if errors:
98 return None
99 return volume_config
100
101
102def mount_volume(config):
103 if os.path.exists(config['mountpoint']):
104 if not os.path.isdir(config['mountpoint']):
105 hookenv.log('Not a directory: {}'.format(config['mountpoint']))
106 raise VolumeConfigurationError()
107 else:
108 host.mkdir(config['mountpoint'])
109 if os.path.ismount(config['mountpoint']):
110 unmount_volume(config)
111 if not host.mount(config['device'], config['mountpoint'], persist=True):
112 raise VolumeConfigurationError()
113
114
115def unmount_volume(config):
116 if os.path.ismount(config['mountpoint']):
117 if not host.umount(config['mountpoint'], persist=True):
118 raise VolumeConfigurationError()
119
120
121def managed_mounts():
122 '''List of all mounted managed volumes'''
123 return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts())
124
125
126def configure_volume(before_change=lambda: None, after_change=lambda: None):
127 '''Set up storage (or don't) according to the charm's volume configuration.
128 Returns the mount point or "ephemeral". before_change and after_change
129 are optional functions to be called if the volume configuration changes.
130 '''
131
132 config = get_config()
133 if not config:
134 hookenv.log('Failed to read volume configuration', hookenv.CRITICAL)
135 raise VolumeConfigurationError()
136
137 if config['ephemeral']:
138 if os.path.ismount(config['mountpoint']):
139 before_change()
140 unmount_volume(config)
141 after_change()
142 return 'ephemeral'
143 else:
144 # persistent storage
145 if os.path.ismount(config['mountpoint']):
146 mounts = dict(managed_mounts())
147 if mounts.get(config['mountpoint']) != config['device']:
148 before_change()
149 unmount_volume(config)
150 mount_volume(config)
151 after_change()
152 else:
153 before_change()
154 mount_volume(config)
155 after_change()
156 return config['mountpoint']
0157
=== modified file 'hooks/hooks.py'
--- hooks/hooks.py 2014-09-30 03:41:06 +0000
+++ hooks/hooks.py 2014-11-18 01:06:55 +0000
@@ -20,6 +20,8 @@
20 relation_ids,20 relation_ids,
21 related_units,21 related_units,
22 relation_get,22 relation_get,
23 relations_of_type,
24 local_unit,
23 Hooks,25 Hooks,
24 UnregisteredHookError,26 UnregisteredHookError,
25 service_name27 service_name
@@ -48,6 +50,8 @@
48 format_ipv6_addr50 format_ipv6_addr
49)51)
5052
53from charmhelpers.contrib.charmsupport.nrpe import NRPE
54
51hooks = Hooks()55hooks = Hooks()
5256
5357
@@ -203,6 +207,34 @@
203 fatal=True)207 fatal=True)
204208
205209
210@hooks.hook('nrpe-external-master-relation-joined',
211 'nrpe-external-master-relation-changed')
212def update_nrpe_config():
213 # Find out if nrpe set nagios_hostname
214 hostname = None
215 host_context = None
216 for rel in relations_of_type('nrpe-external-master'):
217 if 'nagios_hostname' in rel:
218 hostname = rel['nagios_hostname']
219 host_context = rel['nagios_host_context']
220 break
221 nrpe = NRPE(hostname=hostname)
222 apt_install('python-dbus')
223
224 if host_context:
225 current_unit = "%s:%s" % (host_context, local_unit())
226 else:
227 current_unit = local_unit()
228
229 nrpe.add_check(
230 shortname='ceph-osd',
231 description='process check {%s}' % current_unit,
232 check_cmd='check_upstart_job ceph-osd',
233 )
234
235 nrpe.write()
236
237
206if __name__ == '__main__':238if __name__ == '__main__':
207 try:239 try:
208 hooks.execute(sys.argv)240 hooks.execute(sys.argv)
209241
=== added symlink 'hooks/nrpe-external-master-relation-changed'
=== target is u'hooks.py'
=== added symlink 'hooks/nrpe-external-master-relation-joined'
=== target is u'hooks.py'
=== modified file 'metadata.yaml'
--- metadata.yaml 2014-10-06 22:11:14 +0000
+++ metadata.yaml 2014-11-18 01:06:55 +0000
@@ -1,6 +1,10 @@
1name: ceph-osd1name: ceph-osd
2summary: Highly scalable distributed storage - Ceph OSD storage2summary: Highly scalable distributed storage - Ceph OSD storage
3maintainer: James Page <james.page@ubuntu.com>3maintainer: James Page <james.page@ubuntu.com>
4provides:
5 nrpe-external-master:
6 interface: nrpe-external-master
7 scope: container
4categories:8categories:
5 - misc9 - misc
6description: |10description: |

Subscribers

People subscribed via source and target branches