Merge lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status into lp:~cf-charmers/charms/trusty/cloudfoundry/trunk

Proposed by Cory Johns
Status: Merged
Merged at revision: 162
Proposed branch: lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status
Merge into: lp:~cf-charmers/charms/trusty/cloudfoundry/trunk
Diff against target: 110 lines (+34/-10)
5 files modified
charmhelpers/core/services/base.py (+2/-0)
charmhelpers/core/services/helpers.py (+1/-4)
cloudfoundry/health_checks.py (+19/-0)
cloudfoundry/jobs.py (+1/-1)
reconciler/app.py (+11/-5)
To merge this branch: bzr merge lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status
Reviewer Review Type Date Requested Status
Benjamin Saller (community) Approve
Whit Morriss (community) Approve
Review via email: mp+242372@code.launchpad.net

Description of the change

Since the monit status is so unreliable, switch the basic health check to use the self-reported Juju status until we have something that more deeply introspects.

To post a comment you must log in.
Revision history for this message
Cory Johns (johnsca) wrote :

Basically, relies on the extended hookenv.juju_status() report (which accounts for data_ready), if available, and the basic Juju status if not.

Should give us a much more accurate approximation of when the system is up, although there may still be a slight delay (due to, e.g., `monit restart all` taking some time).

Revision history for this message
Benjamin Saller (bcsaller) wrote :

This looks like a good intermediate step. Thanks for doing this.

review: Approve
Revision history for this message
Whit Morriss (whitmo) wrote :

This look good to me.

I think the next step might be for the service framework to leave some breadcrumbs ala the status spec. Basically a list of incomplete relationships from the perspective of the service definition.

review: Approve
164. By Cory Johns

Only log "Incomplete requirement" for reqs that are blocking

Revision history for this message
Cory Johns (johnsca) wrote :

Whit,

The extended status is stored in $CHARM_DIR/juju_status.json on each unit, and is retrieved as part of the health hook that the reconciler calls. It might be nice for this to be pushed up to the reconciler instead of pulled, but I don't think it's all that urgent.

Revision history for this message
Benjamin Saller (bcsaller) wrote :

still LGTM, thanks :)

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'charmhelpers/core/services/base.py'
--- charmhelpers/core/services/base.py 2014-10-31 16:11:51 +0000
+++ charmhelpers/core/services/base.py 2014-11-20 17:56:06 +0000
@@ -297,6 +297,8 @@
297 if getattr(req, 'manual', False):297 if getattr(req, 'manual', False):
298 manual = True298 manual = True
299 if hasattr(req, 'name'):299 if hasattr(req, 'name'):
300 hookenv.log('Incomplete requirement: %s' % req.name,
301 hookenv.DEBUG)
300 blockers.append(req.name)302 blockers.append(req.name)
301 hookenv.juju_status(status, manual=manual, blockers=blockers)303 hookenv.juju_status(status, manual=manual, blockers=blockers)
302304
303305
=== modified file 'charmhelpers/core/services/helpers.py'
--- charmhelpers/core/services/helpers.py 2014-10-31 13:20:03 +0000
+++ charmhelpers/core/services/helpers.py 2014-11-20 17:56:06 +0000
@@ -45,10 +45,7 @@
45 """45 """
46 Returns True if all of the `required_keys` are available from any units.46 Returns True if all of the `required_keys` are available from any units.
47 """47 """
48 ready = len(self.get(self.name, [])) > 048 return len(self.get(self.name, [])) > 0
49 if not ready:
50 hookenv.log('Incomplete relation: {}'.format(self.__class__.__name__), hookenv.DEBUG)
51 return ready
5249
53 def _is_ready(self, unit_data):50 def _is_ready(self, unit_data):
54 """51 """
5552
=== modified file 'cloudfoundry/health_checks.py'
--- cloudfoundry/health_checks.py 2014-08-24 21:36:50 +0000
+++ cloudfoundry/health_checks.py 2014-11-20 17:56:06 +0000
@@ -1,3 +1,4 @@
1from charmhelpers.core import hookenv
1from cloudfoundry import tasks2from cloudfoundry import tasks
23
34
@@ -17,3 +18,21 @@
17 message='not all services running',18 message='not all services running',
18 data={'services': summary})19 data={'services': summary})
19 return result20 return result
21
22
23def status(service):
24 result = {
25 'name': 'monit_summary',
26 'health': 'pass',
27 'message': None,
28 'data': {},
29 }
30 status = hookenv.juju_status()
31 if status['status'] == 'error':
32 return dict(result, health='fail', message=status['message'])
33 elif status['status'] == 'blocked' and status['manual']:
34 return dict(result, health='fail', message='Blocked: %s' % status['blockers'])
35 elif status['status'] == 'up':
36 return result
37 else:
38 return dict(result, health='warn', message='Working (%s)' % status['status'])
2039
=== modified file 'cloudfoundry/jobs.py'
--- cloudfoundry/jobs.py 2014-10-03 15:47:17 +0000
+++ cloudfoundry/jobs.py 2014-11-20 17:56:06 +0000
@@ -68,7 +68,7 @@
68 service_def = service_data[charm_name]68 service_def = service_data[charm_name]
69 results = []69 results = []
70 health = 'pass'70 health = 'pass'
71 checks = service_def.get('health', []) + [health_checks.monit_summary]71 checks = service_def.get('health', []) + [health_checks.status]
72 for health_check in checks:72 for health_check in checks:
73 result = health_check(service_def)73 result = health_check(service_def)
74 if result['health'] == 'fail':74 if result['health'] == 'fail':
7575
=== modified file 'reconciler/app.py'
--- reconciler/app.py 2014-11-14 17:00:08 +0000
+++ reconciler/app.py 2014-11-20 17:56:06 +0000
@@ -83,13 +83,14 @@
83 units = service.get('Units', {}) or {}83 units = service.get('Units', {}) or {}
84 for unit_name, unit in units.iteritems():84 for unit_name, unit in units.iteritems():
85 unit_addr = unit.get('PublicAddress')85 unit_addr = unit.get('PublicAddress')
86 unit_state = unit.get('AgentState')
86 if unit_addr:87 if unit_addr:
87 loop = tornado.ioloop.IOLoop.instance()88 loop = tornado.ioloop.IOLoop.instance()
88 loop.add_callback(check_health, service_name,89 loop.add_callback(check_health, service_name,
89 unit_name, unit_addr)90 unit_name, unit_addr, unit_state)
9091
9192
92def check_health(service_name, unit_name, unit_addr):93def check_health(service_name, unit_name, unit_addr, unit_state):
93 service = health.setdefault(service_name, {94 service = health.setdefault(service_name, {
94 'name': service_name,95 'name': service_name,
95 'health': 'unknown',96 'health': 'unknown',
@@ -118,8 +119,13 @@
118 unit['health'] = 'fail'119 unit['health'] = 'fail'
119 unit['state'] = {'message': 'Unable to parse health: {}'.format(output)}120 unit['state'] = {'message': 'Unable to parse health: {}'.format(output)}
120 except subprocess.CalledProcessError as e:121 except subprocess.CalledProcessError as e:
121 unit['health'] = 'warn'
122 unit['state'] = {'message': 'Unable to retrieve health: {}'.format(e.output)}122 unit['state'] = {'message': 'Unable to retrieve health: {}'.format(e.output)}
123 if unit_state == 'started':
124 unit['health'] = 'pass'
125 elif unit_state == 'error':
126 unit['health'] = 'fail'
127 else:
128 unit['health'] = 'warn'
123129
124 units_fail = [u['health'] == 'fail' for u in service['units'].values()]130 units_fail = [u['health'] == 'fail' for u in service['units'].values()]
125 units_not_pass = [u['health'] != 'pass' for u in service['units'].values()]131 units_not_pass = [u['health'] != 'pass' for u in service['units'].values()]

Subscribers

People subscribed via source and target branches