Merge lp:~stub/charms/precise/postgresql/fix-races into lp:charms/postgresql

Proposed by Stuart Bishop
Status: Merged
Approved by: Mark Mims
Approved revision: 75
Merged at revision: 63
Proposed branch: lp:~stub/charms/precise/postgresql/fix-races
Merge into: lp:charms/postgresql
Prerequisite: lp:~stub/charms/precise/postgresql/cleanups
Diff against target: 650 lines (+247/-131)
3 files modified
config.yaml (+7/-0)
hooks/hooks.py (+208/-109)
test.py (+32/-22)
To merge this branch: bzr merge lp:~stub/charms/precise/postgresql/fix-races
Reviewer Review Type Date Requested Status
Mark Mims (community) Approve
Review via email: mp+181740@code.launchpad.net

Description of the change

The new local provider is quite fast, and exposes race conditions in the PostgreSQL charm.

This branch reworks the replication peer relationship so the test suite runs more reliably. For example, election has been rewritten to cope with situations such as, when creating a new service of 3 units, units 1 and 2 may have already elected a master amongst themselves before unit 0 has joined the relation, so the assumption that the lowest numbered unit in a new replication peer relation is the master is false.

To post a comment you must log in.
75. By Stuart Bishop

Merged cleanups into fix-races.

Revision history for this message
Mark Mims (mark-mims) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'config.yaml'
--- config.yaml 2013-05-27 14:37:19 +0000
+++ config.yaml 2013-08-23 09:40:16 +0000
@@ -313,3 +313,10 @@
313 type: string313 type: string
314 description: |314 description: |
315 Extra archives to add with add-apt-repository(1).315 Extra archives to add with add-apt-repository(1).
316 advisory_lock_restart_key:
317 default: 765
318 type: int
319 description: |
320 An advisory lock key used internally by the charm. You do not need
321 to change it unless it happens to conflict with an advisory lock key
322 being used by your applications.
316323
=== modified file 'hooks/hooks.py'
--- hooks/hooks.py 2013-08-23 09:40:15 +0000
+++ hooks/hooks.py 2013-08-23 09:40:16 +0000
@@ -20,7 +20,7 @@
2020
21from charmhelpers.core import hookenv, host21from charmhelpers.core import hookenv, host
22from charmhelpers.core.hookenv import (22from charmhelpers.core.hookenv import (
23 CRITICAL, ERROR, WARNING, INFO, DEBUG, log,23 CRITICAL, ERROR, WARNING, INFO, DEBUG,
24 )24 )
2525
26hooks = hookenv.Hooks()26hooks = hookenv.Hooks()
@@ -28,16 +28,24 @@
28# jinja2 may not be importable until the install hook has installed the28# jinja2 may not be importable until the install hook has installed the
29# required packages.29# required packages.
30def Template(*args, **kw):30def Template(*args, **kw):
31 """jinja2.Template with deferred jinja2 import"""
31 from jinja2 import Template32 from jinja2 import Template
32 return Template(*args, **kw)33 return Template(*args, **kw)
3334
3435
35def log(msg, lvl=INFO):36def log(msg, lvl=INFO):
36 # Per Bug #1208787, log messages sent via juju-log are being lost.37 '''Log a message.
37 # Spit messages out to a log file to work around the problem.38
39 Per Bug #1208787, log messages sent via juju-log are being lost.
40 Spit messages out to a log file to work around the problem.
41 It is also rather nice to have the log messages we explicitly emit
42 in a separate log file, rather than just mashed up with all the
43 juju noise.
44 '''
38 myname = hookenv.local_unit().replace('/', '-')45 myname = hookenv.local_unit().replace('/', '-')
39 with open('/tmp/{}-debug.log'.format(myname), 'a') as f:46 ts = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
40 f.write('{}: {}\n'.format(lvl, msg))47 with open('/var/log/juju/{}-debug.log'.format(myname), 'a') as f:
48 f.write('{} {}: {}\n'.format(ts, lvl, msg))
41 hookenv.log(msg, lvl)49 hookenv.log(msg, lvl)
4250
4351
@@ -49,6 +57,7 @@
49 self.load()57 self.load()
5058
51 def load(self):59 def load(self):
60 '''Load stored state from local disk.'''
52 if os.path.exists(self._state_file):61 if os.path.exists(self._state_file):
53 state = pickle.load(open(self._state_file, 'rb'))62 state = pickle.load(open(self._state_file, 'rb'))
54 else:63 else:
@@ -58,6 +67,7 @@
58 self.update(state)67 self.update(state)
5968
60 def save(self):69 def save(self):
70 '''Store state to local disk.'''
61 state = {}71 state = {}
62 state.update(self)72 state.update(self)
63 pickle.dump(state, open(self._state_file, 'wb'))73 pickle.dump(state, open(self._state_file, 'wb'))
@@ -181,13 +191,13 @@
181191
182192
183def postgresql_autostart(enabled):193def postgresql_autostart(enabled):
194 startup_file = os.path.join(postgresql_config_dir, 'start.conf')
184 if enabled:195 if enabled:
185 log("Enabling PostgreSQL startup in {}".format(startup_file))196 log("Enabling PostgreSQL startup in {}".format(startup_file))
186 mode = 'auto'197 mode = 'auto'
187 else:198 else:
188 log("Disabling PostgreSQL startup in {}".format(startup_file))199 log("Disabling PostgreSQL startup in {}".format(startup_file))
189 mode = 'manual'200 mode = 'manual'
190 startup_file = os.path.join(postgresql_config_dir, 'start.conf')
191 contents = Template(open("templates/start_conf.tmpl").read()).render(201 contents = Template(open("templates/start_conf.tmpl").read()).render(
192 {'mode': mode})202 {'mode': mode})
193 host.write_file(203 host.write_file(
@@ -209,6 +219,7 @@
209219
210220
211def postgresql_is_running():221def postgresql_is_running():
222 '''Return true if PostgreSQL is running.'''
212 # init script always return true (9.1), add extra check to make it useful223 # init script always return true (9.1), add extra check to make it useful
213 status, output = commands.getstatusoutput("invoke-rc.d postgresql status")224 status, output = commands.getstatusoutput("invoke-rc.d postgresql status")
214 if status != 0:225 if status != 0:
@@ -219,72 +230,65 @@
219230
220231
221def postgresql_stop():232def postgresql_stop():
222 host.service_stop('postgresql')233 '''Shutdown PostgreSQL.'''
223 return not postgresql_is_running()234 success = host.service_stop('postgresql')
235 return not (success and postgresql_is_running())
224236
225237
226def postgresql_start():238def postgresql_start():
227 host.service_start('postgresql')239 '''Start PostgreSQL if it is not already running.'''
228 return postgresql_is_running()240 success = host.service_start('postgresql')
241 return success and postgresql_is_running()
229242
230243
231def postgresql_restart():244def postgresql_restart():
245 '''Restart PostgreSQL, or start it if it is not already running.'''
232 if postgresql_is_running():246 if postgresql_is_running():
233 # If the database is in backup mode, we don't want to restart247 with restart_lock(hookenv.local_unit(), True):
234 # PostgreSQL and abort the procedure. This may be another unit being248 # 'service postgresql restart' fails; it only does a reload.
235 # cloned, or a filesystem level backup is being made. There is no249 # success = host.service_restart('postgresql')
236 # timeout here, as backups can take hours or days. Instead, keep250 try:
237 # logging so admins know wtf is going on.251 run('pg_ctlcluster -force {version} {cluster_name} '
238 last_warning = time.time()252 'restart'.format(**config_data))
239 while postgresql_is_in_backup_mode():253 success = True
240 if time.time() + 120 > last_warning:254 except subprocess.CalledProcessError as e:
241 log("In backup mode. PostgreSQL restart blocked.", WARNING)255 success = False
242 log(
243 "Run \"psql -U postgres -c 'SELECT pg_stop_backup()'\""
244 "to cancel backup mode and forcefully unblock this hook.")
245 last_warning = time.time()
246 time.sleep(5)
247
248 return host.service_restart('postgresql')
249 else:256 else:
250 return host.service_start('postgresql')257 success = host.service_start('postgresql')
251258
252 # Store a copy of our known live configuration so259 # Store a copy of our known live configuration so
253 # postgresql_reload_or_restart() can make good choices.260 # postgresql_reload_or_restart() can make good choices.
254 if 'saved_config' in local_state:261 if success and 'saved_config' in local_state:
255 local_state['live_config'] = local_state['saved_config']262 local_state['live_config'] = local_state['saved_config']
256 local_state.save()263 local_state.save()
257264
258 return postgresql_is_running()265 return success and postgresql_is_running()
259266
260267
261def postgresql_reload():268def postgresql_reload():
269 '''Make PostgreSQL reload its configuration.'''
262 # reload returns a reliable exit status270 # reload returns a reliable exit status
263 status, output = commands.getstatusoutput("invoke-rc.d postgresql reload")271 status, output = commands.getstatusoutput("invoke-rc.d postgresql reload")
264 return (status == 0)272 return (status == 0)
265273
266274
267def postgresql_reload_or_restart():275def requires_restart():
268 """Reload PostgreSQL configuration, restarting if necessary."""276 '''Check for configuration changes requiring a restart to take effect.'''
269 # Pull in current values of settings that can only be changed on
270 # server restart.
271 if not postgresql_is_running():277 if not postgresql_is_running():
272 return postgresql_restart()278 return True
273279
274 # Suck in the config last written to postgresql.conf.
275 saved_config = local_state.get('saved_config', None)280 saved_config = local_state.get('saved_config', None)
276 if not saved_config:281 if not saved_config:
277 # No record of postgresql.conf state, perhaps an upgrade.282 # No record of postgresql.conf state, perhaps an upgrade.
278 # Better restart.283 # Better restart.
279 return postgresql_restart()284 return True
280285
281 # Suck in our live config from last time we restarted.
282 live_config = local_state.setdefault('live_config', {})286 live_config = local_state.setdefault('live_config', {})
283287
284 # Pull in a list of PostgreSQL settings.288 # Pull in a list of PostgreSQL settings.
285 cur = db_cursor()289 cur = db_cursor()
286 cur.execute("SELECT name, context FROM pg_settings")290 cur.execute("SELECT name, context FROM pg_settings")
287 requires_restart = False291 restart = False
288 for name, context in cur.fetchall():292 for name, context in cur.fetchall():
289 live_value = live_config.get(name, None)293 live_value = live_config.get(name, None)
290 new_value = saved_config.get(name, None)294 new_value = saved_config.get(name, None)
@@ -296,23 +300,27 @@
296 if context == 'postmaster':300 if context == 'postmaster':
297 # A setting has changed that requires PostgreSQL to be301 # A setting has changed that requires PostgreSQL to be
298 # restarted before it will take effect.302 # restarted before it will take effect.
299 requires_restart = True303 restart = True
300304 return restart
301 if requires_restart:305
302 # A change has been requested that requires a restart.306
303 log(307def postgresql_reload_or_restart():
304 "Configuration change requires PostgreSQL restart. Restarting.",308 """Reload PostgreSQL configuration, restarting if necessary."""
309 if requires_restart():
310 log("Configuration change requires PostgreSQL restart. Restarting.",
305 WARNING)311 WARNING)
306 rc = postgresql_restart()312 success = postgresql_restart()
313 if not success or requires_restart():
314 log("Configuration changes failed to apply", WARNING)
315 success = False
307 else:316 else:
308 log("PostgreSQL reload, config changes taking effect.", DEBUG)317 success = host.service_reload('postgresql')
309 rc = postgresql_reload() # No pending need to bounce, just reload.
310318
311 if rc == 0 and 'saved_config' in local_state:319 if success:
312 local_state['live_config'] = local_state['saved_config']320 local_state['saved_config'] = local_state['live_config']
313 local_state.save()321 local_state.save()
314322
315 return rc323 return success
316324
317325
318def get_service_port(postgresql_config):326def get_service_port(postgresql_config):
@@ -344,8 +352,6 @@
344 config_data["shared_buffers"] = \352 config_data["shared_buffers"] = \
345 "%sMB" % (int(int(total_ram) * 0.15),)353 "%sMB" % (int(int(total_ram) * 0.15),)
346 # XXX: This is very messy - should probably be a subordinate charm354 # XXX: This is very messy - should probably be a subordinate charm
347 # file overlaps with __builtin__.file ... renaming to conf_file
348 # negronjl
349 conf_file = open("/etc/sysctl.d/50-postgresql.conf", "w")355 conf_file = open("/etc/sysctl.d/50-postgresql.conf", "w")
350 conf_file.write("kernel.sem = 250 32000 100 1024\n")356 conf_file.write("kernel.sem = 250 32000 100 1024\n")
351 conf_file.write("kernel.shmall = %s\n" %357 conf_file.write("kernel.shmall = %s\n" %
@@ -579,7 +585,7 @@
579585
580586
581def db_cursor(autocommit=False, db='template1', user='postgres',587def db_cursor(autocommit=False, db='template1', user='postgres',
582 host=None, timeout=120):588 host=None, timeout=30):
583 import psycopg2589 import psycopg2
584 if host:590 if host:
585 conn_str = "dbname={} host={} user={}".format(db, host, user)591 conn_str = "dbname={} host={} user={}".format(db, host, user)
@@ -855,14 +861,16 @@
855861
856@hooks.hook()862@hooks.hook()
857def start():863def start():
858 if not postgresql_restart():864 if not postgresql_reload_or_restart():
859 raise SystemExit(1)865 raise SystemExit(1)
860866
861867
862@hooks.hook()868@hooks.hook()
863def stop():869def stop():
864 if not postgresql_stop():870 if postgresql_is_running():
865 raise SystemExit(1)871 with restart_lock(hookenv.local_unit(), True):
872 if not postgresql_stop():
873 raise SystemExit(1)
866874
867875
868def quote_identifier(identifier):876def quote_identifier(identifier):
@@ -1163,7 +1171,7 @@
1163def db_relation_broken():1171def db_relation_broken():
1164 from psycopg2.extensions import AsIs1172 from psycopg2.extensions import AsIs
11651173
1166 relid = os.environ['JUJU_RELATION_ID']1174 relid = hookenv.relation_id()
1167 if relid not in local_state['relations']['db']:1175 if relid not in local_state['relations']['db']:
1168 # This was to be a hot standby, but it had not yet got as far as1176 # This was to be a hot standby, but it had not yet got as far as
1169 # receiving and handling credentials from the master.1177 # receiving and handling credentials from the master.
@@ -1174,7 +1182,7 @@
1174 # we used from there. Instead, we have to persist this information1182 # we used from there. Instead, we have to persist this information
1175 # ourselves.1183 # ourselves.
1176 relation = local_state['relations']['db'][relid]1184 relation = local_state['relations']['db'][relid]
1177 unit_relation_data = relation[os.environ['JUJU_UNIT_NAME']]1185 unit_relation_data = relation[hookenv.local_unit()]
11781186
1179 if local_state['state'] in ('master', 'standalone'):1187 if local_state['state'] in ('master', 'standalone'):
1180 user = unit_relation_data.get('user', None)1188 user = unit_relation_data.get('user', None)
@@ -1303,27 +1311,75 @@
1303 log("I am already the master", DEBUG)1311 log("I am already the master", DEBUG)
1304 return hookenv.local_unit()1312 return hookenv.local_unit()
13051313
1314 if local_state['state'] == 'hot standby':
1315 log("I am already following {}".format(
1316 local_state['following']), DEBUG)
1317 return local_state['following']
1318
1319 replication_relid = hookenv.relation_ids('replication')[0]
1320 replication_units = hookenv.related_units(replication_relid)
1321
1322 if local_state['state'] == 'standalone':
1323 log("I'm a standalone unit wanting to participate in replication")
1324 existing_replication = False
1325 for unit in replication_units:
1326 # If another peer thinks it is the master, believe it.
1327 remote_state = hookenv.relation_get(
1328 'state', unit, replication_relid)
1329 if remote_state == 'master':
1330 log("{} thinks it is the master, believing it".format(
1331 unit), DEBUG)
1332 return unit
1333
1334 # If we find a peer that isn't standalone, we know
1335 # replication has already been setup at some point.
1336 if remote_state != 'standalone':
1337 existing_replication = True
1338
1339 # If we are joining a peer relation where replication has
1340 # already been setup, but there is currently no master, wait
1341 # until one of the remaining participating units has been
1342 # promoted to master. Only they have the data we need to
1343 # preserve.
1344 if existing_replication:
1345 log("Peers participating in replication need to elect a master",
1346 DEBUG)
1347 return None
1348
1349 # There are no peers claiming to be master, and there is no
1350 # election in progress, so lowest numbered unit wins.
1351 units = replication_units + [hookenv.local_unit()]
1352 master = unit_sorted(units)[0]
1353 if master == hookenv.local_unit():
1354 log("I'm Master - lowest numbered unit in new peer group")
1355 return master
1356 else:
1357 log("Waiting on {} to declare itself Master".format(master), DEBUG)
1358 return None
1359
1306 if local_state['state'] == 'failover':1360 if local_state['state'] == 'failover':
1307 former_master = local_state['following']1361 former_master = local_state['following']
1308 log("Failover from {}".format(former_master))1362 log("Failover from {}".format(former_master))
13091363
1310 units_not_in_failover = set()1364 units_not_in_failover = set()
1311 for relid in hookenv.relation_ids('replication'):1365 candidates = set()
1312 for unit in hookenv.related_units(relid):1366 for unit in replication_units:
1313 if unit == former_master:1367 if unit == former_master:
1314 log("Found dying master {}".format(unit), DEBUG)1368 log("Found dying master {}".format(unit), DEBUG)
1315 continue1369 continue
13161370
1317 relation = hookenv.relation_get(unit=unit, rid=relid)1371 relation = hookenv.relation_get(unit=unit, rid=replication_relid)
13181372
1319 if relation['state'] == 'master':1373 if relation['state'] == 'master':
1320 log(1374 log("{} says it already won the election".format(unit),
1321 "{} says it already won the election".format(unit),1375 INFO)
1322 INFO)1376 return unit
1323 return unit1377
13241378 if relation['state'] == 'failover':
1325 if relation['state'] != 'failover':1379 candidates.add(unit)
1326 units_not_in_failover.add(unit)1380
1381 elif relation['state'] != 'standalone':
1382 units_not_in_failover.add(unit)
13271383
1328 if units_not_in_failover:1384 if units_not_in_failover:
1329 log("{} unaware of impending election. Deferring result.".format(1385 log("{} unaware of impending election. Deferring result.".format(
@@ -1333,35 +1389,24 @@
1333 log("Election in progress")1389 log("Election in progress")
1334 winner = None1390 winner = None
1335 winning_offset = -11391 winning_offset = -1
1336 for relid in hookenv.relation_ids('replication'):1392 candidates.add(hookenv.local_unit())
1337 candidates = set(hookenv.related_units(relid))1393 # Sort the unit lists so we get consistent results in a tie
1338 candidates.add(hookenv.local_unit())1394 # and lowest unit number wins.
1339 candidates.discard(former_master)1395 for unit in unit_sorted(candidates):
1340 # Sort the unit lists so we get consistent results in a tie1396 relation = hookenv.relation_get(unit=unit, rid=replication_relid)
1341 # and lowest unit number wins.1397 if int(relation['wal_received_offset']) > winning_offset:
1342 for unit in unit_sorted(candidates):1398 winner = unit
1343 relation = hookenv.relation_get(unit=unit, rid=relid)1399 winning_offset = int(relation['wal_received_offset'])
1344 if int(relation['wal_received_offset']) > winning_offset:
1345 winner = unit
1346 winning_offset = int(relation['wal_received_offset'])
13471400
1348 # All remaining hot standbys are in failover mode and have1401 # All remaining hot standbys are in failover mode and have
1349 # reported their wal_received_offset. We can declare victory.1402 # reported their wal_received_offset. We can declare victory.
1350 log("{} won the election as is the new master".format(winner))1403 if winner == hookenv.local_unit():
1351 return winner1404 log("I won the election, announcing myself winner")
13521405 return winner
1353 # Maybe another peer thinks it is the master?1406 else:
1354 for relid in hookenv.relation_ids('replication'):1407 log("Waiting for {} to announce its victory".format(winner),
1355 for unit in hookenv.related_units(relid):1408 DEBUG)
1356 if hookenv.relation_get('state', unit, relid) == 'master':1409 return None
1357 return unit
1358
1359 # New peer group. Lowest numbered unit will be the master.
1360 for relid in hookenv.relation_ids('replication'):
1361 units = hookenv.related_units(relid) + [hookenv.local_unit()]
1362 master = unit_sorted(units)[0]
1363 log("New peer group. {} is elected master".format(master))
1364 return master
13651410
13661411
1367@hooks.hook('replication-relation-joined', 'replication-relation-changed')1412@hooks.hook('replication-relation-joined', 'replication-relation-changed')
@@ -1419,10 +1464,7 @@
1419 log("Fresh unit. I will clone {} and become a hot standby".format(1464 log("Fresh unit. I will clone {} and become a hot standby".format(
1420 master))1465 master))
14211466
1422 # Before we start destroying anything, ensure that the
1423 # master is contactable.
1424 master_ip = hookenv.relation_get('private-address', master)1467 master_ip = hookenv.relation_get('private-address', master)
1425 wait_for_db(db='postgres', user='juju_replication', host=master_ip)
14261468
1427 clone_database(master, master_ip)1469 clone_database(master, master_ip)
14281470
@@ -1592,8 +1634,55 @@
1592 os.chdir(org_dir)1634 os.chdir(org_dir)
15931635
15941636
1637@contextmanager
1638def restart_lock(unit, exclusive):
1639 '''Aquire the database restart lock on the given unit.
1640
1641 A database needing a restart should grab an exclusive lock before
1642 doing so. To block a remote database from doing a restart, grab a shared
1643 lock.
1644 '''
1645 import psycopg2
1646 key = long(config_data['advisory_lock_restart_key'])
1647 if exclusive:
1648 lock_function = 'pg_advisory_lock'
1649 else:
1650 lock_function = 'pg_advisory_lock_shared'
1651 q = 'SELECT {}({})'.format(lock_function, key)
1652
1653 # We will get an exception if the database is rebooted while waiting
1654 # for a shared lock. If the connection is killed, we retry a few
1655 # times to cope.
1656 num_retries = 3
1657
1658 for count in range(0, num_retries):
1659 try:
1660 if unit == hookenv.local_unit():
1661 cur = db_cursor(autocommit=True)
1662 else:
1663 host = hookenv.relation_get('private-address', unit)
1664 cur = db_cursor(
1665 autocommit=True, db='postgres',
1666 user='juju_replication', host=host)
1667 cur.execute(q)
1668 break
1669 except psycopg2.Error:
1670 if count == num_retries - 1:
1671 raise
1672
1673 try:
1674 yield
1675 finally:
1676 # Close our connection, swallowing any exceptions as the database
1677 # may be being rebooted now we have released our lock.
1678 try:
1679 del cur
1680 except psycopg2.Error:
1681 pass
1682
1683
1595def clone_database(master_unit, master_host):1684def clone_database(master_unit, master_host):
1596 with pgpass():1685 with restart_lock(master_unit, False):
1597 postgresql_stop()1686 postgresql_stop()
1598 log("Cloning master {}".format(master_unit))1687 log("Cloning master {}".format(master_unit))
15991688
@@ -1607,9 +1696,10 @@
1607 shutil.rmtree(postgresql_cluster_dir)1696 shutil.rmtree(postgresql_cluster_dir)
16081697
1609 try:1698 try:
1610 # Change directory the postgres user can read.1699 # Change directory the postgres user can read, and need
1611 with switch_cwd('/tmp'):1700 # .pgpass too.
1612 # Run the sudo command.1701 with switch_cwd('/tmp'), pgpass():
1702 # Clone the master with pg_basebackup.
1613 output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)1703 output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
1614 log(output, DEBUG)1704 log(output, DEBUG)
1615 # Debian by default expects SSL certificates in the datadir.1705 # Debian by default expects SSL certificates in the datadir.
@@ -1626,8 +1716,8 @@
1626 # can retry hooks again. Even assuming the charm is1716 # can retry hooks again. Even assuming the charm is
1627 # functioning correctly, the clone may still fail1717 # functioning correctly, the clone may still fail
1628 # due to eg. lack of disk space.1718 # due to eg. lack of disk space.
1629 log("Clone failed, db cluster destroyed", ERROR)
1630 log(x.output, ERROR)1719 log(x.output, ERROR)
1720 log("Clone failed, local db destroyed", ERROR)
1631 if os.path.exists(postgresql_cluster_dir):1721 if os.path.exists(postgresql_cluster_dir):
1632 shutil.rmtree(postgresql_cluster_dir)1722 shutil.rmtree(postgresql_cluster_dir)
1633 if os.path.exists(postgresql_config_dir):1723 if os.path.exists(postgresql_config_dir):
@@ -1652,6 +1742,15 @@
1652 os.path.join(postgresql_cluster_dir, 'backup_label'))1742 os.path.join(postgresql_cluster_dir, 'backup_label'))
16531743
16541744
1745def pg_basebackup_is_running():
1746 cur = db_cursor(autocommit=True)
1747 cur.execute("""
1748 SELECT count(*) FROM pg_stat_activity
1749 WHERE usename='juju_replication' AND application_name='pg_basebackup'
1750 """)
1751 return cur.fetchone()[0] > 0
1752
1753
1655def postgresql_wal_received_offset():1754def postgresql_wal_received_offset():
1656 """How much WAL we have.1755 """How much WAL we have.
16571756
@@ -1694,7 +1793,7 @@
1694 try:1793 try:
1695 nagios_uid = getpwnam('nagios').pw_uid1794 nagios_uid = getpwnam('nagios').pw_uid
1696 nagios_gid = getgrnam('nagios').gr_gid1795 nagios_gid = getgrnam('nagios').gr_gid
1697 except:1796 except Exception:
1698 hookenv.log("Nagios user not set up.", hookenv.DEBUG)1797 hookenv.log("Nagios user not set up.", hookenv.DEBUG)
1699 return1798 return
17001799
17011800
=== modified file 'test.py'
--- test.py 2013-08-23 09:40:15 +0000
+++ test.py 2013-08-23 09:40:16 +0000
@@ -74,12 +74,12 @@
74 return None74 return None
7575
76 def deploy(self, charm, name=None, num_units=1):76 def deploy(self, charm, name=None, num_units=1):
77 # The first time we deploy a charm in the test run, it needs to77 # The first time we deploy a local: charm in the test run, it
78 # deploy with --update to ensure we are testing the desired78 # needs to deploy with --update to ensure we are testing the
79 # revision of the charm. Subsequent deploys we do not use79 # desired revision of the charm. Subsequent deploys we do not
80 # --update to avoid overhead and needless incrementing of the80 # use --update to avoid overhead and needless incrementing of the
81 # revision number.81 # revision number.
82 if charm.startswith('cs:') or charm in self._deployed_charms:82 if not charm.startswith('local:') or charm in self._deployed_charms:
83 cmd = ['deploy']83 cmd = ['deploy']
84 else:84 else:
85 cmd = ['deploy', '-u']85 cmd = ['deploy', '-u']
@@ -102,7 +102,7 @@
102 self.status = self.get_result(['status'])102 self.status = self.get_result(['status'])
103 return self.status103 return self.status
104104
105 def wait_until_ready(self):105 def wait_until_ready(self, extra=45):
106 ready = False106 ready = False
107 while not ready:107 while not ready:
108 self.refresh_status()108 self.refresh_status()
@@ -128,7 +128,7 @@
128 # enough that our system is probably stable. This means we have128 # enough that our system is probably stable. This means we have
129 # extremely slow and flaky tests, but that is possibly better129 # extremely slow and flaky tests, but that is possibly better
130 # than no tests.130 # than no tests.
131 time.sleep(45)131 time.sleep(extra)
132132
133 def setUp(self):133 def setUp(self):
134 DEBUG("JujuFixture.setUp()")134 DEBUG("JujuFixture.setUp()")
@@ -156,7 +156,7 @@
156 # Per Bug #1190250 (WONTFIX), we need to wait for dying services156 # Per Bug #1190250 (WONTFIX), we need to wait for dying services
157 # to die before we can continue.157 # to die before we can continue.
158 if found_services:158 if found_services:
159 self.wait_until_ready()159 self.wait_until_ready(0)
160160
161 # We shouldn't reuse machines, as we have no guarantee they are161 # We shouldn't reuse machines, as we have no guarantee they are
162 # still in a usable state, so tear them down too. Per162 # still in a usable state, so tear them down too. Per
@@ -305,15 +305,18 @@
305 self.juju.do(['add-relation', 'postgresql:db', 'psql:db'])305 self.juju.do(['add-relation', 'postgresql:db', 'psql:db'])
306 self.juju.wait_until_ready()306 self.juju.wait_until_ready()
307307
308 # On a freshly setup service, lowest numbered unit is always the308 # Even on a freshly setup service, we have no idea which unit
309 # master.309 # will become the master as we have no control over which two
310 units = unit_sorted(310 # units join the peer relation first.
311 self.juju.status['services']['postgresql']['units'].keys())311 units = sorted((self.is_master(unit), unit)
312 master_unit, standby_unit_1, standby_unit_2 = units312 for unit in
313313 self.juju.status['services']['postgresql']['units'].keys())
314 self.assertIs(True, self.is_master(master_unit))314 self.assertFalse(units[0][0])
315 self.assertIs(False, self.is_master(standby_unit_1))315 self.assertFalse(units[1][0])
316 self.assertIs(False, self.is_master(standby_unit_2))316 self.assertTrue(units[2][0])
317 standby_unit_1 = units[0][1]
318 standby_unit_2 = units[1][1]
319 master_unit = units[2][1]
317320
318 self.sql('CREATE TABLE Token (x int)', master_unit)321 self.sql('CREATE TABLE Token (x int)', master_unit)
319322
@@ -390,11 +393,18 @@
390 self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])393 self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])
391 self.juju.wait_until_ready()394 self.juju.wait_until_ready()
392395
393 # On a freshly setup service, lowest numbered unit is always the396 # Even on a freshly setup service, we have no idea which unit
394 # master.397 # will become the master as we have no control over which two
395 units = unit_sorted(398 # units join the peer relation first.
396 self.juju.status['services']['postgresql']['units'].keys())399 units = sorted((self.is_master(unit, 'postgres'), unit)
397 master_unit, standby_unit_1, standby_unit_2 = units400 for unit in
401 self.juju.status['services']['postgresql']['units'].keys())
402 self.assertFalse(units[0][0])
403 self.assertFalse(units[1][0])
404 self.assertTrue(units[2][0])
405 standby_unit_1 = units[0][1]
406 standby_unit_2 = units[1][1]
407 master_unit = units[2][1]
398408
399 # Shutdown PostgreSQL on standby_unit_1 and ensure409 # Shutdown PostgreSQL on standby_unit_1 and ensure
400 # standby_unit_2 will have received more WAL information from410 # standby_unit_2 will have received more WAL information from

Subscribers

People subscribed via source and target branches