1
=== modified file 'config.yaml'
2
--- config.yaml	2013-05-27 14:37:19 +0000
3
+++ config.yaml	2013-08-23 09:40:16 +0000
4
@@ -313,3 +313,10 @@
5
313
    type: string
313
    type: string
6
314
    description: |
314
    description: |
7
315
        Extra archives to add with add-apt-repository(1).
315
        Extra archives to add with add-apt-repository(1).
8
316
  advisory_lock_restart_key:
9
317
    default: 765
10
318
    type: int
11
319
    description: |
12
320
        An advisory lock key used internally by the charm. You do not need
13
321
        to change it unless it happens to conflict with an advisory lock key
14
322
        being used by your applications.
15
316
323
16
=== modified file 'hooks/hooks.py'
17
--- hooks/hooks.py	2013-08-23 09:40:15 +0000
18
+++ hooks/hooks.py	2013-08-23 09:40:16 +0000
19
@@ -20,7 +20,7 @@
20
20
20
21
21
from charmhelpers.core import hookenv, host
21
from charmhelpers.core import hookenv, host
22
22
from charmhelpers.core.hookenv import (
22
from charmhelpers.core.hookenv import (
24
23
    CRITICAL, ERROR, WARNING, INFO, DEBUG, log,
23
    CRITICAL, ERROR, WARNING, INFO, DEBUG,
25
24
    )
24
    )
26
25
25
27
26
hooks = hookenv.Hooks()
26
hooks = hookenv.Hooks()
28
@@ -28,16 +28,24 @@
29
28
# jinja2 may not be importable until the install hook has installed the
28
# jinja2 may not be importable until the install hook has installed the
30
29
# required packages.
29
# required packages.
31
30
def Template(*args, **kw):
30
def Template(*args, **kw):
32
31
    """jinja2.Template with deferred jinja2 import"""
33
31
    from jinja2 import Template
32
    from jinja2 import Template
34
32
    return Template(*args, **kw)
33
    return Template(*args, **kw)
35
33
34
36
34
35
37
35
def log(msg, lvl=INFO):
36
def log(msg, lvl=INFO):
40
36
    # Per Bug #1208787, log messages sent via juju-log are being lost.
37
    '''Log a message.
41
37
    # Spit messages out to a log file to work around the problem.
38
42
39
    Per Bug #1208787, log messages sent via juju-log are being lost.
43
40
    Spit messages out to a log file to work around the problem.
44
41
    It is also rather nice to have the log messages we explicitly emit
45
42
    in a separate log file, rather than just mashed up with all the
46
43
    juju noise.
47
44
    '''
48
38
    myname = hookenv.local_unit().replace('/', '-')
45
    myname = hookenv.local_unit().replace('/', '-')
51
39
    with open('/tmp/{}-debug.log'.format(myname), 'a') as f:
46
    ts = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
52
40
        f.write('{}: {}\n'.format(lvl, msg))
47
    with open('/var/log/juju/{}-debug.log'.format(myname), 'a') as f:
53
48
        f.write('{} {}: {}\n'.format(ts, lvl, msg))
54
41
    hookenv.log(msg, lvl)
49
    hookenv.log(msg, lvl)
55
42
50
56
43
51
57
@@ -49,6 +57,7 @@
58
49
        self.load()
57
        self.load()
59
50
58
60
51
    def load(self):
59
    def load(self):
61
60
        '''Load stored state from local disk.'''
62
52
        if os.path.exists(self._state_file):
61
        if os.path.exists(self._state_file):
63
53
            state = pickle.load(open(self._state_file, 'rb'))
62
            state = pickle.load(open(self._state_file, 'rb'))
64
54
        else:
63
        else:
65
@@ -58,6 +67,7 @@
66
58
        self.update(state)
67
        self.update(state)
67
59
68
68
60
    def save(self):
69
    def save(self):
69
70
        '''Store state to local disk.'''
70
61
        state = {}
71
        state = {}
71
62
        state.update(self)
72
        state.update(self)
72
63
        pickle.dump(state, open(self._state_file, 'wb'))
73
        pickle.dump(state, open(self._state_file, 'wb'))
73
@@ -181,13 +191,13 @@
74
181
191
75
182
192
76
183
def postgresql_autostart(enabled):
193
def postgresql_autostart(enabled):
77
194
    startup_file = os.path.join(postgresql_config_dir, 'start.conf')
78
184
    if enabled:
195
    if enabled:
79
185
        log("Enabling PostgreSQL startup in {}".format(startup_file))
196
        log("Enabling PostgreSQL startup in {}".format(startup_file))
80
186
        mode = 'auto'
197
        mode = 'auto'
81
187
    else:
198
    else:
82
188
        log("Disabling PostgreSQL startup in {}".format(startup_file))
199
        log("Disabling PostgreSQL startup in {}".format(startup_file))
83
189
        mode = 'manual'
200
        mode = 'manual'
84
190
    startup_file = os.path.join(postgresql_config_dir, 'start.conf')
85
191
    contents = Template(open("templates/start_conf.tmpl").read()).render(
201
    contents = Template(open("templates/start_conf.tmpl").read()).render(
86
192
        {'mode': mode})
202
        {'mode': mode})
87
193
    host.write_file(
203
    host.write_file(
88
@@ -209,6 +219,7 @@
89
209
219
90
210
220
91
211
def postgresql_is_running():
221
def postgresql_is_running():
92
222
    '''Return true if PostgreSQL is running.'''
93
212
    # init script always return true (9.1), add extra check to make it useful
223
    # init script always return true (9.1), add extra check to make it useful
94
213
    status, output = commands.getstatusoutput("invoke-rc.d postgresql status")
224
    status, output = commands.getstatusoutput("invoke-rc.d postgresql status")
95
214
    if status != 0:
225
    if status != 0:
96
@@ -219,72 +230,65 @@
97
219
230
98
220
231
99
221
def postgresql_stop():
232
def postgresql_stop():
102
222
    host.service_stop('postgresql')
233
    '''Shutdown PostgreSQL.'''
103
223
    return not postgresql_is_running()
234
    success = host.service_stop('postgresql')
104
235
    return not (success and postgresql_is_running())
105
224
236
106
225
237
107
226
def postgresql_start():
238
def postgresql_start():
110
227
    host.service_start('postgresql')
239
    '''Start PostgreSQL if it is not already running.'''
111
228
    return postgresql_is_running()
240
    success = host.service_start('postgresql')
112
241
    return success and postgresql_is_running()
113
229
242
114
230
243
115
231
def postgresql_restart():
244
def postgresql_restart():
116
245
    '''Restart PostgreSQL, or start it if it is not already running.'''
117
232
    if postgresql_is_running():
246
    if postgresql_is_running():
134
233
        # If the database is in backup mode, we don't want to restart
247
        with restart_lock(hookenv.local_unit(), True):
135
234
        # PostgreSQL and abort the procedure. This may be another unit being
248
            # 'service postgresql restart' fails; it only does a reload.
136
235
        # cloned, or a filesystem level backup is being made. There is no
249
            # success = host.service_restart('postgresql')
137
236
        # timeout here, as backups can take hours or days. Instead, keep
250
            try:
138
237
        # logging so admins know wtf is going on.
251
                run('pg_ctlcluster -force {version} {cluster_name} '
139
238
        last_warning = time.time()
252
                    'restart'.format(**config_data))
140
239
        while postgresql_is_in_backup_mode():
253
                success = True
141
240
            if time.time() + 120 > last_warning:
254
            except subprocess.CalledProcessError as e:
142
241
                log("In backup mode. PostgreSQL restart blocked.", WARNING)
255
                success = False
127
242
                log(
128
243
                    "Run \"psql -U postgres -c 'SELECT pg_stop_backup()'\""
129
244
                    "to cancel backup mode and forcefully unblock this hook.")
130
245
                last_warning = time.time()
131
246
            time.sleep(5)
132
247
133
248
        return host.service_restart('postgresql')
143
249
    else:
256
    else:
145
250
        return host.service_start('postgresql')
257
        success = host.service_start('postgresql')
146
251
258
147
252
    # Store a copy of our known live configuration so
259
    # Store a copy of our known live configuration so
148
253
    # postgresql_reload_or_restart() can make good choices.
260
    # postgresql_reload_or_restart() can make good choices.
150
254
    if 'saved_config' in local_state:
261
    if success and 'saved_config' in local_state:
151
255
        local_state['live_config'] = local_state['saved_config']
262
        local_state['live_config'] = local_state['saved_config']
152
256
        local_state.save()
263
        local_state.save()
153
257
264
155
258
    return postgresql_is_running()
265
    return success and postgresql_is_running()
156
259
266
157
260
267
158
261
def postgresql_reload():
268
def postgresql_reload():
159
269
    '''Make PostgreSQL reload its configuration.'''
160
262
    # reload returns a reliable exit status
270
    # reload returns a reliable exit status
161
263
    status, output = commands.getstatusoutput("invoke-rc.d postgresql reload")
271
    status, output = commands.getstatusoutput("invoke-rc.d postgresql reload")
162
264
    return (status == 0)
272
    return (status == 0)
163
265
273
164
266
274
169
267
def postgresql_reload_or_restart():
275
def requires_restart():
170
268
    """Reload PostgreSQL configuration, restarting if necessary."""
276
    '''Check for configuration changes requiring a restart to take effect.'''
167
269
    # Pull in current values of settings that can only be changed on
168
270
    # server restart.
171
271
    if not postgresql_is_running():
277
    if not postgresql_is_running():
173
272
        return postgresql_restart()
278
        return True
174
273
279
175
274
    # Suck in the config last written to postgresql.conf.
176
275
    saved_config = local_state.get('saved_config', None)
280
    saved_config = local_state.get('saved_config', None)
177
276
    if not saved_config:
281
    if not saved_config:
178
277
        # No record of postgresql.conf state, perhaps an upgrade.
282
        # No record of postgresql.conf state, perhaps an upgrade.
179
278
        # Better restart.
283
        # Better restart.
181
279
        return postgresql_restart()
284
        return True
182
280
285
183
281
    # Suck in our live config from last time we restarted.
184
282
    live_config = local_state.setdefault('live_config', {})
286
    live_config = local_state.setdefault('live_config', {})
185
283
287
186
284
    # Pull in a list of PostgreSQL settings.
288
    # Pull in a list of PostgreSQL settings.
187
285
    cur = db_cursor()
289
    cur = db_cursor()
188
286
    cur.execute("SELECT name, context FROM pg_settings")
290
    cur.execute("SELECT name, context FROM pg_settings")
190
287
    requires_restart = False
291
    restart = False
191
288
    for name, context in cur.fetchall():
292
    for name, context in cur.fetchall():
192
289
        live_value = live_config.get(name, None)
293
        live_value = live_config.get(name, None)
193
290
        new_value = saved_config.get(name, None)
294
        new_value = saved_config.get(name, None)
194
@@ -296,23 +300,27 @@
195
296
            if context == 'postmaster':
300
            if context == 'postmaster':
196
297
                # A setting has changed that requires PostgreSQL to be
301
                # A setting has changed that requires PostgreSQL to be
197
298
                # restarted before it will take effect.
302
                # restarted before it will take effect.
204
299
                requires_restart = True
303
                restart = True
205
300
304
    return restart
206
301
    if requires_restart:
305
207
302
        # A change has been requested that requires a restart.
306
208
303
        log(
307
def postgresql_reload_or_restart():
209
304
            "Configuration change requires PostgreSQL restart. Restarting.",
308
    """Reload PostgreSQL configuration, restarting if necessary."""
210
309
    if requires_restart():
211
310
        log("Configuration change requires PostgreSQL restart. Restarting.",
212
305
            WARNING)
311
            WARNING)
214
306
        rc = postgresql_restart()
312
        success = postgresql_restart()
215
313
        if not success or requires_restart():
216
314
            log("Configuration changes failed to apply", WARNING)
217
315
            success = False
218
307
    else:
316
    else:
221
308
        log("PostgreSQL reload, config changes taking effect.", DEBUG)
317
        success = host.service_reload('postgresql')
220
309
        rc = postgresql_reload()  # No pending need to bounce, just reload.
222
310
318
225
311
    if rc == 0 and 'saved_config' in local_state:
319
    if success:
226
312
        local_state['live_config'] = local_state['saved_config']
320
        local_state['saved_config'] = local_state['live_config']
227
313
        local_state.save()
321
        local_state.save()
228
314
322
230
315
    return rc
323
    return success
231
316
324
232
317
325
233
318
def get_service_port(postgresql_config):
326
def get_service_port(postgresql_config):
234
@@ -344,8 +352,6 @@
235
344
            config_data["shared_buffers"] = \
352
            config_data["shared_buffers"] = \
236
345
                "%sMB" % (int(int(total_ram) * 0.15),)
353
                "%sMB" % (int(int(total_ram) * 0.15),)
237
346
        # XXX: This is very messy - should probably be a subordinate charm
354
        # XXX: This is very messy - should probably be a subordinate charm
238
347
        # file overlaps with __builtin__.file ... renaming to conf_file
239
348
        # negronjl
240
349
        conf_file = open("/etc/sysctl.d/50-postgresql.conf", "w")
355
        conf_file = open("/etc/sysctl.d/50-postgresql.conf", "w")
241
350
        conf_file.write("kernel.sem = 250 32000 100 1024\n")
356
        conf_file.write("kernel.sem = 250 32000 100 1024\n")
242
351
        conf_file.write("kernel.shmall = %s\n" %
357
        conf_file.write("kernel.shmall = %s\n" %
243
@@ -579,7 +585,7 @@
244
579
585
245
580
586
246
581
def db_cursor(autocommit=False, db='template1', user='postgres',
587
def db_cursor(autocommit=False, db='template1', user='postgres',
248
582
              host=None, timeout=120):
588
              host=None, timeout=30):
249
583
    import psycopg2
589
    import psycopg2
250
584
    if host:
590
    if host:
251
585
        conn_str = "dbname={} host={} user={}".format(db, host, user)
591
        conn_str = "dbname={} host={} user={}".format(db, host, user)
252
@@ -855,14 +861,16 @@
253
855
861
254
856
@hooks.hook()
862
@hooks.hook()
255
857
def start():
863
def start():
257
858
    if not postgresql_restart():
864
    if not postgresql_reload_or_restart():
258
859
        raise SystemExit(1)
865
        raise SystemExit(1)
259
860
866
260
861
867
261
862
@hooks.hook()
868
@hooks.hook()
262
863
def stop():
869
def stop():
265
864
    if not postgresql_stop():
870
    if postgresql_is_running():
266
865
        raise SystemExit(1)
871
        with restart_lock(hookenv.local_unit(), True):
267
872
            if not postgresql_stop():
268
873
                raise SystemExit(1)
269
866
874
270
867
875
271
868
def quote_identifier(identifier):
876
def quote_identifier(identifier):
272
@@ -1163,7 +1171,7 @@
273
1163
def db_relation_broken():
1171
def db_relation_broken():
274
1164
    from psycopg2.extensions import AsIs
1172
    from psycopg2.extensions import AsIs
275
1165
1173
277
1166
    relid = os.environ['JUJU_RELATION_ID']
1174
    relid = hookenv.relation_id()
278
1167
    if relid not in local_state['relations']['db']:
1175
    if relid not in local_state['relations']['db']:
279
1168
        # This was to be a hot standby, but it had not yet got as far as
1176
        # This was to be a hot standby, but it had not yet got as far as
280
1169
        # receiving and handling credentials from the master.
1177
        # receiving and handling credentials from the master.
281
@@ -1174,7 +1182,7 @@
282
1174
    # we used from there. Instead, we have to persist this information
1182
    # we used from there. Instead, we have to persist this information
283
1175
    # ourselves.
1183
    # ourselves.
284
1176
    relation = local_state['relations']['db'][relid]
1184
    relation = local_state['relations']['db'][relid]
286
1177
    unit_relation_data = relation[os.environ['JUJU_UNIT_NAME']]
1185
    unit_relation_data = relation[hookenv.local_unit()]
287
1178
1186
288
1179
    if local_state['state'] in ('master', 'standalone'):
1187
    if local_state['state'] in ('master', 'standalone'):
289
1180
        user = unit_relation_data.get('user', None)
1188
        user = unit_relation_data.get('user', None)
290
@@ -1303,27 +1311,75 @@
291
1303
        log("I am already the master", DEBUG)
1311
        log("I am already the master", DEBUG)
292
1304
        return hookenv.local_unit()
1312
        return hookenv.local_unit()
293
1305
1313
294
1314
    if local_state['state'] == 'hot standby':
295
1315
        log("I am already following {}".format(
296
1316
            local_state['following']), DEBUG)
297
1317
        return local_state['following']
298
1318
299
1319
    replication_relid = hookenv.relation_ids('replication')[0]
300
1320
    replication_units = hookenv.related_units(replication_relid)
301
1321
302
1322
    if local_state['state'] == 'standalone':
303
1323
        log("I'm a standalone unit wanting to participate in replication")
304
1324
        existing_replication = False
305
1325
        for unit in replication_units:
306
1326
            # If another peer thinks it is the master, believe it.
307
1327
            remote_state = hookenv.relation_get(
308
1328
                'state', unit, replication_relid)
309
1329
            if remote_state == 'master':
310
1330
                log("{} thinks it is the master, believing it".format(
311
1331
                    unit), DEBUG)
312
1332
                return unit
313
1333
314
1334
            # If we find a peer that isn't standalone, we know
315
1335
            # replication has already been setup at some point.
316
1336
            if remote_state != 'standalone':
317
1337
                existing_replication = True
318
1338
319
1339
        # If we are joining a peer relation where replication has
320
1340
        # already been setup, but there is currently no master, wait
321
1341
        # until one of the remaining participating units has been
322
1342
        # promoted to master. Only they have the data we need to
323
1343
        # preserve.
324
1344
        if existing_replication:
325
1345
            log("Peers participating in replication need to elect a master",
326
1346
                DEBUG)
327
1347
            return None
328
1348
329
1349
        # There are no peers claiming to be master, and there is no
330
1350
        # election in progress, so lowest numbered unit wins.
331
1351
        units = replication_units + [hookenv.local_unit()]
332
1352
        master = unit_sorted(units)[0]
333
1353
        if master == hookenv.local_unit():
334
1354
            log("I'm Master - lowest numbered unit in new peer group")
335
1355
            return master
336
1356
        else:
337
1357
            log("Waiting on {} to declare itself Master".format(master), DEBUG)
338
1358
            return None
339
1359
340
1306
    if local_state['state'] == 'failover':
1360
    if local_state['state'] == 'failover':
341
1307
        former_master = local_state['following']
1361
        former_master = local_state['following']
342
1308
        log("Failover from {}".format(former_master))
1362
        log("Failover from {}".format(former_master))
343
1309
1363
344
1310
        units_not_in_failover = set()
1364
        units_not_in_failover = set()
361
1311
        for relid in hookenv.relation_ids('replication'):
1365
        candidates = set()
362
1312
            for unit in hookenv.related_units(relid):
1366
        for unit in replication_units:
363
1313
                if unit == former_master:
1367
            if unit == former_master:
364
1314
                    log("Found dying master {}".format(unit), DEBUG)
1368
                log("Found dying master {}".format(unit), DEBUG)
365
1315
                    continue
1369
                continue
366
1316
1370
367
1317
                relation = hookenv.relation_get(unit=unit, rid=relid)
1371
            relation = hookenv.relation_get(unit=unit, rid=replication_relid)
368
1318
1372
369
1319
                if relation['state'] == 'master':
1373
            if relation['state'] == 'master':
370
1320
                    log(
1374
                log("{} says it already won the election".format(unit),
371
1321
                        "{} says it already won the election".format(unit),
1375
                    INFO)
372
1322
                        INFO)
1376
                return unit
373
1323
                    return unit
1377
374
1324
1378
            if relation['state'] == 'failover':
375
1325
                if relation['state'] != 'failover':
1379
                candidates.add(unit)
376
1326
                    units_not_in_failover.add(unit)
1380
377
1381
            elif relation['state'] != 'standalone':
378
1382
                units_not_in_failover.add(unit)
379
1327
1383
380
1328
        if units_not_in_failover:
1384
        if units_not_in_failover:
381
1329
            log("{} unaware of impending election. Deferring result.".format(
1385
            log("{} unaware of impending election. Deferring result.".format(
382
@@ -1333,35 +1389,24 @@
383
1333
        log("Election in progress")
1389
        log("Election in progress")
384
1334
        winner = None
1390
        winner = None
385
1335
        winning_offset = -1
1391
        winning_offset = -1
397
1336
        for relid in hookenv.relation_ids('replication'):
1392
        candidates.add(hookenv.local_unit())
398
1337
            candidates = set(hookenv.related_units(relid))
1393
        # Sort the unit lists so we get consistent results in a tie
399
1338
            candidates.add(hookenv.local_unit())
1394
        # and lowest unit number wins.
400
1339
            candidates.discard(former_master)
1395
        for unit in unit_sorted(candidates):
401
1340
            # Sort the unit lists so we get consistent results in a tie
1396
            relation = hookenv.relation_get(unit=unit, rid=replication_relid)
402
1341
            # and lowest unit number wins.
1397
            if int(relation['wal_received_offset']) > winning_offset:
403
1342
            for unit in unit_sorted(candidates):
1398
                winner = unit
404
1343
                relation = hookenv.relation_get(unit=unit, rid=relid)
1399
                winning_offset = int(relation['wal_received_offset'])
394
1344
                if int(relation['wal_received_offset']) > winning_offset:
395
1345
                    winner = unit
396
1346
                    winning_offset = int(relation['wal_received_offset'])
405
1347
1400
406
1348
        # All remaining hot standbys are in failover mode and have
1401
        # All remaining hot standbys are in failover mode and have
407
1349
        # reported their wal_received_offset. We can declare victory.
1402
        # reported their wal_received_offset. We can declare victory.
423
1350
        log("{} won the election as is the new master".format(winner))
1403
        if winner == hookenv.local_unit():
424
1351
        return winner
1404
            log("I won the election, announcing myself winner")
425
1352
1405
            return winner
426
1353
    # Maybe another peer thinks it is the master?
1406
        else:
427
1354
    for relid in hookenv.relation_ids('replication'):
1407
            log("Waiting for {} to announce its victory".format(winner),
428
1355
        for unit in hookenv.related_units(relid):
1408
                DEBUG)
429
1356
            if hookenv.relation_get('state', unit, relid) == 'master':
1409
            return None
415
1357
                return unit
416
1358
417
1359
    # New peer group. Lowest numbered unit will be the master.
418
1360
    for relid in hookenv.relation_ids('replication'):
419
1361
        units = hookenv.related_units(relid) + [hookenv.local_unit()]
420
1362
        master = unit_sorted(units)[0]
421
1363
        log("New peer group. {} is elected master".format(master))
422
1364
        return master
430
1365
1410
431
1366
1411
432
1367
@hooks.hook('replication-relation-joined', 'replication-relation-changed')
1412
@hooks.hook('replication-relation-joined', 'replication-relation-changed')
433
@@ -1419,10 +1464,7 @@
434
1419
            log("Fresh unit. I will clone {} and become a hot standby".format(
1464
            log("Fresh unit. I will clone {} and become a hot standby".format(
435
1420
                master))
1465
                master))
436
1421
1466
437
1422
            # Before we start destroying anything, ensure that the
438
1423
            # master is contactable.
439
1424
            master_ip = hookenv.relation_get('private-address', master)
1467
            master_ip = hookenv.relation_get('private-address', master)
440
1425
            wait_for_db(db='postgres', user='juju_replication', host=master_ip)
441
1426
1468
442
1427
            clone_database(master, master_ip)
1469
            clone_database(master, master_ip)
443
1428
1470
444
@@ -1592,8 +1634,55 @@
445
1592
        os.chdir(org_dir)
1634
        os.chdir(org_dir)
446
1593
1635
447
1594
1636
448
1637
@contextmanager
449
1638
def restart_lock(unit, exclusive):
450
1639
    '''Aquire the database restart lock on the given unit.
451
1640
452
1641
    A database needing a restart should grab an exclusive lock before
453
1642
    doing so. To block a remote database from doing a restart, grab a shared
454
1643
    lock.
455
1644
    '''
456
1645
    import psycopg2
457
1646
    key = long(config_data['advisory_lock_restart_key'])
458
1647
    if exclusive:
459
1648
        lock_function = 'pg_advisory_lock'
460
1649
    else:
461
1650
        lock_function = 'pg_advisory_lock_shared'
462
1651
    q = 'SELECT {}({})'.format(lock_function, key)
463
1652
464
1653
    # We will get an exception if the database is rebooted while waiting
465
1654
    # for a shared lock. If the connection is killed, we retry a few
466
1655
    # times to cope.
467
1656
    num_retries = 3
468
1657
469
1658
    for count in range(0, num_retries):
470
1659
        try:
471
1660
            if unit == hookenv.local_unit():
472
1661
                cur = db_cursor(autocommit=True)
473
1662
            else:
474
1663
                host = hookenv.relation_get('private-address', unit)
475
1664
                cur = db_cursor(
476
1665
                    autocommit=True, db='postgres',
477
1666
                    user='juju_replication', host=host)
478
1667
            cur.execute(q)
479
1668
            break
480
1669
        except psycopg2.Error:
481
1670
            if count == num_retries - 1:
482
1671
                raise
483
1672
484
1673
    try:
485
1674
        yield
486
1675
    finally:
487
1676
        # Close our connection, swallowing any exceptions as the database
488
1677
        # may be being rebooted now we have released our lock.
489
1678
        try:
490
1679
            del cur
491
1680
        except psycopg2.Error:
492
1681
            pass
493
1682
494
1683
495
1595
def clone_database(master_unit, master_host):
1684
def clone_database(master_unit, master_host):
497
1596
    with pgpass():
1685
    with restart_lock(master_unit, False):
498
1597
        postgresql_stop()
1686
        postgresql_stop()
499
1598
        log("Cloning master {}".format(master_unit))
1687
        log("Cloning master {}".format(master_unit))
500
1599
1688
501
@@ -1607,9 +1696,10 @@
502
1607
            shutil.rmtree(postgresql_cluster_dir)
1696
            shutil.rmtree(postgresql_cluster_dir)
503
1608
1697
504
1609
        try:
1698
        try:
508
1610
            # Change directory the postgres user can read.
1699
            # Change directory the postgres user can read, and need
509
1611
            with switch_cwd('/tmp'):
1700
            # .pgpass too.
510
1612
                # Run the sudo command.
1701
            with switch_cwd('/tmp'), pgpass():
511
1702
                # Clone the master with pg_basebackup.
512
1613
                output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
1703
                output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
513
1614
            log(output, DEBUG)
1704
            log(output, DEBUG)
514
1615
            # Debian by default expects SSL certificates in the datadir.
1705
            # Debian by default expects SSL certificates in the datadir.
515
@@ -1626,8 +1716,8 @@
516
1626
            # can retry hooks again. Even assuming the charm is
1716
            # can retry hooks again. Even assuming the charm is
517
1627
            # functioning correctly, the clone may still fail
1717
            # functioning correctly, the clone may still fail
518
1628
            # due to eg. lack of disk space.
1718
            # due to eg. lack of disk space.
519
1629
            log("Clone failed, db cluster destroyed", ERROR)
520
1630
            log(x.output, ERROR)
1719
            log(x.output, ERROR)
521
1720
            log("Clone failed, local db destroyed", ERROR)
522
1631
            if os.path.exists(postgresql_cluster_dir):
1721
            if os.path.exists(postgresql_cluster_dir):
523
1632
                shutil.rmtree(postgresql_cluster_dir)
1722
                shutil.rmtree(postgresql_cluster_dir)
524
1633
            if os.path.exists(postgresql_config_dir):
1723
            if os.path.exists(postgresql_config_dir):
525
@@ -1652,6 +1742,15 @@
526
1652
        os.path.join(postgresql_cluster_dir, 'backup_label'))
1742
        os.path.join(postgresql_cluster_dir, 'backup_label'))
527
1653
1743
528
1654
1744
529
1745
def pg_basebackup_is_running():
530
1746
    cur = db_cursor(autocommit=True)
531
1747
    cur.execute("""
532
1748
        SELECT count(*) FROM pg_stat_activity
533
1749
        WHERE usename='juju_replication' AND application_name='pg_basebackup'
534
1750
        """)
535
1751
    return cur.fetchone()[0] > 0
536
1752
537
1753
538
1655
def postgresql_wal_received_offset():
1754
def postgresql_wal_received_offset():
539
1656
    """How much WAL we have.
1755
    """How much WAL we have.
540
1657
1756
541
@@ -1694,7 +1793,7 @@
542
1694
    try:
1793
    try:
543
1695
        nagios_uid = getpwnam('nagios').pw_uid
1794
        nagios_uid = getpwnam('nagios').pw_uid
544
1696
        nagios_gid = getgrnam('nagios').gr_gid
1795
        nagios_gid = getgrnam('nagios').gr_gid
546
1697
    except:
1796
    except Exception:
547
1698
        hookenv.log("Nagios user not set up.", hookenv.DEBUG)
1797
        hookenv.log("Nagios user not set up.", hookenv.DEBUG)
548
1699
        return
1798
        return
549
1700
1799
550
1701
1800
551
=== modified file 'test.py'
552
--- test.py	2013-08-23 09:40:15 +0000
553
+++ test.py	2013-08-23 09:40:16 +0000
554
@@ -74,12 +74,12 @@
555
74
        return None
74
        return None
556
75
75
557
76
    def deploy(self, charm, name=None, num_units=1):
76
    def deploy(self, charm, name=None, num_units=1):
562
77
        # The first time we deploy a charm in the test run, it needs to
77
        # The first time we deploy a local: charm in the test run, it
563
78
        # deploy with --update to ensure we are testing the desired
78
        # needs to deploy with --update to ensure we are testing the
564
79
        # revision of the charm. Subsequent deploys we do not use
79
        # desired revision of the charm. Subsequent deploys we do not
565
80
        # --update to avoid overhead and needless incrementing of the
80
        # use --update to avoid overhead and needless incrementing of the
566
81
        # revision number.
81
        # revision number.
568
82
        if charm.startswith('cs:') or charm in self._deployed_charms:
82
        if not charm.startswith('local:') or charm in self._deployed_charms:
569
83
            cmd = ['deploy']
83
            cmd = ['deploy']
570
84
        else:
84
        else:
571
85
            cmd = ['deploy', '-u']
85
            cmd = ['deploy', '-u']
572
@@ -102,7 +102,7 @@
573
102
        self.status = self.get_result(['status'])
102
        self.status = self.get_result(['status'])
574
103
        return self.status
103
        return self.status
575
104
104
577
105
    def wait_until_ready(self):
105
    def wait_until_ready(self, extra=45):
578
106
        ready = False
106
        ready = False
579
107
        while not ready:
107
        while not ready:
580
108
            self.refresh_status()
108
            self.refresh_status()
581
@@ -128,7 +128,7 @@
582
128
        # enough that our system is probably stable. This means we have
128
        # enough that our system is probably stable. This means we have
583
129
        # extremely slow and flaky tests, but that is possibly better
129
        # extremely slow and flaky tests, but that is possibly better
584
130
        # than no tests.
130
        # than no tests.
586
131
        time.sleep(45)
131
        time.sleep(extra)
587
132
132
588
133
    def setUp(self):
133
    def setUp(self):
589
134
        DEBUG("JujuFixture.setUp()")
134
        DEBUG("JujuFixture.setUp()")
590
@@ -156,7 +156,7 @@
591
156
        # Per Bug #1190250 (WONTFIX), we need to wait for dying services
156
        # Per Bug #1190250 (WONTFIX), we need to wait for dying services
592
157
        # to die before we can continue.
157
        # to die before we can continue.
593
158
        if found_services:
158
        if found_services:
595
159
            self.wait_until_ready()
159
            self.wait_until_ready(0)
596
160
160
597
161
        # We shouldn't reuse machines, as we have no guarantee they are
161
        # We shouldn't reuse machines, as we have no guarantee they are
598
162
        # still in a usable state, so tear them down too. Per
162
        # still in a usable state, so tear them down too. Per
599
@@ -305,15 +305,18 @@
600
305
        self.juju.do(['add-relation', 'postgresql:db', 'psql:db'])
305
        self.juju.do(['add-relation', 'postgresql:db', 'psql:db'])
601
306
        self.juju.wait_until_ready()
306
        self.juju.wait_until_ready()
602
307
307
612
308
        # On a freshly setup service, lowest numbered unit is always the
308
        # Even on a freshly setup service, we have no idea which unit
613
309
        # master.
309
        # will become the master as we have no control over which two
614
310
        units = unit_sorted(
310
        # units join the peer relation first.
615
311
            self.juju.status['services']['postgresql']['units'].keys())
311
        units = sorted((self.is_master(unit), unit)
616
312
        master_unit, standby_unit_1, standby_unit_2 = units
312
            for unit in
617
313
313
                self.juju.status['services']['postgresql']['units'].keys())
618
314
        self.assertIs(True, self.is_master(master_unit))
314
        self.assertFalse(units[0][0])
619
315
        self.assertIs(False, self.is_master(standby_unit_1))
315
        self.assertFalse(units[1][0])
620
316
        self.assertIs(False, self.is_master(standby_unit_2))
316
        self.assertTrue(units[2][0])
621
317
        standby_unit_1 = units[0][1]
622
318
        standby_unit_2 = units[1][1]
623
319
        master_unit = units[2][1]
624
317
320
625
318
        self.sql('CREATE TABLE Token (x int)', master_unit)
321
        self.sql('CREATE TABLE Token (x int)', master_unit)
626
319
322
627
@@ -390,11 +393,18 @@
628
390
        self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])
393
        self.juju.do(['add-relation', 'postgresql:db-admin', 'psql:db-admin'])
629
391
        self.juju.wait_until_ready()
394
        self.juju.wait_until_ready()
630
392
395
636
393
        # On a freshly setup service, lowest numbered unit is always the
396
        # Even on a freshly setup service, we have no idea which unit
637
394
        # master.
397
        # will become the master as we have no control over which two
638
395
        units = unit_sorted(
398
        # units join the peer relation first.
639
396
            self.juju.status['services']['postgresql']['units'].keys())
399
        units = sorted((self.is_master(unit, 'postgres'), unit)
640
397
        master_unit, standby_unit_1, standby_unit_2 = units
400
            for unit in
641
401
                self.juju.status['services']['postgresql']['units'].keys())
642
402
        self.assertFalse(units[0][0])
643
403
        self.assertFalse(units[1][0])
644
404
        self.assertTrue(units[2][0])
645
405
        standby_unit_1 = units[0][1]
646
406
        standby_unit_2 = units[1][1]
647
407
        master_unit = units[2][1]
648
398
408
649
399
        # Shutdown PostgreSQL on standby_unit_1 and ensure
409
        # Shutdown PostgreSQL on standby_unit_1 and ensure
650
400
        # standby_unit_2 will have received more WAL information from
410
        # standby_unit_2 will have received more WAL information from
Status:	Merged
Approved by:	Mark Mims on 2013-08-26
Approved revision:	75
Merged at revision:	63
Proposed branch:	lp:~stub/charms/precise/postgresql/fix-races
Merge into:	lp:charms/postgresql
Prerequisite:	lp:~stub/charms/precise/postgresql/cleanups
Diff against target:	650 lines (+247/-131) 3 files modified config.yaml (+7/-0) hooks/hooks.py (+208/-109) test.py (+32/-22)
To merge this branch:	bzr merge lp:~stub/charms/precise/postgresql/fix-races
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Mark Mims (community)		2013-08-23	Approve on 2013-08-26
Review via email: mp+181740@code.launchpad.net