Merge lp:~stub/launchpad/replication into lp:launchpad/db-devel

Proposed by Stuart Bishop
Status: Merged
Approved by: Aaron Bentley
Approved revision: not available
Merged at revision: not available
Proposed branch: lp:~stub/launchpad/replication
Merge into: lp:launchpad/db-devel
Diff against target: 196 lines (+106/-19)
7 files modified
daemons/cache-database-replication-lag.py (+53/-0)
database/replication/helpers.py (+1/-0)
database/schema/comments.sql (+4/-0)
database/schema/patch-2207-28-1.sql (+9/-0)
database/schema/security.cfg (+6/-0)
database/schema/trusted.sql (+22/-0)
lib/canonical/launchpad/webapp/dbpolicy.py (+11/-19)
To merge this branch: bzr merge lp:~stub/launchpad/replication
Reviewer Review Type Date Requested Status
Aaron Bentley (community) Approve
Review via email: mp+18698@code.launchpad.net

Commit message

an alternative approach of stopping the appservers from getting blocked when they check replication lag under high load.

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

The fix to Bug #504696 fails on production, despite passing testing locally and on staging and I'm unable to reproduce.

This branch implements an alternative approach of stopping the appservers from getting blocked when they check replication lag under high load.

The database patch, stored procedures and permissions have been applied to production in case we want to test this on edge while I'm unavailable.

Revision history for this message
Aaron Bentley (abentley) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added symlink 'daemons/_pythonpath.py'
2=== target is u'../_pythonpath.py'
3=== added file 'daemons/cache-database-replication-lag.py'
4--- daemons/cache-database-replication-lag.py 1970-01-01 00:00:00 +0000
5+++ daemons/cache-database-replication-lag.py 2010-02-15 07:48:22 +0000
6@@ -0,0 +1,53 @@
7+#!/usr/bin/python2.5
8+#
9+# Copyright 2010 Canonical Ltd. This software is licensed under the
10+# GNU Affero General Public License version 3 (see the file LICENSE).
11+
12+"""Calculate database replication lag and cache it."""
13+
14+__metaclass__ = type
15+__all__ = []
16+
17+import _pythonpath
18+
19+import sys
20+import time
21+
22+import psycopg2
23+
24+from canonical.database.sqlbase import connect, ISOLATION_LEVEL_AUTOCOMMIT
25+from canonical.launchpad.scripts import db_options, logger
26+from lp.scripts.helpers import LPOptionParser
27+
28+
29+def main(args=None):
30+ parser = LPOptionParser()
31+ db_options(parser)
32+ parser.add_option(
33+ "-s", "--sleep", dest="sleep", type="int", default=5,
34+ metavar="SECS", help="Wait SECS seconds between refreshes.")
35+
36+ (options, args) = parser.parse_args(args)
37+ if len(args) != 0:
38+ parser.error("Too many arguments.")
39+
40+ log = logger(options)
41+
42+ while True:
43+ try:
44+ con = connect(user="lagmon", isolation=ISOLATION_LEVEL_AUTOCOMMIT)
45+ cur = con.cursor()
46+ while True:
47+ cur.execute("SELECT update_replication_lag_cache()")
48+ if cur.fetchone()[0]:
49+ log.info("Updated.")
50+ else:
51+ log.error("update_replication_lag_cache() failed.")
52+ time.sleep(options.sleep)
53+ except psycopg2.Error, x:
54+ log.error("%s. Retrying.", str(x).strip())
55+ time.sleep(options.sleep)
56+
57+
58+if __name__ == '__main__':
59+ sys.exit(main())
60
61=== modified file 'database/replication/helpers.py'
62--- database/replication/helpers.py 2010-01-22 06:25:48 +0000
63+++ database/replication/helpers.py 2010-02-15 07:48:22 +0000
64@@ -44,6 +44,7 @@
65 LPMAIN_SEED = frozenset([
66 ('public', 'person'),
67 ('public', 'launchpaddatabaserevision'),
68+ ('public', 'databasereplicationlag'),
69 ('public', 'fticache'),
70 ('public', 'nameblacklist'),
71 ('public', 'openidconsumerassociation'),
72
73=== modified file 'database/schema/comments.sql'
74--- database/schema/comments.sql 2010-02-03 15:55:01 +0000
75+++ database/schema/comments.sql 2010-02-15 07:48:22 +0000
76@@ -2352,3 +2352,7 @@
77 COMMENT ON COLUMN SourcePackageFormatSelection.distroseries IS 'Refers to the distroseries in question.';
78 COMMENT ON COLUMN SourcePackageFormatSelection.format IS 'The SourcePackageFormat to allow.';
79
80+COMMENT ON TABLE DatabaseReplicationLag IS 'A cached snapshot of database replication lag between our master Slony node and its slaves.';
81+COMMENT ON COLUMN DatabaseReplicationLag.node IS 'The Slony node number identifying the slave database.';
82+COMMENT ON COLUMN DatabaseReplicationLag.lag IS 'lag time.';
83+COMMENT ON COLUMN DatabaseReplicationLag.updated IS 'When this value was updated.';
84
85=== added file 'database/schema/patch-2207-28-1.sql'
86--- database/schema/patch-2207-28-1.sql 1970-01-01 00:00:00 +0000
87+++ database/schema/patch-2207-28-1.sql 2010-02-15 07:48:22 +0000
88@@ -0,0 +1,9 @@
89+SET client_min_messages = ERROR;
90+
91+CREATE TABLE DatabaseReplicationLag (
92+ node integer PRIMARY KEY,
93+ lag interval NOT NULL,
94+ updated timestamp without time zone
95+ DEFAULT (CURRENT_TIMESTAMP AT TIME ZONE 'UTC'));
96+
97+INSERT INTO LaunchpadDatabaseRevision VALUES (2207, 28, 1);
98
99=== modified file 'database/schema/security.cfg'
100--- database/schema/security.cfg 2010-02-12 15:44:24 +0000
101+++ database/schema/security.cfg 2010-02-15 07:48:22 +0000
102@@ -166,6 +166,7 @@
103 public.cvereference = SELECT, INSERT
104 public.cve = SELECT, INSERT, UPDATE
105 public.customlanguagecode = SELECT, INSERT, UPDATE, DELETE
106+public.databasereplicationlag = SELECT
107 public.diff = SELECT, INSERT, UPDATE
108 public.distributionbounty = SELECT, INSERT, UPDATE
109 public.distributionmirror = SELECT, INSERT, UPDATE, DELETE
110@@ -1898,6 +1899,11 @@
111 public.job = SELECT, UPDATE, DELETE
112 public.bugjob = SELECT, DELETE
113
114+[lagmon]
115+# cache-database-replication-lag.py
116+type=user
117+public.update_replication_lag_cache() = EXECUTE
118+
119 [process-apport-blobs]
120 type=user
121 groups=script,read
122
123=== modified file 'database/schema/trusted.sql'
124--- database/schema/trusted.sql 2010-02-02 10:57:26 +0000
125+++ database/schema/trusted.sql 2010-02-15 07:48:22 +0000
126@@ -94,6 +94,28 @@
127 'Returns the lag time of the lpmain replication set to the given node, or NULL if not a replicated installation. The node id parameter can be obtained by calling getlocalnodeid() on the relevant database. This function only returns meaningful results on the lpmain replication set master.';
128
129
130+CREATE OR REPLACE FUNCTION update_replication_lag_cache() RETURNS boolean
131+LANGUAGE plpgsql VOLATILE SECURITY DEFINER AS
132+$$
133+ BEGIN
134+ DELETE FROM DatabaseReplicationLag;
135+ INSERT INTO DatabaseReplicationLag (node, lag)
136+ SELECT st_received, st_lag_time FROM _sl.sl_status
137+ WHERE st_origin = _sl.getlocalnodeid('_sl');
138+ RETURN TRUE;
139+ -- Slony-I not installed here - non-replicated setup.
140+ EXCEPTION
141+ WHEN invalid_schema_name THEN
142+ RETURN FALSE;
143+ WHEN undefined_table THEN
144+ RETURN FALSE;
145+ END;
146+$$;
147+
148+COMMENT ON FUNCTION update_replication_lag_cache() IS
149+'Updates the DatabaseReplicationLag materialized view.';
150+
151+
152 CREATE OR REPLACE FUNCTION getlocalnodeid() RETURNS integer
153 LANGUAGE plpgsql STABLE SECURITY DEFINER AS
154 $$
155
156=== modified file 'lib/canonical/launchpad/webapp/dbpolicy.py'
157--- lib/canonical/launchpad/webapp/dbpolicy.py 2010-01-20 22:09:26 +0000
158+++ lib/canonical/launchpad/webapp/dbpolicy.py 2010-02-15 07:48:22 +0000
159@@ -293,26 +293,18 @@
160
161 # sl_status gives meaningful results only on the origin node.
162 master_store = self.getStore(MAIN_STORE, MASTER_FLAVOR)
163- # If it takes more than (by default) 0.25 seconds to query the
164- # replication lag, assume we are lagged. Normally the query
165- # takes <20ms. This can happen during heavy updates, as the
166- # Slony-I tables can get slow with lots of events. We use a
167- # SAVEPOINT to conveniently reset the statement timeout.
168- master_store.execute("""
169- SAVEPOINT lag_check; SET LOCAL statement_timeout TO %d
170- """ % config.launchpad.lag_check_timeout)
171- try:
172- try:
173- return master_store.execute(
174- "SELECT replication_lag(%d)" % slave_node_id).get_one()[0]
175- except TimeoutError:
176- logging.warn(
177- 'Gave up querying slave lag after %d ms',
178- (config.launchpad.lag_check_timeout))
179- return timedelta(days=999) # A long, long time.
180- finally:
181- master_store.execute("ROLLBACK TO lag_check")
182
183+ # Retrieve the cached lag.
184+ lag = master_store.execute("""
185+ SELECT lag + (CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - updated)
186+ FROM DatabaseReplicationLag WHERE node=%d
187+ """ % slave_node_id).get_one()
188+ if lag is None:
189+ logging.error(
190+ "No data in DatabaseReplicationLag for node %d"
191+ % slave_node_id)
192+ return timedelta(days=999) # A long, long time.
193+ return lag[0]
194
195
196 def WebServiceDatabasePolicyFactory(request):

Subscribers

People subscribed via source and target branches

to status/vote changes: