Merge lp:~stub/launchpad/db-deploy into lp:launchpad/db-devel

Proposed by Stuart Bishop
Status: Merged
Approved by: Robert Collins
Approved revision: no longer in the source branch.
Merged at revision: 10479
Proposed branch: lp:~stub/launchpad/db-deploy
Merge into: lp:launchpad/db-devel
Prerequisite: lp:~stub/launchpad/pending-db-changes
Diff against target: 308 lines (+282/-6)
3 files modified
database/schema/full-update.py (+56/-0)
database/schema/preflight.py (+222/-0)
database/schema/security.py (+4/-6)
To merge this branch: bzr merge lp:~stub/launchpad/db-deploy
Reviewer Review Type Date Requested Status
Robert Collins (community) Approve
Review via email: mp+59065@code.launchpad.net

Commit message

preflight check and automated production deployment script

Description of the change

There are a number of checks we need to make before proceding with production database updates. preflight.py automates these checks.

full-update.py runs the database update steps in sequence, including the new preflight.py check. This is to make production rollouts smoother and minimize downtime windows.

unfortunately, we can't enable preflight.py on staging yet. One of the checks it makes is to confirm there are no non-system connections open to the active databases. Unfortunately, the staging update scripts neglect to disable cronjobs when doing a code-only update so preflight.py (and full-update.py) will normally fail. This fail is correct - these rogue connections occasionally cause the staging update to fail and are exactly the sort of thing we are trying to catch during production rollouts.

To post a comment you must log in.
Revision history for this message
Robert Collins (lifeless) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'database/schema/full-update.py'
2--- database/schema/full-update.py 1970-01-01 00:00:00 +0000
3+++ database/schema/full-update.py 2011-04-27 09:17:24 +0000
4@@ -0,0 +1,56 @@
5+#!/usr/bin/python2.6 -S
6+# Copyright 2011 Canonical Ltd. This software is licensed under the
7+# GNU Affero General Public License version 3 (see the file LICENSE).
8+
9+"""Full update process."""
10+
11+import _pythonpath
12+
13+import os.path
14+from optparse import OptionParser
15+import subprocess
16+import sys
17+
18+from canonical.launchpad.scripts import (
19+ db_options,
20+ logger_options,
21+ )
22+
23+
24+def run_script(script, *extra_args):
25+ script_path = os.path.join(os.path.dirname(__file__), script)
26+ return subprocess.call([script_path] + sys.argv[1:] + list(extra_args))
27+
28+
29+def main():
30+ parser = OptionParser()
31+
32+ # Add all the command command line arguments.
33+ db_options(parser)
34+ logger_options(parser)
35+ (options, args) = parser.parse_args()
36+ if args:
37+ parser.error("Too many arguments")
38+
39+ preflight_rc = run_script('preflight.py')
40+ if preflight_rc != 0:
41+ return preflight_rc
42+
43+ upgrade_rc = run_script('upgrade.py')
44+ if upgrade_rc != 0:
45+ return upgrade_rc
46+
47+ fti_rc = run_script('fti.py')
48+ if fti_rc != 0:
49+ return fti_rc
50+
51+ security_rc = run_script('security.py', '--cluster')
52+ if security_rc != 0:
53+ return security_rc
54+
55+ preflight_rc = run_script('preflight.py')
56+ return preflight_rc
57+
58+
59+if __name__ == '__main__':
60+ sys.exit(main())
61
62=== added file 'database/schema/preflight.py'
63--- database/schema/preflight.py 1970-01-01 00:00:00 +0000
64+++ database/schema/preflight.py 2011-04-27 09:17:24 +0000
65@@ -0,0 +1,222 @@
66+#!/usr/bin/python2.6 -S
67+# Copyright 2011 Canonical Ltd. This software is licensed under the
68+# GNU Affero General Public License version 3 (see the file LICENSE).
69+
70+"""Confirm the database systems are ready to be patched as best we can."""
71+
72+import _pythonpath
73+
74+from datetime import timedelta
75+from optparse import OptionParser
76+import sys
77+
78+import psycopg2
79+
80+from canonical.database.sqlbase import (
81+ connect,
82+ ISOLATION_LEVEL_AUTOCOMMIT,
83+ )
84+from canonical.launchpad.scripts import (
85+ db_options,
86+ logger,
87+ logger_options,
88+ )
89+from canonical import lp
90+import replication.helpers
91+
92+
93+# Ignore connections by these users.
94+SYSTEM_USERS = frozenset(['postgres', 'slony', 'nagios'])
95+
96+# How lagged the cluster can be before failing the preflight check.
97+MAX_LAG = timedelta(seconds=45)
98+
99+
100+class DatabasePreflight:
101+ def __init__(self, log, master_con):
102+ self.log = log
103+ self.is_replicated = replication.helpers.slony_installed(master_con)
104+ if self.is_replicated:
105+ self.nodes = replication.helpers.get_all_cluster_nodes(master_con)
106+ for node in self.nodes:
107+ node.con = psycopg2.connect(node.connection_string)
108+ node.con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
109+ else:
110+ node = replication.helpers.Node(None, None, None, True)
111+ node.con = master_con
112+ self.nodes = [node]
113+
114+ def check_is_superuser(self):
115+ """Return True if all the node connections are as superusers."""
116+ success = True
117+ for node in self.nodes:
118+ cur = node.con.cursor()
119+ cur.execute("""
120+ SELECT current_database(), pg_user.usesuper
121+ FROM pg_user
122+ WHERE usename = current_user
123+ """)
124+ dbname, is_super = cur.fetchone()
125+ if is_super:
126+ self.log.debug("Connected to %s as a superuser.", dbname)
127+ else:
128+ self.log.fatal("Not connected to %s as a superuser.", dbname)
129+ success = False
130+ return success
131+
132+ def check_open_connections(self):
133+ """Return False if any nodes have connections from non-system users.
134+
135+ System users are defined by SYSTEM_USERS.
136+ """
137+ success = True
138+ for node in self.nodes:
139+ cur = node.con.cursor()
140+ cur.execute("""
141+ SELECT datname, usename, COUNT(*) AS num_connections
142+ FROM pg_stat_activity
143+ WHERE
144+ datname=current_database()
145+ AND procpid <> pg_backend_pid()
146+ GROUP BY datname, usename
147+ """)
148+ for datname, usename, num_connections in cur.fetchall():
149+ if usename in SYSTEM_USERS:
150+ self.log.debug(
151+ "%s has %d connections by %s",
152+ datname, num_connections, usename)
153+ else:
154+ self.log.fatal(
155+ "%s has %d connections by %s",
156+ datname, num_connections, usename)
157+ success = False
158+ if success:
159+ self.log.info("Only system users connected to the cluster")
160+ return success
161+
162+ def check_long_running_transactions(self, max_secs=10):
163+ """Return False if any nodes have long running transactions open.
164+
165+ max_secs defines what is long running. For database rollouts,
166+ this will be short. Even if the transaction is benign like a
167+ autovacuum task, we should wait until things have settled down.
168+ """
169+ success = True
170+ for node in self.nodes:
171+ cur = node.con.cursor()
172+ cur.execute("""
173+ SELECT
174+ datname, usename,
175+ age(current_timestamp, xact_start) AS age, current_query
176+ FROM pg_stat_activity
177+ WHERE
178+ age(current_timestamp, xact_start) > interval '%d secs'
179+ AND datname=current_database()
180+ """ % max_secs)
181+ for datname, usename, age, current_query in cur.fetchall():
182+ self.log.fatal(
183+ "%s has transaction by %s open %s",
184+ datname, usename, age)
185+ success = False
186+ if success:
187+ self.log.info("No long running transactions detected.")
188+ return success
189+
190+ def check_replication_lag(self):
191+ """Return False if the replication cluster is badly lagged."""
192+ if not self.is_replicated:
193+ self.log.debug("Not replicated - no replication lag.")
194+ return True
195+
196+ # Check replication lag on every node just in case there are
197+ # disagreements.
198+ max_lag = timedelta(seconds=-1)
199+ max_lag_node = None
200+ for node in self.nodes:
201+ cur = node.con.cursor()
202+ cur.execute("""
203+ SELECT current_database(),
204+ max(st_lag_time) AS lag FROM _sl.sl_status
205+ """)
206+ dbname, lag = cur.fetchone()
207+ if lag > max_lag:
208+ max_lag = lag
209+ max_lag_node = node
210+ self.log.debug(
211+ "%s reports database lag of %s.", dbname, lag)
212+ if max_lag <= MAX_LAG:
213+ self.log.info("Database cluster lag is ok (%s)", max_lag)
214+ return True
215+ else:
216+ self.log.fatal("Database cluster lag is high (%s)", max_lag)
217+ return False
218+
219+ def check_can_sync(self):
220+ """Return True if a sync event is acknowledged by all nodes.
221+
222+ We only wait 30 seconds for the sync, because we require the
223+ cluster to be quiescent.
224+ """
225+ if self.is_replicated:
226+ success = replication.helpers.sync(30)
227+ if success:
228+ self.log.info(
229+ "Replication events are being propagated.")
230+ else:
231+ self.log.fatal(
232+ "Replication events are not being propagated.")
233+ self.log.fatal(
234+ "One or more replication daemons may be down.")
235+ self.log.fatal(
236+ "Bounce the replication daemons and check the logs.")
237+ return success
238+ else:
239+ return True
240+
241+ def check_all(self):
242+ """Run all checks.
243+
244+ If any failed, return False. Otherwise return True.
245+ """
246+ if not self.check_is_superuser():
247+ # No point continuing - results will be bogus without access
248+ # to pg_stat_activity
249+ return False
250+
251+ success = True
252+ if not self.check_open_connections():
253+ success = False
254+ if not self.check_long_running_transactions():
255+ success = False
256+ if not self.check_replication_lag():
257+ success = False
258+ if not self.check_can_sync():
259+ success = False
260+ return success
261+
262+
263+def main():
264+ parser = OptionParser()
265+ db_options(parser)
266+ logger_options(parser)
267+ (options, args) = parser.parse_args()
268+ if args:
269+ parser.error("Too many arguments")
270+
271+ log = logger(options)
272+
273+ master_con = connect(lp.dbuser)
274+ master_con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
275+
276+ preflight_check = DatabasePreflight(log, master_con)
277+
278+ if preflight_check.check_all():
279+ log.info('Preflight check succeeded. Good to go.')
280+ return 0
281+ else:
282+ log.error('Preflight check failed.')
283+ return 1
284+
285+
286+if __name__ == '__main__':
287+ sys.exit(main())
288
289=== modified file 'database/schema/security.py'
290--- database/schema/security.py 2011-03-21 03:50:40 +0000
291+++ database/schema/security.py 2011-04-27 09:17:24 +0000
292@@ -172,12 +172,10 @@
293 node.nickname, node.connection_string))
294 reset_permissions(
295 psycopg2.connect(node.connection_string), config, options)
296- else:
297- log.error("--cluster requested, but not a Slony-I cluster.")
298- return 1
299- else:
300- log.info("Resetting permissions on single database")
301- reset_permissions(con, config, options)
302+ return
303+ log.warning("--cluster requested, but not a Slony-I cluster.")
304+ log.info("Resetting permissions on single database")
305+ reset_permissions(con, config, options)
306
307
308 def list_identifiers(identifiers):

Subscribers

People subscribed via source and target branches

to status/vote changes: