Merge lp:~percona-toolkit-dev/percona-toolkit/pt-hearbeat-monitor-master-change into lp:percona-toolkit/2.2

Proposed by Daniel Nichter
Status: Needs review
Proposed branch: lp:~percona-toolkit-dev/percona-toolkit/pt-hearbeat-monitor-master-change
Merge into: lp:percona-toolkit/2.2
Diff against target: 395 lines (+250/-13)
2 files modified
bin/pt-heartbeat (+76/-13)
t/pt-heartbeat/check_master.t (+174/-0)
To merge this branch: bzr merge lp:~percona-toolkit-dev/percona-toolkit/pt-hearbeat-monitor-master-change
Reviewer Review Type Date Requested Status
Daniel Nichter Pending
Review via email: mp+205877@code.launchpad.net
To post a comment you must log in.
595. By Daniel Nichter

Implement --check-master-server-id.

596. By Daniel Nichter

Add more --check-master-server-id docs.

Unmerged revisions

596. By Daniel Nichter

Add more --check-master-server-id docs.

595. By Daniel Nichter

Implement --check-master-server-id.

594. By Daniel Nichter

Add --check-master-server-id and a failing test for it.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bin/pt-heartbeat'
2--- bin/pt-heartbeat 2013-12-18 23:53:01 +0000
3+++ bin/pt-heartbeat 2014-02-12 02:05:53 +0000
4@@ -4778,6 +4778,7 @@
5 use List::Util qw(min max sum);
6 use Time::HiRes qw(gettimeofday time sleep usleep);
7 use IO::File;
8+use Carp qw(confess);
9
10 use Percona::Toolkit;
11 use constant PTDEBUG => $ENV{PTDEBUG} || 0;
12@@ -4840,6 +4841,14 @@
13 if ( !$o->get('stop') && !$o->get('database') ) {
14 $o->save_error('--database must be specified');
15 }
16+
17+ if ( $o->get('check-master-server-id') > 0 && !$o->get('monitor') ) {
18+ $o->save_error('--check-master-server-id requires --monitor');
19+ }
20+ }
21+
22+ if ( $o->get('check-master-server-id') > 0 ) {
23+ $o->set('print-master-server-id', 1);
24 }
25
26 eval {
27@@ -4958,6 +4967,7 @@
28
29 # pk_col and pk_val are used to identify the heartbeat row to update or
30 # or monitor.
31+ my ($ms, $master_server_id);
32 my ($pk_col, $pk_val);
33 if ( $id ) {
34 # Legacy mode: update heartbeat row WHERE id=1 and monitor heartbeat
35@@ -4973,10 +4983,10 @@
36 $pk_val = $server_id;
37 }
38 else { # monitor or check
39- my $master_server_id = $o->get('master-server-id');
40+ $master_server_id = $o->get('master-server-id');
41 if ( !$master_server_id ) {
42 eval {
43- my $ms = new MasterSlave(
44+ $ms = new MasterSlave(
45 OptionParser => $o,
46 DSNParser => $dp,
47 Quoter => $q,
48@@ -5077,7 +5087,7 @@
49
50 $heartbeat_sth = $dbh->prepare($heartbeat_sql);
51
52- my $ro_check = !!$o->get('check-read-only');
53+ my $ro_check = $o->get('check-read-only');
54 $update_heartbeat = sub {
55 my ($sth) = @_;
56 my @vals;
57@@ -5151,15 +5161,16 @@
58 . ($dbi_driver eq 'mysql' ? '/*!50038, @@hostname AS host*/' : '')
59 . ($id ? "" : ", server_id")
60 . " FROM $db_tbl "
61- . "WHERE $pk_col='$pk_val' "
62+ . "WHERE $pk_col=? "
63 . "LIMIT 1";
64 PTDEBUG && _d("SELECT SQL:", $heartbeat_sql);
65
66 $heartbeat_sth = $dbh->prepare($heartbeat_sql);
67
68 $get_delay = sub {
69- my ($sth) = @_;
70- $sth->execute();
71+ my ($sth, $id) = @_;
72+ confess "No master server ID given" unless $id;
73+ $sth->execute($id);
74 PTDEBUG && _d($sth->{Statement});
75 my ($ts, $hostname, $server_id) = $sth->fetchrow_array();
76 my $now = time;
77@@ -5176,7 +5187,7 @@
78 $delay = 0.00 if $delay < 0;
79
80 $sth->finish();
81- return ($delay, $hostname, $pk_val);
82+ return ($delay, $hostname, $server_id);
83 };
84
85 # https://bugs.launchpad.net/percona-toolkit/+bug/1163372
86@@ -5194,7 +5205,7 @@
87 $update_heartbeat->($heartbeat_sth);
88 }
89 else {
90- $get_delay->($heartbeat_sth);
91+ $get_delay->($heartbeat_sth, $pk_val);
92 }
93 $heartbeat_sth->finish();
94
95@@ -5224,6 +5235,7 @@
96 sth => $heartbeat_sth,
97 sql => $heartbeat_sql,
98 get_delay => $get_delay,
99+ server_id => $pk_val,
100 interval => $interval,
101 skew => $skew,
102 hires_ts => $hires_ts,
103@@ -5259,6 +5271,9 @@
104 # ########################################################################
105 # Monitor or update the heartbeat table.
106 # ########################################################################
107+ my $check_server_id_time = $o->get('check-master-server-id');
108+ my $last_server_id_check = int(time);
109+
110 my $end = $o->get('run-time') ? int(time + $o->get('run-time')) : 0;
111 PTDEBUG && _d($end ? ('Will exit at', ts($end)) : 'Running forever');
112
113@@ -5290,8 +5305,39 @@
114 }
115
116 if ( $o->get('monitor') ) {
117+ if ( $check_server_id_time ) {
118+ # Time to --check-master-server-id?
119+ my $now = int(time);
120+ if ( $now - $last_server_id_check >= $check_server_id_time ) {
121+ PTDEBUG && _d("Checking master server id");
122+ eval {
123+ my $master_dsn = $ms->get_master_dsn($dbh, $dsn, $dp)
124+ or die "This server is not a slave";
125+ my $master_dbh = $dp->get_dbh(
126+ $dp->get_cxn_params($master_dsn),
127+ { AutoCommit => 1 }
128+ );
129+ my ($new_master_server_id)
130+ = $master_dbh->selectrow_array('SELECT @@server_id');
131+ $master_dbh->disconnect;
132+ if ( $new_master_server_id
133+ && $new_master_server_id != $master_server_id ) {
134+ PTDEBUG && _d("Master server id changed:",
135+ $master_server_id, "to",
136+ $new_master_server_id);
137+ $pk_val = $new_master_server_id;
138+ }
139+ };
140+ if ( $EVAL_ERROR ) {
141+ PTDEBUG && _d("Error checking master id:", $EVAL_ERROR);
142+ }
143+ $last_server_id_check = $now;
144+ }
145+ }
146+
147 $heartbeat_sth ||= $dbh->prepare($heartbeat_sql);
148- my ($delay) = $get_delay->($heartbeat_sth);
149+ my ($delay, undef, $server_id)
150+ = $get_delay->($heartbeat_sth, $pk_val);
151
152 unshift @samples, $delay;
153 pop @samples if @samples > $limit;
154@@ -5302,7 +5348,7 @@
155 sum(@samples[0 .. $bound-1]) / $_;
156 } @$frames;
157
158- my $output = sprintf $format, $delay, @vals, $pk_val;
159+ my $output = sprintf $format, $delay, @vals, $server_id;
160 if ( my $file = $o->get('file') ) {
161 open my $file, '>', $file
162 or die "Can't open $file: $OS_ERROR";
163@@ -5354,11 +5400,11 @@
164 # Check the delay on a single server. Optionally recurse to all its slaves.
165 sub check_delay {
166 my ( %args ) = @_;
167- my @required_args = qw(dsn dbh sth sql get_delay interval skew OptionParser DSNParser);
168+ my @required_args = qw(dsn dbh sth sql get_delay server_id interval skew OptionParser DSNParser);
169 foreach my $arg ( @required_args ) {
170 die "I need a $arg argument" unless $args{$arg};
171 }
172- my ($dsn, $dbh, $sth, $sql, $get_delay, $interval, $skew, $o, $dp)
173+ my ($dsn, $dbh, $sth, $sql, $get_delay, $server_id, $interval, $skew, $o, $dp)
174 = @args{@required_args};
175 PTDEBUG && _d('Checking slave delay');
176
177@@ -5411,7 +5457,8 @@
178 }
179 sleep $next_interval - time;
180 PTDEBUG && _d('Woke up at', ts(time));
181- my ($delay, $hostname, $master_server_id) = $get_delay->($sth);
182+ my ($delay, $hostname, $master_server_id)
183+ = $get_delay->($sth, $server_id);
184
185 if ( $o->get('recurse') ) {
186 # Must print not only the delay, but the server's hostname if
187@@ -5642,6 +5689,8 @@
188
189 L<"--daemonize"> and L<"--check"> are mutually exclusive.
190
191+L<"--master-server-id"> and L<"--check-master-server-id"> are mutually exclusive.
192+
193 This tool accepts additional command-line arguments. Refer to the
194 L<"SYNOPSIS"> and usage information for details.
195
196@@ -5667,6 +5716,20 @@
197 their lag, too. The hostname or IP and port for each slave is printed
198 before its delay. L<"--recurse"> only works with MySQL.
199
200+=item --check-master-server-id
201+
202+type: time; default: 0
203+
204+Check master server ID periodically and change L<"--monitor"> to match.
205+The given time should be longer than L<"--interval">. For example,
206+to L<"--monitor"> every 5 seconds and check the master server ID every
207+1 minute:
208+
209+ --monitor 5s --check-master-server-id 1m
210+
211+This option implies L<"--print-master-server-id">. When the master server ID
212+changes, the new value is printed.
213+
214 =item --check-read-only
215
216 Check if the server has read_only enabled; If it does, the tool skips doing
217
218=== added file 't/pt-heartbeat/check_master.t'
219--- t/pt-heartbeat/check_master.t 1970-01-01 00:00:00 +0000
220+++ t/pt-heartbeat/check_master.t 2014-02-12 02:05:53 +0000
221@@ -0,0 +1,174 @@
222+#!/usr/bin/env perl
223+
224+BEGIN {
225+ die "The PERCONA_TOOLKIT_BRANCH environment variable is not set.\n"
226+ unless $ENV{PERCONA_TOOLKIT_BRANCH} && -d $ENV{PERCONA_TOOLKIT_BRANCH};
227+ unshift @INC, "$ENV{PERCONA_TOOLKIT_BRANCH}/lib";
228+};
229+
230+use strict;
231+use warnings FATAL => 'all';
232+use English qw(-no_match_vars);
233+use Test::More;
234+use Data::Dumper;
235+use File::Temp qw(tempfile);
236+
237+use PerconaTest;
238+use Sandbox;
239+require "$trunk/bin/pt-heartbeat";
240+
241+my $dp = new DSNParser(opts=>$dsn_opts);
242+my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
243+my $master1_dbh = $sb->get_dbh_for('master');
244+
245+# Standard setup is:
246+# 12345 -> 12346 -> 12347
247+# So we don't mess with standard servers, start also:
248+# 12348 -> 12349
249+# Then --update 12345 and 12348, start --monitor on 12349, then change
250+# its master to 12345. Disregard the names "master1" and "master2".
251+
252+#diag("Starting extra sandboxes...");
253+
254+diag(`$trunk/sandbox/start-sandbox master 12348 >/dev/null`);
255+my $master2_dbh = $sb->get_dbh_for('master1');
256+
257+diag(`$trunk/sandbox/start-sandbox slave 12349 12348 >/dev/null`);
258+my $slave_dbh = $sb->get_dbh_for('master2');
259+
260+if ( !$master1_dbh ) {
261+ plan skip_all => 'Cannot connect to sandbox master';
262+}
263+elsif ( !$master2_dbh ) {
264+ plan skip_all => 'Cannot connect to second sandbox master';
265+}
266+elsif ( !$slave_dbh ) {
267+ plan skip_all => 'Cannot connect to sandbox slave1';
268+}
269+
270+$sb->load_file('master', 't/pt-heartbeat/samples/precision-time-table.sql');
271+$sb->load_file('master1', 't/pt-heartbeat/samples/precision-time-table.sql');
272+
273+# Get master relay pos before we create heartbeat table and row,
274+# else slave 12349 won't reply these queries and get heartbeat
275+# from master 12345.
276+my $s = $master1_dbh->selectrow_hashref("SHOW MASTER STATUS");
277+
278+$master1_dbh->do("INSERT INTO test.heartbeat (ts, server_id) VALUES (UTC_TIMESTAMP(), 12345)");
279+$master2_dbh->do("INSERT INTO test.heartbeat (ts, server_id) VALUES (UTC_TIMESTAMP(), 12348)");
280+
281+# ###########################################################################
282+# Helper subs
283+# ###########################################################################
284+
285+my $base_pidfile = (tempfile("/tmp/pt-heartbeat-test.XXXXXXXX", OPEN => 0, UNLINK => 0))[1];
286+my $master_port = $sb->port_for('master');
287+
288+my @exec_pids;
289+my @pidfiles;
290+
291+sub start_update_instance {
292+ my ($port) = @_;
293+ my $pidfile = "$base_pidfile.$port.pid";
294+ push @pidfiles, $pidfile;
295+
296+ my $pid = fork();
297+ if ( $pid == 0 ) {
298+ my $cmd = "$trunk/bin/pt-heartbeat";
299+ exec { $cmd } $cmd, qw(-h 127.0.0.1 -u msandbox -p msandbox -P), $port,
300+ qw(--database test --table heartbeat --create-table),
301+ qw(--utc --update --interval 0.5 --pid), $pidfile;
302+ exit 1;
303+ }
304+ push @exec_pids, $pid;
305+
306+ PerconaTest::wait_for_files($pidfile);
307+ ok(
308+ -f $pidfile,
309+ "--update on $port started"
310+ );
311+}
312+
313+sub stop_all_instances {
314+ my @pids = @exec_pids, map { chomp; $_ } map { slurp_file($_) } @pidfiles;
315+ diag(`$trunk/bin/pt-heartbeat --stop >/dev/null`);
316+
317+ waitpid($_, 0) for @pids;
318+ PerconaTest::wait_until(sub{ !-e $_ }) for @pidfiles;
319+
320+ unlink '/tmp/pt-heartbeat-sentinel';
321+}
322+
323+# ###########################################################################
324+# Test --check-master-server-id
325+# ###########################################################################
326+
327+# Start --update on both masters.
328+#diag("Starting --update instances...");
329+start_update_instance(12345);
330+start_update_instance(12348);
331+
332+# Start --monitor on slave currently attached to master 12348.
333+my $output_file = "/tmp/pt-heartbeat-monitor.$PID";
334+system("$trunk/bin/pt-heartbeat --monitor h=127.1,P=12349,u=msandbox,p=msandbox -D test --utc --check-master-server-id 1s --interval 0.5 --file $output_file --daemonize --run-time 5");
335+#diag("Waiting for slave monitor to start...");
336+PerconaTest::wait_for_files($output_file);
337+
338+# Slave monitor should report master ID is 12348.
339+#diag("Waiting for slave monitor output...");
340+my $output;
341+for (1..3) {
342+ $output = `cat $output_file`;
343+ if ($output ne "") {
344+ last;
345+ }
346+ sleep(1);
347+}
348+
349+like(
350+ $output,
351+ qr/12348$/,
352+ "Monitor sees master 12348"
353+);
354+
355+# Simulate master VIP change by actually changing slave's master.
356+#diag("Changing slave's master...");
357+$slave_dbh->do("STOP SLAVE");
358+
359+$slave_dbh->do("CHANGE MASTER TO master_host='127.0.0.1', master_user='msandbox', master_password='msandbox', master_port=12345, master_log_file='$s->{file}', master_log_pos=$s->{position}");
360+
361+$slave_dbh->do("START SLAVE");
362+
363+# Give pt-heartbeat time to detect change.
364+#diag("Waiting for pt-heartbeat to check master server ID...");
365+sleep(2);
366+
367+# Slave monitor should report new master ID 12345.
368+#diag("Waiting for slave monitor output...");
369+for (1..3) {
370+ $output = `cat $output_file`;
371+ if ($output ne "") {
372+ last;
373+ }
374+ sleep(1);
375+}
376+
377+like(
378+ $output,
379+ qr/12345$/,
380+ "Monitor changed to master 12345"
381+);
382+
383+#diag("Stopping --update instances...");
384+stop_all_instances();
385+
386+# #############################################################################
387+# Done.
388+# #############################################################################
389+#diag("Stopping extra sandboxes...");
390+diag(`$trunk/sandbox/stop-sandbox 12349 >/dev/null`);
391+diag(`$trunk/sandbox/stop-sandbox 12348 >/dev/null`);
392+diag(`rm -rf $output_file >/dev/null`);
393+$sb->wipe_clean($master1_dbh);
394+ok($sb->ok(), "Sandbox servers") or BAIL_OUT(__FILE__ . " broke the sandbox");
395+done_testing;

Subscribers

People subscribed via source and target branches