Merge lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434 into lp:percona-toolkit/2.1

Proposed by Daniel Nichter
Status: Merged
Merged at revision: 428
Proposed branch: lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434
Merge into: lp:percona-toolkit/2.1
Diff against target: 280 lines (+110/-28)
2 files modified
bin/pt-stalk (+43/-0)
t/pt-stalk/pt-stalk.t (+67/-28)
To merge this branch: bzr merge lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434
Reviewer Review Type Date Requested Status
Daniel Nichter Approve
Review via email: mp+131093@code.launchpad.net
To post a comment you must log in.
429. By Daniel Nichter

Make "not stalking" tests more reliable.

430. By Daniel Nichter

Really wait for expected data in files.

431. By Daniel Nichter

Even more waiting for _really_ slow Ubuntu 12 box.

432. By Daniel Nichter

Include collector output in diag.

Revision history for this message
Daniel Nichter (daniel-nichter) :
review: Approve
433. By Daniel Nichter

Document extra --run-time-related waits.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bin/pt-stalk'
2--- bin/pt-stalk 2012-10-13 20:15:37 +0000
3+++ bin/pt-stalk 2012-10-24 00:23:21 +0000
4@@ -1210,6 +1210,37 @@
5 purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME"
6 fi
7 done
8+
9+ # Before exiting, the last collector may still be running.
10+ # Wait for it to finish in case the tool is part of a script,
11+ # or part of a test, so the caller has access to the collected
12+ # data when the tool exists. collect() waits an additional
13+ # --run-time seconds for itself to complete, which means we
14+ # have to wait for 2 * run-time like it plus some overhead else
15+ # we may get in sync with the collector and kill it a microsecond
16+ # before it kills itself, thus 3 * run-time.
17+ # https://bugs.launchpad.net/percona-toolkit/+bug/1070434
18+ if [ "$(jobs)" ]; then
19+ local sleep_time=$((OPT_RUN_TIME * 3))
20+ log "Waiting up to $sleep_time seconds for collectors to finish..."
21+ local slept=0
22+ while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
23+ sleep 1
24+ slept=$((slept + 1))
25+ done
26+
27+ if [ "$(jobs)" ]; then
28+ for pid in $(jobs -p); do
29+ # This isn't an warning (we don't want exit status 1) because
30+ # the system may be running slowly so it's just "natural" that
31+ # a collector may get stuck or run really slowly.
32+ log "Killing collector $pid"
33+ kill $pid >/dev/null 2>&1
34+ done
35+ else
36+ log "All collectors have finished"
37+ fi
38+ fi
39 }
40
41 # ###########################################################################
42@@ -1754,6 +1785,18 @@
43 seconds hasn't gathered enough diagnostic data, running longer is not likely to
44 do so. In fact, in many cases a shorter collection period is appropriate.
45
46+This value is used two other times. After collecting, the collect subprocess
47+will wait another L<"--run-time"> seconds for its commands to finish. Some
48+commands can take awhile if the system is running very slowly (which can
49+likely be the case given that a collection was triggered). Since empty files
50+are deleted, the extra wait gives commands time to finish and write their
51+data. The value is potentially used again just before the tool exits to wait
52+again for any collect subprocesses to finish. In most cases this won't
53+happen because of the aforementioned extra wait. If it happens, the tool
54+will log "Waiting up to N seconds for collectors to finish..." where N is
55+three times L<"--run-time">. In both cases, after waiting, the tool kills
56+all of its subprocesses.
57+
58 =item --sleep
59
60 type: int; default: 300
61
62=== modified file 't/pt-stalk/pt-stalk.t'
63--- t/pt-stalk/pt-stalk.t 2012-10-13 18:58:55 +0000
64+++ t/pt-stalk/pt-stalk.t 2012-10-24 00:23:21 +0000
65@@ -86,20 +86,21 @@
66 "Creates --dest (collect) dir"
67 );
68
69-chomp($pid = `cat $pid_file`);
70+chomp($pid = `cat $pid_file 2>/dev/null`);
71 $retval = system("kill -0 $pid");
72 is(
73 $retval >> 0,
74 0,
75- "pt-stalk is running ($pid)"
76+ "pt-stalk is running"
77 );
78
79-$output = `cat $log_file`;
80+PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null");
81+$output = `cat $log_file 2>/dev/null`;
82 like(
83 $output,
84 qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/,
85 "Check results logged"
86-);
87+) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
88
89 $retval = system("kill $pid 2>/dev/null");
90 is(
91@@ -108,19 +109,19 @@
92 "Killed pt-stalk"
93 );
94
95-sleep 1;
96+PerconaTest::wait_until(sub { !-f $pid_file });
97
98 ok(
99 ! -f $pid_file,
100 "Removes PID file"
101 );
102
103-$output = `cat $log_file`;
104+$output = `cat $log_file 2>/dev/null`;
105 like(
106 $output,
107 qr/Caught signal, exiting/,
108 "Caught signal logged"
109-);
110+) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`);
111
112 # ###########################################################################
113 # Test collect.
114@@ -134,42 +135,61 @@
115 my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'");
116 my $threshold = $uptime + 2;
117
118-$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
119-
120-sleep 3;
121-
122-$output = `cat $dest/*-trigger`;
123+$retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
124+
125+PerconaTest::wait_until(sub { !-f $pid_file });
126+
127+$output = `cat $dest/*-trigger 2>/dev/null`;
128 like(
129 $output,
130 qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/,
131 "Collect triggered"
132+)
133+or diag(
134+ 'output', $output,
135+ 'log file', `cat $log_file 2>/dev/null`,
136+ 'collector', `cat $dest/*-output 2>/dev/null`,
137+ 'dest', `ls -l $dest/ 2>/dev/null`,
138+ 'df', `cat $dest/*-df 2>/dev/null`,
139 );
140
141 # There is some nondeterminism here. Sometimes it'll run for 2 samples because
142 # the samples may not be precisely 1 second apart.
143-chomp($output = `cat $dest/*-df | grep -c '^TS'`);
144+chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`);
145 ok(
146 $output >= 1 && $output <= 3,
147 "Collect ran for --run-time"
148-) or diag("Run time: $output");
149+)
150+or diag(
151+ 'output', $output,
152+ 'log file', `cat $log_file 2>/dev/null`,
153+ 'collector', `cat $dest/*-output 2>/dev/null`,
154+ 'dest', `ls -l $dest/ 2>/dev/null`,
155+ 'df', `cat $dest/*-df 2>/dev/null`,
156+);
157
158 ok(
159 PerconaTest::not_running("pt-stalk --iterations 1"),
160 "pt-stalk is not running"
161 );
162
163-$output = `cat $dest/*-trigger`;
164+$output = `cat $dest/*-trigger 2>/dev/null`;
165 like(
166 $output,
167 qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/,
168 "Trigger file logs how pt-stalk was ran"
169 );
170
171-chomp($output = `cat $log_file | grep 'Collector PID'`);
172+chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`);
173 like(
174 $output,
175 qr/Collector PID \d+/,
176 "Collector PID logged"
177+)
178+or diag(
179+ 'output', $output,
180+ 'log file', `cat $log_file 2>/dev/null`,
181+ 'collector', `cat $dest/*-output 2>/dev/null`,
182 );
183
184 # ###########################################################################
185@@ -184,9 +204,9 @@
186
187 $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
188
189-sleep 2;
190+PerconaTest::wait_until(sub { !-f $pid_file });
191
192-$output = `cat $log_file`;
193+$output = `cat $log_file 2>/dev/null`;
194 like(
195 $output,
196 qr/Collect triggered/,
197@@ -196,7 +216,7 @@
198 ok(
199 ! -f "$dest/*",
200 "No files collected"
201-);
202+) or diag(`ls -l $dest/ 2>/dev/null`);
203
204 ok(
205 PerconaTest::not_running("pt-stalk --no-collect"),
206@@ -221,7 +241,7 @@
207 "Killed pt-stalk"
208 );
209
210-$output = `cat $log_file`;
211+$output = `cat $log_file 2>/dev/null`;
212 like(
213 $output,
214 qr/Check results: Aborted_connects=|variable=Aborted_connects/,
215@@ -238,23 +258,32 @@
216 diag(`rm $log_file 2>/dev/null`);
217 diag(`rm $dest/* 2>/dev/null`);
218
219-$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1");
220-
221-PerconaTest::wait_for_files("$dest/nostalk-trigger");
222-$output = `cat $dest/nostalk-trigger`;
223+$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
224+
225+PerconaTest::wait_until(sub { !-f $pid_file });
226+
227+$output = `cat $dest/nostalk-trigger 2>/dev/null`;
228 like(
229 $output,
230 qr/Not stalking/,
231 "Not stalking, collect triggered"
232+)
233+or diag(
234+ 'dest', `ls -l $dest/ 2>/dev/null`,
235+ 'log_file', `cat $log_file 2>/dev/null`,
236+ 'collector', `cat $dest/*-output 2>/dev/null`,
237 );
238
239-PerconaTest::wait_for_files("$dest/nostalk-hostname");
240-PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2");
241-chomp($output = `grep -c '^TS' $dest/nostalk-df`);
242+chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`);
243 is(
244 $output,
245 2,
246 "Not stalking, collect ran for --run-time"
247+)
248+or diag(
249+ 'dest', `ls -l $dest/ 2>/dev/null`,
250+ 'log_file', `cat $log_file 2>/dev/null`,
251+ 'collector', `cat $dest/*-output 2>/dev/null`,
252 );
253
254 my $vmstat = `which vmstat 2>/dev/null`;
255@@ -269,14 +298,24 @@
256 };
257
258 is(
259- `cat $dest/nostalk-hostname`,
260+ `cat $dest/nostalk-hostname 2>/dev/null`,
261 `hostname`,
262 "Not stalking, collect gathered data"
263+)
264+or diag(
265+ 'dest', `ls -l $dest/ 2>/dev/null`,
266+ 'log_file', `cat $log_file 2>/dev/null`,
267+ 'collector', `cat $dest/*-output 2>/dev/null`,
268 );
269
270 ok(
271 PerconaTest::not_running("pt-stalk --no-stalk"),
272 "Not stalking, pt-stalk is not running"
273+)
274+or diag(
275+ 'dest', `ls -l $dest/ 2>/dev/null`,
276+ 'log_file', `cat $log_file 2>/dev/null`,
277+ 'collector', `cat $dest/*-output 2>/dev/null`,
278 );
279
280 # ############################################################################

Subscribers

People subscribed via source and target branches