Merge lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434 into lp:percona-toolkit/2.1
- pt-stalk-iter-1-bug-1070434
- Merge into 2.1
Proposed by
Daniel Nichter
Status: | Merged |
---|---|
Merged at revision: | 428 |
Proposed branch: | lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434 |
Merge into: | lp:percona-toolkit/2.1 |
Diff against target: |
280 lines (+110/-28) 2 files modified
bin/pt-stalk (+43/-0) t/pt-stalk/pt-stalk.t (+67/-28) |
To merge this branch: | bzr merge lp:~percona-toolkit-dev/percona-toolkit/pt-stalk-iter-1-bug-1070434 |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Daniel Nichter | Approve | ||
Review via email: mp+131093@code.launchpad.net |
Commit message
Description of the change
To post a comment you must log in.
- 429. By Daniel Nichter
-
Make "not stalking" tests more reliable.
- 430. By Daniel Nichter
-
Really wait for expected data in files.
- 431. By Daniel Nichter
-
Even more waiting for _really_ slow Ubuntu 12 box.
- 432. By Daniel Nichter
-
Include collector output in diag.
Revision history for this message
Daniel Nichter (daniel-nichter) : | # |
review:
Approve
- 433. By Daniel Nichter
-
Document extra --run-time-related waits.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'bin/pt-stalk' | |||
2 | --- bin/pt-stalk 2012-10-13 20:15:37 +0000 | |||
3 | +++ bin/pt-stalk 2012-10-24 00:23:21 +0000 | |||
4 | @@ -1210,6 +1210,37 @@ | |||
5 | 1210 | purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME" | 1210 | purge_samples "$OPT_DEST" "$OPT_RETENTION_TIME" |
6 | 1211 | fi | 1211 | fi |
7 | 1212 | done | 1212 | done |
8 | 1213 | |||
9 | 1214 | # Before exiting, the last collector may still be running. | ||
10 | 1215 | # Wait for it to finish in case the tool is part of a script, | ||
11 | 1216 | # or part of a test, so the caller has access to the collected | ||
12 | 1217 | # data when the tool exists. collect() waits an additional | ||
13 | 1218 | # --run-time seconds for itself to complete, which means we | ||
14 | 1219 | # have to wait for 2 * run-time like it plus some overhead else | ||
15 | 1220 | # we may get in sync with the collector and kill it a microsecond | ||
16 | 1221 | # before it kills itself, thus 3 * run-time. | ||
17 | 1222 | # https://bugs.launchpad.net/percona-toolkit/+bug/1070434 | ||
18 | 1223 | if [ "$(jobs)" ]; then | ||
19 | 1224 | local sleep_time=$((OPT_RUN_TIME * 3)) | ||
20 | 1225 | log "Waiting up to $sleep_time seconds for collectors to finish..." | ||
21 | 1226 | local slept=0 | ||
22 | 1227 | while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do | ||
23 | 1228 | sleep 1 | ||
24 | 1229 | slept=$((slept + 1)) | ||
25 | 1230 | done | ||
26 | 1231 | |||
27 | 1232 | if [ "$(jobs)" ]; then | ||
28 | 1233 | for pid in $(jobs -p); do | ||
29 | 1234 | # This isn't an warning (we don't want exit status 1) because | ||
30 | 1235 | # the system may be running slowly so it's just "natural" that | ||
31 | 1236 | # a collector may get stuck or run really slowly. | ||
32 | 1237 | log "Killing collector $pid" | ||
33 | 1238 | kill $pid >/dev/null 2>&1 | ||
34 | 1239 | done | ||
35 | 1240 | else | ||
36 | 1241 | log "All collectors have finished" | ||
37 | 1242 | fi | ||
38 | 1243 | fi | ||
39 | 1213 | } | 1244 | } |
40 | 1214 | 1245 | ||
41 | 1215 | # ########################################################################### | 1246 | # ########################################################################### |
42 | @@ -1754,6 +1785,18 @@ | |||
43 | 1754 | seconds hasn't gathered enough diagnostic data, running longer is not likely to | 1785 | seconds hasn't gathered enough diagnostic data, running longer is not likely to |
44 | 1755 | do so. In fact, in many cases a shorter collection period is appropriate. | 1786 | do so. In fact, in many cases a shorter collection period is appropriate. |
45 | 1756 | 1787 | ||
46 | 1788 | This value is used two other times. After collecting, the collect subprocess | ||
47 | 1789 | will wait another L<"--run-time"> seconds for its commands to finish. Some | ||
48 | 1790 | commands can take awhile if the system is running very slowly (which can | ||
49 | 1791 | likely be the case given that a collection was triggered). Since empty files | ||
50 | 1792 | are deleted, the extra wait gives commands time to finish and write their | ||
51 | 1793 | data. The value is potentially used again just before the tool exits to wait | ||
52 | 1794 | again for any collect subprocesses to finish. In most cases this won't | ||
53 | 1795 | happen because of the aforementioned extra wait. If it happens, the tool | ||
54 | 1796 | will log "Waiting up to N seconds for collectors to finish..." where N is | ||
55 | 1797 | three times L<"--run-time">. In both cases, after waiting, the tool kills | ||
56 | 1798 | all of its subprocesses. | ||
57 | 1799 | |||
58 | 1757 | =item --sleep | 1800 | =item --sleep |
59 | 1758 | 1801 | ||
60 | 1759 | type: int; default: 300 | 1802 | type: int; default: 300 |
61 | 1760 | 1803 | ||
62 | === modified file 't/pt-stalk/pt-stalk.t' | |||
63 | --- t/pt-stalk/pt-stalk.t 2012-10-13 18:58:55 +0000 | |||
64 | +++ t/pt-stalk/pt-stalk.t 2012-10-24 00:23:21 +0000 | |||
65 | @@ -86,20 +86,21 @@ | |||
66 | 86 | "Creates --dest (collect) dir" | 86 | "Creates --dest (collect) dir" |
67 | 87 | ); | 87 | ); |
68 | 88 | 88 | ||
70 | 89 | chomp($pid = `cat $pid_file`); | 89 | chomp($pid = `cat $pid_file 2>/dev/null`); |
71 | 90 | $retval = system("kill -0 $pid"); | 90 | $retval = system("kill -0 $pid"); |
72 | 91 | is( | 91 | is( |
73 | 92 | $retval >> 0, | 92 | $retval >> 0, |
74 | 93 | 0, | 93 | 0, |
76 | 94 | "pt-stalk is running ($pid)" | 94 | "pt-stalk is running" |
77 | 95 | ); | 95 | ); |
78 | 96 | 96 | ||
80 | 97 | $output = `cat $log_file`; | 97 | PerconaTest::wait_for_sh("grep -q 'Check results' $log_file >/dev/null"); |
81 | 98 | $output = `cat $log_file 2>/dev/null`; | ||
82 | 98 | like( | 99 | like( |
83 | 99 | $output, | 100 | $output, |
84 | 100 | qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/, | 101 | qr/Check results: Threads_running=\d+, matched=no, cycles_true=0/, |
85 | 101 | "Check results logged" | 102 | "Check results logged" |
87 | 102 | ); | 103 | ) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); |
88 | 103 | 104 | ||
89 | 104 | $retval = system("kill $pid 2>/dev/null"); | 105 | $retval = system("kill $pid 2>/dev/null"); |
90 | 105 | is( | 106 | is( |
91 | @@ -108,19 +109,19 @@ | |||
92 | 108 | "Killed pt-stalk" | 109 | "Killed pt-stalk" |
93 | 109 | ); | 110 | ); |
94 | 110 | 111 | ||
96 | 111 | sleep 1; | 112 | PerconaTest::wait_until(sub { !-f $pid_file }); |
97 | 112 | 113 | ||
98 | 113 | ok( | 114 | ok( |
99 | 114 | ! -f $pid_file, | 115 | ! -f $pid_file, |
100 | 115 | "Removes PID file" | 116 | "Removes PID file" |
101 | 116 | ); | 117 | ); |
102 | 117 | 118 | ||
104 | 118 | $output = `cat $log_file`; | 119 | $output = `cat $log_file 2>/dev/null`; |
105 | 119 | like( | 120 | like( |
106 | 120 | $output, | 121 | $output, |
107 | 121 | qr/Caught signal, exiting/, | 122 | qr/Caught signal, exiting/, |
108 | 122 | "Caught signal logged" | 123 | "Caught signal logged" |
110 | 123 | ); | 124 | ) or diag(`cat $log_file 2>/dev/null`, `cat $dest/*-output 2>/dev/null`); |
111 | 124 | 125 | ||
112 | 125 | # ########################################################################### | 126 | # ########################################################################### |
113 | 126 | # Test collect. | 127 | # Test collect. |
114 | @@ -134,42 +135,61 @@ | |||
115 | 134 | my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'"); | 135 | my (undef, $uptime) = $dbh->selectrow_array("SHOW STATUS LIKE 'Uptime'"); |
116 | 135 | my $threshold = $uptime + 2; | 136 | my $threshold = $uptime + 2; |
117 | 136 | 137 | ||
123 | 137 | $retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); | 138 | $retval = system("$trunk/bin/pt-stalk --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 2 --run-time 2 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); |
124 | 138 | 139 | ||
125 | 139 | sleep 3; | 140 | PerconaTest::wait_until(sub { !-f $pid_file }); |
126 | 140 | 141 | ||
127 | 141 | $output = `cat $dest/*-trigger`; | 142 | $output = `cat $dest/*-trigger 2>/dev/null`; |
128 | 142 | like( | 143 | like( |
129 | 143 | $output, | 144 | $output, |
130 | 144 | qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/, | 145 | qr/Check results: Uptime=\d+, matched=yes, cycles_true=2/, |
131 | 145 | "Collect triggered" | 146 | "Collect triggered" |
132 | 147 | ) | ||
133 | 148 | or diag( | ||
134 | 149 | 'output', $output, | ||
135 | 150 | 'log file', `cat $log_file 2>/dev/null`, | ||
136 | 151 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
137 | 152 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
138 | 153 | 'df', `cat $dest/*-df 2>/dev/null`, | ||
139 | 146 | ); | 154 | ); |
140 | 147 | 155 | ||
141 | 148 | # There is some nondeterminism here. Sometimes it'll run for 2 samples because | 156 | # There is some nondeterminism here. Sometimes it'll run for 2 samples because |
142 | 149 | # the samples may not be precisely 1 second apart. | 157 | # the samples may not be precisely 1 second apart. |
144 | 150 | chomp($output = `cat $dest/*-df | grep -c '^TS'`); | 158 | chomp($output = `cat $dest/*-df 2>/dev/null | grep -c '^TS'`); |
145 | 151 | ok( | 159 | ok( |
146 | 152 | $output >= 1 && $output <= 3, | 160 | $output >= 1 && $output <= 3, |
147 | 153 | "Collect ran for --run-time" | 161 | "Collect ran for --run-time" |
149 | 154 | ) or diag("Run time: $output"); | 162 | ) |
150 | 163 | or diag( | ||
151 | 164 | 'output', $output, | ||
152 | 165 | 'log file', `cat $log_file 2>/dev/null`, | ||
153 | 166 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
154 | 167 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
155 | 168 | 'df', `cat $dest/*-df 2>/dev/null`, | ||
156 | 169 | ); | ||
157 | 155 | 170 | ||
158 | 156 | ok( | 171 | ok( |
159 | 157 | PerconaTest::not_running("pt-stalk --iterations 1"), | 172 | PerconaTest::not_running("pt-stalk --iterations 1"), |
160 | 158 | "pt-stalk is not running" | 173 | "pt-stalk is not running" |
161 | 159 | ); | 174 | ); |
162 | 160 | 175 | ||
164 | 161 | $output = `cat $dest/*-trigger`; | 176 | $output = `cat $dest/*-trigger 2>/dev/null`; |
165 | 162 | like( | 177 | like( |
166 | 163 | $output, | 178 | $output, |
167 | 164 | qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/, | 179 | qr/pt-stalk ran with --function=status --variable=Uptime --threshold=$threshold/, |
168 | 165 | "Trigger file logs how pt-stalk was ran" | 180 | "Trigger file logs how pt-stalk was ran" |
169 | 166 | ); | 181 | ); |
170 | 167 | 182 | ||
172 | 168 | chomp($output = `cat $log_file | grep 'Collector PID'`); | 183 | chomp($output = `cat $log_file 2>/dev/null | grep 'Collector PID'`); |
173 | 169 | like( | 184 | like( |
174 | 170 | $output, | 185 | $output, |
175 | 171 | qr/Collector PID \d+/, | 186 | qr/Collector PID \d+/, |
176 | 172 | "Collector PID logged" | 187 | "Collector PID logged" |
177 | 188 | ) | ||
178 | 189 | or diag( | ||
179 | 190 | 'output', $output, | ||
180 | 191 | 'log file', `cat $log_file 2>/dev/null`, | ||
181 | 192 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
182 | 173 | ); | 193 | ); |
183 | 174 | 194 | ||
184 | 175 | # ########################################################################### | 195 | # ########################################################################### |
185 | @@ -184,9 +204,9 @@ | |||
186 | 184 | 204 | ||
187 | 185 | $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); | 205 | $retval = system("$trunk/bin/pt-stalk --no-collect --iterations 1 --dest $dest --variable Uptime --threshold $threshold --cycles 1 --run-time 1 --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); |
188 | 186 | 206 | ||
190 | 187 | sleep 2; | 207 | PerconaTest::wait_until(sub { !-f $pid_file }); |
191 | 188 | 208 | ||
193 | 189 | $output = `cat $log_file`; | 209 | $output = `cat $log_file 2>/dev/null`; |
194 | 190 | like( | 210 | like( |
195 | 191 | $output, | 211 | $output, |
196 | 192 | qr/Collect triggered/, | 212 | qr/Collect triggered/, |
197 | @@ -196,7 +216,7 @@ | |||
198 | 196 | ok( | 216 | ok( |
199 | 197 | ! -f "$dest/*", | 217 | ! -f "$dest/*", |
200 | 198 | "No files collected" | 218 | "No files collected" |
202 | 199 | ); | 219 | ) or diag(`ls -l $dest/ 2>/dev/null`); |
203 | 200 | 220 | ||
204 | 201 | ok( | 221 | ok( |
205 | 202 | PerconaTest::not_running("pt-stalk --no-collect"), | 222 | PerconaTest::not_running("pt-stalk --no-collect"), |
206 | @@ -221,7 +241,7 @@ | |||
207 | 221 | "Killed pt-stalk" | 241 | "Killed pt-stalk" |
208 | 222 | ); | 242 | ); |
209 | 223 | 243 | ||
211 | 224 | $output = `cat $log_file`; | 244 | $output = `cat $log_file 2>/dev/null`; |
212 | 225 | like( | 245 | like( |
213 | 226 | $output, | 246 | $output, |
214 | 227 | qr/Check results: Aborted_connects=|variable=Aborted_connects/, | 247 | qr/Check results: Aborted_connects=|variable=Aborted_connects/, |
215 | @@ -238,23 +258,32 @@ | |||
216 | 238 | diag(`rm $log_file 2>/dev/null`); | 258 | diag(`rm $log_file 2>/dev/null`); |
217 | 239 | diag(`rm $dest/* 2>/dev/null`); | 259 | diag(`rm $dest/* 2>/dev/null`); |
218 | 240 | 260 | ||
223 | 241 | $retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk -- --defaults-file=$cnf >$log_file 2>&1"); | 261 | $retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); |
224 | 242 | 262 | ||
225 | 243 | PerconaTest::wait_for_files("$dest/nostalk-trigger"); | 263 | PerconaTest::wait_until(sub { !-f $pid_file }); |
226 | 244 | $output = `cat $dest/nostalk-trigger`; | 264 | |
227 | 265 | $output = `cat $dest/nostalk-trigger 2>/dev/null`; | ||
228 | 245 | like( | 266 | like( |
229 | 246 | $output, | 267 | $output, |
230 | 247 | qr/Not stalking/, | 268 | qr/Not stalking/, |
231 | 248 | "Not stalking, collect triggered" | 269 | "Not stalking, collect triggered" |
232 | 270 | ) | ||
233 | 271 | or diag( | ||
234 | 272 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
235 | 273 | 'log_file', `cat $log_file 2>/dev/null`, | ||
236 | 274 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
237 | 249 | ); | 275 | ); |
238 | 250 | 276 | ||
242 | 251 | PerconaTest::wait_for_files("$dest/nostalk-hostname"); | 277 | chomp($output = `grep -c '^TS' $dest/nostalk-df 2>/dev/null`); |
240 | 252 | PerconaTest::wait_for_sh("test \$(grep -c '^TS' $dest/nostalk-df) -ge 2"); | ||
241 | 253 | chomp($output = `grep -c '^TS' $dest/nostalk-df`); | ||
243 | 254 | is( | 278 | is( |
244 | 255 | $output, | 279 | $output, |
245 | 256 | 2, | 280 | 2, |
246 | 257 | "Not stalking, collect ran for --run-time" | 281 | "Not stalking, collect ran for --run-time" |
247 | 282 | ) | ||
248 | 283 | or diag( | ||
249 | 284 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
250 | 285 | 'log_file', `cat $log_file 2>/dev/null`, | ||
251 | 286 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
252 | 258 | ); | 287 | ); |
253 | 259 | 288 | ||
254 | 260 | my $vmstat = `which vmstat 2>/dev/null`; | 289 | my $vmstat = `which vmstat 2>/dev/null`; |
255 | @@ -269,14 +298,24 @@ | |||
256 | 269 | }; | 298 | }; |
257 | 270 | 299 | ||
258 | 271 | is( | 300 | is( |
260 | 272 | `cat $dest/nostalk-hostname`, | 301 | `cat $dest/nostalk-hostname 2>/dev/null`, |
261 | 273 | `hostname`, | 302 | `hostname`, |
262 | 274 | "Not stalking, collect gathered data" | 303 | "Not stalking, collect gathered data" |
263 | 304 | ) | ||
264 | 305 | or diag( | ||
265 | 306 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
266 | 307 | 'log_file', `cat $log_file 2>/dev/null`, | ||
267 | 308 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
268 | 275 | ); | 309 | ); |
269 | 276 | 310 | ||
270 | 277 | ok( | 311 | ok( |
271 | 278 | PerconaTest::not_running("pt-stalk --no-stalk"), | 312 | PerconaTest::not_running("pt-stalk --no-stalk"), |
272 | 279 | "Not stalking, pt-stalk is not running" | 313 | "Not stalking, pt-stalk is not running" |
273 | 314 | ) | ||
274 | 315 | or diag( | ||
275 | 316 | 'dest', `ls -l $dest/ 2>/dev/null`, | ||
276 | 317 | 'log_file', `cat $log_file 2>/dev/null`, | ||
277 | 318 | 'collector', `cat $dest/*-output 2>/dev/null`, | ||
278 | 280 | ); | 319 | ); |
279 | 281 | 320 | ||
280 | 282 | # ############################################################################ | 321 | # ############################################################################ |