Merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665 into lp:percona-toolkit/2.2
- no-stalk-bug-1125665
- Merge into 2.2
Proposed by
Daniel Nichter
Status: | Merged |
---|---|
Approved by: | Daniel Nichter |
Approved revision: | 550 |
Merged at revision: | 546 |
Proposed branch: | lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665 |
Merge into: | lp:percona-toolkit/2.2 |
Diff against target: |
731 lines (+283/-161) 5 files modified
bin/pt-stalk (+208/-149) lib/bash/collect.sh (+3/-11) lib/bash/subshell.sh (+66/-0) t/lib/bash/collect.sh (+1/-0) t/pt-stalk/pt-stalk.t (+5/-1) |
To merge this branch: | bzr merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665 |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Daniel Nichter | Approve | ||
Review via email: mp+151664@code.launchpad.net |
Commit message
Description of the change
To post a comment you must log in.
- 550. By Daniel Nichter
-
Minor doc correction.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'bin/pt-stalk' |
2 | --- bin/pt-stalk 2013-02-27 00:01:17 +0000 |
3 | +++ bin/pt-stalk 2013-03-05 01:26:23 +0000 |
4 | @@ -55,6 +55,58 @@ |
5 | # ########################################################################### |
6 | |
7 | # ########################################################################### |
8 | +# subshell package |
9 | +# This package is a copy without comments from the original. The original |
10 | +# with comments and its test file can be found in the Bazaar repository at, |
11 | +# lib/bash/subshell.sh |
12 | +# t/lib/bash/subshell.sh |
13 | +# See https://launchpad.net/percona-toolkit for more information. |
14 | +# ########################################################################### |
15 | + |
16 | + |
17 | +set -u |
18 | + |
19 | +wait_for_subshells() { |
20 | + local max_wait=$1 |
21 | + if [ "$(jobs)" ]; then |
22 | + log "Waiting up to $max_wait seconds for subprocesses to finish..." |
23 | + local slept=0 |
24 | + while [ -n "$(jobs)" ]; do |
25 | + local subprocess_still_running="" |
26 | + for pid in $(jobs -p); do |
27 | + if kill -0 $pid >/dev/null 2>&1; then |
28 | + subprocess_still_running=1 |
29 | + fi |
30 | + done |
31 | + if [ "$subprocess_still_running" ]; then |
32 | + sleep 1 |
33 | + slept=$((slept + 1)) |
34 | + [ $slept -ge $max_wait ] && break |
35 | + else |
36 | + break |
37 | + fi |
38 | + done |
39 | + fi |
40 | +} |
41 | + |
42 | +kill_all_subshells() { |
43 | + if [ "$(jobs)" ]; then |
44 | + for pid in $(jobs -p); do |
45 | + if kill -0 $pid >/dev/null 2>&1; then |
46 | + log "Killing subprocess $pid" |
47 | + kill $pid >/dev/null 2>&1 |
48 | + fi |
49 | + done |
50 | + else |
51 | + log "All subprocesses have finished" |
52 | + fi |
53 | +} |
54 | + |
55 | +# ########################################################################### |
56 | +# End subshell package |
57 | +# ########################################################################### |
58 | + |
59 | +# ########################################################################### |
60 | # parse_options package |
61 | # This package is a copy without comments from the original. The original |
62 | # with comments and its test file can be found in the Bazaar repository at, |
63 | @@ -871,16 +923,8 @@ |
64 | |
65 | hostname > "$d/$p-hostname" |
66 | |
67 | - local slept=0 |
68 | - while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do |
69 | - sleep 1 |
70 | - slept=$((slept + 1)) |
71 | - done |
72 | - |
73 | - for pid in $(jobs -p); do |
74 | - kill $pid >/dev/null 2>&1 |
75 | - done |
76 | - |
77 | + wait_for_subshells $OPT_RUN_TIME |
78 | + kill_all_subshells |
79 | for file in "$d/$p-"*; do |
80 | if [ -z "$(grep -v '^TS ' --max-count 1 "$file")" ]; then |
81 | log "Removing empty file $file"; |
82 | @@ -1079,7 +1123,7 @@ |
83 | local seconds="$1" |
84 | local msg="${2:-""}" |
85 | if oktorun; then |
86 | - [ "$msg" ] && info "$msg" |
87 | + [ "$msg" ] && log "$msg" |
88 | sleep $seconds |
89 | fi |
90 | } |
91 | @@ -1244,27 +1288,8 @@ |
92 | # we may get in sync with the collector and kill it a microsecond |
93 | # before it kills itself, thus 3 * run-time. |
94 | # https://bugs.launchpad.net/percona-toolkit/+bug/1070434 |
95 | - if [ "$(jobs)" ]; then |
96 | - local sleep_time=$((OPT_RUN_TIME * 3)) |
97 | - log "Waiting up to $sleep_time seconds for collectors to finish..." |
98 | - local slept=0 |
99 | - while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do |
100 | - sleep 1 |
101 | - slept=$((slept + 1)) |
102 | - done |
103 | - |
104 | - if [ "$(jobs)" ]; then |
105 | - for pid in $(jobs -p); do |
106 | - # This isn't an warning (we don't want exit status 1) because |
107 | - # the system may be running slowly so it's just "natural" that |
108 | - # a collector may get stuck or run really slowly. |
109 | - log "Killing collector $pid" |
110 | - kill $pid >/dev/null 2>&1 |
111 | - done |
112 | - else |
113 | - log "All collectors have finished" |
114 | - fi |
115 | - fi |
116 | + wait_for_subshells $((OPT_RUN_TIME * 3)) |
117 | + kill_all_subshells |
118 | } |
119 | |
120 | # ########################################################################### |
121 | @@ -1333,10 +1358,8 @@ |
122 | |
123 | if [ -z "$OPT_STALK" -a "$OPT_COLLECT" ]; then |
124 | # Not stalking; do immediate collect once. |
125 | - OPT_ITERATIONS=1 |
126 | OPT_CYCLES=0 |
127 | - OPT_SLEEP=0 |
128 | - OPT_INTERVAL=0 |
129 | + echo "[iter=$OPT_ITERATIONS] [cycle=$OPT_CYCLES] [sleep=$OPT_SLEEP] [interval=$OPT_INTERVAL]" |
130 | fi |
131 | |
132 | usage_or_errors "$0" |
133 | @@ -1412,17 +1435,17 @@ |
134 | |
135 | =head1 NAME |
136 | |
137 | -pt-stalk - Gather forensic data about MySQL when a problem occurs. |
138 | +pt-stalk - Collect forensic data about MySQL when problems occur. |
139 | |
140 | =head1 SYNOPSIS |
141 | |
142 | Usage: pt-stalk [OPTIONS] [-- MYSQL OPTIONS] |
143 | |
144 | -pt-stalk watches for a trigger condition to become true, and then collects data |
145 | -to help in diagnosing problems. It is designed to run as a daemon with root |
146 | +pt-stalk waits for a trigger condition to occur, then collects data |
147 | +to help diagnose problems. The tool is designed to run as a daemon with root |
148 | privileges, so that you can diagnose intermittent problems that you cannot |
149 | -observe directly. You can also use it to execute a custom command, or to gather |
150 | -the data on demand without waiting for the trigger to happen. |
151 | +observe directly. You can also use it to execute a custom command, or to |
152 | +collect data on demand without waiting for the trigger to occur. |
153 | |
154 | =head1 RISKS |
155 | |
156 | @@ -1453,16 +1476,20 @@ |
157 | problems when you can't observe them? That's why pt-stalk exists. In addition to |
158 | using it when there's a known problem on your servers, it is a good idea to run |
159 | pt-stalk all the time, even when you think nothing is wrong. You will |
160 | -appreciate the data it gathers when a problem occurs, because problems such as |
161 | -MySQL lockups or spikes of activity typically leave no evidence to use in root |
162 | +appreciate the data it collects when a problem occurs, because problems such as |
163 | +MySQL lockups or spikes in activity typically leave no evidence to use in root |
164 | cause analysis. |
165 | |
166 | -This tool does two things: it watches a server (typically MySQL) for a trigger |
167 | -to occur, and it gathers diagnostic data. To use it effectively, you need to |
168 | -define a good trigger condition. A good trigger is sensitive enough to fire |
169 | -reliably when a problem occurs, so that you don't miss a chance to solve |
170 | -problems. On the other hand, a good trigger isn't prone to false positives, so |
171 | -you don't gather information when the server is functioning normally. |
172 | +pt-stalk does two things: it watches a MySQL server and waits for a trigger |
173 | +condition to occur, and it collects diagnostic data when that trigger occurs. |
174 | +To avoid false-positives caused by short-lived problems, the trigger condition |
175 | +must be true at least L<"--cycles"> times before a L<"--collect"> is triggered. |
176 | + |
177 | +To use pt-stalk effectively, you need to define a good trigger. A good trigger |
178 | +is sensitive enough to fire reliably when a problem occurs, so that you don't |
179 | +miss a chance to solve problems. On the other hand, a good trigger isn't |
180 | +prone to false positives, so you don't gather information when the server |
181 | +is functioning normally. |
182 | |
183 | The most reliable triggers for MySQL tend to be the number of connections to the |
184 | server, and the number of queries running concurrently. These are available in |
185 | @@ -1472,55 +1499,76 @@ |
186 | appropriate trigger condition for the tool. Choose carefully, because the |
187 | quality of your results will depend on the trigger you choose. |
188 | |
189 | -You can define the trigger with the L<"--function">, L<"--variable">, and |
190 | -L<"--threshold"> options, among others. Please read the documentation for |
191 | ---function to learn how to do this. |
192 | - |
193 | -The pt-stalk tool, by default, simply watches MySQL repeatedly until the trigger |
194 | -becomes true. It then gathers diagnostics for a while, and sleeps afterwards for |
195 | -some time to prevent repeatedly gathering data if the condition remains true. |
196 | -In crude pseudocode, omitting some subtleties, |
197 | - |
198 | - while true; do |
199 | - if --variable from --function is greater than --threshold; then |
200 | - observations++ |
201 | - if observations is greater than --cycles; then |
202 | - capture diagnostics for --run-time seconds |
203 | - exit if --iterations is exceeded |
204 | - sleep for --sleep seconds |
205 | - done |
206 | - done |
207 | - clean up data that's older than --retention-time |
208 | - sleep for --interval seconds |
209 | - done |
210 | +You define the trigger with the L<"--function">, L<"--variable">, |
211 | +L<"--threshold">, and L<"--cycles"> options. The default values |
212 | +for these options define a reasonable trigger, but you should adjust |
213 | +or change them to suite your particular system and needs. |
214 | + |
215 | +By default, pt-stalk tool watches MySQL forever until the trigger occurs, |
216 | +then it collects diagnostic data for a while, and sleeps afterwards to avoid |
217 | +repeatedly collecting data if the trigger remains true. The general order of |
218 | +operations is: |
219 | + |
220 | + while true; do |
221 | + if --variable from --function > --threshold; then |
222 | + cycles_true++ |
223 | + if cycles_true >= --cycles; then |
224 | + --notify-by-email |
225 | + if --collect; then |
226 | + if --disk-bytes-free and --disk-pct-free ok; then |
227 | + (--collect for --run-time seconds) & |
228 | + fi |
229 | + rm files in --dest older than --retention-time |
230 | + fi |
231 | + iter++ |
232 | + cycles_true=0 |
233 | + fi |
234 | + if iter < --iterations; then |
235 | + sleep --sleep seconds |
236 | + else |
237 | + break |
238 | + fi |
239 | + else |
240 | + if iter < --iterations; then |
241 | + sleep --interval seconds |
242 | + else |
243 | + break |
244 | + fi |
245 | + fi |
246 | + done |
247 | + rm old --dest files older than --retention-time |
248 | + if --collect process are still running; then |
249 | + wait up to --run-time * 3 seconds |
250 | + kill any remaining --collect processes |
251 | + fi |
252 | |
253 | The diagnostic data is written to files whose names begin with a timestamp, so |
254 | you can distinguish samples from each other in case the tool collects data |
255 | -multiple times. The pt-sift tool is designed to help you browse and analyze the |
256 | -resulting samples of data. |
257 | +multiple times. The pt-sift tool is designed to help you browse and analyze |
258 | +the resulting data samples. |
259 | |
260 | Although this sounds simple enough, in practice there are a number of |
261 | subtleties, such as detecting when the disk is beginning to fill up so that the |
262 | tool doesn't cause the server to run out of disk space. This tool handles these |
263 | types of potential problems, so it's a good idea to use this tool instead of |
264 | writing something from scratch and possibly experiencing some of the hazards |
265 | -this tool is designed to prevent. |
266 | +this tool is designed to avoid. |
267 | |
268 | =head1 CONFIGURING |
269 | |
270 | -You can use standard Percona Toolkit configuration files to set commandline |
271 | +You can use standard Percona Toolkit configuration files to set command line |
272 | options. |
273 | |
274 | You will probably want to run the tool as a daemon and customize at least the |
275 | -diagnostic threshold. Here's a sample configuration file for triggering when |
276 | +L<"--threshold">. Here's a sample configuration file for triggering when |
277 | there are more than 20 queries running at once: |
278 | |
279 | daemonize |
280 | threshold=20 |
281 | |
282 | -If you're not running the tool as it's designed (as a root user, daemonized) |
283 | -then you'll need to set several options, such as L<"--dest">, to locations that |
284 | -are writable by non-root users. |
285 | +If you don't run the tool as root, then you will need specify several options, |
286 | +such as L<"--pid">, L<"--log">, and L<"--dest">, else the tool will probably |
287 | +fail to start. |
288 | |
289 | =head1 OPTIONS |
290 | |
291 | @@ -1530,8 +1578,8 @@ |
292 | |
293 | default: yes; negatable: yes |
294 | |
295 | -Collect system information. You can negate this option to make the tool watch |
296 | -the system but not actually gather any diagnostic data. |
297 | +Collect diagnostic data when the trigger occurs. Specify C<--no-collect> |
298 | +to make the tool watch the system but not collect data. |
299 | |
300 | See also L<"--stalk">. |
301 | |
302 | @@ -1581,9 +1629,8 @@ |
303 | |
304 | type: int; default: 5 |
305 | |
306 | -The number of times the trigger condition must be true before collecting data. |
307 | -This helps prevent false positives, and makes the trigger condition less likely |
308 | -to fire when the problem recovers quickly. |
309 | +How many times L<"--variable"> must be greater than L<"--threshold"> before triggering L<"--collect">. This helps prevent false positives, and makes |
310 | +the trigger condition less likely to fire when the problem recovers quickly. |
311 | |
312 | =item --daemonize |
313 | |
314 | @@ -1594,14 +1641,15 @@ |
315 | |
316 | type: string; default: /var/lib/pt-stalk |
317 | |
318 | -Where to store the diagnostic data. Each time the tool collects data, it writes |
319 | -to a new set of files, which are named with the current system timestamp. |
320 | +Where to save diagnostic data from L<"--collect">. Each time the tool |
321 | +collects data, it writes to a new set of files, which are named with the |
322 | +current system timestamp. |
323 | |
324 | =item --disk-bytes-free |
325 | |
326 | type: size; default: 100M |
327 | |
328 | -Don't collect data if the disk has less than this much free space. |
329 | +Do not L<"--collect"> if the disk has less than this much free space. |
330 | This prevents the tool from filling up the disk with diagnostic data. |
331 | |
332 | If the L<"--dest"> directory contains a previously captured sample of data, |
333 | @@ -1618,7 +1666,7 @@ |
334 | |
335 | type: int; default: 5 |
336 | |
337 | -Don't collect data if the disk has less than this percent free space. |
338 | +Do not L<"--collect"> if the disk has less than this percent free space. |
339 | This prevents the tool from filling up the disk with diagnostic data. |
340 | |
341 | This option works similarly to L<"--disk-bytes-free"> but specifies a |
342 | @@ -1630,57 +1678,57 @@ |
343 | |
344 | type: string; default: status |
345 | |
346 | -Specifies what to watch for a diagnostic trigger. The default value watches |
347 | -SHOW GLOBAL STATUS, but you can also watch SHOW PROCESSLIST or supply a plugin |
348 | -file with your own custom code. This function supplies the value of |
349 | +What to watch for the trigger. The default value watches |
350 | +C<SHOW GLOBAL STATUS>, but you can also watch C<SHOW PROCESSLIST> and specify |
351 | +a file with your own custom code. This function supplies the value of |
352 | L<"--variable">, which is then compared against L<"--threshold"> to see if the |
353 | -trigger condition is met. Additional options may be required as well; see |
354 | -below. Possible values: |
355 | +the trigger condition is met. Additional options may be required as |
356 | +well; see below. Possible values are: |
357 | |
358 | =over |
359 | |
360 | =item * status |
361 | |
362 | -This value specifies that the source of data for the diagnostic trigger is SHOW |
363 | -GLOBAL STATUS. The value of L<"--variable"> then defines which status counter |
364 | -is the trigger. |
365 | +Watch C<SHOW GLOBAL STATUS> for the trigger. The value of |
366 | +L<"--variable"> then defines which status counter is the trigger. |
367 | |
368 | =item * processlist |
369 | |
370 | -This value specifies that the data for the diagnostic trigger comes from SHOW |
371 | -FULL PROCESSLIST. The trigger value is the count of processes whose |
372 | -L<"--variable"> column matches the L<"--match"> option. For example, to trigger |
373 | -when more than 10 processes are in the "statistics" state, use the following |
374 | -options: |
375 | +Watch C<SHOW FULL PROCESSLIST> for the trigger. The trigger |
376 | +value is the count of processes whose L<"--variable"> column matches the |
377 | +L<"--match"> option. For example, to trigger L<"--collect"> when more than |
378 | +10 processes are in the "statistics" state, specify: |
379 | |
380 | - --function processlist --variable State \ |
381 | - --match statistics --threshold 10 |
382 | + --function processlist \ |
383 | + --variable State \ |
384 | + --match statistics \ |
385 | + --threshold 10 |
386 | |
387 | =back |
388 | |
389 | -In addition, you can specify a file that contains your custom trigger function, |
390 | -written in Unix shell script. This can be a wrapper that executes anything you |
391 | -wish. If the argument to --function is a file, then it takes precedence over |
392 | -builtin functions, so if there is a file in the working directory named "status" |
393 | -or "processlist" then the tool will use that file as a plugin, even though those |
394 | -are otherwise recognized as reserved words for this option. |
395 | +In addition, you can specify a file that contains your custom trigger |
396 | +function, written in Unix shell script. This can be a wrapper that executes |
397 | +anything you wish. If the argument to L<"--function"> is a file, then it |
398 | +takes precedence over built-in functions, so if there is a file in the working |
399 | +directory named "status" or "processlist" then the tool will use that file |
400 | +even though are valid built-in values. |
401 | |
402 | -The plugin file works by providing a function called C<trg_plugin>, and the tool |
403 | -simply sources the file and executes the function. For example, the function |
404 | -might look like the following: |
405 | +The file works by providing a function called C<trg_plugin>, and the tool |
406 | +simply sources the file and executes the function. For example, the file |
407 | +might contain: |
408 | |
409 | trg_plugin() { |
410 | mysql $EXT_ARGV -e "SHOW ENGINE INNODB STATUS" \ |
411 | | grep -c "has waited at" |
412 | } |
413 | |
414 | -This snippet will count the number of mutex waits inside of InnoDB. It |
415 | +This snippet will count the number of mutex waits inside InnoDB. It |
416 | illustrates the general principle: the function must output a number, which is |
417 | -then compared to the threshold as usual. The $EXT_ARGV variable contains the |
418 | -MySQL options mentioned in the L<"SYNOPSIS"> above. |
419 | +then compared to L<"--threshold"> as usual. The C<$EXT_ARGV> variable |
420 | +contains the MySQL options mentioned in the L<"SYNOPSIS"> above. |
421 | |
422 | -The plugin should not alter the tool's existing global variables. Prefix any |
423 | -plugin-specific global variables with "PLUGIN_" or make them local. |
424 | +The file should not alter the tool's existing global variables. Prefix any |
425 | +file-specific global variables with "PLUGIN_" or make them local. |
426 | |
427 | =item --help |
428 | |
429 | @@ -1690,15 +1738,17 @@ |
430 | |
431 | type: int; default: 1 |
432 | |
433 | -Interval between checks for the diagnostic trigger. |
434 | +How often to check the if trigger is true, in seconds. |
435 | |
436 | =item --iterations |
437 | |
438 | type: int |
439 | |
440 | -Exit after collecting diagnostics this many times. By default, the tool |
441 | -will continue to watch the server forever, but this is useful for scenarios |
442 | -where you want to capture once and then exit, for example. |
443 | +How many times to L<"--collect"> diagnostic data. By default, the tool |
444 | +runs forever and collects data every time the trigger occurs. |
445 | +Specify L<"--iterations"> to collect data a limited number of times. |
446 | +This option is also useful with C<--no-stalk> to collect data once and |
447 | +exit, for example. |
448 | |
449 | =item --log |
450 | |
451 | @@ -1710,14 +1760,14 @@ |
452 | |
453 | type: string |
454 | |
455 | -The pattern to use when watching SHOW PROCESSLIST. See the documentation for |
456 | -L<"--function"> for details. |
457 | +The pattern to use when watching SHOW PROCESSLIST. See L<"--function"> |
458 | +for details. |
459 | |
460 | =item --notify-by-email |
461 | |
462 | type: string |
463 | |
464 | -Send mail to this list of addresses when data is collected. |
465 | +Send an email to these addresses for every L<"--collect">. |
466 | |
467 | =item --pid |
468 | |
469 | @@ -1746,8 +1796,8 @@ |
470 | |
471 | =item before_collect |
472 | |
473 | -Called when the stalk condition is triggered, before running a collector |
474 | -process as a backgrounded subshell. |
475 | +Called when the trigger occurs, before running a L<"--collect"> |
476 | +subprocesses in the background. |
477 | |
478 | =item after_collect |
479 | |
480 | @@ -1771,10 +1821,10 @@ |
481 | |
482 | =back |
483 | |
484 | -For example, a very simple plugin that touches a file when a collector |
485 | -process is triggered: |
486 | +For example, a very simple plugin that touches a file when L<"--collect"> |
487 | +is triggered: |
488 | |
489 | - before_colllect() { |
490 | + before_collect() { |
491 | touch /tmp/foo |
492 | } |
493 | |
494 | @@ -1797,9 +1847,9 @@ |
495 | |
496 | type: string |
497 | |
498 | -The filename prefix for diagnostic samples. By default, samples have a timestamp |
499 | -prefix based on the current local time, such as 2011_12_06_14_02_02, which is |
500 | -December 6, 2011 at 14:02:02. |
501 | +The filename prefix for diagnostic samples. By default, all files created |
502 | +by the same L<"--collect"> instance have a timestamp prefix based on the current |
503 | +local time, like C<2011_12_06_14_02_02>, which is December 6, 2011 at 14:02:02. |
504 | |
505 | =item --retention-time |
506 | |
507 | @@ -1812,10 +1862,12 @@ |
508 | |
509 | type: int; default: 30 |
510 | |
511 | -How long the tool will collect data when it triggers. This should not be longer |
512 | -than L<"--sleep">. It is usually not necessary to change this; if the default 30 |
513 | -seconds hasn't gathered enough diagnostic data, running longer is not likely to |
514 | -do so. In fact, in many cases a shorter collection period is appropriate. |
515 | +How long to L<"--collect"> diagnostic data when the trigger occurs. |
516 | +The value is in seconds and should not be longer than L<"--sleep">. It is |
517 | +usually not necessary to change this; if the default 30 seconds doesn't |
518 | +collect enough data, running longer is not likely to help because the system |
519 | +or MySQL server is probably too busy to respond. In fact, in many cases a |
520 | +shorter collection period is appropriate. |
521 | |
522 | This value is used two other times. After collecting, the collect subprocess |
523 | will wait another L<"--run-time"> seconds for its commands to finish. Some |
524 | @@ -1825,7 +1877,7 @@ |
525 | data. The value is potentially used again just before the tool exits to wait |
526 | again for any collect subprocesses to finish. In most cases this won't |
527 | happen because of the aforementioned extra wait. If it happens, the tool |
528 | -will log "Waiting up to N seconds for collectors to finish..." where N is |
529 | +will log "Waiting up to N seconds for subprocesses to finish..." where N is |
530 | three times L<"--run-time">. In both cases, after waiting, the tool kills |
531 | all of its subprocesses. |
532 | |
533 | @@ -1833,8 +1885,8 @@ |
534 | |
535 | type: int; default: 300 |
536 | |
537 | -How long to sleep after collecting data. This prevents the tool from triggering |
538 | -continuously, which might be a problem if the collection process is intrusive. |
539 | +How long to sleep after L<"--collect">. This prevents the tool |
540 | +from triggering continuously, which might be a problem if the collection process is intrusive. |
541 | It also prevents filling up the disk or gathering too much data to analyze |
542 | reasonably. |
543 | |
544 | @@ -1842,14 +1894,16 @@ |
545 | |
546 | default: yes; negatable: yes |
547 | |
548 | -Watch the server and wait for the trigger to occur. You can negate this option |
549 | -to make the tool immediately gather any diagnostic data once and exit. This is |
550 | -useful if a problem is already happening, but pt-stalk is not running, so |
551 | -you only want to collect diagnostic data. |
552 | - |
553 | -If this option is negate, L<"--daemonize">, L<"--log">, L<"--pid">, and other |
554 | -stalking-related options have no effect; the tool simply collects diagnostic |
555 | -data and exits. Safeguard options, like L<"--disk-bytes-free"> and |
556 | +Watch the server and wait for the trigger to occur. Specify C<--no-stalk> |
557 | +to collect diagnostic data immediately, that is, without waiting for the |
558 | +trigger to occur. You probably also want to specify values for |
559 | +L<"--interval">, L<"--iterations">, and L<"--sleep">. For example, to |
560 | +immediately collect data for 1 minute then exit, specify: |
561 | + |
562 | + --no-stalk --run-time 60 --iterations 1 |
563 | + |
564 | +L<"--cycles">, L<"--daemonize">, L<"--log"> and L<"--pid"> have no effect |
565 | +with C<--no-stalk>. Safeguard options, like L<"--disk-bytes-free"> and |
566 | L<"--disk-pct-free">, are still respected. |
567 | |
568 | See also L<"--collect">. |
569 | @@ -1858,14 +1912,18 @@ |
570 | |
571 | type: int; default: 25 |
572 | |
573 | -The threshold at which the diagnostic trigger should fire. See L<"--function"> |
574 | -for details. |
575 | +The maximum acceptable value for L<"--variable">. L<"--collect"> is |
576 | +triggered when the value of L<"--variable"> is greater than L<"--threshold"> |
577 | +for L<"--cycles"> many times. Currently, there is no way to define a lower |
578 | +threshold to check for a L<"--variable"> value that is too low. |
579 | + |
580 | +See also L<"--function">. |
581 | |
582 | =item --variable |
583 | |
584 | type: string; default: Threads_running |
585 | |
586 | -The variable to compare against the threshold. See L<"--function"> for details. |
587 | +The variable to compare against L<"--threshold">. See also L<"--function">. |
588 | |
589 | =item --verbose |
590 | |
591 | @@ -1995,7 +2053,8 @@ |
592 | |
593 | =head1 AUTHORS |
594 | |
595 | -Baron Schwartz, Justin Swanhart, Fernando Ipar, and Daniel Nichter |
596 | +Baron Schwartz, Justin Swanhart, Fernando Ipar, Daniel Nichter, |
597 | +and Brian Fraser. |
598 | |
599 | =head1 ABOUT PERCONA TOOLKIT |
600 | |
601 | |
602 | === modified file 'lib/bash/collect.sh' |
603 | --- lib/bash/collect.sh 2012-11-12 14:26:01 +0000 |
604 | +++ lib/bash/collect.sh 2013-03-05 01:26:23 +0000 |
605 | @@ -22,7 +22,7 @@ |
606 | # collect collects system information. |
607 | |
608 | # XXX |
609 | -# THIS LIB REQUIRES log_warn_die.sh, safeguards.sh, and alt_cmds.sh! |
610 | +# THIS LIB REQUIRES log_warn_die, safeguards, alt_cmds, and subshell! |
611 | # XXX |
612 | |
613 | set -u |
614 | @@ -289,16 +289,8 @@ |
615 | # it may leave an empty file. But first wait another --run-time |
616 | # seconds for any slow process to finish: |
617 | # https://bugs.launchpad.net/percona-toolkit/+bug/1047701 |
618 | - local slept=0 |
619 | - while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do |
620 | - sleep 1 |
621 | - slept=$((slept + 1)) |
622 | - done |
623 | - |
624 | - for pid in $(jobs -p); do |
625 | - kill $pid >/dev/null 2>&1 |
626 | - done |
627 | - |
628 | + wait_for_subshells $OPT_RUN_TIME |
629 | + kill_all_subshells |
630 | for file in "$d/$p-"*; do |
631 | # If there's not at least 1 line that's not a TS, |
632 | # then the file is empty. |
633 | |
634 | === added file 'lib/bash/subshell.sh' |
635 | --- lib/bash/subshell.sh 1970-01-01 00:00:00 +0000 |
636 | +++ lib/bash/subshell.sh 2013-03-05 01:26:23 +0000 |
637 | @@ -0,0 +1,66 @@ |
638 | +# This program is copyright 2013 Percona Ireland Ltd. |
639 | +# Feedback and improvements are welcome. |
640 | +# |
641 | +# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED |
642 | +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF |
643 | +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
644 | +# |
645 | +# This program is free software; you can redistribute it and/or modify it under |
646 | +# the terms of the GNU General Public License as published by the Free Software |
647 | +# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar |
648 | +# systems, you can issue `man perlgpl' or `man perlartistic' to read these |
649 | +# licenses. |
650 | +# |
651 | +# You should have received a copy of the GNU General Public License along with |
652 | +# this program; if not, write to the Free Software Foundation, Inc., 59 Temple |
653 | +# Place, Suite 330, Boston, MA 02111-1307 USA. |
654 | +# ########################################################################### |
655 | +# subshell package |
656 | +# ########################################################################### |
657 | + |
658 | +# Package: subshell |
659 | + |
660 | +set -u |
661 | + |
662 | +wait_for_subshells() { |
663 | + local max_wait=$1 |
664 | + if [ "$(jobs)" ]; then |
665 | + log "Waiting up to $max_wait seconds for subprocesses to finish..." |
666 | + local slept=0 |
667 | + while [ -n "$(jobs)" ]; do |
668 | + local subprocess_still_running="" |
669 | + for pid in $(jobs -p); do |
670 | + if kill -0 $pid >/dev/null 2>&1; then |
671 | + subprocess_still_running=1 |
672 | + fi |
673 | + done |
674 | + if [ "$subprocess_still_running" ]; then |
675 | + sleep 1 |
676 | + slept=$((slept + 1)) |
677 | + [ $slept -ge $max_wait ] && break |
678 | + else |
679 | + break |
680 | + fi |
681 | + done |
682 | + fi |
683 | +} |
684 | + |
685 | +kill_all_subshells() { |
686 | + if [ "$(jobs)" ]; then |
687 | + for pid in $(jobs -p); do |
688 | + if kill -0 $pid >/dev/null 2>&1; then |
689 | + # This isn't an warning (we don't want exit status 1) because |
690 | + # the system may be running slowly so it's just "natural" that |
691 | + # a collector may get stuck or run really slowly. |
692 | + log "Killing subprocess $pid" |
693 | + kill $pid >/dev/null 2>&1 |
694 | + fi |
695 | + done |
696 | + else |
697 | + log "All subprocesses have finished" |
698 | + fi |
699 | +} |
700 | + |
701 | +# ########################################################################### |
702 | +# End subshell package |
703 | +# ########################################################################### |
704 | |
705 | === modified file 't/lib/bash/collect.sh' |
706 | --- t/lib/bash/collect.sh 2012-11-12 14:26:01 +0000 |
707 | +++ t/lib/bash/collect.sh 2013-03-05 01:26:23 +0000 |
708 | @@ -10,6 +10,7 @@ |
709 | mkdir "$PT_TMPDIR/collect" 2>/dev/null |
710 | |
711 | source "$LIB_DIR/log_warn_die.sh" |
712 | +source "$LIB_DIR/subshell.sh" |
713 | source "$LIB_DIR/parse_options.sh" |
714 | source "$LIB_DIR/safeguards.sh" |
715 | source "$LIB_DIR/alt_cmds.sh" |
716 | |
717 | === modified file 't/pt-stalk/pt-stalk.t' |
718 | --- t/pt-stalk/pt-stalk.t 2013-01-24 18:13:39 +0000 |
719 | +++ t/pt-stalk/pt-stalk.t 2013-03-05 01:26:23 +0000 |
720 | @@ -317,7 +317,11 @@ |
721 | |
722 | cleanup(); |
723 | |
724 | -$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1"); |
725 | +# As of 2.2, --no-stalk means just that: don't stalk, just collect, so |
726 | +# we have to specify --iterations=1 else the tool will continue to run, |
727 | +# whereas in 2.1 --no-stalk implied/forced "collect once and exit". |
728 | + |
729 | +$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file --iterations 1 -- --defaults-file=$cnf >$log_file 2>&1"); |
730 | |
731 | PerconaTest::wait_until(sub { !-f $pid_file }); |
732 |
Tool passes full test run.