Merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665 into lp:percona-toolkit/2.2

Proposed by Daniel Nichter
Status: Merged
Approved by: Daniel Nichter
Approved revision: 550
Merged at revision: 546
Proposed branch: lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665
Merge into: lp:percona-toolkit/2.2
Diff against target: 731 lines (+283/-161)
5 files modified
bin/pt-stalk (+208/-149)
lib/bash/collect.sh (+3/-11)
lib/bash/subshell.sh (+66/-0)
t/lib/bash/collect.sh (+1/-0)
t/pt-stalk/pt-stalk.t (+5/-1)
To merge this branch: bzr merge lp:~percona-toolkit-dev/percona-toolkit/no-stalk-bug-1125665
Reviewer Review Type Date Requested Status
Daniel Nichter Approve
Review via email: mp+151664@code.launchpad.net
To post a comment you must log in.
550. By Daniel Nichter

Minor doc correction.

Revision history for this message
Daniel Nichter (daniel-nichter) wrote :

Tool passes full test run.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bin/pt-stalk'
2--- bin/pt-stalk 2013-02-27 00:01:17 +0000
3+++ bin/pt-stalk 2013-03-05 01:26:23 +0000
4@@ -55,6 +55,58 @@
5 # ###########################################################################
6
7 # ###########################################################################
8+# subshell package
9+# This package is a copy without comments from the original. The original
10+# with comments and its test file can be found in the Bazaar repository at,
11+# lib/bash/subshell.sh
12+# t/lib/bash/subshell.sh
13+# See https://launchpad.net/percona-toolkit for more information.
14+# ###########################################################################
15+
16+
17+set -u
18+
19+wait_for_subshells() {
20+ local max_wait=$1
21+ if [ "$(jobs)" ]; then
22+ log "Waiting up to $max_wait seconds for subprocesses to finish..."
23+ local slept=0
24+ while [ -n "$(jobs)" ]; do
25+ local subprocess_still_running=""
26+ for pid in $(jobs -p); do
27+ if kill -0 $pid >/dev/null 2>&1; then
28+ subprocess_still_running=1
29+ fi
30+ done
31+ if [ "$subprocess_still_running" ]; then
32+ sleep 1
33+ slept=$((slept + 1))
34+ [ $slept -ge $max_wait ] && break
35+ else
36+ break
37+ fi
38+ done
39+ fi
40+}
41+
42+kill_all_subshells() {
43+ if [ "$(jobs)" ]; then
44+ for pid in $(jobs -p); do
45+ if kill -0 $pid >/dev/null 2>&1; then
46+ log "Killing subprocess $pid"
47+ kill $pid >/dev/null 2>&1
48+ fi
49+ done
50+ else
51+ log "All subprocesses have finished"
52+ fi
53+}
54+
55+# ###########################################################################
56+# End subshell package
57+# ###########################################################################
58+
59+# ###########################################################################
60 # parse_options package
61 # This package is a copy without comments from the original. The original
62 # with comments and its test file can be found in the Bazaar repository at,
63@@ -871,16 +923,8 @@
64
65 hostname > "$d/$p-hostname"
66
67- local slept=0
68- while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
69- sleep 1
70- slept=$((slept + 1))
71- done
72-
73- for pid in $(jobs -p); do
74- kill $pid >/dev/null 2>&1
75- done
76-
77+ wait_for_subshells $OPT_RUN_TIME
78+ kill_all_subshells
79 for file in "$d/$p-"*; do
80 if [ -z "$(grep -v '^TS ' --max-count 1 "$file")" ]; then
81 log "Removing empty file $file";
82@@ -1079,7 +1123,7 @@
83 local seconds="$1"
84 local msg="${2:-""}"
85 if oktorun; then
86- [ "$msg" ] && info "$msg"
87+ [ "$msg" ] && log "$msg"
88 sleep $seconds
89 fi
90 }
91@@ -1244,27 +1288,8 @@
92 # we may get in sync with the collector and kill it a microsecond
93 # before it kills itself, thus 3 * run-time.
94 # https://bugs.launchpad.net/percona-toolkit/+bug/1070434
95- if [ "$(jobs)" ]; then
96- local sleep_time=$((OPT_RUN_TIME * 3))
97- log "Waiting up to $sleep_time seconds for collectors to finish..."
98- local slept=0
99- while [ -n "$(jobs)" -a $slept -lt $sleep_time ]; do
100- sleep 1
101- slept=$((slept + 1))
102- done
103-
104- if [ "$(jobs)" ]; then
105- for pid in $(jobs -p); do
106- # This isn't an warning (we don't want exit status 1) because
107- # the system may be running slowly so it's just "natural" that
108- # a collector may get stuck or run really slowly.
109- log "Killing collector $pid"
110- kill $pid >/dev/null 2>&1
111- done
112- else
113- log "All collectors have finished"
114- fi
115- fi
116+ wait_for_subshells $((OPT_RUN_TIME * 3))
117+ kill_all_subshells
118 }
119
120 # ###########################################################################
121@@ -1333,10 +1358,8 @@
122
123 if [ -z "$OPT_STALK" -a "$OPT_COLLECT" ]; then
124 # Not stalking; do immediate collect once.
125- OPT_ITERATIONS=1
126 OPT_CYCLES=0
127- OPT_SLEEP=0
128- OPT_INTERVAL=0
129+ echo "[iter=$OPT_ITERATIONS] [cycle=$OPT_CYCLES] [sleep=$OPT_SLEEP] [interval=$OPT_INTERVAL]"
130 fi
131
132 usage_or_errors "$0"
133@@ -1412,17 +1435,17 @@
134
135 =head1 NAME
136
137-pt-stalk - Gather forensic data about MySQL when a problem occurs.
138+pt-stalk - Collect forensic data about MySQL when problems occur.
139
140 =head1 SYNOPSIS
141
142 Usage: pt-stalk [OPTIONS] [-- MYSQL OPTIONS]
143
144-pt-stalk watches for a trigger condition to become true, and then collects data
145-to help in diagnosing problems. It is designed to run as a daemon with root
146+pt-stalk waits for a trigger condition to occur, then collects data
147+to help diagnose problems. The tool is designed to run as a daemon with root
148 privileges, so that you can diagnose intermittent problems that you cannot
149-observe directly. You can also use it to execute a custom command, or to gather
150-the data on demand without waiting for the trigger to happen.
151+observe directly. You can also use it to execute a custom command, or to
152+collect data on demand without waiting for the trigger to occur.
153
154 =head1 RISKS
155
156@@ -1453,16 +1476,20 @@
157 problems when you can't observe them? That's why pt-stalk exists. In addition to
158 using it when there's a known problem on your servers, it is a good idea to run
159 pt-stalk all the time, even when you think nothing is wrong. You will
160-appreciate the data it gathers when a problem occurs, because problems such as
161-MySQL lockups or spikes of activity typically leave no evidence to use in root
162+appreciate the data it collects when a problem occurs, because problems such as
163+MySQL lockups or spikes in activity typically leave no evidence to use in root
164 cause analysis.
165
166-This tool does two things: it watches a server (typically MySQL) for a trigger
167-to occur, and it gathers diagnostic data. To use it effectively, you need to
168-define a good trigger condition. A good trigger is sensitive enough to fire
169-reliably when a problem occurs, so that you don't miss a chance to solve
170-problems. On the other hand, a good trigger isn't prone to false positives, so
171-you don't gather information when the server is functioning normally.
172+pt-stalk does two things: it watches a MySQL server and waits for a trigger
173+condition to occur, and it collects diagnostic data when that trigger occurs.
174+To avoid false-positives caused by short-lived problems, the trigger condition
175+must be true at least L<"--cycles"> times before a L<"--collect"> is triggered.
176+
177+To use pt-stalk effectively, you need to define a good trigger. A good trigger
178+is sensitive enough to fire reliably when a problem occurs, so that you don't
179+miss a chance to solve problems. On the other hand, a good trigger isn't
180+prone to false positives, so you don't gather information when the server
181+is functioning normally.
182
183 The most reliable triggers for MySQL tend to be the number of connections to the
184 server, and the number of queries running concurrently. These are available in
185@@ -1472,55 +1499,76 @@
186 appropriate trigger condition for the tool. Choose carefully, because the
187 quality of your results will depend on the trigger you choose.
188
189-You can define the trigger with the L<"--function">, L<"--variable">, and
190-L<"--threshold"> options, among others. Please read the documentation for
191---function to learn how to do this.
192-
193-The pt-stalk tool, by default, simply watches MySQL repeatedly until the trigger
194-becomes true. It then gathers diagnostics for a while, and sleeps afterwards for
195-some time to prevent repeatedly gathering data if the condition remains true.
196-In crude pseudocode, omitting some subtleties,
197-
198- while true; do
199- if --variable from --function is greater than --threshold; then
200- observations++
201- if observations is greater than --cycles; then
202- capture diagnostics for --run-time seconds
203- exit if --iterations is exceeded
204- sleep for --sleep seconds
205- done
206- done
207- clean up data that's older than --retention-time
208- sleep for --interval seconds
209- done
210+You define the trigger with the L<"--function">, L<"--variable">,
211+L<"--threshold">, and L<"--cycles"> options. The default values
212+for these options define a reasonable trigger, but you should adjust
213+or change them to suite your particular system and needs.
214+
215+By default, pt-stalk tool watches MySQL forever until the trigger occurs,
216+then it collects diagnostic data for a while, and sleeps afterwards to avoid
217+repeatedly collecting data if the trigger remains true. The general order of
218+operations is:
219+
220+ while true; do
221+ if --variable from --function > --threshold; then
222+ cycles_true++
223+ if cycles_true >= --cycles; then
224+ --notify-by-email
225+ if --collect; then
226+ if --disk-bytes-free and --disk-pct-free ok; then
227+ (--collect for --run-time seconds) &
228+ fi
229+ rm files in --dest older than --retention-time
230+ fi
231+ iter++
232+ cycles_true=0
233+ fi
234+ if iter < --iterations; then
235+ sleep --sleep seconds
236+ else
237+ break
238+ fi
239+ else
240+ if iter < --iterations; then
241+ sleep --interval seconds
242+ else
243+ break
244+ fi
245+ fi
246+ done
247+ rm old --dest files older than --retention-time
248+ if --collect process are still running; then
249+ wait up to --run-time * 3 seconds
250+ kill any remaining --collect processes
251+ fi
252
253 The diagnostic data is written to files whose names begin with a timestamp, so
254 you can distinguish samples from each other in case the tool collects data
255-multiple times. The pt-sift tool is designed to help you browse and analyze the
256-resulting samples of data.
257+multiple times. The pt-sift tool is designed to help you browse and analyze
258+the resulting data samples.
259
260 Although this sounds simple enough, in practice there are a number of
261 subtleties, such as detecting when the disk is beginning to fill up so that the
262 tool doesn't cause the server to run out of disk space. This tool handles these
263 types of potential problems, so it's a good idea to use this tool instead of
264 writing something from scratch and possibly experiencing some of the hazards
265-this tool is designed to prevent.
266+this tool is designed to avoid.
267
268 =head1 CONFIGURING
269
270-You can use standard Percona Toolkit configuration files to set commandline
271+You can use standard Percona Toolkit configuration files to set command line
272 options.
273
274 You will probably want to run the tool as a daemon and customize at least the
275-diagnostic threshold. Here's a sample configuration file for triggering when
276+L<"--threshold">. Here's a sample configuration file for triggering when
277 there are more than 20 queries running at once:
278
279 daemonize
280 threshold=20
281
282-If you're not running the tool as it's designed (as a root user, daemonized)
283-then you'll need to set several options, such as L<"--dest">, to locations that
284-are writable by non-root users.
285+If you don't run the tool as root, then you will need specify several options,
286+such as L<"--pid">, L<"--log">, and L<"--dest">, else the tool will probably
287+fail to start.
288
289 =head1 OPTIONS
290
291@@ -1530,8 +1578,8 @@
292
293 default: yes; negatable: yes
294
295-Collect system information. You can negate this option to make the tool watch
296-the system but not actually gather any diagnostic data.
297+Collect diagnostic data when the trigger occurs. Specify C<--no-collect>
298+to make the tool watch the system but not collect data.
299
300 See also L<"--stalk">.
301
302@@ -1581,9 +1629,8 @@
303
304 type: int; default: 5
305
306-The number of times the trigger condition must be true before collecting data.
307-This helps prevent false positives, and makes the trigger condition less likely
308-to fire when the problem recovers quickly.
309+How many times L<"--variable"> must be greater than L<"--threshold"> before triggering L<"--collect">. This helps prevent false positives, and makes
310+the trigger condition less likely to fire when the problem recovers quickly.
311
312 =item --daemonize
313
314@@ -1594,14 +1641,15 @@
315
316 type: string; default: /var/lib/pt-stalk
317
318-Where to store the diagnostic data. Each time the tool collects data, it writes
319-to a new set of files, which are named with the current system timestamp.
320+Where to save diagnostic data from L<"--collect">. Each time the tool
321+collects data, it writes to a new set of files, which are named with the
322+current system timestamp.
323
324 =item --disk-bytes-free
325
326 type: size; default: 100M
327
328-Don't collect data if the disk has less than this much free space.
329+Do not L<"--collect"> if the disk has less than this much free space.
330 This prevents the tool from filling up the disk with diagnostic data.
331
332 If the L<"--dest"> directory contains a previously captured sample of data,
333@@ -1618,7 +1666,7 @@
334
335 type: int; default: 5
336
337-Don't collect data if the disk has less than this percent free space.
338+Do not L<"--collect"> if the disk has less than this percent free space.
339 This prevents the tool from filling up the disk with diagnostic data.
340
341 This option works similarly to L<"--disk-bytes-free"> but specifies a
342@@ -1630,57 +1678,57 @@
343
344 type: string; default: status
345
346-Specifies what to watch for a diagnostic trigger. The default value watches
347-SHOW GLOBAL STATUS, but you can also watch SHOW PROCESSLIST or supply a plugin
348-file with your own custom code. This function supplies the value of
349+What to watch for the trigger. The default value watches
350+C<SHOW GLOBAL STATUS>, but you can also watch C<SHOW PROCESSLIST> and specify
351+a file with your own custom code. This function supplies the value of
352 L<"--variable">, which is then compared against L<"--threshold"> to see if the
353-trigger condition is met. Additional options may be required as well; see
354-below. Possible values:
355+the trigger condition is met. Additional options may be required as
356+well; see below. Possible values are:
357
358 =over
359
360 =item * status
361
362-This value specifies that the source of data for the diagnostic trigger is SHOW
363-GLOBAL STATUS. The value of L<"--variable"> then defines which status counter
364-is the trigger.
365+Watch C<SHOW GLOBAL STATUS> for the trigger. The value of
366+L<"--variable"> then defines which status counter is the trigger.
367
368 =item * processlist
369
370-This value specifies that the data for the diagnostic trigger comes from SHOW
371-FULL PROCESSLIST. The trigger value is the count of processes whose
372-L<"--variable"> column matches the L<"--match"> option. For example, to trigger
373-when more than 10 processes are in the "statistics" state, use the following
374-options:
375+Watch C<SHOW FULL PROCESSLIST> for the trigger. The trigger
376+value is the count of processes whose L<"--variable"> column matches the
377+L<"--match"> option. For example, to trigger L<"--collect"> when more than
378+10 processes are in the "statistics" state, specify:
379
380- --function processlist --variable State \
381- --match statistics --threshold 10
382+ --function processlist \
383+ --variable State \
384+ --match statistics \
385+ --threshold 10
386
387 =back
388
389-In addition, you can specify a file that contains your custom trigger function,
390-written in Unix shell script. This can be a wrapper that executes anything you
391-wish. If the argument to --function is a file, then it takes precedence over
392-builtin functions, so if there is a file in the working directory named "status"
393-or "processlist" then the tool will use that file as a plugin, even though those
394-are otherwise recognized as reserved words for this option.
395+In addition, you can specify a file that contains your custom trigger
396+function, written in Unix shell script. This can be a wrapper that executes
397+anything you wish. If the argument to L<"--function"> is a file, then it
398+takes precedence over built-in functions, so if there is a file in the working
399+directory named "status" or "processlist" then the tool will use that file
400+even though are valid built-in values.
401
402-The plugin file works by providing a function called C<trg_plugin>, and the tool
403-simply sources the file and executes the function. For example, the function
404-might look like the following:
405+The file works by providing a function called C<trg_plugin>, and the tool
406+simply sources the file and executes the function. For example, the file
407+might contain:
408
409 trg_plugin() {
410 mysql $EXT_ARGV -e "SHOW ENGINE INNODB STATUS" \
411 | grep -c "has waited at"
412 }
413
414-This snippet will count the number of mutex waits inside of InnoDB. It
415+This snippet will count the number of mutex waits inside InnoDB. It
416 illustrates the general principle: the function must output a number, which is
417-then compared to the threshold as usual. The $EXT_ARGV variable contains the
418-MySQL options mentioned in the L<"SYNOPSIS"> above.
419+then compared to L<"--threshold"> as usual. The C<$EXT_ARGV> variable
420+contains the MySQL options mentioned in the L<"SYNOPSIS"> above.
421
422-The plugin should not alter the tool's existing global variables. Prefix any
423-plugin-specific global variables with "PLUGIN_" or make them local.
424+The file should not alter the tool's existing global variables. Prefix any
425+file-specific global variables with "PLUGIN_" or make them local.
426
427 =item --help
428
429@@ -1690,15 +1738,17 @@
430
431 type: int; default: 1
432
433-Interval between checks for the diagnostic trigger.
434+How often to check the if trigger is true, in seconds.
435
436 =item --iterations
437
438 type: int
439
440-Exit after collecting diagnostics this many times. By default, the tool
441-will continue to watch the server forever, but this is useful for scenarios
442-where you want to capture once and then exit, for example.
443+How many times to L<"--collect"> diagnostic data. By default, the tool
444+runs forever and collects data every time the trigger occurs.
445+Specify L<"--iterations"> to collect data a limited number of times.
446+This option is also useful with C<--no-stalk> to collect data once and
447+exit, for example.
448
449 =item --log
450
451@@ -1710,14 +1760,14 @@
452
453 type: string
454
455-The pattern to use when watching SHOW PROCESSLIST. See the documentation for
456-L<"--function"> for details.
457+The pattern to use when watching SHOW PROCESSLIST. See L<"--function">
458+for details.
459
460 =item --notify-by-email
461
462 type: string
463
464-Send mail to this list of addresses when data is collected.
465+Send an email to these addresses for every L<"--collect">.
466
467 =item --pid
468
469@@ -1746,8 +1796,8 @@
470
471 =item before_collect
472
473-Called when the stalk condition is triggered, before running a collector
474-process as a backgrounded subshell.
475+Called when the trigger occurs, before running a L<"--collect">
476+subprocesses in the background.
477
478 =item after_collect
479
480@@ -1771,10 +1821,10 @@
481
482 =back
483
484-For example, a very simple plugin that touches a file when a collector
485-process is triggered:
486+For example, a very simple plugin that touches a file when L<"--collect">
487+is triggered:
488
489- before_colllect() {
490+ before_collect() {
491 touch /tmp/foo
492 }
493
494@@ -1797,9 +1847,9 @@
495
496 type: string
497
498-The filename prefix for diagnostic samples. By default, samples have a timestamp
499-prefix based on the current local time, such as 2011_12_06_14_02_02, which is
500-December 6, 2011 at 14:02:02.
501+The filename prefix for diagnostic samples. By default, all files created
502+by the same L<"--collect"> instance have a timestamp prefix based on the current
503+local time, like C<2011_12_06_14_02_02>, which is December 6, 2011 at 14:02:02.
504
505 =item --retention-time
506
507@@ -1812,10 +1862,12 @@
508
509 type: int; default: 30
510
511-How long the tool will collect data when it triggers. This should not be longer
512-than L<"--sleep">. It is usually not necessary to change this; if the default 30
513-seconds hasn't gathered enough diagnostic data, running longer is not likely to
514-do so. In fact, in many cases a shorter collection period is appropriate.
515+How long to L<"--collect"> diagnostic data when the trigger occurs.
516+The value is in seconds and should not be longer than L<"--sleep">. It is
517+usually not necessary to change this; if the default 30 seconds doesn't
518+collect enough data, running longer is not likely to help because the system
519+or MySQL server is probably too busy to respond. In fact, in many cases a
520+shorter collection period is appropriate.
521
522 This value is used two other times. After collecting, the collect subprocess
523 will wait another L<"--run-time"> seconds for its commands to finish. Some
524@@ -1825,7 +1877,7 @@
525 data. The value is potentially used again just before the tool exits to wait
526 again for any collect subprocesses to finish. In most cases this won't
527 happen because of the aforementioned extra wait. If it happens, the tool
528-will log "Waiting up to N seconds for collectors to finish..." where N is
529+will log "Waiting up to N seconds for subprocesses to finish..." where N is
530 three times L<"--run-time">. In both cases, after waiting, the tool kills
531 all of its subprocesses.
532
533@@ -1833,8 +1885,8 @@
534
535 type: int; default: 300
536
537-How long to sleep after collecting data. This prevents the tool from triggering
538-continuously, which might be a problem if the collection process is intrusive.
539+How long to sleep after L<"--collect">. This prevents the tool
540+from triggering continuously, which might be a problem if the collection process is intrusive.
541 It also prevents filling up the disk or gathering too much data to analyze
542 reasonably.
543
544@@ -1842,14 +1894,16 @@
545
546 default: yes; negatable: yes
547
548-Watch the server and wait for the trigger to occur. You can negate this option
549-to make the tool immediately gather any diagnostic data once and exit. This is
550-useful if a problem is already happening, but pt-stalk is not running, so
551-you only want to collect diagnostic data.
552-
553-If this option is negate, L<"--daemonize">, L<"--log">, L<"--pid">, and other
554-stalking-related options have no effect; the tool simply collects diagnostic
555-data and exits. Safeguard options, like L<"--disk-bytes-free"> and
556+Watch the server and wait for the trigger to occur. Specify C<--no-stalk>
557+to collect diagnostic data immediately, that is, without waiting for the
558+trigger to occur. You probably also want to specify values for
559+L<"--interval">, L<"--iterations">, and L<"--sleep">. For example, to
560+immediately collect data for 1 minute then exit, specify:
561+
562+ --no-stalk --run-time 60 --iterations 1
563+
564+L<"--cycles">, L<"--daemonize">, L<"--log"> and L<"--pid"> have no effect
565+with C<--no-stalk>. Safeguard options, like L<"--disk-bytes-free"> and
566 L<"--disk-pct-free">, are still respected.
567
568 See also L<"--collect">.
569@@ -1858,14 +1912,18 @@
570
571 type: int; default: 25
572
573-The threshold at which the diagnostic trigger should fire. See L<"--function">
574-for details.
575+The maximum acceptable value for L<"--variable">. L<"--collect"> is
576+triggered when the value of L<"--variable"> is greater than L<"--threshold">
577+for L<"--cycles"> many times. Currently, there is no way to define a lower
578+threshold to check for a L<"--variable"> value that is too low.
579+
580+See also L<"--function">.
581
582 =item --variable
583
584 type: string; default: Threads_running
585
586-The variable to compare against the threshold. See L<"--function"> for details.
587+The variable to compare against L<"--threshold">. See also L<"--function">.
588
589 =item --verbose
590
591@@ -1995,7 +2053,8 @@
592
593 =head1 AUTHORS
594
595-Baron Schwartz, Justin Swanhart, Fernando Ipar, and Daniel Nichter
596+Baron Schwartz, Justin Swanhart, Fernando Ipar, Daniel Nichter,
597+and Brian Fraser.
598
599 =head1 ABOUT PERCONA TOOLKIT
600
601
602=== modified file 'lib/bash/collect.sh'
603--- lib/bash/collect.sh 2012-11-12 14:26:01 +0000
604+++ lib/bash/collect.sh 2013-03-05 01:26:23 +0000
605@@ -22,7 +22,7 @@
606 # collect collects system information.
607
608 # XXX
609-# THIS LIB REQUIRES log_warn_die.sh, safeguards.sh, and alt_cmds.sh!
610+# THIS LIB REQUIRES log_warn_die, safeguards, alt_cmds, and subshell!
611 # XXX
612
613 set -u
614@@ -289,16 +289,8 @@
615 # it may leave an empty file. But first wait another --run-time
616 # seconds for any slow process to finish:
617 # https://bugs.launchpad.net/percona-toolkit/+bug/1047701
618- local slept=0
619- while [ -n "$(jobs)" -a $slept -lt $OPT_RUN_TIME ]; do
620- sleep 1
621- slept=$((slept + 1))
622- done
623-
624- for pid in $(jobs -p); do
625- kill $pid >/dev/null 2>&1
626- done
627-
628+ wait_for_subshells $OPT_RUN_TIME
629+ kill_all_subshells
630 for file in "$d/$p-"*; do
631 # If there's not at least 1 line that's not a TS,
632 # then the file is empty.
633
634=== added file 'lib/bash/subshell.sh'
635--- lib/bash/subshell.sh 1970-01-01 00:00:00 +0000
636+++ lib/bash/subshell.sh 2013-03-05 01:26:23 +0000
637@@ -0,0 +1,66 @@
638+# This program is copyright 2013 Percona Ireland Ltd.
639+# Feedback and improvements are welcome.
640+#
641+# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
642+# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
643+# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
644+#
645+# This program is free software; you can redistribute it and/or modify it under
646+# the terms of the GNU General Public License as published by the Free Software
647+# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
648+# systems, you can issue `man perlgpl' or `man perlartistic' to read these
649+# licenses.
650+#
651+# You should have received a copy of the GNU General Public License along with
652+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
653+# Place, Suite 330, Boston, MA 02111-1307 USA.
654+# ###########################################################################
655+# subshell package
656+# ###########################################################################
657+
658+# Package: subshell
659+
660+set -u
661+
662+wait_for_subshells() {
663+ local max_wait=$1
664+ if [ "$(jobs)" ]; then
665+ log "Waiting up to $max_wait seconds for subprocesses to finish..."
666+ local slept=0
667+ while [ -n "$(jobs)" ]; do
668+ local subprocess_still_running=""
669+ for pid in $(jobs -p); do
670+ if kill -0 $pid >/dev/null 2>&1; then
671+ subprocess_still_running=1
672+ fi
673+ done
674+ if [ "$subprocess_still_running" ]; then
675+ sleep 1
676+ slept=$((slept + 1))
677+ [ $slept -ge $max_wait ] && break
678+ else
679+ break
680+ fi
681+ done
682+ fi
683+}
684+
685+kill_all_subshells() {
686+ if [ "$(jobs)" ]; then
687+ for pid in $(jobs -p); do
688+ if kill -0 $pid >/dev/null 2>&1; then
689+ # This isn't an warning (we don't want exit status 1) because
690+ # the system may be running slowly so it's just "natural" that
691+ # a collector may get stuck or run really slowly.
692+ log "Killing subprocess $pid"
693+ kill $pid >/dev/null 2>&1
694+ fi
695+ done
696+ else
697+ log "All subprocesses have finished"
698+ fi
699+}
700+
701+# ###########################################################################
702+# End subshell package
703+# ###########################################################################
704
705=== modified file 't/lib/bash/collect.sh'
706--- t/lib/bash/collect.sh 2012-11-12 14:26:01 +0000
707+++ t/lib/bash/collect.sh 2013-03-05 01:26:23 +0000
708@@ -10,6 +10,7 @@
709 mkdir "$PT_TMPDIR/collect" 2>/dev/null
710
711 source "$LIB_DIR/log_warn_die.sh"
712+source "$LIB_DIR/subshell.sh"
713 source "$LIB_DIR/parse_options.sh"
714 source "$LIB_DIR/safeguards.sh"
715 source "$LIB_DIR/alt_cmds.sh"
716
717=== modified file 't/pt-stalk/pt-stalk.t'
718--- t/pt-stalk/pt-stalk.t 2013-01-24 18:13:39 +0000
719+++ t/pt-stalk/pt-stalk.t 2013-03-05 01:26:23 +0000
720@@ -317,7 +317,11 @@
721
722 cleanup();
723
724-$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file -- --defaults-file=$cnf >$log_file 2>&1");
725+# As of 2.2, --no-stalk means just that: don't stalk, just collect, so
726+# we have to specify --iterations=1 else the tool will continue to run,
727+# whereas in 2.1 --no-stalk implied/forced "collect once and exit".
728+
729+$retval = system("$trunk/bin/pt-stalk --no-stalk --run-time 2 --dest $dest --prefix nostalk --pid $pid_file --iterations 1 -- --defaults-file=$cnf >$log_file 2>&1");
730
731 PerconaTest::wait_until(sub { !-f $pid_file });
732

Subscribers

People subscribed via source and target branches