Merge lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327 into lp:percona-toolkit/2.1

Proposed by Daniel Nichter
Status: Work in progress
Proposed branch: lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327
Merge into: lp:percona-toolkit/2.1
Diff against target: 285 lines (+92/-67)
5 files modified
bin/pt-table-checksum (+15/-8)
lib/Quoter.pm (+18/-13)
lib/Sandbox.pm (+3/-3)
lib/TableChunker.pm (+2/-27)
t/lib/Quoter.t (+54/-16)
To merge this branch: bzr merge lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327
Reviewer Review Type Date Requested Status
Percona Toolkit developers Pending
Review via email: mp+122159@code.launchpad.net
To post a comment you must log in.

Unmerged revisions

390. By Brian Fraser

Updated ptc

389. By Brian Fraser

Merged https://bugs.launchpad.net/percona-toolkit/+bug/932327 and updated to work on Perl 5.16 and possibly resolve bug 925781

388. By Brian Fraser

Experimental fix to TableChunker: Don't use the built-in latin1 char range, let mysql calculate it

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bin/pt-table-checksum'
2--- bin/pt-table-checksum 2012-08-29 23:00:34 +0000
3+++ bin/pt-table-checksum 2012-08-30 21:48:19 +0000
4@@ -1061,7 +1061,7 @@
5 next unless exists $versions->{$item};
6 if ( ref($versions->{$item}) eq 'HASH' ) {
7 my $mysql_versions = $versions->{$item};
8- for my $id ( keys %$mysql_versions ) {
9+ for my $id ( sort keys %$mysql_versions ) {
10 push @lines, join(';', $id, $item, $mysql_versions->{$id});
11 }
12 }
13@@ -3176,7 +3176,14 @@
14 die "Cannot serialize multiple values with undef/NULL"
15 if grep { !defined $_ } @args;
16
17- return join ',', map { quotemeta } @args;
18+ return join ',', map {
19+ (my $res = $_) =~ s/([^A-Za-z0-9_])/\\$1/g;
20+ if ( utf8::is_utf8($res) ) {
21+ $res = "UTF8:$res";
22+ utf8::encode($res);
23+ }
24+ $res
25+ } @args;
26 }
27
28 sub deserialize_list {
29@@ -3199,10 +3206,10 @@
30 my @unescaped_parts = map {
31 my $part = $_;
32
33- my $char_class = utf8::is_utf8($part) # If it's a UTF-8 string,
34- ? qr/(?=\p{ASCII})\W/ # We only care about non-word
35- : qr/(?=\p{ASCII})\W|[\x{80}-\x{FF}]/; # Otherwise,
36- $part =~ s/\\($char_class)/$1/g;
37+ utf8::decode($part) if $part =~ s/\AUTF8://;
38+
39+
40+ $part =~ s/\\([^A-Za-z0-9_])/$1/g;
41 $part;
42 } @escaped_parts;
43
44@@ -10732,8 +10739,8 @@
45 chunk int NOT NULL,
46 chunk_time float NULL,
47 chunk_index varchar(200) NULL,
48- lower_boundary text NULL,
49- upper_boundary text NULL,
50+ lower_boundary text CHARACTER SET binary NULL,
51+ upper_boundary text CHARACTER SET binary NULL,
52 this_crc char(40) NOT NULL,
53 this_cnt int NOT NULL,
54 master_crc char(40) NULL,
55
56=== modified file 'lib/Quoter.pm'
57--- lib/Quoter.pm 2012-08-22 19:19:43 +0000
58+++ lib/Quoter.pm 2012-08-30 21:48:19 +0000
59@@ -164,7 +164,19 @@
60 die "Cannot serialize multiple values with undef/NULL"
61 if grep { !defined $_ } @args;
62
63- return join ',', map { quotemeta } @args;
64+ return join ',', map {
65+ (my $res = $_) =~ s/([^A-Za-z0-9_])/\\$1/g;
66+ if ( utf8::is_utf8($res) ) {
67+ # If the string is marked as UTF-8, we also mark it here.
68+ # Since all colons have already been quotemeta'd, appending
69+ # one of our own here is fine.
70+ $res = "UTF8:$res";
71+ # Turn the UTF-8 flag off, so that DBD::mysql doesn't try
72+ # anything besides storing the string.
73+ utf8::encode($res);
74+ }
75+ $res
76+ } @args;
77 }
78
79 sub deserialize_list {
80@@ -190,23 +202,16 @@
81 # Undo the quotemeta().
82 my @unescaped_parts = map {
83 my $part = $_;
84+
85+ # If it starts with UTF8:, remove the prefix and decode.
86+ utf8::decode($part) if $part =~ s/\AUTF8://;
87+
88 # Here be weirdness. Unfortunately quotemeta() is broken, and exposes
89 # the internal representation of scalars. Namely, the latin-1 range,
90 # \128-\377 (\p{Latin1} in newer Perls) is all escaped in downgraded
91 # strings, but left alone in UTF-8 strings. Thus, this.
92
93- # TODO: quotemeta() might change in 5.16 to mean
94- # qr/(?=\p{ASCII})\W|\p{Pattern_Syntax}/
95- # And also fix this whole weird behavior under
96- # use feature 'unicode_strings' -- If/once that's
97- # implemented, this will have to change.
98- my $char_class = utf8::is_utf8($part) # If it's a UTF-8 string,
99- ? qr/(?=\p{ASCII})\W/ # We only care about non-word
100- # characters in the ASCII range
101- : qr/(?=\p{ASCII})\W|[\x{80}-\x{FF}]/; # Otherwise,
102- # same as above, but also
103- # unescape the latin-1 range.
104- $part =~ s/\\($char_class)/$1/g;
105+ $part =~ s/\\([^A-Za-z0-9_])/$1/g;
106 $part;
107 } @escaped_parts;
108
109
110=== modified file 'lib/Sandbox.pm'
111--- lib/Sandbox.pm 2012-08-09 18:33:42 +0000
112+++ lib/Sandbox.pm 2012-08-30 21:48:19 +0000
113@@ -288,11 +288,11 @@
114 return;
115 }
116
117-# This returns an empty string if all servers and data are OK. If it returns
118-# anything but empty string, there is a problem, and the string indicates what
119-# the problem is.
120+# This returns true if all servers and data are OK. Otherwise, there is a
121+# problem, and that's warn'd out.
122 sub ok {
123 my ($self) = @_;
124+ return 1 unless $self->get_dbh_for('master');
125 my @errors;
126 # First, wait for all slaves to be caught up to their masters.
127 $self->wait_for_slaves();
128
129=== modified file 'lib/TableChunker.pm'
130--- lib/TableChunker.pm 2012-08-24 22:50:34 +0000
131+++ lib/TableChunker.pm 2012-08-30 21:48:19 +0000
132@@ -553,38 +553,13 @@
133 my $base;
134 my @chars;
135 PTDEBUG && _d("Table charset:", $args{tbl_struct}->{charset});
136- if ( ($args{tbl_struct}->{charset} || "") eq "latin1" ) {
137- # These are the unique, sorted latin1 character codes according to
138- # MySQL. You'll notice that many are missing. That's because MySQL
139- # treats many characters as the same, for example "e" and "é".
140- my @sorted_latin1_chars = (
141- 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
142- 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
143- 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
144- 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
145- 88, 89, 90, 91, 92, 93, 94, 95, 96, 123, 124, 125, 126, 161,
146- 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
147- 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
148- 190, 191, 215, 216, 222, 223, 247, 255);
149-
150- my ($first_char, $last_char);
151- for my $i ( 0..$#sorted_latin1_chars ) {
152- $first_char = $i and last if $sorted_latin1_chars[$i] >= $min_col_ord;
153- }
154- for my $i ( $first_char..$#sorted_latin1_chars ) {
155- $last_char = $i and last if $sorted_latin1_chars[$i] >= $max_col_ord;
156- };
157-
158- @chars = map { chr $_; } @sorted_latin1_chars[$first_char..$last_char];
159- $base = scalar @chars;
160- }
161- else {
162+ {
163 # If the table's charset isn't latin1, who knows what charset is being
164 # used, what characters it contains, and how those characters are sorted.
165 # So we create a character map and let MySQL tell us these things.
166
167 # Create a temp table with the same char col def as the original table.
168- my $tmp_tbl = '__maatkit_char_chunking_map';
169+ my $tmp_tbl = '__percona_char_chunking_map';
170 my $tmp_db_tbl = $q->quote($args{db}, $tmp_tbl);
171 $sql = "DROP TABLE IF EXISTS $tmp_db_tbl";
172 PTDEBUG && _d($dbh, $sql);
173
174=== modified file 't/lib/Quoter.t'
175--- t/lib/Quoter.t 2012-08-22 19:19:43 +0000
176+++ t/lib/Quoter.t 2012-08-30 21:48:19 +0000
177@@ -145,7 +145,10 @@
178 [ 'a\\\\\\,aa', 'c', ],
179 [ 'a\\\,a,a', 'c,d,e,d,', ],
180 [ "\\\,\x{e8},a", '!!!!__!*`,`\\', ], # Latin-1
181+ [ "\\\,\N{U+e8},a", '!!!!__!*`,`\\', ], # UTF-8
182 [ "\x{30cb}\\\,\x{e8},a", '!!!!__!*`,`\\', ], # UTF-8
183+ [ "\x{30cb}\\\,\x{e8},a", "\x{e9}", ], # UTF-8 on the left, Latin-1 on the right
184+ [ "\x{e9}", "\x{30cb}\\\,\x{e8},a", ], # Latin-1 on the left, UTF-8 on the right
185 [ ",,,,,,,,,,,,,,", ",", ],
186 [ "\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,,,,\\", ":(", ],
187 [ "asdfa", "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\,a", ],
188@@ -162,12 +165,13 @@
189 my $sb = new Sandbox(basedir => '/tmp', DSNParser => $dp);
190 my $dbh = $sb->get_dbh_for('master');
191 SKIP: {
192- skip 'Cannot connect to sandbox master', scalar @serialize_tests unless $dbh;
193+ skip 'Cannot connect to sandbox master', scalar @serialize_tests + 11 unless $dbh;
194
195 # Prevent "Wide character in print at Test/Builder.pm" warnings.
196 binmode Test::More->builder->$_(), ':encoding(UTF-8)'
197 for qw(output failure_output);
198
199+ $dbh->do('DROP DATABASE IF EXISTS serialize_test');
200 $dbh->do('CREATE DATABASE IF NOT EXISTS serialize_test');
201 $dbh->do('DROP TABLE IF EXISTS serialize_test.serialize');
202 $dbh->do('CREATE TABLE serialize_test.serialize (id INT, foo TEXT)');
203@@ -182,10 +186,6 @@
204 for my $test_index ( 0..$#serialize_tests ) {
205 my $ser = $q->serialize_list( @{$serialize_tests[$test_index]} );
206
207- # Bit of a hack, but we want to test both of Perl's internal encodings
208- # for correctness.
209- local $dbh->{'mysql_enable_utf8'} = 1 if utf8::is_utf8($ser);
210-
211 $sth->execute($test_index, $ser);
212 $selsth->execute($test_index);
213
214@@ -196,22 +196,60 @@
215 . "]";
216 $flat_string =~ s/\n/\\n/g;
217
218- # diag($test_index);
219- SKIP: {
220- skip "DBD::mysql version $DBD::mysql::VERSION has utf8 bugs. "
221- . "See https://bugs.launchpad.net/percona-toolkit/+bug/932327",
222- 1 if $DBD::mysql::VERSION lt '4' && $test_index == 9;
223- is_deeply(
224- [ $q->deserialize_list($selsth->fetchrow_array()) ],
225- $serialize_tests[$test_index],
226- "Serialize $flat_string"
227- );
228+ my $res = [ $q->deserialize_list($selsth->fetchrow_array()) ];
229+ is_deeply(
230+ $res,
231+ $serialize_tests[$test_index],
232+ "Serialize $flat_string"
233+ );
234+
235+ if ( $flat_string =~ /\P{ASCII}/ && $flat_string =~ /[\x{80}-\x{FF}]/ ) {
236+ for my $i (0..$#{$res}) {
237+ my $form = utf8::is_utf8($serialize_tests[$test_index][$i])
238+ ? 'UTF-8'
239+ : 'Latin1';
240+ is(
241+ utf8::is_utf8($res->[$i]),
242+ utf8::is_utf8($serialize_tests[$test_index][$i]),
243+ "$res->[$i] and $serialize_tests[$test_index][$i] remain $form after deserialziation"
244+ );
245+ }
246 }
247 }
248-
249+
250 $sth->finish();
251 $selsth->finish();
252
253+ {
254+ $dbh->do('SET NAMES utf8');
255+ $dbh->do('CREATE TABLE serialize_test.serialize2 (id INT, foo TEXT CHARACTER SET binary)');
256+
257+ my $sth2 = $dbh->prepare(
258+ "INSERT INTO serialize_test.serialize2 (id, foo) VALUES (?, ?)"
259+ );
260+
261+ my $selsth2 = $dbh->prepare(
262+ "SELECT foo FROM serialize_test.serialize2 WHERE id=? LIMIT 1"
263+ );
264+
265+ my @list = ("\N{U+10000}", "\N{U+10001}", "\N{U+10002}");
266+ my $ser = $q->serialize_list( @list );
267+
268+ local $dbh->{'mysql_enable_utf8'} = 1;
269+
270+ $sth2->execute(1, $ser);
271+ $selsth2->execute(1);
272+ is_deeply(
273+ [ $q->deserialize_list($selsth2->fetchrow_array()) ],
274+ \@list,
275+ "Serialized astral plane characters correctly"
276+ );
277+
278+ $sth2->finish();
279+ $selsth2->finish();
280+
281+ }
282+
283 $dbh->do("DROP DATABASE serialize_test");
284 $sb->wipe_clean($dbh);
285 $dbh->disconnect();

Subscribers

People subscribed via source and target branches

to status/vote changes: