Percona Toolkit moved to https://jira.percona.com/projects/PT

Merge lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327 into lp:percona-toolkit/2.1

possible-fix-925781-932327
Merge into 2.1

Proposed by Daniel Nichter on 2012-08-30

Status:

Work in progress

Proposed branch:

lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327

Merge into:

lp:percona-toolkit/2.1

Diff against target:

285 lines (+92/-67)

5 files modified

bin/pt-table-checksum (+15/-8)
lib/Quoter.pm (+18/-13)
lib/Sandbox.pm (+3/-3)
lib/TableChunker.pm (+2/-27)
t/lib/Quoter.t (+54/-16)

To merge this branch:

bzr merge lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327

Related bugs:

Bug #925781: pt-table-checksum checksum error when default-character-set = utf8	High	Fix Released
Bug #932327: Quoter (de)serialize UTF8 data fails on CentOS 5.6	Medium	Invalid

Link a bug report

Reviewer	Review Type	Date Requested	Status
Percona Toolkit developers		2012-08-30	Pending
Review via email: mp+122159@code.launchpad.net

Unmerged revisions

390. By Brian Fraser on 2012-08-30: Updated ptc
389. By Brian Fraser on 2012-08-30: Merged https://bugs.launchpad.net/percona-toolkit/+bug/932327 and updated to work on Perl 5.16 and possibly resolve bug 925781
388. By Brian Fraser on 2012-08-30: Experimental fix to TableChunker: Don't use the built-in latin1 char range, let mysql calculate it

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Jonathan Quimbly

Percona Toolkit developers

to status/vote changes:

Heather Sullivan

Percona Toolkit moved to https://jira.percona.com/projects/PT

Merge lp:~percona-toolkit-dev/percona-toolkit/possible-fix-925781-932327 into lp:percona-toolkit/2.1

Commit message

Description of the change

Unmerged revisions

Preview Diff

Subscribers

 === modified file 'bin/pt-table-checksum'
 --- bin/pt-table-checksum	2012-08-29 23:00:34 +0000
 +++ bin/pt-table-checksum	2012-08-30 21:48:19 +0000
@@ -1061,7 +1061,7 @@
        next unless exists $versions->{$item};
        if ( ref($versions->{$item}) eq 'HASH' ) {
           my $mysql_versions = $versions->{$item};
--         for my $id ( keys %$mysql_versions ) {
++         for my $id ( sort keys %$mysql_versions ) {
              push @lines, join(';', $id, $item, $mysql_versions->{$id});
+          }
+       }
@@ -3176,7 +3176,14 @@
     die "Cannot serialize multiple values with undef/NULL"
        if grep { !defined $_ } @args;
--   return join ',', map { quotemeta } @args;
++   return join ',', map {
++         (my $res = $_) =~ s/([^A-Za-z0-9_])/\\$1/g;
++         if ( utf8::is_utf8($res) ) {
++            $res = "UTF8:$res";
++            utf8::encode($res);
++         }
++         $res
++      } @args;
+ }
  sub deserialize_list {
@@ -3199,10 +3206,10 @@
     my @unescaped_parts = map {
        my $part = $_;
--      my $char_class = utf8::is_utf8($part)  # If it's a UTF-8 string,
--                     ? qr/(?=\p{ASCII})\W/   # We only care about non-word
--                     : qr/(?=\p{ASCII})\W|[\x{80}-\x{FF}]/; # Otherwise,
--      $part =~ s/\\($char_class)/$1/g;
++      utf8::decode($part) if $part =~ s/\AUTF8://;
++
++
++      $part =~ s/\\([^A-Za-z0-9_])/$1/g;
        $part;
     } @escaped_parts;
@@ -10732,8 +10739,8 @@
       chunk          int          NOT NULL,
       chunk_time     float            NULL,
       chunk_index    varchar(200)     NULL,
--     lower_boundary text             NULL,
--     upper_boundary text             NULL,
++     lower_boundary text  CHARACTER SET binary NULL,
++     upper_boundary text   CHARACTER SET binary NULL,
       this_crc       char(40)     NOT NULL,
       this_cnt       int          NOT NULL,
       master_crc     char(40)         NULL,
 === modified file 'lib/Quoter.pm'
 --- lib/Quoter.pm	2012-08-22 19:19:43 +0000
 +++ lib/Quoter.pm	2012-08-30 21:48:19 +0000
@@ -164,7 +164,19 @@
     die "Cannot serialize multiple values with undef/NULL"
        if grep { !defined $_ } @args;
--   return join ',', map { quotemeta } @args;
++   return join ',', map {
++         (my $res = $_) =~ s/([^A-Za-z0-9_])/\\$1/g;
++         if ( utf8::is_utf8($res) ) {
++            # If the string is marked as UTF-8, we also mark it here.
++            # Since all colons have already been quotemeta'd, appending
++            # one of our own here is fine.
++            $res = "UTF8:$res";
++            # Turn the UTF-8 flag off, so that DBD::mysql doesn't try
++            # anything besides storing the string.
++            utf8::encode($res);
++         }
++         $res
++      } @args;
+ }
  sub deserialize_list {
@@ -190,23 +202,16 @@
     # Undo the quotemeta().
     my @unescaped_parts = map {
        my $part = $_;
++
++      # If it starts with UTF8:, remove the prefix and decode.
++      utf8::decode($part) if $part =~ s/\AUTF8://;
++
        # Here be weirdness. Unfortunately quotemeta() is broken, and exposes
        # the internal representation of scalars. Namely, the latin-1 range,
        # \128-\377 (\p{Latin1} in newer Perls) is all escaped in downgraded
        # strings, but left alone in UTF-8 strings. Thus, this.
--      # TODO: quotemeta() might change in 5.16 to mean
--      # qr/(?=\p{ASCII})\W|\p{Pattern_Syntax}/
--      # And also fix this whole weird behavior under
--      # use feature 'unicode_strings' --  If/once that's
--      # implemented, this will have to change.
--      my $char_class = utf8::is_utf8($part)  # If it's a UTF-8 string,
--                     ? qr/(?=\p{ASCII})\W/   # We only care about non-word
--                                             # characters in the ASCII range
--                     : qr/(?=\p{ASCII})\W|[\x{80}-\x{FF}]/; # Otherwise,
--                                             # same as above, but also
--                                             # unescape the latin-1 range.
--      $part =~ s/\\($char_class)/$1/g;
++      $part =~ s/\\([^A-Za-z0-9_])/$1/g;
        $part;
     } @escaped_parts;
 === modified file 'lib/Sandbox.pm'
 --- lib/Sandbox.pm	2012-08-09 18:33:42 +0000
 +++ lib/Sandbox.pm	2012-08-30 21:48:19 +0000
@@ -288,11 +288,11 @@
     return;
+ }
--# This returns an empty string if all servers and data are OK. If it returns
--# anything but empty string, there is a problem, and the string indicates what
--# the problem is.
++# This returns true if all servers and data are OK. Otherwise, there is a
++# problem, and that's warn'd out.
  sub ok {
     my ($self) = @_;
++   return 1 unless $self->get_dbh_for('master');
     my @errors;
     # First, wait for all slaves to be caught up to their masters.
     $self->wait_for_slaves();
 === modified file 'lib/TableChunker.pm'
 --- lib/TableChunker.pm	2012-08-24 22:50:34 +0000
 +++ lib/TableChunker.pm	2012-08-30 21:48:19 +0000
@@ -553,38 +553,13 @@
     my $base;
     my @chars;
     PTDEBUG && _d("Table charset:", $args{tbl_struct}->{charset});
--   if ( ($args{tbl_struct}->{charset} || "") eq "latin1" ) {
--      # These are the unique, sorted latin1 character codes according to
--      # MySQL.  You'll notice that many are missing.  That's because MySQL
--      # treats many characters as the same, for example "e" and "é".
--      my @sorted_latin1_chars = (
--          32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
--          46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
--          60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
--          74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
--          88,  89,  90,  91,  92,  93,  94,  95,  96, 123, 124, 125, 126, 161,
--         162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
--         176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
--         190, 191, 215, 216, 222, 223, 247, 255);
--
--      my ($first_char, $last_char);
--      for my $i ( 0..$#sorted_latin1_chars ) {
--         $first_char = $i and last if $sorted_latin1_chars[$i] >= $min_col_ord;
--      }
--      for my $i ( $first_char..$#sorted_latin1_chars ) {
--         $last_char = $i and last if $sorted_latin1_chars[$i] >= $max_col_ord;
--      };
--
--      @chars = map { chr $_; } @sorted_latin1_chars[$first_char..$last_char];
--      $base  = scalar @chars;
--   }
--   else {
++   {
        # If the table's charset isn't latin1, who knows what charset is being
        # used, what characters it contains, and how those characters are sorted.
        # So we create a character map and let MySQL tell us these things.
        # Create a temp table with the same char col def as the original table.
--      my $tmp_tbl    = '__maatkit_char_chunking_map';
++      my $tmp_tbl    = '__percona_char_chunking_map';
        my $tmp_db_tbl = $q->quote($args{db}, $tmp_tbl);
        $sql = "DROP TABLE IF EXISTS $tmp_db_tbl";
        PTDEBUG && _d($dbh, $sql);
 === modified file 't/lib/Quoter.t'
 --- t/lib/Quoter.t	2012-08-22 19:19:43 +0000
 +++ t/lib/Quoter.t	2012-08-30 21:48:19 +0000
@@ -145,7 +145,10 @@
     [ 'a\\\\\\,aa', 'c', ],
     [ 'a\\\,a,a', 'c,d,e,d,', ],
     [ "\\\,\x{e8},a", '!!!!__!*`,`\\', ], # Latin-1
++   [ "\\\,\N{U+e8},a", '!!!!__!*`,`\\', ], # UTF-8
     [ "\x{30cb}\\\,\x{e8},a", '!!!!__!*`,`\\', ], # UTF-8
++   [ "\x{30cb}\\\,\x{e8},a", "\x{e9}", ], # UTF-8 on the left, Latin-1 on the right
++   [ "\x{e9}", "\x{30cb}\\\,\x{e8},a", ], # Latin-1 on the left, UTF-8 on the right
     [ ",,,,,,,,,,,,,,", ",", ],
     [ "\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,\\,,,,\\", ":(", ],
     [ "asdfa", "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\,a", ],
@@ -162,12 +165,13 @@
  my $sb  = new Sandbox(basedir => '/tmp', DSNParser => $dp);
  my $dbh = $sb->get_dbh_for('master');
  SKIP: {
--   skip 'Cannot connect to sandbox master', scalar @serialize_tests unless $dbh;
++   skip 'Cannot connect to sandbox master', scalar @serialize_tests + 11 unless $dbh;
     # Prevent "Wide character in print at Test/Builder.pm" warnings.
     binmode Test::More->builder->$_(), ':encoding(UTF-8)'
        for qw(output failure_output);
++   $dbh->do('DROP DATABASE IF EXISTS serialize_test');
     $dbh->do('CREATE DATABASE IF NOT EXISTS serialize_test');
     $dbh->do('DROP TABLE IF EXISTS serialize_test.serialize');
     $dbh->do('CREATE TABLE serialize_test.serialize (id INT, foo TEXT)');
@@ -182,10 +186,6 @@
     for my $test_index ( 0..$#serialize_tests ) {
        my $ser = $q->serialize_list( @{$serialize_tests[$test_index]} );
--      # Bit of a hack, but we want to test both of Perl's internal encodings
--      # for correctness.
--      local $dbh->{'mysql_enable_utf8'} = 1 if utf8::is_utf8($ser);
--
        $sth->execute($test_index, $ser);
        $selsth->execute($test_index);
@@ -196,22 +196,60 @@
  					  . "]";
        $flat_string =~ s/\n/\\n/g;
--      # diag($test_index);
--      SKIP: {
--         skip "DBD::mysql version $DBD::mysql::VERSION has utf8 bugs. "
--	    . "See https://bugs.launchpad.net/percona-toolkit/+bug/932327",
--            1 if $DBD::mysql::VERSION lt '4' && $test_index == 9;
--         is_deeply(
--            [ $q->deserialize_list($selsth->fetchrow_array()) ],
--            $serialize_tests[$test_index],
--            "Serialize $flat_string"
--         );
++      my $res = [ $q->deserialize_list($selsth->fetchrow_array()) ];
++      is_deeply(
++         $res,
++         $serialize_tests[$test_index],
++         "Serialize $flat_string"
++      );
++
++      if ( $flat_string =~ /\P{ASCII}/ && $flat_string =~ /[\x{80}-\x{FF}]/ ) {
++         for my $i (0..$#{$res}) {
++            my $form = utf8::is_utf8($serialize_tests[$test_index][$i])
++                     ? 'UTF-8'
++                     : 'Latin1';
++            is(
++               utf8::is_utf8($res->[$i]),
++               utf8::is_utf8($serialize_tests[$test_index][$i]),
++               "$res->[$i] and $serialize_tests[$test_index][$i] remain $form after deserialziation"
++            );
++         }
+       }
+    }
--
++
     $sth->finish();
     $selsth->finish();
++   {
++      $dbh->do('SET NAMES utf8');
++      $dbh->do('CREATE TABLE serialize_test.serialize2 (id INT, foo TEXT CHARACTER SET binary)');
++
++      my $sth2 = $dbh->prepare(
++         "INSERT INTO serialize_test.serialize2 (id, foo) VALUES (?, ?)"
++      );
++
++      my $selsth2 = $dbh->prepare(
++         "SELECT foo FROM serialize_test.serialize2 WHERE id=? LIMIT 1"
++      );
++
++      my @list = ("\N{U+10000}", "\N{U+10001}", "\N{U+10002}");
++      my $ser = $q->serialize_list( @list );
++
++      local $dbh->{'mysql_enable_utf8'} = 1;
++
++      $sth2->execute(1, $ser);
++      $selsth2->execute(1);
++      is_deeply(
++         [ $q->deserialize_list($selsth2->fetchrow_array()) ],
++         \@list,
++         "Serialized astral plane characters correctly"
++      );
++
++      $sth2->finish();
++      $selsth2->finish();
++
++   }
++
     $dbh->do("DROP DATABASE serialize_test");
     $sb->wipe_clean($dbh);
     $dbh->disconnect();