Merge lp:~percona-toolkit-dev/percona-toolkit/fix-1127450-pt-archiver-bulk-insert-encoding into lp:percona-toolkit/2.2

Proposed by Brian Fraser
Status: Merged
Approved by: Daniel Nichter
Approved revision: 569
Merged at revision: 577
Proposed branch: lp:~percona-toolkit-dev/percona-toolkit/fix-1127450-pt-archiver-bulk-insert-encoding
Merge into: lp:percona-toolkit/2.2
Diff against target: 161 lines (+77/-19)
3 files modified
bin/pt-archiver (+29/-19)
t/pt-archiver/bulk_insert.t (+36/-0)
t/pt-archiver/samples/bug_1127450.sql (+12/-0)
To merge this branch: bzr merge lp:~percona-toolkit-dev/percona-toolkit/fix-1127450-pt-archiver-bulk-insert-encoding
Reviewer Review Type Date Requested Status
Daniel Nichter Approve
Review via email: mp+158686@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Daniel Nichter (daniel-nichter) wrote :

99 +use charnames ':full';

Will this work in 5.8?

8 + my $got_charset = $o->get('charset');
36 + my $charset = $got_charset || '';

Those seem redundant; just my $charset = $o->get('charset'); should work for both.

17 + . " INTO TABLE $dst->{db_tbl}"
18 + . ($got_charset ? "CHARACTER SET $got_charset" : "")
19 + . "("

I'm not aware of the "INSERT INTO TABLE foo CHARACTER SET bar" syntax, where is it documented? Is this tested?

review: Needs Fixing
Revision history for this message
Brian Fraser (fraserbn) wrote :

> 99 +use charnames ':full';
>
> Will this work in 5.8?
>

Yes. I actually added that line explicitly to be compatible with older Perls, since newer ones don't need it.

> 8 + my $got_charset = $o->get('charset');
> 36 + my $charset = $got_charset || '';
>
> Those seem redundant; just my $charset = $o->get('charset'); should work for
> both.

Technically yes, but further down pt-archiver overloads $charset to be the character set name *to perl*, which might be different than the MySQL charset. Simple example: --charset latin1, $got_charset would always remain 'latin1', but $charset later becomes 'iso-8859-1', which MySQL doesn't understand.
I want to refactor the charset handling code for 3.0, but figured that adding the new variable would be the least intrusive change for 2.2.

>
> 17 + . " INTO TABLE $dst->{db_tbl}"
> 18 + . ($got_charset ? "CHARACTER SET $got_charset" :
> "")
> 19 + . "("
>
> I'm not aware of the "INSERT INTO TABLE foo CHARACTER SET bar" syntax, where
> is it documented? Is this tested?

The diff probably cut off the important bit from that query: it's not INSERT INTO, it's "LOAD DATA LOCAL INFILE INTO TABLE foo CHARACTER SET doof"
It's documented on mysql's LOAD DATA doc from 5.0 onwards.

Revision history for this message
Daniel Nichter (daniel-nichter) wrote :

Ok, sounds good.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bin/pt-archiver'
2--- bin/pt-archiver 2013-03-20 17:53:36 +0000
3+++ bin/pt-archiver 2013-04-12 17:46:29 +0000
4@@ -5450,6 +5450,7 @@
5 my $archive_file = $o->get('file');
6 my $txnsize = $o->get('txn-size');
7 my $quiet = $o->get('quiet');
8+ my $got_charset = $o->get('charset');
9
10 # First things first: if --stop was given, create the sentinel file.
11 if ( $o->get('stop') ) {
12@@ -5833,7 +5834,9 @@
13 . ' LOCAL INFILE ?'
14 . ($o->get('replace') ? ' REPLACE' : '')
15 . ($o->get('ignore') ? ' IGNORE' : '')
16- . " INTO TABLE $dst->{db_tbl}("
17+ . " INTO TABLE $dst->{db_tbl}"
18+ . ($got_charset ? "CHARACTER SET $got_charset" : "")
19+ . "("
20 . join(",", map { $q->quote($_) } @{$ins_stmt->{cols}} )
21 . ")";
22 }
23@@ -5942,28 +5945,29 @@
24 return 0;
25 }
26
27- # Open the file and print the header to it.
28- if ( $archive_file ) {
29- my $need_hdr = $o->get('header') && !-f $archive_file;
30- my $charset = $o->get('charset') || '';
31- if ($charset eq 'utf8') {
32- $charset = ":$charset";
33- }
34- elsif ($charset) {
35- eval { require Encode }
36+ my $charset = $got_charset || '';
37+ if ($charset eq 'utf8') {
38+ $charset = ":$charset";
39+ }
40+ elsif ($charset) {
41+ eval { require Encode }
42 or (PTDEBUG &&
43 _d("Couldn't load Encode: ", $EVAL_ERROR,
44 "Going to try using the charset ",
45 "passed in without checking it."));
46- # No need to punish a user if they did their
47- # homework and passed in an official charset,
48- # rather than an alias.
49- $charset = ":encoding("
50- . (defined &Encode::resolve_alias
51- ? Encode::resolve_alias($charset) || $charset
52- : $charset)
53- . ")";
54- }
55+ # No need to punish a user if they did their
56+ # homework and passed in an official charset,
57+ # rather than an alias.
58+ $charset = ":encoding("
59+ . (defined &Encode::resolve_alias
60+ ? Encode::resolve_alias($charset) || $charset
61+ : $charset)
62+ . ")";
63+ }
64+
65+ # Open the file and print the header to it.
66+ if ( $archive_file ) {
67+ my $need_hdr = $o->get('header') && !-f $archive_file;
68 $archive_fh = IO::File->new($archive_file, ">>$charset")
69 or die "Cannot open $charset $archive_file: $OS_ERROR\n";
70 $archive_fh->autoflush(1) unless $o->get('buffer');
71@@ -5979,6 +5983,9 @@
72 require File::Temp;
73 $bulkins_file = File::Temp->new( SUFFIX => 'pt-archiver' )
74 or die "Cannot open temp file: $OS_ERROR\n";
75+ binmode($bulkins_file, $charset)
76+ or die "Cannot set $charset as an encoding for the bulk-insert "
77+ . "file: $OS_ERROR";
78 }
79
80 # This row is the first row fetched from each 'chunk'.
81@@ -6205,6 +6212,9 @@
82 if ( $o->get('bulk-insert') ) {
83 $bulkins_file = File::Temp->new( SUFFIX => 'pt-archiver' )
84 or die "Cannot open temp file: $OS_ERROR\n";
85+ binmode($bulkins_file, $charset)
86+ or die "Cannot set $charset as an encoding for the bulk-insert "
87+ . "file: $OS_ERROR";
88 }
89 } # no next row (do bulk operations)
90 else {
91
92=== modified file 't/pt-archiver/bulk_insert.t'
93--- t/pt-archiver/bulk_insert.t 2012-11-21 16:21:56 +0000
94+++ t/pt-archiver/bulk_insert.t 2013-04-12 17:46:29 +0000
95@@ -11,6 +11,8 @@
96 use English qw(-no_match_vars);
97 use Test::More;
98
99+use charnames ':full';
100+
101 use PerconaTest;
102 use Sandbox;
103 require "$trunk/bin/pt-archiver";
104@@ -85,6 +87,40 @@
105 );
106
107 # #############################################################################
108+# pt-archiver wide character errors / corrupted data with UTF-8 + bulk-insert
109+# https://bugs.launchpad.net/percona-toolkit/+bug/1127450
110+# #############################################################################
111+{
112+my $utf8_dbh = $sb->get_dbh_for('master', { mysql_enable_utf8 => 1, AutoCommit => 1 });
113+
114+$sb->load_file('master', 't/pt-archiver/samples/bug_1127450.sql');
115+my $sql = qq{INSERT INTO `bug_1127450`.`original` VALUES (1, "\N{KATAKANA LETTER NI}")};
116+$utf8_dbh->do($sql);
117+
118+$output = output(
119+ sub { pt_archiver::main(qw(--no-ascend --limit 50 --bulk-insert),
120+ qw(--bulk-delete --where 1=1 --statistics --charset utf8),
121+ '--source', "L=1,D=bug_1127450,t=original,F=$cnf",
122+ '--dest', "t=copy") }, stderr => 1
123+);
124+
125+my (undef, $val) = $utf8_dbh->selectrow_array('select * from bug_1127450.copy');
126+
127+ok(
128+ utf8::is_utf8($val),
129+ "--bulk-insert preserves UTF8ness"
130+);
131+
132+is(
133+ $val,
134+ "\N{KATAKANA LETTER NI}",
135+ "--bulk-insert can handle utf8 characters"
136+);
137+
138+unlike($output, qr/Wide character/, "no wide character warnings")
139+
140+}
141+# #############################################################################
142 # Done.
143 # #############################################################################
144 $sb->wipe_clean($dbh);
145
146=== added file 't/pt-archiver/samples/bug_1127450.sql'
147--- t/pt-archiver/samples/bug_1127450.sql 1970-01-01 00:00:00 +0000
148+++ t/pt-archiver/samples/bug_1127450.sql 2013-04-12 17:46:29 +0000
149@@ -0,0 +1,12 @@
150+DROP DATABASE IF EXISTS `bug_1127450`;
151+CREATE DATABASE `bug_1127450`;
152+CREATE TABLE `bug_1127450`.`original` (
153+ id int,
154+ t text CHARACTER SET utf8,
155+ PRIMARY KEY(id)
156+) engine=InnoDB DEFAULT CHARSET=utf8;
157+CREATE TABLE `bug_1127450`.`copy` (
158+ id int,
159+ t text CHARACTER SET utf8,
160+ PRIMARY KEY(id)
161+) engine=InnoDB DEFAULT CHARSET=utf8;

Subscribers

People subscribed via source and target branches