xay.lab: MySQLのUTF8照合順序はどうなってんの？

この記事はQiitaにも載せています。

やってみた

MySQLの照合順序、UTF-8の日本語で使える照合順序はいくつかあるけど、実際にどんなマッチの仕方をするのかわからなかったので、ちょっとやってみた。
ちなみに、MySQLのUTF-8で日本語を扱う場合に使用可能な照合順序はしたの3つ。

utf8_bin
utf8_general_ci
utf8_unicode_ci

確認したかったのは、半角・全角、小文字・大文字がどんな感じでマッチするのかという事。

ながーいので、結果を

utf8_bin
キャラクターコードが完全に一致するもののみマッチする。
utf8_general_ci
アルファベットの大文字・小文字は区別せずにマッチする。ただし、区別しない文字は、半角は半角の大文字・小文字、全角は全角の大文字・小文字のみ。
半角小文字と全角小文字同士はマッチしない。
utf8_unicode_ci
下記がマッチする。

アルファベットの大文字・小文字(全半角混合)
ひらがなの大文字・小文字とカタカナの大文字・小文字 (例えば、'あ'で検索すると、'ぁ', 'ア', 'ァ'もマッチする)

テーブル定義

検証のためのテーブル定義はこんな感じ。

drop table if exists `utf8_bin`;

create table `utf8_bin`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_bin;

drop table if exists `utf8_general_ci`;
create table `utf8_general_ci`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_general_ci;

drop table if exists `utf8_unicode_ci`;
create table `utf8_unicode_ci`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_unicode_ci;

テストデータ

テストデータはこんな感じ。
とりあえず、気になってた文字の比較。

create table `utf8_bin`
insert into `utf8_bin`(`str`, `num`)
values
('a', null),
('ａ', null),
('A', null),
('Ａ', null),
('あ', null),
('ぁ', null),
('ア', null),
('ァ', null),
('高橋', null),
('髙橋', null),
('-', null),
('−', null),
('+', null),
('＋', null),
('0', null),
('０', null),
('1', null),
('１', null),
(null, 0),
(null, 1),
(null, 9),
(null,  100)
;

insert into `utf8_general_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;

insert into `utf8_unicode_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;

確認してみた

これ、途中で気づいたんだけど、プロシージャで書けばよかった…
思いつきで書き始めたけど、なんか長くなってしまった。

select '## str = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'a'
;
show warnings;

select '## str = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'A'
;
show warnings;

select '## str = ａ(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ａ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ａ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ａ'
;
show warnings;

select '## str = Ａ(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'Ａ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'Ａ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'Ａ'
;
show warnings;

select '## str = あ(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'あ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'あ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'あ'
;
show warnings;

select '## str = ぁ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ぁ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ぁ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ぁ'
;
show warnings;

select '## str = ア(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ア'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ア'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ア'
;
show warnings;

select '## str = ァ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ァ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ァ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ァ'
;
show warnings;

select '## str = 高橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '高橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '高橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '高橋'
;
show warnings;

select '## str = 髙橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '髙橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '髙橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '髙橋'
;
show warnings;

select '## str = -(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '-'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '-'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '-'
;
show warnings;

select '## str = −(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '−'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '−'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '−'
;
show warnings;

select '## str = +(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '+'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '+'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '+'
;
show warnings;

select '## str = ＋(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '＋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '＋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '＋'
;
show warnings;

select '## str = 0(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '0'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '0'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '0'
;
show warnings;

select '## str = ０(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '０'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '０'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '０'
;
show warnings;

select '## str = 1(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '1'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '1'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '1'
;
show warnings;

select '## str = １(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '１'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '１'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '１'
;
show warnings;

select '## str like \'高%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '高%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '高%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '高%'
;
show warnings;

select '## str like \'髙%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '髙%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '髙%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '髙%'
;
show warnings;

select '## num = 0' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 0
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 0
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 0
;
show warnings;

select '## num = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'a'
;
show warnings;

select '## num = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'A'
;
show warnings;

select '## num = AAA(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'AAA'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'AAA'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'AAA'
;
show warnings;

select '## num = ａ(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'ａ'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'ａ'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'ａ'
;
show warnings;

select '## num = Ａ(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'Ａ'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'Ａ'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'Ａ'
;
show warnings;

select '## num like \'1%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like '1%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like '1%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like '1%'
;
show warnings;

select '## num like \'a%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like 'a%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like 'a%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like 'a%'
;
show warnings;

テスト結果

これもまただらだらいきます(；´∀｀)

str = a(半角)

table	id	str
utf8_bin	1	a
utf8_general_ci	1	a
utf8_general_ci	3	A
utf8_unicode_ci	1	a
utf8_unicode_ci	2	ａ
utf8_unicode_ci	3	A
utf8_unicode_ci	4	Ａ

エラー無し

str = A(半角)

table	id	str
utf8_bin	3	A
utf8_general_ci	1	a
utf8_general_ci	3	A
utf8_unicode_ci	1	a
utf8_unicode_ci	2	ａ
utf8_unicode_ci	3	A
utf8_unicode_ci	4	Ａ

エラー無し

str = ａ(全角)

table	id	str
utf8_bin	2	ａ
utf8_general_ci	2	ａ
utf8_general_ci	4	Ａ
utf8_unicode_ci	1	a
utf8_unicode_ci	2	ａ
utf8_unicode_ci	3	A
utf8_unicode_ci	4	Ａ

エラー無し

str = Ａ(全角)

table	id	str
utf8_bin	4	Ａ
utf8_general_ci	2	ａ
utf8_general_ci	4	Ａ
utf8_unicode_ci	1	a
utf8_unicode_ci	2	ａ
utf8_unicode_ci	3	A
utf8_unicode_ci	4	Ａ

エラー無し

str = あ(大文字)

table	id	str
utf8_bin	5	あ
utf8_general_ci	5	あ
utf8_unicode_ci	5	あ
utf8_unicode_ci	6	ぁ
utf8_unicode_ci	7	ア
utf8_unicode_ci	8	ァ

エラー無し

str = ぁ(小文字)

table	id	str
utf8_bin	6	ぁ
utf8_general_ci	6	ぁ
utf8_unicode_ci	5	あ
utf8_unicode_ci	6	ぁ
utf8_unicode_ci	7	ア
utf8_unicode_ci	8	ァ

エラー無し

str = ア(大文字)

table	id	str
utf8_bin	7	ア
utf8_general_ci	7	ア
utf8_unicode_ci	5	あ
utf8_unicode_ci	6	ぁ
utf8_unicode_ci	7	ア
utf8_unicode_ci	8	ァ

エラー無し

str = ァ(小文字)

table	id	str
utf8_bin	8	ァ
utf8_general_ci	8	ァ
utf8_unicode_ci	5	あ
utf8_unicode_ci	6	ぁ
utf8_unicode_ci	7	ア
utf8_unicode_ci	8	ァ

エラー無し

str = 高橋

table	id	str
utf8_bin	9	高橋
utf8_general_ci	9	高橋
utf8_unicode_ci	9	高橋

エラー無し

str = 髙橋

table	id	str
utf8_bin	10	髙橋
utf8_general_ci	10	髙橋
utf8_unicode_ci	10	髙橋

エラー無し

str = -(半角)

table	id	str
utf8_bin	11	-
utf8_general_ci	11	-
utf8_unicode_ci	11	-

エラー無し

str = −(全角)

table	id	str
utf8_bin	12	−
utf8_general_ci	12	−
utf8_unicode_ci	12	−

エラー無し

str = +(半角)

table	id	str
utf8_bin	13	+
utf8_general_ci	13	+
utf8_unicode_ci	13	+
utf8_unicode_ci	14	＋

エラー無し

str = ＋(全角)

table	id	str
utf8_bin	14	＋
utf8_general_ci	14	＋
utf8_unicode_ci	13	+
utf8_unicode_ci	14	＋

エラー無し

str = 0(半角)

table	id	str
utf8_bin	15	0
utf8_general_ci	15	0
utf8_unicode_ci	15	0
utf8_unicode_ci	16	０

エラー無し

str = ０(全角)

table	id	str
utf8_bin	16	０
utf8_general_ci	16	０
utf8_unicode_ci	15	0
utf8_unicode_ci	16	０

エラー無し

str = 1(半角)

table	id	str
utf8_bin	17	1
utf8_general_ci	17	1
utf8_unicode_ci	17	1
utf8_unicode_ci	18	１

エラー無し

str = １(全角)

table	id	str
utf8_bin	18	１
utf8_general_ci	18	１
utf8_unicode_ci	17	1
utf8_unicode_ci	18	１

エラー無し

str like '高%'

table	id	str
utf8_bin	9	高橋
utf8_general_ci	9	高橋
utf8_unicode_ci	9	高橋

エラー無し

str like '髙%'

table	id	str
utf8_bin	10	髙橋
utf8_general_ci	10	髙橋
utf8_unicode_ci	10	髙橋

エラー無し

num = 0

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラー無し

num = a(半角)

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラーあり

show warnings;

Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'

num = A(半角)

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラーあり

show warnings;

Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'

num = AAA(半角)

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラーあり

show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'

num = ａ(全角)

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラーあり

show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'ａ'
Warning 1292 Truncated incorrect DOUBLE value: 'ａ'
Warning 1292 Truncated incorrect DOUBLE value: 'ａ'

num = Ａ(全角)

table	id	str
utf8_bin	19	0
utf8_general_ci	19	0
utf8_unicode_ci	19	0

エラーあり

show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'Ａ'
Warning 1292 Truncated incorrect DOUBLE value: 'Ａ'
Warning 1292 Truncated incorrect DOUBLE value: 'Ａ'

num like '1%'

table	id	str
utf8_bin	20	1
utf8_bin	22	100
utf8_general_ci	20	1
utf8_general_ci	22	100
utf8_unicode_ci	20	1
utf8_unicode_ci	22	100

エラー無し

num like 'a%'

マッチせずエラー無し

思ったこと

utf8_bin / utf8_general_ciはそんなに違いは無いのね。
大文字・小文字を判別するかしないか、くらいっぽい。

utf8_unicode_ciはヒトが考えるマッチ条件に似ているかな。
全半角をゴチャッと検索できるのはうれしいかも。システム的には厳密じゃないけど。

理解した上で使い分けていと思わぬバグを生産しそうだけど、とっても便利だという事がわかりました。

2013-09-11

MySQLのUTF8照合順序はどうなってんの？

やってみた

ながーいので、結果を

テーブル定義

テストデータ

確認してみた

テスト結果

str = a(半角)

str = A(半角)

str = ａ(全角)

str = Ａ(全角)

str = あ(大文字)

str = ぁ(小文字)

str = ア(大文字)

str = ァ(小文字)

str = 高橋

str = 髙橋

str = -(半角)

str = −(全角)

str = +(半角)

str = ＋(全角)

str = 0(半角)

str = ０(全角)

str = 1(半角)

str = １(全角)

str like '高%'

str like '髙%'

num = 0

num = a(半角)

num = A(半角)

num = AAA(半角)

num = ａ(全角)

num = Ａ(全角)

num like '1%'

num like 'a%'

思ったこと

0 件のコメント:

コメントを投稿