2013-09-11

MySQLのUTF8照合順序はどうなってんの?

この記事はQiitaにも載せています。

やってみた

MySQLの照合順序、UTF-8の日本語で使える照合順序はいくつかあるけど、 実際にどんなマッチの仕方をするのかわからなかったので、ちょっとやってみた。
ちなみに、MySQLのUTF-8で日本語を扱う場合に使用可能な照合順序はしたの3つ。

  • utf8_bin
  • utf8_general_ci
  • utf8_unicode_ci
確認したかったのは、半角・全角、小文字・大文字がどんな感じでマッチするのかという事。


ながーいので、結果を

  • utf8_bin
    キャラクターコードが完全に一致するもののみマッチする。 
  • utf8_general_ci
    アルファベットの大文字・小文字は区別せずにマッチする。 ただし、区別しない文字は、半角は半角の大文字・小文字、全角は全角の大文字・小文字のみ。
    半角小文字と全角小文字同士はマッチしない。
  • utf8_unicode_ci
    下記がマッチする。
    • アルファベットの大文字・小文字(全半角混合)
    • ひらがなの大文字・小文字とカタカナの大文字・小文字 (例えば、'あ'で検索すると、'ぁ', 'ア', 'ァ'もマッチする) 

テーブル定義

検証のためのテーブル定義はこんな感じ。

drop table if exists `utf8_bin`;
create table `utf8_bin`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_bin;

drop table if exists `utf8_general_ci`;
create table `utf8_general_ci`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_general_ci;

drop table if exists `utf8_unicode_ci`;
create table `utf8_unicode_ci`
( `id` int(11) NOT NULL auto_increment,
  `str` varchar(255),
  `num` int(11),
  primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_unicode_ci; 

テストデータ

テストデータはこんな感じ。
とりあえず、気になってた文字の比較。
create table `utf8_bin`
insert into `utf8_bin`(`str`, `num`)
values
('a', null),
('a', null),
('A', null),
('A', null),
('あ', null),
('ぁ', null),
('ア', null),
('ァ', null),
('高橋', null),
('髙橋', null),
('-', null),
('−', null),
('+', null),
('+', null),
('0', null),
('0', null),
('1', null),
('1', null),
(null, 0),
(null, 1),
(null, 9),
(null,  100)
;

insert into `utf8_general_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;

insert into `utf8_unicode_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;

確認してみた

これ、途中で気づいたんだけど、プロシージャで書けばよかった…
思いつきで書き始めたけど、なんか長くなってしまった。

select '## str = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'a'
;
show warnings;

select '## str = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'A'
;
show warnings;

select '## str = a(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'a'
;
show warnings;

select '## str = A(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'A'
;
show warnings;

select '## str = あ(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'あ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'あ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'あ'
;
show warnings;

select '## str = ぁ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ぁ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ぁ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ぁ'
;
show warnings;

select '## str = ア(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ア'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ア'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ア'
;
show warnings;

select '## str = ァ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ァ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ァ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ァ'
;
show warnings;

select '## str = 高橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '高橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '高橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '高橋'
;
show warnings;

select '## str = 髙橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '髙橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '髙橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '髙橋'
;
show warnings;

select '## str = -(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '-'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '-'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '-'
;
show warnings;

select '## str = −(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '−'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '−'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '−'
;
show warnings;

select '## str = +(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '+'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '+'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '+'
;
show warnings;

select '## str = +(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '+'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '+'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '+'
;
show warnings;

select '## str = 0(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '0'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '0'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '0'
;
show warnings;

select '## str = 0(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '0'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '0'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '0'
;
show warnings;

select '## str = 1(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '1'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '1'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '1'
;
show warnings;

select '## str = 1(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '1'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '1'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '1'
;
show warnings;

select '## str like \'高%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '高%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '高%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '高%'
;
show warnings;

select '## str like \'髙%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '髙%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '髙%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '髙%'
;
show warnings;

select '## num = 0' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 0
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 0
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 0
;
show warnings;

select '## num = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'a'
;
show warnings;

select '## num = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'A'
;
show warnings;

select '## num = AAA(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'AAA'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'AAA'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'AAA'
;
show warnings;

select '## num = a(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'a'
;
show warnings;

select '## num = A(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'A'
;
show warnings;

select '## num like \'1%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like '1%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like '1%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like '1%'
;
show warnings;

select '## num like \'a%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like 'a%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like 'a%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like 'a%'
;
show warnings;

テスト結果

これもまただらだらいきます(;´∀`)

str = a(半角)

tableidstr
utf8_bin1a
utf8_general_ci1a
utf8_general_ci3A
utf8_unicode_ci1a
utf8_unicode_ci2
utf8_unicode_ci3A
utf8_unicode_ci4
エラー無し

str = A(半角)

tableidstr
utf8_bin3A
utf8_general_ci1a
utf8_general_ci3A
utf8_unicode_ci1a
utf8_unicode_ci2
utf8_unicode_ci3A
utf8_unicode_ci4
エラー無し

str = a(全角)

tableidstr
utf8_bin2
utf8_general_ci2
utf8_general_ci4
utf8_unicode_ci1a
utf8_unicode_ci2
utf8_unicode_ci3A
utf8_unicode_ci4
エラー無し

str = A(全角)

tableidstr
utf8_bin4
utf8_general_ci2
utf8_general_ci4
utf8_unicode_ci1a
utf8_unicode_ci2
utf8_unicode_ci3A
utf8_unicode_ci4
エラー無し

str = あ(大文字) 

tableidstr
utf8_bin5
utf8_general_ci5
utf8_unicode_ci5
utf8_unicode_ci6
utf8_unicode_ci7
utf8_unicode_ci8
エラー無し

str = ぁ(小文字)

tableidstr
utf8_bin6
utf8_general_ci6
utf8_unicode_ci5
utf8_unicode_ci6
utf8_unicode_ci7
utf8_unicode_ci8
エラー無し

str = ア(大文字)

tableidstr
utf8_bin7
utf8_general_ci7
utf8_unicode_ci5
utf8_unicode_ci6
utf8_unicode_ci7
utf8_unicode_ci8
エラー無し

str = ァ(小文字)

tableidstr
utf8_bin8
utf8_general_ci8
utf8_unicode_ci5
utf8_unicode_ci6
utf8_unicode_ci7
utf8_unicode_ci8
エラー無し

str = 高橋

tableidstr
utf8_bin9高橋
utf8_general_ci9高橋
utf8_unicode_ci9高橋
エラー無し

str = 髙橋

tableidstr
utf8_bin10髙橋
utf8_general_ci10髙橋
utf8_unicode_ci10髙橋
エラー無し

str = -(半角)

tableidstr
utf8_bin11-
utf8_general_ci11-
utf8_unicode_ci11-
エラー無し


str = −(全角)

tableidstr
utf8_bin12
utf8_general_ci12
utf8_unicode_ci12
エラー無し

str = +(半角)

tableidstr
utf8_bin13+
utf8_general_ci13+
utf8_unicode_ci13+
utf8_unicode_ci14
エラー無し

str = +(全角)

tableidstr
utf8_bin14
utf8_general_ci14
utf8_unicode_ci13+
utf8_unicode_ci14
エラー無し

str = 0(半角)

tableidstr
utf8_bin150
utf8_general_ci150
utf8_unicode_ci150
utf8_unicode_ci16
エラー無し

str = 0(全角)

tableidstr
utf8_bin16
utf8_general_ci16
utf8_unicode_ci150
utf8_unicode_ci16
エラー無し

str = 1(半角)

tableidstr
utf8_bin171
utf8_general_ci171
utf8_unicode_ci171
utf8_unicode_ci18
エラー無し

str = 1(全角)

tableidstr
utf8_bin18
utf8_general_ci18
utf8_unicode_ci171
utf8_unicode_ci18
エラー無し

str like '高%'

tableidstr
utf8_bin9高橋
utf8_general_ci9高橋
utf8_unicode_ci9高橋
エラー無し

str like '髙%'

tableidstr
utf8_bin10髙橋
utf8_general_ci10髙橋
utf8_unicode_ci10髙橋
エラー無し

num = 0

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラー無し

num = a(半角)

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラーあり
show warnings;

Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'

num = A(半角)

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラーあり
show warnings;

Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'


num = AAA(半角)

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラーあり
show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'

num = a(全角)

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラーあり
show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'

num = A(全角)

tableidstr
utf8_bin190
utf8_general_ci190
utf8_unicode_ci190
エラーあり
show warnings;

Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'

num like '1%'

tableidstr
utf8_bin201
utf8_bin22100
utf8_general_ci201
utf8_general_ci22100
utf8_unicode_ci201
utf8_unicode_ci22100
エラー無し

num like 'a%'

マッチせず エラー無し

思ったこと 

utf8_bin / utf8_general_ciはそんなに違いは無いのね。
大文字・小文字を判別するかしないか、くらいっぽい。

utf8_unicode_ciはヒトが考えるマッチ条件に似ているかな。
全半角をゴチャッと検索できるのはうれしいかも。システム的には厳密じゃないけど。

理解した上で使い分けていと思わぬバグを生産しそうだけど、 とっても便利だという事がわかりました。

0 件のコメント:

コメントを投稿