この記事は
Qiitaにも載せています。
やってみた
MySQLの照合順序、UTF-8の日本語で使える照合順序はいくつかあるけど、
実際にどんなマッチの仕方をするのかわからなかったので、ちょっとやってみた。
ちなみに、MySQLのUTF-8で日本語を扱う場合に使用可能な照合順序はしたの3つ。
- utf8_bin
- utf8_general_ci
- utf8_unicode_ci
確認したかったのは、半角・全角、小文字・大文字がどんな感じでマッチするのかという事。
ながーいので、結果を
- utf8_bin
キャラクターコードが完全に一致するもののみマッチする。
- utf8_general_ci
アルファベットの大文字・小文字は区別せずにマッチする。
ただし、区別しない文字は、半角は半角の大文字・小文字、全角は全角の大文字・小文字のみ。
半角小文字と全角小文字同士はマッチしない。
- utf8_unicode_ci
下記がマッチする。
- アルファベットの大文字・小文字(全半角混合)
- ひらがなの大文字・小文字とカタカナの大文字・小文字 (例えば、'あ'で検索すると、'ぁ', 'ア', 'ァ'もマッチする)
テーブル定義
検証のためのテーブル定義はこんな感じ。
drop table if exists `utf8_bin`;
create table `utf8_bin`
( `id` int(11) NOT NULL auto_increment,
`str` varchar(255),
`num` int(11),
primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_bin;
drop table if exists `utf8_general_ci`;
create table `utf8_general_ci`
( `id` int(11) NOT NULL auto_increment,
`str` varchar(255),
`num` int(11),
primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_general_ci;
drop table if exists `utf8_unicode_ci`;
create table `utf8_unicode_ci`
( `id` int(11) NOT NULL auto_increment,
`str` varchar(255),
`num` int(11),
primary key (`id`)
)
engine=InnoDB
default character set utf8 collate utf8_unicode_ci;
テストデータ
テストデータはこんな感じ。
とりあえず、気になってた文字の比較。
create table `utf8_bin`
insert into `utf8_bin`(`str`, `num`)
values
('a', null),
('a', null),
('A', null),
('A', null),
('あ', null),
('ぁ', null),
('ア', null),
('ァ', null),
('高橋', null),
('髙橋', null),
('-', null),
('−', null),
('+', null),
('+', null),
('0', null),
('0', null),
('1', null),
('1', null),
(null, 0),
(null, 1),
(null, 9),
(null, 100)
;
insert into `utf8_general_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;
insert into `utf8_unicode_ci`(`str`, `num`) select `str`, `num` from `utf8_bin`;
確認してみた
これ、途中で気づいたんだけど、プロシージャで書けばよかった…
思いつきで書き始めたけど、なんか長くなってしまった。
select '## str = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'a'
;
show warnings;
select '## str = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'A'
;
show warnings;
select '## str = a(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'a'
;
show warnings;
select '## str = A(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'A'
;
show warnings;
select '## str = あ(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'あ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'あ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'あ'
;
show warnings;
select '## str = ぁ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ぁ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ぁ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ぁ'
;
show warnings;
select '## str = ア(大文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ア'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ア'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ア'
;
show warnings;
select '## str = ァ(小文字)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = 'ァ'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = 'ァ'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = 'ァ'
;
show warnings;
select '## str = 高橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '高橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '高橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '高橋'
;
show warnings;
select '## str = 髙橋' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '髙橋'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '髙橋'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '髙橋'
;
show warnings;
select '## str = -(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '-'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '-'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '-'
;
show warnings;
select '## str = −(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '−'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '−'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '−'
;
show warnings;
select '## str = +(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '+'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '+'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '+'
;
show warnings;
select '## str = +(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '+'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '+'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '+'
;
show warnings;
select '## str = 0(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '0'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '0'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '0'
;
show warnings;
select '## str = 0(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '0'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '0'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '0'
;
show warnings;
select '## str = 1(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '1'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '1'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '1'
;
show warnings;
select '## str = 1(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` = '1'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` = '1'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` = '1'
;
show warnings;
select '## str like \'高%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '高%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '高%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '高%'
;
show warnings;
select '## str like \'髙%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `str` from `utf8_bin` where `str` like '髙%'
union select 'utf8_general_ci' as 'table', `id`, `str` from `utf8_general_ci` where `str` like '髙%'
union select 'utf8_unicode_ci' as 'table', `id`, `str` from `utf8_unicode_ci` where `str` like '髙%'
;
show warnings;
select '## num = 0' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 0
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 0
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 0
;
show warnings;
select '## num = a(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'a'
;
show warnings;
select '## num = A(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'A'
;
show warnings;
select '## num = AAA(半角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'AAA'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'AAA'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'AAA'
;
show warnings;
select '## num = a(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'a'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'a'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'a'
;
show warnings;
select '## num = A(全角)' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` = 'A'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` = 'A'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` = 'A'
;
show warnings;
select '## num like \'1%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like '1%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like '1%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like '1%'
;
show warnings;
select '## num like \'a%\'' as '実行内容';
select 'utf8_bin' as 'table', `id`, `num` from `utf8_bin` where `num` like 'a%'
union select 'utf8_general_ci' as 'table', `id`, `num` from `utf8_general_ci` where `num` like 'a%'
union select 'utf8_unicode_ci' as 'table', `id`, `num` from `utf8_unicode_ci` where `num` like 'a%'
;
show warnings;
テスト結果
これもまただらだらいきます(;´∀`)
str = a(半角)
table | id | str |
utf8_bin | 1 | a |
utf8_general_ci | 1 | a |
utf8_general_ci | 3 | A |
utf8_unicode_ci | 1 | a |
utf8_unicode_ci | 2 | a |
utf8_unicode_ci | 3 | A |
utf8_unicode_ci | 4 | A |
エラー無し
str = A(半角)
table | id | str |
utf8_bin | 3 | A |
utf8_general_ci | 1 | a |
utf8_general_ci | 3 | A |
utf8_unicode_ci | 1 | a |
utf8_unicode_ci | 2 | a |
utf8_unicode_ci | 3 | A |
utf8_unicode_ci | 4 | A |
エラー無し
str = a(全角)
table | id | str |
utf8_bin | 2 | a |
utf8_general_ci | 2 | a |
utf8_general_ci | 4 | A |
utf8_unicode_ci | 1 | a |
utf8_unicode_ci | 2 | a |
utf8_unicode_ci | 3 | A |
utf8_unicode_ci | 4 | A |
エラー無し
str = A(全角)
table | id | str |
utf8_bin | 4 | A |
utf8_general_ci | 2 | a |
utf8_general_ci | 4 | A |
utf8_unicode_ci | 1 | a |
utf8_unicode_ci | 2 | a |
utf8_unicode_ci | 3 | A |
utf8_unicode_ci | 4 | A |
エラー無し
str = あ(大文字)
table | id | str |
utf8_bin | 5 | あ |
utf8_general_ci | 5 | あ |
utf8_unicode_ci | 5 | あ |
utf8_unicode_ci | 6 | ぁ |
utf8_unicode_ci | 7 | ア |
utf8_unicode_ci | 8 | ァ |
エラー無し
str = ぁ(小文字)
table | id | str |
utf8_bin | 6 | ぁ |
utf8_general_ci | 6 | ぁ |
utf8_unicode_ci | 5 | あ |
utf8_unicode_ci | 6 | ぁ |
utf8_unicode_ci | 7 | ア |
utf8_unicode_ci | 8 | ァ |
エラー無し
str = ア(大文字)
table | id | str |
utf8_bin | 7 | ア |
utf8_general_ci | 7 | ア |
utf8_unicode_ci | 5 | あ |
utf8_unicode_ci | 6 | ぁ |
utf8_unicode_ci | 7 | ア |
utf8_unicode_ci | 8 | ァ |
エラー無し
str = ァ(小文字)
table | id | str |
utf8_bin | 8 | ァ |
utf8_general_ci | 8 | ァ |
utf8_unicode_ci | 5 | あ |
utf8_unicode_ci | 6 | ぁ |
utf8_unicode_ci | 7 | ア |
utf8_unicode_ci | 8 | ァ |
エラー無し
str = 高橋
table | id | str |
utf8_bin | 9 | 高橋 |
utf8_general_ci | 9 | 高橋 |
utf8_unicode_ci | 9 | 高橋 |
エラー無し
str = 髙橋
table | id | str |
utf8_bin | 10 | 髙橋 |
utf8_general_ci | 10 | 髙橋 |
utf8_unicode_ci | 10 | 髙橋 |
エラー無し
str = -(半角)
table | id | str |
utf8_bin | 11 | - |
utf8_general_ci | 11 | - |
utf8_unicode_ci | 11 | - |
エラー無し
str = −(全角)
table | id | str |
utf8_bin | 12 | − |
utf8_general_ci | 12 | − |
utf8_unicode_ci | 12 | − |
エラー無し
str = +(半角)
table | id | str |
utf8_bin | 13 | + |
utf8_general_ci | 13 | + |
utf8_unicode_ci | 13 | + |
utf8_unicode_ci | 14 | + |
エラー無し
str = +(全角)
table | id | str |
utf8_bin | 14 | + |
utf8_general_ci | 14 | + |
utf8_unicode_ci | 13 | + |
utf8_unicode_ci | 14 | + |
エラー無し
str = 0(半角)
table | id | str |
utf8_bin | 15 | 0 |
utf8_general_ci | 15 | 0 |
utf8_unicode_ci | 15 | 0 |
utf8_unicode_ci | 16 | 0 |
エラー無し
str = 0(全角)
table | id | str |
utf8_bin | 16 | 0 |
utf8_general_ci | 16 | 0 |
utf8_unicode_ci | 15 | 0 |
utf8_unicode_ci | 16 | 0 |
エラー無し
str = 1(半角)
table | id | str |
utf8_bin | 17 | 1 |
utf8_general_ci | 17 | 1 |
utf8_unicode_ci | 17 | 1 |
utf8_unicode_ci | 18 | 1 |
エラー無し
str = 1(全角)
table | id | str |
utf8_bin | 18 | 1 |
utf8_general_ci | 18 | 1 |
utf8_unicode_ci | 17 | 1 |
utf8_unicode_ci | 18 | 1 |
エラー無し
str like '高%'
table | id | str |
utf8_bin | 9 | 高橋 |
utf8_general_ci | 9 | 高橋 |
utf8_unicode_ci | 9 | 高橋 |
エラー無し
str like '髙%'
table | id | str |
utf8_bin | 10 | 髙橋 |
utf8_general_ci | 10 | 髙橋 |
utf8_unicode_ci | 10 | 髙橋 |
エラー無し
num = 0
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラー無し
num = a(半角)
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラーあり
show warnings;
Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
num = A(半角)
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラーあり
show warnings;
Level Code Message
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
num = AAA(半角)
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラーあり
show warnings;
Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
Warning 1292 Truncated incorrect DOUBLE value: 'AAA'
num = a(全角)
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラーあり
show warnings;
Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
Warning 1292 Truncated incorrect DOUBLE value: 'a'
num = A(全角)
table | id | str |
utf8_bin | 19 | 0 |
utf8_general_ci | 19 | 0 |
utf8_unicode_ci | 19 | 0 |
エラーあり
show warnings;
Level Code Message --
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
Warning 1292 Truncated incorrect DOUBLE value: 'A'
num like '1%'
table | id | str |
utf8_bin | 20 | 1 |
utf8_bin | 22 | 100 |
utf8_general_ci | 20 | 1 |
utf8_general_ci | 22 | 100 |
utf8_unicode_ci | 20 | 1 |
utf8_unicode_ci | 22 | 100 |
エラー無し
num like 'a%'
マッチせず
エラー無し
思ったこと
utf8_bin / utf8_general_ciはそんなに違いは無いのね。
大文字・小文字を判別するかしないか、くらいっぽい。
utf8_unicode_ciはヒトが考えるマッチ条件に似ているかな。
全半角をゴチャッと検索できるのはうれしいかも。システム的には厳密じゃないけど。
理解した上で使い分けていと思わぬバグを生産しそうだけど、
とっても便利だという事がわかりました。
0 件のコメント:
コメントを投稿