2014-10-22 13:00:04 +04:00
|
|
|
# coding: utf-8
|
2015-12-16 08:07:31 +03:00
|
|
|
# frozen_string_literal: false
|
2014-10-22 13:00:04 +04:00
|
|
|
|
2014-10-22 13:06:49 +04:00
|
|
|
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
|
2014-10-22 13:00:04 +04:00
|
|
|
|
|
|
|
require 'test/unit'
|
2014-11-10 09:56:41 +03:00
|
|
|
require 'unicode_normalize/normalize'
|
2014-10-22 13:00:04 +04:00
|
|
|
|
2014-11-06 15:45:15 +03:00
|
|
|
class TestUnicodeNormalize < Test::Unit::TestCase
|
2014-10-22 13:00:04 +04:00
|
|
|
|
2016-07-03 12:51:46 +03:00
|
|
|
UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
|
2018-07-28 11:56:59 +03:00
|
|
|
path = File.expand_path("../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
|
|
|
|
UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path
|
2016-05-28 14:34:20 +03:00
|
|
|
|
2016-06-28 09:56:50 +03:00
|
|
|
def self.expand_filename(basename)
|
2018-07-28 11:56:59 +03:00
|
|
|
File.expand_path("#{basename}.txt", UNICODE_DATA_PATH)
|
2016-05-28 14:34:20 +03:00
|
|
|
end
|
2016-06-28 09:56:50 +03:00
|
|
|
end
|
2014-11-06 15:45:15 +03:00
|
|
|
|
2016-06-28 09:56:50 +03:00
|
|
|
%w[NormalizationTest].all? {|f|
|
|
|
|
File.exist?(TestUnicodeNormalize.expand_filename(f))
|
|
|
|
} and
|
|
|
|
class TestUnicodeNormalize
|
2014-11-06 15:45:15 +03:00
|
|
|
NormTest = Struct.new :source, :NFC, :NFD, :NFKC, :NFKD, :line
|
2014-10-31 03:59:25 +03:00
|
|
|
|
2016-06-28 09:56:50 +03:00
|
|
|
def self.read_tests
|
2023-02-27 09:44:06 +03:00
|
|
|
lines = File.readlines(expand_filename('NormalizationTest'), encoding: 'utf-8')
|
2016-06-28 09:56:50 +03:00
|
|
|
firstline = lines.shift
|
|
|
|
define_method "test_0_normalizationtest_firstline" do
|
|
|
|
assert_include(firstline, "NormalizationTest-#{UNICODE_VERSION}.txt")
|
|
|
|
end
|
|
|
|
lines
|
2014-10-22 13:00:04 +04:00
|
|
|
.collect.with_index { |linedata, linenumber| [linedata, linenumber]}
|
|
|
|
.reject { |line| line[0] =~ /^[\#@]/ }
|
|
|
|
.collect do |line|
|
2015-06-14 03:59:23 +03:00
|
|
|
NormTest.new(*(line[0].split(';').take(5).collect do |code_string|
|
2014-10-22 13:00:04 +04:00
|
|
|
code_string.split(/\s/).collect { |cp| cp.to_i(16) }.pack('U*')
|
2015-06-14 03:59:23 +03:00
|
|
|
end + [line[1]+1]))
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2014-10-22 13:06:49 +04:00
|
|
|
def to_codepoints(string)
|
2014-10-22 13:00:04 +04:00
|
|
|
string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') }
|
|
|
|
end
|
|
|
|
|
2016-06-28 09:56:50 +03:00
|
|
|
begin
|
2014-10-22 13:00:04 +04:00
|
|
|
@@tests ||= read_tests
|
2019-06-28 12:31:42 +03:00
|
|
|
rescue Errno::ENOENT
|
2014-11-12 02:54:45 +03:00
|
|
|
@@tests ||= []
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.generate_test_normalize(target, normalization, source, prechecked)
|
|
|
|
define_method "test_normalize_to_#{target}_from_#{source}_with_#{normalization}" do
|
2014-10-23 16:36:25 +04:00
|
|
|
expected = actual = test = nil
|
|
|
|
mesg = proc {"#{to_codepoints(expected)} expected but was #{to_codepoints(actual)} on line #{test[:line]} (#{normalization})"}
|
|
|
|
@@tests.each do |t|
|
|
|
|
test = t
|
2014-10-25 14:30:36 +04:00
|
|
|
if prechecked.nil? or test[prechecked]==test[source]
|
2014-10-22 13:00:04 +04:00
|
|
|
expected = test[target]
|
2014-10-22 13:06:49 +04:00
|
|
|
actual = test[source].unicode_normalize(normalization)
|
2014-10-23 16:36:25 +04:00
|
|
|
assert_equal expected, actual, mesg
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
# source; NFC; NFD; NFKC; NFKD
|
|
|
|
# NFC
|
|
|
|
# :NFC == toNFC(:source) == toNFC(:NFC) == toNFC(:NFD)
|
|
|
|
generate_test_normalize :NFC, :nfc, :source, nil
|
|
|
|
generate_test_normalize :NFC, :nfc, :NFC, :source
|
|
|
|
generate_test_normalize :NFC, :nfc, :NFD, :source
|
|
|
|
# :NFKC == toNFC(:NFKC) == toNFC(:NFKD)
|
|
|
|
generate_test_normalize :NFKC, :nfc, :NFKC, nil
|
|
|
|
generate_test_normalize :NFKC, :nfc, :NFKD, :NFKC
|
|
|
|
#
|
|
|
|
# NFD
|
|
|
|
# :NFD == toNFD(:source) == toNFD(:NFC) == toNFD(:NFD)
|
|
|
|
generate_test_normalize :NFD, :nfd, :source, nil
|
|
|
|
generate_test_normalize :NFD, :nfd, :NFC, :source
|
|
|
|
generate_test_normalize :NFD, :nfd, :NFD, :source
|
|
|
|
# :NFKD == toNFD(:NFKC) == toNFD(:NFKD)
|
|
|
|
generate_test_normalize :NFKD, :nfd, :NFKC, nil
|
|
|
|
generate_test_normalize :NFKD, :nfd, :NFKD, :NFKC
|
|
|
|
#
|
|
|
|
# NFKC
|
|
|
|
# :NFKC == toNFKC(:source) == toNFKC(:NFC) == toNFKC(:NFD) == toNFKC(:NFKC) == toNFKC(:NFKD)
|
|
|
|
generate_test_normalize :NFKC, :nfkc, :source, nil
|
|
|
|
generate_test_normalize :NFKC, :nfkc, :NFC, :source
|
|
|
|
generate_test_normalize :NFKC, :nfkc, :NFD, :source
|
|
|
|
generate_test_normalize :NFKC, :nfkc, :NFKC, :NFC
|
|
|
|
generate_test_normalize :NFKC, :nfkc, :NFKD, :NFD
|
|
|
|
#
|
|
|
|
# NFKD
|
|
|
|
# :NFKD == toNFKD(:source) == toNFKD(:NFC) == toNFKD(:NFD) == toNFKD(:NFKC) == toNFKD(:NFKD)
|
|
|
|
generate_test_normalize :NFKD, :nfkd, :source, nil
|
|
|
|
generate_test_normalize :NFKD, :nfkd, :NFC, :source
|
|
|
|
generate_test_normalize :NFKD, :nfkd, :NFD, :source
|
|
|
|
generate_test_normalize :NFKD, :nfkd, :NFKC, :NFC
|
|
|
|
generate_test_normalize :NFKD, :nfkd, :NFKD, :NFD
|
|
|
|
|
|
|
|
def self.generate_test_check_true(source, normalization)
|
|
|
|
define_method "test_check_true_#{source}_as_#{normalization}" do
|
2014-10-23 16:36:25 +04:00
|
|
|
test = nil
|
|
|
|
mesg = proc {"#{to_codepoints(test[source])} should check as #{normalization} but does not on line #{test[:line]}"}
|
|
|
|
@@tests.each do |t|
|
|
|
|
test = t
|
2014-10-22 13:06:49 +04:00
|
|
|
actual = test[source].unicode_normalized?(normalization)
|
2014-10-23 16:36:25 +04:00
|
|
|
assert_equal true, actual, mesg
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.generate_test_check_false(source, compare, normalization)
|
|
|
|
define_method "test_check_false_#{source}_as_#{normalization}" do
|
2014-10-23 16:36:25 +04:00
|
|
|
test = nil
|
|
|
|
mesg = proc {"#{to_codepoints(test[source])} should not check as #{normalization} but does on line #{test[:line]}"}
|
|
|
|
@@tests.each do |t|
|
|
|
|
test = t
|
2014-10-22 13:00:04 +04:00
|
|
|
if test[source] != test[compare]
|
2014-10-22 13:06:49 +04:00
|
|
|
actual = test[source].unicode_normalized?(normalization)
|
2014-10-23 16:36:25 +04:00
|
|
|
assert_equal false, actual, mesg
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
generate_test_check_true :NFC, :nfc
|
|
|
|
generate_test_check_true :NFD, :nfd
|
|
|
|
generate_test_check_true :NFKC, :nfc
|
|
|
|
generate_test_check_true :NFKC, :nfkc
|
|
|
|
generate_test_check_true :NFKD, :nfd
|
|
|
|
generate_test_check_true :NFKD, :nfkd
|
|
|
|
|
|
|
|
generate_test_check_false :source, :NFD, :nfd
|
|
|
|
generate_test_check_false :NFC, :NFD, :nfd
|
|
|
|
generate_test_check_false :NFKC, :NFKD, :nfd
|
|
|
|
generate_test_check_false :source, :NFC, :nfc
|
|
|
|
generate_test_check_false :NFD, :NFC, :nfc
|
|
|
|
generate_test_check_false :NFKD, :NFKC, :nfc
|
|
|
|
generate_test_check_false :source, :NFKD, :nfkd
|
|
|
|
generate_test_check_false :NFC, :NFKD, :nfkd
|
|
|
|
generate_test_check_false :NFD, :NFKD, :nfkd
|
|
|
|
generate_test_check_false :NFKC, :NFKD, :nfkd
|
|
|
|
generate_test_check_false :source, :NFKC, :nfkc
|
|
|
|
generate_test_check_false :NFC, :NFKC, :nfkc
|
|
|
|
generate_test_check_false :NFD, :NFKC, :nfkc
|
|
|
|
generate_test_check_false :NFKD, :NFKC, :nfkc
|
2018-07-28 12:11:13 +03:00
|
|
|
end
|
2014-10-22 13:00:04 +04:00
|
|
|
|
2018-07-28 12:11:13 +03:00
|
|
|
class TestUnicodeNormalize
|
2014-10-22 13:00:04 +04:00
|
|
|
def test_non_UTF_8
|
2014-10-22 13:06:49 +04:00
|
|
|
assert_equal "\u1E0A".encode('UTF-16BE'), "D\u0307".encode('UTF-16BE').unicode_normalize(:nfc)
|
|
|
|
assert_equal true, "\u1E0A".encode('UTF-16BE').unicode_normalized?(:nfc)
|
|
|
|
assert_equal false, "D\u0307".encode('UTF-16BE').unicode_normalized?(:nfc)
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
|
|
|
|
def test_singleton_with_accents
|
2014-10-22 13:06:49 +04:00
|
|
|
assert_equal "\u0136", "\u212A\u0327".unicode_normalize(:nfc)
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
|
|
|
|
def test_partial_jamo_compose
|
2014-10-22 13:06:49 +04:00
|
|
|
assert_equal "\uAC01", "\uAC00\u11A8".unicode_normalize(:nfc)
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
|
|
|
|
def test_partial_jamo_decompose
|
2014-10-22 13:06:49 +04:00
|
|
|
assert_equal "\u1100\u1161\u11A8", "\uAC00\u11A8".unicode_normalize(:nfd)
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
|
|
|
|
2018-07-28 12:44:33 +03:00
|
|
|
# preventive tests for (non-)bug #14934
|
|
|
|
def test_no_trailing_jamo
|
|
|
|
assert_equal "\u1100\u1176\u11a8", "\u1100\u1176\u11a8".unicode_normalize(:nfc)
|
|
|
|
assert_equal "\uae30\u11a7", "\u1100\u1175\u11a7".unicode_normalize(:nfc)
|
|
|
|
assert_equal "\uae30\u11c3", "\u1100\u1175\u11c3".unicode_normalize(:nfc)
|
|
|
|
end
|
|
|
|
|
2014-10-22 13:00:04 +04:00
|
|
|
def test_hangul_plus_accents
|
2014-10-22 13:06:49 +04:00
|
|
|
assert_equal "\uAC00\u0323\u0300", "\uAC00\u0300\u0323".unicode_normalize(:nfc)
|
|
|
|
assert_equal "\uAC00\u0323\u0300", "\u1100\u1161\u0300\u0323".unicode_normalize(:nfc)
|
|
|
|
assert_equal "\u1100\u1161\u0323\u0300", "\uAC00\u0300\u0323".unicode_normalize(:nfd)
|
|
|
|
assert_equal "\u1100\u1161\u0323\u0300", "\u1100\u1161\u0300\u0323".unicode_normalize(:nfd)
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|
2014-10-25 15:01:07 +04:00
|
|
|
|
2014-10-25 15:01:05 +04:00
|
|
|
def test_raise_exception_for_non_unicode_encoding
|
|
|
|
assert_raise(Encoding::CompatibilityError) { "abc".force_encoding('ISO-8859-1').unicode_normalize }
|
|
|
|
assert_raise(Encoding::CompatibilityError) { "abc".force_encoding('ISO-8859-1').unicode_normalize! }
|
|
|
|
assert_raise(Encoding::CompatibilityError) { "abc".force_encoding('ISO-8859-1').unicode_normalized? }
|
|
|
|
end
|
2014-10-25 15:19:22 +04:00
|
|
|
|
2019-04-05 03:58:51 +03:00
|
|
|
def test_reiwa
|
|
|
|
assert_equal "\u4EE4\u548C", "\u32FF".unicode_normalize(:nfkc)
|
|
|
|
end
|
|
|
|
|
2014-10-25 15:19:22 +04:00
|
|
|
def test_us_ascii
|
|
|
|
ascii_string = 'abc'.encode('US-ASCII')
|
|
|
|
|
|
|
|
assert_equal ascii_string, ascii_string.unicode_normalize
|
|
|
|
assert_equal ascii_string, ascii_string.unicode_normalize(:nfd)
|
|
|
|
assert_equal ascii_string, ascii_string.unicode_normalize(:nfkc)
|
|
|
|
assert_equal ascii_string, ascii_string.unicode_normalize(:nfkd)
|
|
|
|
|
2016-05-28 14:34:20 +03:00
|
|
|
assert_equal ascii_string, ascii_string.dup.unicode_normalize!
|
|
|
|
assert_equal ascii_string, ascii_string.dup.unicode_normalize!(:nfd)
|
|
|
|
assert_equal ascii_string, ascii_string.dup.unicode_normalize!(:nfkc)
|
|
|
|
assert_equal ascii_string, ascii_string.dup.unicode_normalize!(:nfkd)
|
2014-10-25 15:19:22 +04:00
|
|
|
|
|
|
|
assert_equal true, ascii_string.unicode_normalized?
|
|
|
|
assert_equal true, ascii_string.unicode_normalized?(:nfd)
|
|
|
|
assert_equal true, ascii_string.unicode_normalized?(:nfkc)
|
|
|
|
assert_equal true, ascii_string.unicode_normalized?(:nfkd)
|
|
|
|
end
|
2014-10-22 13:00:04 +04:00
|
|
|
end
|