From e061b5613ea3b237941b78364119b055e6febaaa Mon Sep 17 00:00:00 2001 From: Jelmer Date: Sun, 16 Jan 2022 10:26:40 +0000 Subject: [PATCH] Treat most HTML elements as word-breaking (#286) --- .gitignore | 2 +- bergamot-translator-tests | 2 +- src/tests/units/html_tests.cpp | 328 +++++++++++++++++++---- src/translator/html.cpp | 474 ++++++++++++++++++--------------- src/translator/html.h | 67 ++++- 5 files changed, 595 insertions(+), 278 deletions(-) diff --git a/.gitignore b/.gitignore index 49093ba..64c1aa3 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,7 @@ _deps wasm/test_page/node_modules build-wasm models -wasm/test_page/bergamot-translator-worker.* +wasm/test_page/js/bergamot-translator-worker.* # VSCode .vscode diff --git a/bergamot-translator-tests b/bergamot-translator-tests index 332e976..b46987e 160000 --- a/bergamot-translator-tests +++ b/bergamot-translator-tests @@ -1 +1 @@ -Subproject commit 332e976df4583793a09b6483b80b972621fcfadb +Subproject commit b46987e96fc27b7e9488fbc36b53c07e1786784c diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp index 48af706..d1a604d 100644 --- a/src/tests/units/html_tests.cpp +++ b/src/tests/units/html_tests.cpp @@ -169,24 +169,136 @@ TEST_CASE("Test case html entities") { // These are all entities I would expect in innerHTML, since all other entities // can be encoded as UTF-8 so there's no need to encode them through &...; when // innerHTML encodes the DOM as HTML. - std::string input("

This is a sentence <with> named & entities

\n"); + std::string input("

This is a sentence <with> named & entities

"); HTML html(std::move(input), true); - CHECK(input == "This is a sentence named & entities\n"); + CHECK(input == "This is a sentence named & entities"); } -TEST_CASE("Test self-closing tags should be treated as spaces") { - std::string input("

Space
please?

\n"); +TEST_CASE("Test self-closing tags should be treated as paragraph break") { + std::string test_str("

Space
please?

"); + std::string input(test_str); HTML html(std::move(input), true); - CHECK(input == "Space please?\n"); + CHECK(input == "Space\n\nplease?"); + + Response response; + std::string source_str("Space\n\nplease?"); + std::vector source_tokens{ + string_view(source_str.data() + 0, 5), // Space + string_view(source_str.data() + 5, 0), // [EOS] + string_view(source_str.data() + 5, 2), // \n\n + string_view(source_str.data() + 7, 1), // p + string_view(source_str.data() + 8, 5), // lease + string_view(source_str.data() + 13, 1), // ? + string_view(source_str.data() + 14, 0), // EOS + }; + response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2); + response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end()); + + std::string target_str("Platz\n\nbitte?"); + std::vector target_tokens{ + string_view(target_str.data() + 0, 5), // Platz + string_view(target_str.data() + 5, 0), // [EOS] + string_view(target_str.data() + 5, 2), // \n\n + string_view(target_str.data() + 7, 5), // bitte + string_view(target_str.data() + 12, 1), // ? + string_view(target_str.data() + 13, 0), // [EOS] + }; + response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2); + response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end()); + response.alignments = {{ + {1.0, 0.0}, // Platz <- Space + {0.0, 1.0} // [EOS] <- [EOS] + }, + { + {0.1, 0.9, 0.0, 0.0}, // _bitte <- _p + lease + {0.0, 0.0, 1.0, 0.0}, // ? <- ? + {0.0, 0.0, 0.0, 1.0}, // [EOS] <- [EOS] + }}; + + // Main focus of this test is that the space that was introduced in the text + // that was being translated does not end up in the translation. + html.restore(response); + CHECK(response.source.text == "

Space
please?

"); + CHECK(response.target.text == "

Platz
bitte?

"); +} + +TEST_CASE("Test inline tags should be treated as spaces") { + std::string test_str("underline"); + + std::string input(test_str); + HTML html(std::move(input), true); + CHECK(input == "un der line"); + + Response response; + std::string source_str("un der line"); + std::vector source_tokens{ + string_view(source_str.data() + 0, 2), // un + string_view(source_str.data() + 2, 3), // _de + string_view(source_str.data() + 5, 1), // r + string_view(source_str.data() + 6, 5), // _line + string_view(source_str.data() + 11, 0), // EOS + }; + response.source.appendSentence("", source_tokens.begin(), source_tokens.end()); + + std::string target_str("una linea der"); + std::vector target_tokens{ + string_view(target_str.data() + 0, 3), // una + string_view(target_str.data() + 3, 6), // _linéa + string_view(target_str.data() + 9, 3), // _de + string_view(target_str.data() + 12, 1), // r + string_view(target_str.data() + 13, 0), // [EOS] + }; + response.target.appendSentence("", target_tokens.begin(), target_tokens.end()); + + response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009}, + {0.0098, 0.2967, 0.0156, 0.6640, 0.0138}, + {0.0214, 0.7472, 0.0626, 0.0745, 0.0943}, + {0.0022, 0.0230, 0.9357, 0.0165, 0.0226}, + {0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}}; + + html.restore(response); + CHECK(response.source.text == "un der line"); // TODO leave spaces? + CHECK(response.target.text == "una linea der"); +} + +TEST_CASE("Test inline tags should not break words") { + std::string test_str("underline"); + + std::string input(test_str); + HTML::Options options; + options.substituteInlineTagsWithSpaces = false; + HTML html(std::move(input), true, std::move(options)); + CHECK(input == "underline"); + + Response response; + std::string source_str("underline"); + std::vector source_tokens{ + string_view(source_str.data() + 0, 9), // underline + string_view(source_str.data() + 9, 0), // EOS + }; + response.source.appendSentence("", source_tokens.begin(), source_tokens.end()); + + std::string target_str("subrayar"); + std::vector target_tokens{ + string_view(target_str.data() + 0, 8), // subrayar + string_view(target_str.data() + 8, 0), // [EOS] + }; + response.target.appendSentence("", target_tokens.begin(), target_tokens.end()); + + response.alignments = {identity_matrix(2)}; + + html.restore(response); + CHECK(response.source.text == "underline"); // TODO not spread to whole word? + CHECK(response.target.text == "subrayar"); // TODO not spread to the whole word? } TEST_CASE("Test reconstruction of target sentence") { std::string input("

hello world

\n"); HTML html(std::move(input), true); - CHECK(input == "hello world\n"); + CHECK(input == "hello world\n\n\n"); // tripple \n because \n +

- AnnotatedText source("hello world\n"); + AnnotatedText source("hello world\n\n\n"); recordSentenceFromByteRange(source, { ByteRange{0, 4}, // 0.0 "hell" ByteRange{4, 5}, // 0.1 "o" @@ -194,7 +306,7 @@ TEST_CASE("Test reconstruction of target sentence") { ByteRange{11, 11} // 0.3 "" }); - AnnotatedText target("hallo Welt\n"); + AnnotatedText target("hallo Welt\n\n\n"); recordSentenceFromByteRange(target, { ByteRange{0, 4}, // 0.0 "hall" ByteRange{4, 5}, // 0.1 "o" @@ -218,11 +330,11 @@ TEST_CASE("Test reconstruction of target sentence") { } TEST_CASE("Test reconstruction of target sentence with entities") { - std::string input("

hello world & friends!

\n"); + std::string input("

hello world & friends!

"); HTML html(std::move(input), true); - CHECK(input == "hello world & friends!\n"); + CHECK(input == "hello world & friends!"); - AnnotatedText source("hello world & friends!\n"); + AnnotatedText source("hello world & friends!"); recordSentenceFromByteRange(source, { ByteRange{0, 4}, // 0.0 "hell" ByteRange{4, 5}, // 0.1 "o" @@ -233,7 +345,7 @@ TEST_CASE("Test reconstruction of target sentence with entities") { ByteRange{22, 22} // 0.6 "" }); - AnnotatedText target("hallo Welt & Freunde!\n"); + AnnotatedText target("hallo Welt & Freunde!"); recordSentenceFromByteRange(target, { ByteRange{0, 4}, // 0.0 "hall" ByteRange{4, 5}, // 0.1 "o" @@ -252,11 +364,11 @@ TEST_CASE("Test reconstruction of target sentence with entities") { html.restore(response); std::vector html_tokens_source{"", "

hell", "o", " world", " &", - " friends", "!", "", "

\n"}; + " friends", "!", "", "

"}; - std::vector html_tokens_target{"", "

hall", "o", " Welt", " &", + std::vector html_tokens_target{"", "

hall", "o", " Welt", " &", - " Freunde", "!", "", "

\n"}; + " Freunde", "!", "", "

"}; CHECK(asTokens(response.source) == html_tokens_source); CHECK(asTokens(response.target) == html_tokens_target); @@ -264,10 +376,10 @@ TEST_CASE("Test reconstruction of target sentence with entities") { TEST_CASE("Test reconstruction of target with multiple sentences") { std::string input( - "

hello world! How does this deal with multiple sentences? Will it work?

\n"); + "

hello world! How does this deal with multiple sentences? Will it work?

"); HTML html(std::move(input), true); - AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?\n"); + AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?"); CHECK(source.text == input); recordSentenceFromByteRange(source, { @@ -297,7 +409,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") { ByteRange{71, 71} // 2.4 "" }); - AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?\n"); + AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?"); recordSentenceFromByteRange(target, { ByteRange{0, 4}, // 0.0 "hall" ByteRange{4, 5}, // 0.1 "o" @@ -327,7 +439,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") { std::vector text_tokens_source{ "", "hall", "o", " Welt", "!", "", " ", "Wie", " geht", " das", " mit", " mehreren", - " Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", "\n"}; + " Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", ""}; CHECK(asTokens(target) == text_tokens_source); @@ -360,26 +472,56 @@ TEST_CASE("Test reconstruction of target with multiple sentences") { " work", "?", "", - "

\n"}; + "

"}; CHECK(asTokens(response.source) == html_tokens_source); } TEST_CASE("Test self-closing tag (HTML5)") { - std::string input("

hello world and other creatures

\n"); + std::string input("

hello world and other creatures

"); HTML html(std::move(input), true); - CHECK(input == "hello world and other creatures\n"); // Note double space between "hello" and "world" + CHECK(input == "hello world and other creatures"); // Note double space between "hello" and "world" } -TEST_CASE("Test empty self-closing tag at end of input") { +TEST_CASE("Test empty void tag at end of input") { std::string input("hello
"); HTML html(std::move(input), true); CHECK(input == "hello "); + + Response response; + std::string sentence_str("hello "); + std::vector sentence{ + string_view(sentence_str.data() + 0, 4), // 0.0 hell + string_view(sentence_str.data() + 4, 2), // 0.1 o_ + string_view(sentence_str.data() + 6, 0), // 0.2 [EOS] + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(3)}; + + html.restore(response); + CHECK(response.source.text == "hello
"); + CHECK(response.target.text == "hello
"); } TEST_CASE("Test empty tag pair at end of input") { std::string input("hello "); HTML html(std::move(input), true); CHECK(input == "hello "); + + Response response; + std::string sentence_str("hello "); + std::vector sentence{ + string_view(sentence_str.data() + 0, 4), // 0.0 hell + string_view(sentence_str.data() + 4, 2), // 0.1 o_ + string_view(sentence_str.data() + 6, 0), // 0.2 [EOS] + }; + response.source.appendSentence("", sentence.begin(), sentence.end()); + response.target.appendSentence("", sentence.begin(), sentence.end()); + response.alignments = {identity_matrix(3)}; + + html.restore(response); + CHECK(response.source.text == "hello "); + CHECK(response.target.text == "hello "); } TEST_CASE("Test empty self-closing pair at end of input in parent") { @@ -391,11 +533,11 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") { TEST_CASE("Test empty tag") { std::string test_str( "

hello world

\n"); + "id=\"1.2.3\">world

"); std::string input(test_str); HTML html(std::move(input), true); - CHECK(input == "hello world\n"); + CHECK(input == "hello world"); Response response; @@ -407,11 +549,7 @@ TEST_CASE("Test empty tag") { string_view(sentence_str.data() + 11, 0), // 0.3 "" }; response.source.appendSentence("", sentence.begin(), sentence.end()); - response.source.appendEndingWhitespace("\n"); - response.target.appendSentence("", sentence.begin(), sentence.end()); - response.target.appendEndingWhitespace("\n"); - response.alignments = {identity_matrix(4)}; html.restore(response); @@ -424,19 +562,20 @@ TEST_CASE("Test