From e061b5613ea3b237941b78364119b055e6febaaa Mon Sep 17 00:00:00 2001
From: Jelmer
Date: Sun, 16 Jan 2022 10:26:40 +0000
Subject: [PATCH] Treat most HTML elements as word-breaking (#286)
---
.gitignore | 2 +-
bergamot-translator-tests | 2 +-
src/tests/units/html_tests.cpp | 328 +++++++++++++++++++----
src/translator/html.cpp | 474 ++++++++++++++++++---------------
src/translator/html.h | 67 ++++-
5 files changed, 595 insertions(+), 278 deletions(-)
diff --git a/.gitignore b/.gitignore
index 49093ba..64c1aa3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,7 +19,7 @@ _deps
wasm/test_page/node_modules
build-wasm
models
-wasm/test_page/bergamot-translator-worker.*
+wasm/test_page/js/bergamot-translator-worker.*
# VSCode
.vscode
diff --git a/bergamot-translator-tests b/bergamot-translator-tests
index 332e976..b46987e 160000
--- a/bergamot-translator-tests
+++ b/bergamot-translator-tests
@@ -1 +1 @@
-Subproject commit 332e976df4583793a09b6483b80b972621fcfadb
+Subproject commit b46987e96fc27b7e9488fbc36b53c07e1786784c
diff --git a/src/tests/units/html_tests.cpp b/src/tests/units/html_tests.cpp
index 48af706..d1a604d 100644
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@@ -169,24 +169,136 @@ TEST_CASE("Test case html entities") {
// These are all entities I would expect in innerHTML, since all other entities
// can be encoded as UTF-8 so there's no need to encode them through &...; when
// innerHTML encodes the DOM as HTML.
- std::string input("This is a sentence <with> named & entities
\n");
+ std::string input("This is a sentence <with> named & entities
");
HTML html(std::move(input), true);
- CHECK(input == "This is a sentence named & entities\n");
+ CHECK(input == "This is a sentence named & entities");
}
-TEST_CASE("Test self-closing tags should be treated as spaces") {
- std::string input("Space
please?
\n");
+TEST_CASE("Test self-closing tags should be treated as paragraph break") {
+ std::string test_str("Space
please?
");
+ std::string input(test_str);
HTML html(std::move(input), true);
- CHECK(input == "Space please?\n");
+ CHECK(input == "Space\n\nplease?");
+
+ Response response;
+ std::string source_str("Space\n\nplease?");
+ std::vector source_tokens{
+ string_view(source_str.data() + 0, 5), // Space
+ string_view(source_str.data() + 5, 0), // [EOS]
+ string_view(source_str.data() + 5, 2), // \n\n
+ string_view(source_str.data() + 7, 1), // p
+ string_view(source_str.data() + 8, 5), // lease
+ string_view(source_str.data() + 13, 1), // ?
+ string_view(source_str.data() + 14, 0), // EOS
+ };
+ response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2);
+ response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end());
+
+ std::string target_str("Platz\n\nbitte?");
+ std::vector target_tokens{
+ string_view(target_str.data() + 0, 5), // Platz
+ string_view(target_str.data() + 5, 0), // [EOS]
+ string_view(target_str.data() + 5, 2), // \n\n
+ string_view(target_str.data() + 7, 5), // bitte
+ string_view(target_str.data() + 12, 1), // ?
+ string_view(target_str.data() + 13, 0), // [EOS]
+ };
+ response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2);
+ response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end());
+ response.alignments = {{
+ {1.0, 0.0}, // Platz <- Space
+ {0.0, 1.0} // [EOS] <- [EOS]
+ },
+ {
+ {0.1, 0.9, 0.0, 0.0}, // _bitte <- _p + lease
+ {0.0, 0.0, 1.0, 0.0}, // ? <- ?
+ {0.0, 0.0, 0.0, 1.0}, // [EOS] <- [EOS]
+ }};
+
+ // Main focus of this test is that the space that was introduced in the text
+ // that was being translated does not end up in the translation.
+ html.restore(response);
+ CHECK(response.source.text == "Space
please?
");
+ CHECK(response.target.text == "Platz
bitte?
");
+}
+
+TEST_CASE("Test inline tags should be treated as spaces") {
+ std::string test_str("underline");
+
+ std::string input(test_str);
+ HTML html(std::move(input), true);
+ CHECK(input == "un der line");
+
+ Response response;
+ std::string source_str("un der line");
+ std::vector source_tokens{
+ string_view(source_str.data() + 0, 2), // un
+ string_view(source_str.data() + 2, 3), // _de
+ string_view(source_str.data() + 5, 1), // r
+ string_view(source_str.data() + 6, 5), // _line
+ string_view(source_str.data() + 11, 0), // EOS
+ };
+ response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+ std::string target_str("una linea der");
+ std::vector target_tokens{
+ string_view(target_str.data() + 0, 3), // una
+ string_view(target_str.data() + 3, 6), // _linéa
+ string_view(target_str.data() + 9, 3), // _de
+ string_view(target_str.data() + 12, 1), // r
+ string_view(target_str.data() + 13, 0), // [EOS]
+ };
+ response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+ response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009},
+ {0.0098, 0.2967, 0.0156, 0.6640, 0.0138},
+ {0.0214, 0.7472, 0.0626, 0.0745, 0.0943},
+ {0.0022, 0.0230, 0.9357, 0.0165, 0.0226},
+ {0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}};
+
+ html.restore(response);
+ CHECK(response.source.text == "un der line"); // TODO leave spaces?
+ CHECK(response.target.text == "una linea der");
+}
+
+TEST_CASE("Test inline tags should not break words") {
+ std::string test_str("underline");
+
+ std::string input(test_str);
+ HTML::Options options;
+ options.substituteInlineTagsWithSpaces = false;
+ HTML html(std::move(input), true, std::move(options));
+ CHECK(input == "underline");
+
+ Response response;
+ std::string source_str("underline");
+ std::vector source_tokens{
+ string_view(source_str.data() + 0, 9), // underline
+ string_view(source_str.data() + 9, 0), // EOS
+ };
+ response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+ std::string target_str("subrayar");
+ std::vector target_tokens{
+ string_view(target_str.data() + 0, 8), // subrayar
+ string_view(target_str.data() + 8, 0), // [EOS]
+ };
+ response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+ response.alignments = {identity_matrix(2)};
+
+ html.restore(response);
+ CHECK(response.source.text == "underline"); // TODO not spread to whole word?
+ CHECK(response.target.text == "subrayar"); // TODO not spread to the whole word?
}
TEST_CASE("Test reconstruction of target sentence") {
std::string input("hello world
\n");
HTML html(std::move(input), true);
- CHECK(input == "hello world\n");
+ CHECK(input == "hello world\n\n\n"); // tripple \n because \n +
- AnnotatedText source("hello world\n");
+ AnnotatedText source("hello world\n\n\n");
recordSentenceFromByteRange(source, {
ByteRange{0, 4}, // 0.0 "hell"
ByteRange{4, 5}, // 0.1 "o"
@@ -194,7 +306,7 @@ TEST_CASE("Test reconstruction of target sentence") {
ByteRange{11, 11} // 0.3 ""
});
- AnnotatedText target("hallo Welt\n");
+ AnnotatedText target("hallo Welt\n\n\n");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@@ -218,11 +330,11 @@ TEST_CASE("Test reconstruction of target sentence") {
}
TEST_CASE("Test reconstruction of target sentence with entities") {
- std::string input("hello world & friends!
\n");
+ std::string input("hello world & friends!
");
HTML html(std::move(input), true);
- CHECK(input == "hello world & friends!\n");
+ CHECK(input == "hello world & friends!");
- AnnotatedText source("hello world & friends!\n");
+ AnnotatedText source("hello world & friends!");
recordSentenceFromByteRange(source, {
ByteRange{0, 4}, // 0.0 "hell"
ByteRange{4, 5}, // 0.1 "o"
@@ -233,7 +345,7 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
ByteRange{22, 22} // 0.6 ""
});
- AnnotatedText target("hallo Welt & Freunde!\n");
+ AnnotatedText target("hallo Welt & Freunde!");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@@ -252,11 +364,11 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
html.restore(response);
std::vector html_tokens_source{"", "hell", "o", " world", " &",
- " friends", "!", "", "
\n"};
+ " friends", "!", "", ""};
- std::vector html_tokens_target{"", "hall", "o", " Welt", " &",
+ std::vector html_tokens_target{"", "hall", "o", " Welt", " &",
- " Freunde", "!", "", "
\n"};
+ " Freunde", "!", "", "
"};
CHECK(asTokens(response.source) == html_tokens_source);
CHECK(asTokens(response.target) == html_tokens_target);
@@ -264,10 +376,10 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
TEST_CASE("Test reconstruction of target with multiple sentences") {
std::string input(
- "hello world! How does this deal with multiple sentences? Will it work?
\n");
+ "hello world! How does this deal with multiple sentences? Will it work?
");
HTML html(std::move(input), true);
- AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?\n");
+ AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?");
CHECK(source.text == input);
recordSentenceFromByteRange(source, {
@@ -297,7 +409,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
ByteRange{71, 71} // 2.4 ""
});
- AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?\n");
+ AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?");
recordSentenceFromByteRange(target, {
ByteRange{0, 4}, // 0.0 "hall"
ByteRange{4, 5}, // 0.1 "o"
@@ -327,7 +439,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
std::vector text_tokens_source{
"", "hall", "o", " Welt", "!", "", " ", "Wie", " geht", " das", " mit", " mehreren",
- " Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", "\n"};
+ " Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", ""};
CHECK(asTokens(target) == text_tokens_source);
@@ -360,26 +472,56 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
" work",
"?",
"",
- "\n"};
+ ""};
CHECK(asTokens(response.source) == html_tokens_source);
}
TEST_CASE("Test self-closing tag (HTML5)") {
- std::string input("hello world and other creatures
\n");
+ std::string input("hello world and other creatures
");
HTML html(std::move(input), true);
- CHECK(input == "hello world and other creatures\n"); // Note double space between "hello" and "world"
+ CHECK(input == "hello world and other creatures"); // Note double space between "hello" and "world"
}
-TEST_CASE("Test empty self-closing tag at end of input") {
+TEST_CASE("Test empty void tag at end of input") {
std::string input("hello
");
HTML html(std::move(input), true);
CHECK(input == "hello ");
+
+ Response response;
+ std::string sentence_str("hello ");
+ std::vector sentence{
+ string_view(sentence_str.data() + 0, 4), // 0.0 hell
+ string_view(sentence_str.data() + 4, 2), // 0.1 o_
+ string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
+ };
+ response.source.appendSentence("", sentence.begin(), sentence.end());
+ response.target.appendSentence("", sentence.begin(), sentence.end());
+ response.alignments = {identity_matrix(3)};
+
+ html.restore(response);
+ CHECK(response.source.text == "hello
");
+ CHECK(response.target.text == "hello
");
}
TEST_CASE("Test empty tag pair at end of input") {
std::string input("hello ");
HTML html(std::move(input), true);
CHECK(input == "hello ");
+
+ Response response;
+ std::string sentence_str("hello ");
+ std::vector sentence{
+ string_view(sentence_str.data() + 0, 4), // 0.0 hell
+ string_view(sentence_str.data() + 4, 2), // 0.1 o_
+ string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
+ };
+ response.source.appendSentence("", sentence.begin(), sentence.end());
+ response.target.appendSentence("", sentence.begin(), sentence.end());
+ response.alignments = {identity_matrix(3)};
+
+ html.restore(response);
+ CHECK(response.source.text == "hello ");
+ CHECK(response.target.text == "hello ");
}
TEST_CASE("Test empty self-closing pair at end of input in parent") {
@@ -391,11 +533,11 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") {
TEST_CASE("Test empty tag") {
std::string test_str(
"hello world
\n");
+ "id=\"1.2.3\">world");
std::string input(test_str);
HTML html(std::move(input), true);
- CHECK(input == "hello world\n");
+ CHECK(input == "hello world");
Response response;
@@ -407,11 +549,7 @@ TEST_CASE("Test empty tag") {
string_view(sentence_str.data() + 11, 0), // 0.3 ""
};
response.source.appendSentence("", sentence.begin(), sentence.end());
- response.source.appendEndingWhitespace("\n");
-
response.target.appendSentence("", sentence.begin(), sentence.end());
- response.target.appendEndingWhitespace("\n");
-
response.alignments = {identity_matrix(4)};
html.restore(response);
@@ -424,19 +562,20 @@ TEST_CASE("Test