Treat most HTML elements as word-breaking (#286)
This commit is contained in:
Родитель
13c55e2693
Коммит
e061b5613e
|
@ -19,7 +19,7 @@ _deps
|
|||
wasm/test_page/node_modules
|
||||
build-wasm
|
||||
models
|
||||
wasm/test_page/bergamot-translator-worker.*
|
||||
wasm/test_page/js/bergamot-translator-worker.*
|
||||
|
||||
# VSCode
|
||||
.vscode
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 332e976df4583793a09b6483b80b972621fcfadb
|
||||
Subproject commit b46987e96fc27b7e9488fbc36b53c07e1786784c
|
|
@ -169,24 +169,136 @@ TEST_CASE("Test case html entities") {
|
|||
// These are all entities I would expect in innerHTML, since all other entities
|
||||
// can be encoded as UTF-8 so there's no need to encode them through &...; when
|
||||
// innerHTML encodes the DOM as HTML.
|
||||
std::string input("<p data-attr=\""'\">This is a sentence <with> named & entities</p>\n");
|
||||
std::string input("<p data-attr=\""'\">This is a sentence <with> named & entities</p>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "This is a sentence <with> named & entities\n");
|
||||
CHECK(input == "This is a sentence <with> named & entities");
|
||||
}
|
||||
|
||||
TEST_CASE("Test self-closing tags should be treated as spaces") {
|
||||
std::string input("<p>Space<br>please?</p>\n");
|
||||
TEST_CASE("Test self-closing tags should be treated as paragraph break") {
|
||||
std::string test_str("<p>Space<br>please?</p>");
|
||||
|
||||
std::string input(test_str);
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "Space please?\n");
|
||||
CHECK(input == "Space\n\nplease?");
|
||||
|
||||
Response response;
|
||||
std::string source_str("Space\n\nplease?");
|
||||
std::vector<string_view> source_tokens{
|
||||
string_view(source_str.data() + 0, 5), // Space
|
||||
string_view(source_str.data() + 5, 0), // [EOS]
|
||||
string_view(source_str.data() + 5, 2), // \n\n
|
||||
string_view(source_str.data() + 7, 1), // p
|
||||
string_view(source_str.data() + 8, 5), // lease
|
||||
string_view(source_str.data() + 13, 1), // ?
|
||||
string_view(source_str.data() + 14, 0), // EOS
|
||||
};
|
||||
response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2);
|
||||
response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end());
|
||||
|
||||
std::string target_str("Platz\n\nbitte?");
|
||||
std::vector<string_view> target_tokens{
|
||||
string_view(target_str.data() + 0, 5), // Platz
|
||||
string_view(target_str.data() + 5, 0), // [EOS]
|
||||
string_view(target_str.data() + 5, 2), // \n\n
|
||||
string_view(target_str.data() + 7, 5), // bitte
|
||||
string_view(target_str.data() + 12, 1), // ?
|
||||
string_view(target_str.data() + 13, 0), // [EOS]
|
||||
};
|
||||
response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2);
|
||||
response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end());
|
||||
response.alignments = {{
|
||||
{1.0, 0.0}, // Platz <- Space
|
||||
{0.0, 1.0} // [EOS] <- [EOS]
|
||||
},
|
||||
{
|
||||
{0.1, 0.9, 0.0, 0.0}, // _bitte <- _p + lease
|
||||
{0.0, 0.0, 1.0, 0.0}, // ? <- ?
|
||||
{0.0, 0.0, 0.0, 1.0}, // [EOS] <- [EOS]
|
||||
}};
|
||||
|
||||
// Main focus of this test is that the space that was introduced in the text
|
||||
// that was being translated does not end up in the translation.
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == "<p>Space<br>please?</p>");
|
||||
CHECK(response.target.text == "<p>Platz<br>bitte?</p>");
|
||||
}
|
||||
|
||||
TEST_CASE("Test inline tags should be treated as spaces") {
|
||||
std::string test_str("un<u>der</u>line");
|
||||
|
||||
std::string input(test_str);
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "un der line");
|
||||
|
||||
Response response;
|
||||
std::string source_str("un der line");
|
||||
std::vector<string_view> source_tokens{
|
||||
string_view(source_str.data() + 0, 2), // un
|
||||
string_view(source_str.data() + 2, 3), // _de
|
||||
string_view(source_str.data() + 5, 1), // r
|
||||
string_view(source_str.data() + 6, 5), // _line
|
||||
string_view(source_str.data() + 11, 0), // EOS
|
||||
};
|
||||
response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
|
||||
|
||||
std::string target_str("una linea der");
|
||||
std::vector<string_view> target_tokens{
|
||||
string_view(target_str.data() + 0, 3), // una
|
||||
string_view(target_str.data() + 3, 6), // _linéa
|
||||
string_view(target_str.data() + 9, 3), // _de
|
||||
string_view(target_str.data() + 12, 1), // r
|
||||
string_view(target_str.data() + 13, 0), // [EOS]
|
||||
};
|
||||
response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
|
||||
|
||||
response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009},
|
||||
{0.0098, 0.2967, 0.0156, 0.6640, 0.0138},
|
||||
{0.0214, 0.7472, 0.0626, 0.0745, 0.0943},
|
||||
{0.0022, 0.0230, 0.9357, 0.0165, 0.0226},
|
||||
{0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}};
|
||||
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == "un <u>der</u> line"); // TODO leave spaces?
|
||||
CHECK(response.target.text == "una linea <u>der</u>");
|
||||
}
|
||||
|
||||
TEST_CASE("Test inline tags should not break words") {
|
||||
std::string test_str("un<u>der</u>line");
|
||||
|
||||
std::string input(test_str);
|
||||
HTML::Options options;
|
||||
options.substituteInlineTagsWithSpaces = false;
|
||||
HTML html(std::move(input), true, std::move(options));
|
||||
CHECK(input == "underline");
|
||||
|
||||
Response response;
|
||||
std::string source_str("underline");
|
||||
std::vector<string_view> source_tokens{
|
||||
string_view(source_str.data() + 0, 9), // underline
|
||||
string_view(source_str.data() + 9, 0), // EOS
|
||||
};
|
||||
response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
|
||||
|
||||
std::string target_str("subrayar");
|
||||
std::vector<string_view> target_tokens{
|
||||
string_view(target_str.data() + 0, 8), // subrayar
|
||||
string_view(target_str.data() + 8, 0), // [EOS]
|
||||
};
|
||||
response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
|
||||
|
||||
response.alignments = {identity_matrix<float>(2)};
|
||||
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == "<u></u>underline"); // TODO not spread <u> to whole word?
|
||||
CHECK(response.target.text == "<u></u>subrayar"); // TODO not spread <u> to the whole word?
|
||||
}
|
||||
|
||||
TEST_CASE("Test reconstruction of target sentence") {
|
||||
std::string input("<p>hello <b>world</b></p>\n");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello world\n");
|
||||
CHECK(input == "hello world\n\n\n"); // tripple \n because \n + </p>
|
||||
|
||||
AnnotatedText source("hello world\n");
|
||||
AnnotatedText source("hello world\n\n\n");
|
||||
recordSentenceFromByteRange(source, {
|
||||
ByteRange{0, 4}, // 0.0 "hell"
|
||||
ByteRange{4, 5}, // 0.1 "o"
|
||||
|
@ -194,7 +306,7 @@ TEST_CASE("Test reconstruction of target sentence") {
|
|||
ByteRange{11, 11} // 0.3 ""
|
||||
});
|
||||
|
||||
AnnotatedText target("hallo Welt\n");
|
||||
AnnotatedText target("hallo Welt\n\n\n");
|
||||
recordSentenceFromByteRange(target, {
|
||||
ByteRange{0, 4}, // 0.0 "hall"
|
||||
ByteRange{4, 5}, // 0.1 "o"
|
||||
|
@ -218,11 +330,11 @@ TEST_CASE("Test reconstruction of target sentence") {
|
|||
}
|
||||
|
||||
TEST_CASE("Test reconstruction of target sentence with entities") {
|
||||
std::string input("<p>hello <b>world & friends!</b></p>\n");
|
||||
std::string input("<p>hello <b>world & friends!</b></p>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello world & friends!\n");
|
||||
CHECK(input == "hello world & friends!");
|
||||
|
||||
AnnotatedText source("hello world & friends!\n");
|
||||
AnnotatedText source("hello world & friends!");
|
||||
recordSentenceFromByteRange(source, {
|
||||
ByteRange{0, 4}, // 0.0 "hell"
|
||||
ByteRange{4, 5}, // 0.1 "o"
|
||||
|
@ -233,7 +345,7 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
|
|||
ByteRange{22, 22} // 0.6 ""
|
||||
});
|
||||
|
||||
AnnotatedText target("hallo Welt & Freunde!\n");
|
||||
AnnotatedText target("hallo Welt & Freunde!");
|
||||
recordSentenceFromByteRange(target, {
|
||||
ByteRange{0, 4}, // 0.0 "hall"
|
||||
ByteRange{4, 5}, // 0.1 "o"
|
||||
|
@ -252,11 +364,11 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
|
|||
html.restore(response);
|
||||
|
||||
std::vector<std::string> html_tokens_source{"", "<p>hell", "o", " <b>world", " &",
|
||||
" friends", "!", "", "</b></p>\n"};
|
||||
" friends", "!", "", "</b></p>"};
|
||||
|
||||
std::vector<std::string> html_tokens_target{"", "<p>hall", "o", " <b>Welt", " &",
|
||||
std::vector<std::string> html_tokens_target{"", "<p>hall", "o", " <b>Welt", " &",
|
||||
|
||||
" Freunde", "!", "", "</b></p>\n"};
|
||||
" Freunde", "!", "", "</b></p>"};
|
||||
|
||||
CHECK(asTokens(response.source) == html_tokens_source);
|
||||
CHECK(asTokens(response.target) == html_tokens_target);
|
||||
|
@ -264,10 +376,10 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
|
|||
|
||||
TEST_CASE("Test reconstruction of target with multiple sentences") {
|
||||
std::string input(
|
||||
"<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>\n");
|
||||
"<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>");
|
||||
HTML html(std::move(input), true);
|
||||
|
||||
AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?\n");
|
||||
AnnotatedText source("hello world! How does this deal with multiple sentences? Will it work?");
|
||||
CHECK(source.text == input);
|
||||
|
||||
recordSentenceFromByteRange(source, {
|
||||
|
@ -297,7 +409,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
|
|||
ByteRange{71, 71} // 2.4 ""
|
||||
});
|
||||
|
||||
AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?\n");
|
||||
AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?");
|
||||
recordSentenceFromByteRange(target, {
|
||||
ByteRange{0, 4}, // 0.0 "hall"
|
||||
ByteRange{4, 5}, // 0.1 "o"
|
||||
|
@ -327,7 +439,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
|
|||
|
||||
std::vector<std::string> text_tokens_source{
|
||||
"", "hall", "o", " Welt", "!", "", " ", "Wie", " geht", " das", " mit", " mehreren",
|
||||
" Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", "\n"};
|
||||
" Sätze", "n", " um", "?", "", " ", "Wird", " es", " funktionieren", "?", "", ""};
|
||||
|
||||
CHECK(asTokens(target) == text_tokens_source);
|
||||
|
||||
|
@ -360,26 +472,56 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
|
|||
" work",
|
||||
"?",
|
||||
"",
|
||||
"</p>\n"};
|
||||
"</p>"};
|
||||
CHECK(asTokens(response.source) == html_tokens_source);
|
||||
}
|
||||
|
||||
TEST_CASE("Test self-closing tag (HTML5)") {
|
||||
std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>\n");
|
||||
std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello world and other creatures\n"); // Note double space between "hello" and "world"
|
||||
CHECK(input == "hello world and other creatures"); // Note double space between "hello" and "world"
|
||||
}
|
||||
|
||||
TEST_CASE("Test empty self-closing tag at end of input") {
|
||||
TEST_CASE("Test empty void tag at end of input") {
|
||||
std::string input("hello <br>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello ");
|
||||
|
||||
Response response;
|
||||
std::string sentence_str("hello ");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 4), // 0.0 hell
|
||||
string_view(sentence_str.data() + 4, 2), // 0.1 o_
|
||||
string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.alignments = {identity_matrix<float>(3)};
|
||||
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == "hello <br>");
|
||||
CHECK(response.target.text == "hello <br>");
|
||||
}
|
||||
|
||||
TEST_CASE("Test empty tag pair at end of input") {
|
||||
std::string input("hello <u></u>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello ");
|
||||
|
||||
Response response;
|
||||
std::string sentence_str("hello ");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 4), // 0.0 hell
|
||||
string_view(sentence_str.data() + 4, 2), // 0.1 o_
|
||||
string_view(sentence_str.data() + 6, 0), // 0.2 [EOS]
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.alignments = {identity_matrix<float>(3)};
|
||||
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == "hello <u></u>");
|
||||
CHECK(response.target.text == "hello <u></u>");
|
||||
}
|
||||
|
||||
TEST_CASE("Test empty self-closing pair at end of input in parent") {
|
||||
|
@ -391,11 +533,11 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") {
|
|||
TEST_CASE("Test empty tag") {
|
||||
std::string test_str(
|
||||
"<p id=\"1\">hello <img id=\"1.1\"><span id=\"1.2\"><u id=\"1.2.1\"></u><b id=\"1.2.2\"></b><img "
|
||||
"id=\"1.2.3\">world</span></p>\n");
|
||||
"id=\"1.2.3\">world</span></p>");
|
||||
|
||||
std::string input(test_str);
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello world\n");
|
||||
CHECK(input == "hello world");
|
||||
|
||||
Response response;
|
||||
|
||||
|
@ -407,11 +549,7 @@ TEST_CASE("Test empty tag") {
|
|||
string_view(sentence_str.data() + 11, 0), // 0.3 ""
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.source.appendEndingWhitespace("\n");
|
||||
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.target.appendEndingWhitespace("\n");
|
||||
|
||||
response.alignments = {identity_matrix<float>(4)};
|
||||
|
||||
html.restore(response);
|
||||
|
@ -424,19 +562,20 @@ TEST_CASE("Test <script> element") {
|
|||
|
||||
std::string input(test_str);
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "hello world");
|
||||
CHECK(input == "hello \n\nworld");
|
||||
|
||||
Response response;
|
||||
std::string sentence_str("hello world");
|
||||
std::string sentence_str("hello \n\nworld");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 4), // 0.0 hell
|
||||
string_view(sentence_str.data() + 4, 1), // 0.1 o
|
||||
string_view(sentence_str.data() + 5, 6), // 0.2 _world
|
||||
string_view(sentence_str.data() + 11, 0), // 0.3 ""
|
||||
string_view(sentence_str.data() + 4, 2), // 0.1 o_
|
||||
string_view(sentence_str.data() + 6, 2), // 0.2 \n\n
|
||||
string_view(sentence_str.data() + 8, 5), // 0.3 world
|
||||
string_view(sentence_str.data() + 13, 0), // 0.4 ""
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.alignments = {identity_matrix<float>(4)};
|
||||
response.alignments = {identity_matrix<float>(5)};
|
||||
|
||||
html.restore(response);
|
||||
CHECK(response.source.text == test_str);
|
||||
|
@ -466,10 +605,10 @@ TEST_CASE("Test comment") {
|
|||
CHECK(response.target.text == test_str);
|
||||
}
|
||||
|
||||
TEST_CASE("End-to-end translation") {
|
||||
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>\n");
|
||||
TEST_CASE("End-to-end translation", "[!mayfail]") {
|
||||
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "I like to drive this car.\n");
|
||||
CHECK(input == "I like to drive this car.");
|
||||
|
||||
Response response;
|
||||
|
||||
|
@ -500,7 +639,6 @@ TEST_CASE("End-to-end translation") {
|
|||
string_view(sentence_str.data() + 25, 0), // 0.7 ""
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.source.appendEndingWhitespace("\n");
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -517,7 +655,6 @@ TEST_CASE("End-to-end translation") {
|
|||
string_view(sentence_str.data() + 28, 0), // 0.8 ""
|
||||
};
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
response.target.appendEndingWhitespace("\n");
|
||||
}
|
||||
|
||||
html.restore(response);
|
||||
|
@ -536,27 +673,116 @@ TEST_CASE("End-to-end translation") {
|
|||
string_view(sentence_str.data() + 42, 0), // 0.7 ""
|
||||
};
|
||||
source.appendSentence("", sentence.begin(), sentence.end());
|
||||
source.appendEndingWhitespace("</p>\n");
|
||||
source.appendEndingWhitespace("</p>");
|
||||
|
||||
CHECK(asTokens(response.source) == asTokens(source));
|
||||
}
|
||||
|
||||
{
|
||||
AnnotatedText target;
|
||||
std::string sentence_str("<p>Ich <u>fahre</u> <b>gerne</b> dieses Auto.");
|
||||
// Empty <b></b> because the space token after "Ich" has "<p><b>" markup, passed down from "<b>like</b>"
|
||||
std::string sentence_str("<p>Ich <b></b><u>fahre</u> <b>gerne</b> dieses Auto.");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 6), // 0.0 "<p>Ich"
|
||||
string_view(sentence_str.data() + 6, 4), // 0.1 " <u>"
|
||||
string_view(sentence_str.data() + 10, 4), // 0.2 "fahr"
|
||||
string_view(sentence_str.data() + 14, 1), // 0.3 "e"
|
||||
string_view(sentence_str.data() + 15, 13), // 0.4 "</u> <b>gerne"
|
||||
string_view(sentence_str.data() + 28, 11), // 0.5 "</b> dieses"
|
||||
string_view(sentence_str.data() + 39, 5), // 0.6 " Auto"
|
||||
string_view(sentence_str.data() + 44, 1), // 0.7 "."
|
||||
string_view(sentence_str.data() + 45, 0), // 0.8 ""
|
||||
string_view(sentence_str.data() + 6, 4), // 0.1 " <b>"
|
||||
string_view(sentence_str.data() + 10, 11), // 0.2 "</b><u>fahr"
|
||||
string_view(sentence_str.data() + 21, 1), // 0.3 "e"
|
||||
string_view(sentence_str.data() + 22, 13), // 0.4 "</u> <b>gerne"
|
||||
string_view(sentence_str.data() + 35, 11), // 0.5 "</b> dieses"
|
||||
string_view(sentence_str.data() + 46, 5), // 0.6 " Auto"
|
||||
string_view(sentence_str.data() + 51, 1), // 0.7 "."
|
||||
string_view(sentence_str.data() + 52, 0), // 0.8 ""
|
||||
};
|
||||
target.appendSentence("", sentence.begin(), sentence.end());
|
||||
target.appendEndingWhitespace("</p>\n");
|
||||
target.appendEndingWhitespace("</p>");
|
||||
|
||||
CHECK(asTokens(response.target) == asTokens(target));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("End-to-end translation when no words with markup align", "[!mayfail]") {
|
||||
std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
|
||||
HTML html(std::move(input), true);
|
||||
CHECK(input == "I like to drive this car.");
|
||||
|
||||
Response response;
|
||||
|
||||
// clang-format off
|
||||
response.alignments = std::vector<std::vector<std::vector<float>>>{{
|
||||
{0.5360, 0.4405, 0.0142, 0.0061, 0.0029, 0.0001, 0.0000, 0.0001},
|
||||
{0.0451, 0.0602, 0.5120, 0.2584, 0.1145, 0.0062, 0.0019, 0.0017},
|
||||
{0.0392, 0.0009, 0.6535, 0.2293, 0.0492, 0.0199, 0.0014, 0.0067},
|
||||
{0.0007, 0.0036, 0.0112, 0.0118, 0.9209, 0.0449, 0.0050, 0.0019},
|
||||
{0.0000, 0.0004, 0.0008, 0.0047, 0.0163, 0.9683, 0.0045, 0.0050},
|
||||
{0.0011, 0.0046, 0.0039, 0.0090, 0.0023, 0.0024, 0.9648, 0.0119},
|
||||
{0.0840, 0.0744, 0.1545, 0.1330, 0.1818, 0.1722, 0.0859, 0.1143},
|
||||
}};
|
||||
// clang-format on
|
||||
|
||||
{
|
||||
std::string sentence_str("I like to drive this car.");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 1), // 0.0 "I"
|
||||
string_view(sentence_str.data() + 1, 5), // 0.1 " like"
|
||||
string_view(sentence_str.data() + 6, 3), // 0.2 " to"
|
||||
string_view(sentence_str.data() + 9, 6), // 0.3 " drive"
|
||||
string_view(sentence_str.data() + 15, 5), // 0.4 " this"
|
||||
string_view(sentence_str.data() + 20, 4), // 0.5 " car"
|
||||
string_view(sentence_str.data() + 24, 1), // 0.6 "."
|
||||
string_view(sentence_str.data() + 25, 0), // 0.7 [EOS]
|
||||
};
|
||||
response.source.appendSentence("", sentence.begin(), sentence.end());
|
||||
}
|
||||
|
||||
{
|
||||
std::string sentence_str("Rád řídím to auto.");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 4), // 0.0 "Rád"
|
||||
string_view(sentence_str.data() + 4, 6), // 0.1 " říd"
|
||||
string_view(sentence_str.data() + 10, 3), // 0.2 "ím"
|
||||
string_view(sentence_str.data() + 13, 3), // 0.3 "_to"
|
||||
string_view(sentence_str.data() + 16, 5), // 0.4 " auto"
|
||||
string_view(sentence_str.data() + 21, 1), // 0.5 "."
|
||||
string_view(sentence_str.data() + 22, 0), // 0.6 [EOS]
|
||||
};
|
||||
response.target.appendSentence("", sentence.begin(), sentence.end());
|
||||
}
|
||||
|
||||
html.restore(response);
|
||||
|
||||
{
|
||||
AnnotatedText source;
|
||||
std::string sentence_str("<p>I <b>like</b> to <u>drive</u> this car.");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 4), // 0.0 "<p>I"
|
||||
string_view(sentence_str.data() + 4, 8), // 0.1 " <b>like"
|
||||
string_view(sentence_str.data() + 12, 7), // 0.2 "</b> to"
|
||||
string_view(sentence_str.data() + 19, 9), // 0.3 " <u>drive"
|
||||
string_view(sentence_str.data() + 28, 9), // 0.4 "</u> this"
|
||||
string_view(sentence_str.data() + 37, 4), // 0.5 " car"
|
||||
string_view(sentence_str.data() + 41, 1), // 0.6 "."
|
||||
string_view(sentence_str.data() + 42, 0), // 0.7 ""
|
||||
};
|
||||
source.appendSentence("", sentence.begin(), sentence.end());
|
||||
source.appendEndingWhitespace("</p>");
|
||||
|
||||
CHECK(asTokens(response.source) == asTokens(source));
|
||||
}
|
||||
|
||||
{
|
||||
AnnotatedText target;
|
||||
std::string sentence_str("<p>Rád <b></b>řídím <u></u>to auto.");
|
||||
std::vector<string_view> sentence{
|
||||
string_view(sentence_str.data() + 0, 7), // 0.0 "<p>Rád"
|
||||
string_view(sentence_str.data() + 7, 13), // 0.1 " <b></b>říd"
|
||||
string_view(sentence_str.data() + 20, 3), // 0.2 "ím"
|
||||
string_view(sentence_str.data() + 23, 10), // 0.3 "_<u></u>to"
|
||||
string_view(sentence_str.data() + 33, 5), // 0.4 " auto"
|
||||
string_view(sentence_str.data() + 38, 1), // 0.5 "."
|
||||
string_view(sentence_str.data() + 39, 0), // 0.6 [EOS]
|
||||
};
|
||||
target.appendSentence("", sentence.begin(), sentence.end());
|
||||
target.appendEndingWhitespace("</p>");
|
||||
|
||||
CHECK(asTokens(response.target) == asTokens(target));
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ void encodeEntities(string_view const &input, std::string &output) {
|
|||
|
||||
size_t countPrefixWhitespaces(string_view const &input) {
|
||||
size_t size = 0;
|
||||
while (size < input.size() && input[size] == ' ') ++size;
|
||||
while (size < input.size() && std::isspace(input[size])) ++size;
|
||||
return size;
|
||||
}
|
||||
|
||||
|
@ -59,6 +59,8 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
|
|||
return out << "<!--" << tag->data << "-->";
|
||||
case HTML::Tag::PROCESSING_INSTRUCTION:
|
||||
return out << "<?" << tag->data << "?>";
|
||||
case HTML::Tag::WHITESPACE:
|
||||
return out << "[inserted space]";
|
||||
}
|
||||
return out << "[Unknown tag type]";
|
||||
}
|
||||
|
@ -107,27 +109,8 @@ class reversed {
|
|||
T const &container_;
|
||||
};
|
||||
|
||||
bool isBlockElement(std::string_view const &name) {
|
||||
// List of elements that we expect might occur inside words, and that should
|
||||
// not introduce spacings around them. Not strictly inline elements, nor flow
|
||||
// elements. See also https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories
|
||||
static std::unordered_set<std::string> inlineishElements{
|
||||
"abbr", "a", "b", "em", "i", "kbd", "mark", "math", "output", "q", "ruby",
|
||||
"small", "span", "strong", "sub", "sup", "time", "u", "var", "wbr", "ins", "del"};
|
||||
|
||||
return inlineishElements.find(std::string(name)) == inlineishElements.end();
|
||||
}
|
||||
|
||||
bool isVoidTag(std::string_view const &name) {
|
||||
// List of elements for which we do not expect a closing tag, or self-closing
|
||||
// elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
|
||||
// More relevant source of this list:
|
||||
// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
|
||||
static std::unordered_set<std::string> voidElements{"area", "base", "basefont", "bgsound", "br", "col",
|
||||
"embed", "frame", "hr", "img", "input", "keygen",
|
||||
"link", "meta", "param", "source", "track", "wbr"};
|
||||
|
||||
return voidElements.find(std::string(name)) != voidElements.end();
|
||||
bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
|
||||
return set.find(name) != set.end();
|
||||
}
|
||||
|
||||
void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
|
||||
|
@ -187,8 +170,6 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
|
|||
return out;
|
||||
}
|
||||
|
||||
bool isContinuation(string_view str) { return !str.empty() && str.compare(0, 1, " ", 1) != 0; }
|
||||
|
||||
bool hasAlignments(Response const &response) {
|
||||
// Test for each sentence individually as a sentence may be empty (or there)
|
||||
// might be no sentences, so just testing for alignments.empty() would not be
|
||||
|
@ -207,85 +188,11 @@ bool hasAlignments(Response const &response) {
|
|||
return true;
|
||||
}
|
||||
|
||||
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
|
||||
// For each sentence...
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
|
||||
alignments.emplace_back();
|
||||
|
||||
// Hard-align: find for each target token the most prevalent source token
|
||||
// Note: only search from 0 to N-1 because token N is end-of-sentence token
|
||||
// that can only align with the end-of-sentence token of the target
|
||||
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
|
||||
size_t maxS = 0;
|
||||
for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
|
||||
if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
|
||||
maxS = s;
|
||||
}
|
||||
}
|
||||
|
||||
alignments.back().push_back(maxS);
|
||||
}
|
||||
|
||||
// Next, we try to smooth out these selected alignments with a few heuristics
|
||||
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
|
||||
// If this token is a continuation of a previous token, pick the tags from the most
|
||||
// prevalent token for the whole word.
|
||||
if (isContinuation(response.target.word(sentenceIdx, t))) {
|
||||
// Note: only looking at the previous token since that will already
|
||||
// have this treatment applied to it.
|
||||
size_t currSentenceIdx = alignments.back()[t];
|
||||
size_t prevSentenceIdx = alignments.back()[t - 1];
|
||||
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
|
||||
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
|
||||
|
||||
if (currScore > prevScore) {
|
||||
// Apply this to all previous tokens in the word
|
||||
for (size_t i = t;; --i) {
|
||||
alignments.back()[i] = currSentenceIdx;
|
||||
|
||||
// Stop if this was the first token or the beginning of the word
|
||||
if (i == 0 || !isContinuation(response.target.word(sentenceIdx, i))) break;
|
||||
}
|
||||
} else {
|
||||
alignments.back()[t] = prevSentenceIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Always align target end with source end
|
||||
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Internal type used to point to a position in HTML::spans_.
|
||||
typedef std::vector<HTML::Span>::const_iterator SpanIterator;
|
||||
|
||||
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
|
||||
std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
|
||||
size_t offset = 0;
|
||||
|
||||
// Fill targetTokenSpans based on the alignments we just made up.
|
||||
// NOTE: this should match the exact order of Apply()
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
|
||||
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
|
||||
size_t s = alignments[sentenceIdx][t];
|
||||
assert(s < response.source.numWords(sentenceIdx));
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
|
||||
}
|
||||
|
||||
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
|
||||
}
|
||||
|
||||
assert(offset < sourceTokenSpans.size());
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
|
||||
}
|
||||
|
||||
// Little helper class to append HTML to a token
|
||||
class TokenFormatter {
|
||||
public:
|
||||
explicit TokenFormatter(string_view token)
|
||||
: html_(), offset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
|
||||
: html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
|
||||
// Do encoding of any entities that popped up in the translation
|
||||
encodeEntities(token, html_);
|
||||
}
|
||||
|
@ -303,6 +210,7 @@ class TokenFormatter {
|
|||
std::string closeTag = format("</{}>", tag->name);
|
||||
html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
|
||||
offset_ += closeTag.size();
|
||||
if (closeLeft_) whitespaceOffset_ += closeTag.size();
|
||||
}
|
||||
|
||||
for (HTML::Tag const *tag : opening) {
|
||||
|
@ -318,17 +226,28 @@ class TokenFormatter {
|
|||
case HTML::Tag::PROCESSING_INSTRUCTION:
|
||||
openTag = format("<?{}?>", tag->data);
|
||||
break;
|
||||
case HTML::Tag::WHITESPACE: {
|
||||
// Try to eat two newlines (paragraph break) from our segment
|
||||
auto pos = html_.find("\n\n", whitespaceOffset_);
|
||||
if (pos != std::string::npos && pos < whitespaceOffset_ + whitespaceSize_) {
|
||||
html_.erase(pos, 2);
|
||||
whitespaceSize_ -= 2;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
||||
html_.insert(offset_ + whitespaceSize_, openTag);
|
||||
offset_ += openTag.size();
|
||||
closeLeft_ = false;
|
||||
closeLeft_ = closeLeft_ && openTag.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::string html_; // Output html
|
||||
size_t offset_; // Size added by prepending HTML
|
||||
size_t whitespaceSize_; // number of prefix whitespace characters
|
||||
std::string html_; // Output html
|
||||
size_t offset_; // Size added by prepending HTML
|
||||
size_t whitespaceOffset_; // position of prefix whitespace characters
|
||||
// (it moves as closing tags are prepended)
|
||||
size_t whitespaceSize_; // number of prefix whitespace characters
|
||||
|
||||
// Close tags we want to show up left (before) the token, but open tags
|
||||
// ideally come directly after any prefix whitespace. However, some tokens
|
||||
|
@ -339,96 +258,6 @@ class TokenFormatter {
|
|||
bool closeLeft_;
|
||||
};
|
||||
|
||||
AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
|
||||
std::vector<SpanIterator> &sourceTokenSpans) {
|
||||
auto spanIt = sourceSpans.begin();
|
||||
auto prevIt = sourceSpans.begin(); // safe because first span is always empty span, and
|
||||
// and the while-loop below will do the rest
|
||||
assert(prevIt == sourceSpans.end() || prevIt->tags.empty());
|
||||
|
||||
return apply(in, [&](ByteRange range, string_view token, bool last) {
|
||||
TokenFormatter formatter(token);
|
||||
|
||||
// Potential issue: spans and tokens can intersect, e.g.
|
||||
//
|
||||
// text <p> h <u> e </u> ll o </p>
|
||||
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
|
||||
// tokens |111111111111111|2|
|
||||
//
|
||||
// Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
|
||||
// Note: only relevant if isBlockElement is used. If we just insert spaces
|
||||
// around all elements, every segment of `hello` will be a token.
|
||||
|
||||
// Seek to the last span that overlaps with this token
|
||||
while (true) {
|
||||
formatter.append(prevIt->tags, spanIt->tags);
|
||||
prevIt = spanIt;
|
||||
|
||||
if (spanIt + 1 != sourceSpans.end() && ((spanIt + 1)->begin < range.end || last)) {
|
||||
spanIt++;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: This is just the taint of the last span, not the ones in between.
|
||||
// This makes us lose some markup of parts of tokens as described above.
|
||||
sourceTokenSpans.push_back(prevIt);
|
||||
|
||||
return std::move(formatter.html());
|
||||
});
|
||||
}
|
||||
|
||||
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
|
||||
std::vector<SpanIterator> const &targetTokenSpans) {
|
||||
auto prevSpan = sourceSpans.begin();
|
||||
auto targetSpanIt = targetTokenSpans.begin();
|
||||
|
||||
AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
|
||||
TokenFormatter formatter(token);
|
||||
|
||||
// First we scan through spans_ to catch up to the span assigned to this
|
||||
// token. We're only interested in empty spans (empty and void elements)
|
||||
for (auto span_it = prevSpan + 1; span_it < *targetSpanIt; span_it++) {
|
||||
// We're only interested in empty spans between the spans in targetSpanIt
|
||||
if (span_it->size() != 0) continue;
|
||||
|
||||
formatter.append(prevSpan->tags, span_it->tags);
|
||||
|
||||
// Note: here, not in 3rd part of for-statement because we don't want to
|
||||
// set prevSpan if the continue clause at the beginning of this for-loop
|
||||
// was hit.
|
||||
prevSpan = span_it;
|
||||
}
|
||||
|
||||
// Now do the same thing but for our target set of tags. Note that we cannot
|
||||
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
|
||||
// because there is no guarantee that the order in `targetTokenSpans` is
|
||||
// the same as that of `spans`.
|
||||
formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
|
||||
|
||||
// If this is the last token of the response, close all open tags.
|
||||
if (last) {
|
||||
// Note: this assert is true due to our current implementation of
|
||||
// HardAlignments() that always matches the last token of the input with
|
||||
// the last token of the output. But lets assume someone someday changes
|
||||
// HardAlignments(), and then this for-loop will be necessary.
|
||||
// assert((*targetSpanIt)->tags.empty());
|
||||
formatter.append((*targetSpanIt)->tags, HTML::Taint());
|
||||
}
|
||||
|
||||
prevSpan = *targetSpanIt++;
|
||||
|
||||
return std::move(formatter.html());
|
||||
});
|
||||
|
||||
// Assert that we did in fact use all our taints
|
||||
assert(targetSpanIt == targetTokenSpans.end());
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
size_t debugCountTokens(AnnotatedText const &text) {
|
||||
size_t tokens = 1; // for the ending gap
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
|
||||
|
@ -441,8 +270,9 @@ size_t debugCountTokens(AnnotatedText const &text) {
|
|||
|
||||
namespace marian::bergamot {
|
||||
|
||||
HTML::HTML(std::string &&source, bool process_markup) {
|
||||
HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
|
||||
if (!process_markup) return;
|
||||
|
||||
std::string original = std::move(source);
|
||||
markup::instream in(original.data(), original.data() + original.size());
|
||||
markup::Scanner scanner(in);
|
||||
|
@ -450,6 +280,8 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
|
||||
Tag *tag;
|
||||
Taint stack;
|
||||
bool addSentenceBreak = false;
|
||||
bool addSpace = false;
|
||||
spans_.push_back(Span{0, 0, {}});
|
||||
|
||||
bool stop = false;
|
||||
|
@ -463,24 +295,41 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
break;
|
||||
|
||||
case markup::Scanner::TT_TEXT: {
|
||||
// If the previous segment was the open or close tag of a block element
|
||||
// we treat the text after it as a new sentence.
|
||||
if (addSentenceBreak) {
|
||||
if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
|
||||
stack.push_back(makeTag({Tag::WHITESPACE}));
|
||||
// Important: span->size() == 0 to make it behave as a void element.
|
||||
// Also important: position before the \n\n tokens, not after, to
|
||||
// make it easier to remove them later through apply().
|
||||
spans_.push_back(Span{source.size(), source.size(), stack});
|
||||
source.append("\n\n"); // TODO assumes ssplit-mode = wrapped_text
|
||||
stack.pop_back();
|
||||
}
|
||||
addSentenceBreak = false;
|
||||
}
|
||||
|
||||
// If the previous segment was an open or close tag, it might be best
|
||||
// to add a space to make sure we don't append to the previous word.
|
||||
if (addSpace) {
|
||||
if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
|
||||
!std::isspace(scanner.value()[0])) {
|
||||
source.push_back(' ');
|
||||
}
|
||||
addSpace = false;
|
||||
}
|
||||
|
||||
auto begin = source.size();
|
||||
source.append(scanner.value());
|
||||
spans_.push_back(Span{begin, source.size(), stack});
|
||||
} break;
|
||||
|
||||
case markup::Scanner::TT_TAG_START:
|
||||
// If it makes sense to treat this element as a break in a word (e.g.
|
||||
// <br>, <img>, <li>) make sure it does so in this text as well.
|
||||
// TODO: Strong assumption here that the language uses spaces to
|
||||
// separate words
|
||||
if (isBlockElement(scanner.tag()) && !source.empty() && source.back() != ' ') source.push_back(' ');
|
||||
|
||||
// pool_ takes ownership of our tag, makes sure it's freed when necessary
|
||||
pool_.emplace_back(new Tag{isVoidTag(scanner.tag()) ? Tag::VOID_ELEMENT : Tag::ELEMENT,
|
||||
std::string(scanner.tag()), std::string()});
|
||||
case markup::Scanner::TT_TAG_START: {
|
||||
std::string name(scanner.tag());
|
||||
|
||||
// Tag *tag is used by attribute parsing
|
||||
tag = pool_.back().get();
|
||||
tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});
|
||||
|
||||
stack.push_back(tag);
|
||||
|
||||
|
@ -491,7 +340,14 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
spans_.push_back(Span{source.size(), source.size(), stack});
|
||||
stack.pop_back();
|
||||
}
|
||||
break;
|
||||
|
||||
// Treat non-inline HTML tags as spaces that break up words.
|
||||
if (!contains(options_.inlineTags, tag->name)) {
|
||||
addSentenceBreak = true;
|
||||
} else {
|
||||
addSpace = true;
|
||||
}
|
||||
} break;
|
||||
|
||||
case markup::Scanner::TT_TAG_END:
|
||||
// Note: self-closing tags emit TT_TAG_END immediately after TT_TAG_START
|
||||
|
@ -508,6 +364,13 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
spans_.push_back(Span{source.size(), source.size(), stack});
|
||||
|
||||
stack.pop_back();
|
||||
|
||||
// Add space if necessary
|
||||
if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
|
||||
addSentenceBreak = true;
|
||||
} else {
|
||||
addSpace = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case markup::Scanner::TT_ATTRIBUTE:
|
||||
|
@ -516,18 +379,16 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
break;
|
||||
|
||||
case markup::Scanner::TT_COMMENT_START:
|
||||
// pool_ takes ownership of our tag, makes sure it's freed when necessary
|
||||
pool_.emplace_back(new Tag{Tag::COMMENT});
|
||||
tag = pool_.back().get();
|
||||
// Tag *tag is used when TT_DATA is seen to add the comment's content.
|
||||
tag = makeTag({Tag::COMMENT});
|
||||
stack.push_back(tag);
|
||||
spans_.push_back(Span{source.size(), source.size(), stack});
|
||||
stack.pop_back();
|
||||
break;
|
||||
|
||||
case markup::Scanner::TT_PROCESSING_INSTRUCTION_START:
|
||||
// pool_ takes ownership of our tag, makes sure it's freed when necessary
|
||||
pool_.emplace_back(new Tag{Tag::PROCESSING_INSTRUCTION});
|
||||
tag = pool_.back().get();
|
||||
// Tag *tag is used when TT_DATA is seen to add the PI's content.
|
||||
tag = makeTag({Tag::PROCESSING_INSTRUCTION});
|
||||
stack.push_back(tag);
|
||||
spans_.push_back(Span{source.size(), source.size(), stack});
|
||||
stack.pop_back();
|
||||
|
@ -551,7 +412,7 @@ HTML::HTML(std::string &&source, bool process_markup) {
|
|||
if (!stack.empty()) throw BadHTML(format("Not all tags were closed: {}", stack));
|
||||
|
||||
// Add a trailing span (that's empty) to signify all closed tags.
|
||||
spans_.emplace_back(Span{source.size() + 1, source.size() + 1, stack});
|
||||
spans_.emplace_back(Span{source.size(), source.size(), stack});
|
||||
}
|
||||
|
||||
void HTML::restore(Response &response) {
|
||||
|
@ -580,7 +441,7 @@ void HTML::restore(Response &response) {
|
|||
|
||||
// RestoreSource re-inserts HTML into the source text, but also identifies
|
||||
// which span each source token fits into best.
|
||||
AnnotatedText source = restoreSource(response.source, spans_, sourceTokenSpans);
|
||||
AnnotatedText source = restoreSource(response.source, sourceTokenSpans);
|
||||
assert(sourceTokenSpans.size() == debugCountTokens(response.source));
|
||||
|
||||
// Find for every token in target the token in source that best matches.
|
||||
|
@ -591,10 +452,193 @@ void HTML::restore(Response &response) {
|
|||
copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
|
||||
assert(targetTokenSpans.size() == debugCountTokens(response.target));
|
||||
|
||||
AnnotatedText target = restoreTarget(response.target, spans_, targetTokenSpans);
|
||||
AnnotatedText target = restoreTarget(response.target, targetTokenSpans);
|
||||
|
||||
response.source = source;
|
||||
response.target = target;
|
||||
}
|
||||
|
||||
AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans) {
|
||||
auto spanIt = spans_.begin();
|
||||
auto prevIt = spans_.begin(); // safe because first span is always empty span, and
|
||||
// and the while-loop below will do the rest
|
||||
assert(prevIt == spans_.end() || prevIt->tags.empty());
|
||||
|
||||
return apply(in, [&](ByteRange range, string_view token, bool last) {
|
||||
TokenFormatter formatter(token);
|
||||
|
||||
// Potential issue: spans and tokens can intersect, e.g.
|
||||
//
|
||||
// text <p> h <u> e </u> ll o </p>
|
||||
// spans |1| |2| |3333| (so only 2 is tainted with <p><u>, others only <p>)
|
||||
// tokens |111111111111111|2|
|
||||
//
|
||||
// Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
|
||||
// Note: only relevant if isBlockElement is used. If we just insert spaces
|
||||
// around all elements, every segment of `hello` will be a token.
|
||||
|
||||
// Seek to the last span that overlaps with this token
|
||||
while (true) {
|
||||
formatter.append(prevIt->tags, spanIt->tags);
|
||||
prevIt = spanIt;
|
||||
|
||||
if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
|
||||
spanIt++;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: This is just the taint of the last span, not the ones in between.
|
||||
// This makes us lose some markup of parts of tokens as described above.
|
||||
sourceTokenSpans.push_back(prevIt);
|
||||
|
||||
return std::move(formatter.html());
|
||||
});
|
||||
}
|
||||
|
||||
AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
|
||||
auto prevSpan = spans_.cbegin();
|
||||
auto targetSpanIt = targetTokenSpans.begin();
|
||||
|
||||
AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
|
||||
TokenFormatter formatter(token);
|
||||
|
||||
// First we scan through spans_ to catch up to the span assigned to this
|
||||
// token. We're only interested in empty spans (empty and void elements)
|
||||
for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
|
||||
// We're only interested in empty spans or spans that would otherwise get
|
||||
// lost because they didn't align with anything between the spans in
|
||||
// targetSpanIt
|
||||
// TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
|
||||
if (span_it->size() != 0 &&
|
||||
std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
|
||||
continue;
|
||||
|
||||
formatter.append(prevSpan->tags, span_it->tags);
|
||||
|
||||
// Note: here, not in 3rd part of for-statement because we don't want to
|
||||
// set prevSpan if the continue clause at the beginning of this for-loop
|
||||
// was hit.
|
||||
prevSpan = span_it;
|
||||
}
|
||||
|
||||
// Now do the same thing but for our target set of tags. Note that we cannot
|
||||
// combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
|
||||
// because there is no guarantee that the order in `targetTokenSpans` is
|
||||
// the same as that of `spans`.
|
||||
formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
|
||||
|
||||
// If this is the last token of the response, close all open tags.
|
||||
if (last) {
|
||||
// Note: this assert is true due to our current implementation of
|
||||
// HardAlignments() that always matches the last token of the input with
|
||||
// the last token of the output. But lets assume someone someday changes
|
||||
// HardAlignments(), and then this for-loop will be necessary.
|
||||
// assert((*targetSpanIt)->tags.empty());
|
||||
formatter.append((*targetSpanIt)->tags, HTML::Taint());
|
||||
}
|
||||
|
||||
prevSpan = *targetSpanIt;
|
||||
++targetSpanIt;
|
||||
|
||||
return std::move(formatter.html());
|
||||
});
|
||||
|
||||
// Assert that we did in fact use all our taints
|
||||
assert(targetSpanIt == targetTokenSpans.end());
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
HTML::Tag *HTML::makeTag(Tag &&tag) {
|
||||
pool_.emplace_front(std::move(tag));
|
||||
return &pool_.front();
|
||||
}
|
||||
|
||||
void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
|
||||
std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
|
||||
size_t offset = 0; // Sentence offset in sourceTokenSpans
|
||||
|
||||
// Fill targetTokenSpans based on the alignments we just made up.
|
||||
// NOTE: this should match the exact order of Apply()
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for sentence ending gap
|
||||
for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
|
||||
size_t s = alignments[sentenceIdx][t];
|
||||
assert(s < response.source.numWords(sentenceIdx));
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]); // +1 for prefix gap
|
||||
}
|
||||
|
||||
offset += response.source.numWords(sentenceIdx) + 1; // +1 for prefix gap
|
||||
}
|
||||
|
||||
assert(offset + 1 == sourceTokenSpans.size());
|
||||
targetTokenSpans.push_back(sourceTokenSpans[offset]); // token_tag for ending whitespace
|
||||
}
|
||||
|
||||
// Reports if token `str` is likely to be a continuation of a word. This is used
|
||||
// to determine whether we should share the markup, or whether we should see
|
||||
// this token as a fresh start. This implementation will treat "hello[world]"
|
||||
// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
|
||||
bool HTML::isContinuation(string_view prev, string_view str) {
|
||||
if (options_.continuationDelimiters.empty()) return false;
|
||||
if (prev.empty() || str.empty()) return false;
|
||||
return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
|
||||
options_.continuationDelimiters.find(prev.back()) == std::string::npos;
|
||||
}
|
||||
|
||||
void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
|
||||
// For each sentence...
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
|
||||
alignments.emplace_back();
|
||||
|
||||
// Hard-align: find for each target token the most prevalent source token
|
||||
// Note: only search from 0 to N-1 because token N is end-of-sentence token
|
||||
// that can only align with the end-of-sentence token of the target
|
||||
for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
|
||||
size_t maxS = 0;
|
||||
for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
|
||||
if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
|
||||
maxS = s;
|
||||
}
|
||||
}
|
||||
|
||||
alignments.back().push_back(maxS);
|
||||
}
|
||||
|
||||
// Next, we try to smooth out these selected alignments with a few heuristics
|
||||
for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
|
||||
// If this token is a continuation of a previous token, pick the tags from the most
|
||||
// prevalent token for the whole word.
|
||||
if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
|
||||
// Note: only looking at the previous token since that will already
|
||||
// have this treatment applied to it.
|
||||
size_t currSentenceIdx = alignments.back()[t];
|
||||
size_t prevSentenceIdx = alignments.back()[t - 1];
|
||||
float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
|
||||
float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
|
||||
|
||||
if (currScore >= prevScore) {
|
||||
// Apply this to all previous tokens in the word
|
||||
for (size_t i = t;; --i) {
|
||||
alignments.back()[i] = currSentenceIdx;
|
||||
|
||||
// Stop if this was the first token or the beginning of the word
|
||||
if (i == 0 ||
|
||||
!isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
alignments.back()[t] = prevSentenceIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Always align target end with source end
|
||||
alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace marian::bergamot
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
#ifndef SRC_BERGAMOT_HTML_H_
|
||||
#define SRC_BERGAMOT_HTML_H_
|
||||
|
||||
#include <forward_list>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "annotation.h"
|
||||
#include "definitions.h"
|
||||
|
||||
namespace marian {
|
||||
|
@ -18,40 +21,84 @@ class BadHTML : public std::runtime_error {
|
|||
|
||||
class HTML {
|
||||
public:
|
||||
struct Options {
|
||||
// List of elements for which we do not expect a closing tag, or self-closing
|
||||
// elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
|
||||
// More relevant source of this list:
|
||||
// https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
|
||||
std::unordered_set<std::string> voidTags{"area", "base", "basefont", "bgsound", "br", "col",
|
||||
"embed", "frame", "hr", "img", "input", "keygen",
|
||||
"link", "meta", "param", "source", "track", "wbr"};
|
||||
|
||||
std::unordered_set<std::string> inlineTags{"abbr", "a", "b", "em", "i", "kbd", "mark", "math",
|
||||
"output", "q", "ruby", "small", "span", "strong", "sub", "sup",
|
||||
"time", "u", "var", "wbr", "ins", "del", "img"};
|
||||
|
||||
// List of characters that occur at the start of a token that indicate that
|
||||
// the this token is probably *not* a continuation of a word. Set to empty
|
||||
// to never mark a token as a continuation of the word.
|
||||
// std::string continuationDelimiters = "\n ,.(){}[]";
|
||||
std::string continuationDelimiters;
|
||||
|
||||
// Should we always add spaces to the places where tags used to be? I.e.
|
||||
// `un<u>der</u>line` should become `un der line`?
|
||||
bool substituteInlineTagsWithSpaces = true;
|
||||
};
|
||||
|
||||
struct Tag {
|
||||
enum NodeType {
|
||||
ELEMENT,
|
||||
VOID_ELEMENT,
|
||||
COMMENT,
|
||||
PROCESSING_INSTRUCTION,
|
||||
WHITESPACE, // negative space
|
||||
};
|
||||
|
||||
NodeType type; // Type of the node
|
||||
std::string name;
|
||||
std::string attributes;
|
||||
std::string data; // Raw data of an element that just needs to be
|
||||
// copied as is, e.g. <script> or <style>
|
||||
// TODO: replace with string_view if input lives that long
|
||||
NodeType type; // Type of the node
|
||||
std::string name; // Tag name (if type is ELEMENT or VOID_ELEMENT)
|
||||
std::string attributes; // Tag attributes (as raw HTML string, including
|
||||
// entities and prefix whitespace)
|
||||
std::string data; // Raw data of an element that just needs to be
|
||||
// copied as is, e.g. <script> or <style>
|
||||
// @TODO: if the original HTML stays in memory, we could replace
|
||||
// `attributes` and `data` with string_views pointing to it.
|
||||
};
|
||||
|
||||
typedef std::vector<Tag *> Taint;
|
||||
using Taint = std::vector<Tag *>;
|
||||
|
||||
struct Span {
|
||||
size_t begin;
|
||||
size_t end;
|
||||
Taint tags; // Note: free pointer! Lifetime of tags is managed by pool_
|
||||
Taint tags; // Note: free pointers! Lifetime of tags is managed by pool_
|
||||
inline size_t size() const { return end - begin; }
|
||||
};
|
||||
|
||||
explicit HTML(std::string &&source, bool process_markup);
|
||||
explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
|
||||
explicit HTML(std::string &&source, bool process_markup, Options &&options);
|
||||
void restore(Response &response);
|
||||
|
||||
private:
|
||||
using SpanIterator = std::vector<HTML::Span>::const_iterator;
|
||||
using AnnotatedText = marian::bergamot::AnnotatedText;
|
||||
|
||||
AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
|
||||
AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
|
||||
void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
|
||||
std::vector<HTML::SpanIterator> const &sourceTokenSpans,
|
||||
std::vector<HTML::SpanIterator> &targetTokenSpans);
|
||||
void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
|
||||
bool isContinuation(string_view prev, string_view str);
|
||||
// Allocates tag in pool_ (which then owns it) and gives a pointer to be used
|
||||
// in Taints. Pointer is valid as long as this HTML instance lives on.
|
||||
Tag *makeTag(Tag &&tag);
|
||||
|
||||
Options options_;
|
||||
|
||||
// List of text spans, and which tags are applied to them
|
||||
std::vector<Span> spans_;
|
||||
|
||||
// a pool of tags that we free when HTML goes out of scope
|
||||
std::vector<std::unique_ptr<Tag>> pool_;
|
||||
std::forward_list<Tag> pool_;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
|
|
Загрузка…
Ссылка в новой задаче