Treat most HTML elements as word-breaking (#286)

2022-01-16 10:26:40 +00:00 · 2022-01-16 10:26:40 +00:00 · e061b5613e
--- a/.gitignore
+++ b/.gitignore
@ -19,7 +19,7 @@ _deps
 wasm/test_page/node_modules
 build-wasm
 models
-wasm/test_page/bergamot-translator-worker.*
+wasm/test_page/js/bergamot-translator-worker.*

 # VSCode
 .vscode
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 332e976df4583793a09b6483b80b972621fcfadb
+Subproject commit b46987e96fc27b7e9488fbc36b53c07e1786784c
--- a/src/tests/units/html_tests.cpp
+++ b/src/tests/units/html_tests.cpp
@ -169,24 +169,136 @@ TEST_CASE("Test case html entities") {
  // These are all entities I would expect in innerHTML, since all other entities
  // can be encoded as UTF-8 so there's no need to encode them through &...; when
  // innerHTML encodes the DOM as HTML.
-  std::string input("<p data-attr=\"&quot;&apos;\">This is a sentence &lt;with&gt; named &amp; entities</p>\n");
+  std::string input("<p data-attr=\"&quot;&apos;\">This is a sentence &lt;with&gt; named &amp; entities</p>");
  HTML html(std::move(input), true);
-  CHECK(input == "This is a sentence <with> named & entities\n");
+  CHECK(input == "This is a sentence <with> named & entities");
 }

-TEST_CASE("Test self-closing tags should be treated as spaces") {
-  std::string input("<p>Space<br>please?</p>\n");
+TEST_CASE("Test self-closing tags should be treated as paragraph break") {
+  std::string test_str("<p>Space<br>please?</p>");

+  std::string input(test_str);
  HTML html(std::move(input), true);
-  CHECK(input == "Space please?\n");
+  CHECK(input == "Space\n\nplease?");
+
+  Response response;
+  std::string source_str("Space\n\nplease?");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 5),   // Space
+      string_view(source_str.data() + 5, 0),   // [EOS]
+      string_view(source_str.data() + 5, 2),   // \n\n
+      string_view(source_str.data() + 7, 1),   // p
+      string_view(source_str.data() + 8, 5),   // lease
+      string_view(source_str.data() + 13, 1),  // ?
+      string_view(source_str.data() + 14, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.begin() + 2);
+  response.source.appendSentence("\n\n", source_tokens.begin() + 3, source_tokens.end());
+
+  std::string target_str("Platz\n\nbitte?");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 5),   // Platz
+      string_view(target_str.data() + 5, 0),   // [EOS]
+      string_view(target_str.data() + 5, 2),   // \n\n
+      string_view(target_str.data() + 7, 5),   // bitte
+      string_view(target_str.data() + 12, 1),  // ?
+      string_view(target_str.data() + 13, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.begin() + 2);
+  response.target.appendSentence("", target_tokens.begin() + 3, target_tokens.end());
+  response.alignments = {{
+                             {1.0, 0.0},  //  Platz <- Space
+                             {0.0, 1.0}   //  [EOS] <- [EOS]
+                         },
+                         {
+                             {0.1, 0.9, 0.0, 0.0},  // _bitte <- _p + lease
+                             {0.0, 0.0, 1.0, 0.0},  //      ? <- ?
+                             {0.0, 0.0, 0.0, 1.0},  //  [EOS] <- [EOS]
+                         }};
+
+  // Main focus of this test is that the space that was introduced in the text
+  // that was being translated does not end up in the translation.
+  html.restore(response);
+  CHECK(response.source.text == "<p>Space<br>please?</p>");
+  CHECK(response.target.text == "<p>Platz<br>bitte?</p>");
+}
+
+TEST_CASE("Test inline tags should be treated as spaces") {
+  std::string test_str("un<u>der</u>line");
+
+  std::string input(test_str);
+  HTML html(std::move(input), true);
+  CHECK(input == "un der line");
+
+  Response response;
+  std::string source_str("un der line");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 2),   // un
+      string_view(source_str.data() + 2, 3),   // _de
+      string_view(source_str.data() + 5, 1),   // r
+      string_view(source_str.data() + 6, 5),   // _line
+      string_view(source_str.data() + 11, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+  std::string target_str("una linea der");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 3),   // una
+      string_view(target_str.data() + 3, 6),   // _linéa
+      string_view(target_str.data() + 9, 3),   // _de
+      string_view(target_str.data() + 12, 1),  // r
+      string_view(target_str.data() + 13, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+  response.alignments = {{{0.9795, 0.0127, 0.0002, 0.0066, 0.0009},
+                          {0.0098, 0.2967, 0.0156, 0.6640, 0.0138},
+                          {0.0214, 0.7472, 0.0626, 0.0745, 0.0943},
+                          {0.0022, 0.0230, 0.9357, 0.0165, 0.0226},
+                          {0.0122, 0.0240, 0.0085, 0.7427, 0.2125}}};
+
+  html.restore(response);
+  CHECK(response.source.text == "un <u>der</u> line");  // TODO leave spaces?
+  CHECK(response.target.text == "una linea <u>der</u>");
+}
+
+TEST_CASE("Test inline tags should not break words") {
+  std::string test_str("un<u>der</u>line");
+
+  std::string input(test_str);
+  HTML::Options options;
+  options.substituteInlineTagsWithSpaces = false;
+  HTML html(std::move(input), true, std::move(options));
+  CHECK(input == "underline");
+
+  Response response;
+  std::string source_str("underline");
+  std::vector<string_view> source_tokens{
+      string_view(source_str.data() + 0, 9),  // underline
+      string_view(source_str.data() + 9, 0),  // EOS
+  };
+  response.source.appendSentence("", source_tokens.begin(), source_tokens.end());
+
+  std::string target_str("subrayar");
+  std::vector<string_view> target_tokens{
+      string_view(target_str.data() + 0, 8),  // subrayar
+      string_view(target_str.data() + 8, 0),  // [EOS]
+  };
+  response.target.appendSentence("", target_tokens.begin(), target_tokens.end());
+
+  response.alignments = {identity_matrix<float>(2)};
+
+  html.restore(response);
+  CHECK(response.source.text == "<u></u>underline");  // TODO not spread <u> to whole word?
+  CHECK(response.target.text == "<u></u>subrayar");   // TODO not spread <u> to the whole word?
 }

 TEST_CASE("Test reconstruction of target sentence") {
  std::string input("<p>hello <b>world</b></p>\n");
  HTML html(std::move(input), true);
-  CHECK(input == "hello world\n");
+  CHECK(input == "hello world\n\n\n");  // tripple \n because \n + </p>

-  AnnotatedText source("hello world\n");
+  AnnotatedText source("hello world\n\n\n");
  recordSentenceFromByteRange(source, {
                                          ByteRange{0, 4},   // 0.0 "hell"
                                          ByteRange{4, 5},   // 0.1 "o"
@ -194,7 +306,7 @@ TEST_CASE("Test reconstruction of target sentence") {
                                          ByteRange{11, 11}  // 0.3 ""
                                      });

-  AnnotatedText target("hallo Welt\n");
+  AnnotatedText target("hallo Welt\n\n\n");
  recordSentenceFromByteRange(target, {
                                          ByteRange{0, 4},   // 0.0 "hall"
                                          ByteRange{4, 5},   // 0.1 "o"
@ -218,11 +330,11 @@ TEST_CASE("Test reconstruction of target sentence") {
 }

 TEST_CASE("Test reconstruction of target sentence with entities") {
-  std::string input("<p>hello <b>world &amp; friends!</b></p>\n");
+  std::string input("<p>hello <b>world &amp; friends!</b></p>");
  HTML html(std::move(input), true);
-  CHECK(input == "hello world & friends!\n");
+  CHECK(input == "hello world & friends!");

-  AnnotatedText source("hello world & friends!\n");
+  AnnotatedText source("hello world & friends!");
  recordSentenceFromByteRange(source, {
                                          ByteRange{0, 4},    // 0.0 "hell"
                                          ByteRange{4, 5},    // 0.1 "o"
@ -233,7 +345,7 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
                                          ByteRange{22, 22}   // 0.6 ""
                                      });

-  AnnotatedText target("hallo Welt & Freunde!\n");
+  AnnotatedText target("hallo Welt & Freunde!");
  recordSentenceFromByteRange(target, {
                                          ByteRange{0, 4},    // 0.0 "hall"
                                          ByteRange{4, 5},    // 0.1 "o"
@ -252,11 +364,11 @@ TEST_CASE("Test reconstruction of target sentence with entities") {
  html.restore(response);

  std::vector<std::string> html_tokens_source{"",         "<p>hell", "o", " <b>world", " &amp;",
-                                              " friends", "!",       "",  "</b></p>\n"};
+                                              " friends", "!",       "",  "</b></p>"};

-  std::vector<std::string> html_tokens_target{"",         "<p>hall", "o", " <b>Welt",  " &amp;",
+  std::vector<std::string> html_tokens_target{"",         "<p>hall", "o", " <b>Welt", " &amp;",

-                                              " Freunde", "!",       "",  "</b></p>\n"};
+                                              " Freunde", "!",       "",  "</b></p>"};

  CHECK(asTokens(response.source) == html_tokens_source);
  CHECK(asTokens(response.target) == html_tokens_target);
@ -264,10 +376,10 @@ TEST_CASE("Test reconstruction of target sentence with entities") {

 TEST_CASE("Test reconstruction of target with multiple sentences") {
  std::string input(
-      "<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>\n");
+      "<p>hello <b>world!</b> How does this <img> <b>deal <u>with multiple sentences?</u></b> Will it work?</p>");
  HTML html(std::move(input), true);

-  AnnotatedText source("hello world! How does this  deal with multiple sentences? Will it work?\n");
+  AnnotatedText source("hello world! How does this  deal with multiple sentences? Will it work?");
  CHECK(source.text == input);

  recordSentenceFromByteRange(source, {
@ -297,7 +409,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
                                          ByteRange{71, 71}   // 2.4 ""
                                      });

-  AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?\n");
+  AnnotatedText target("hallo Welt! Wie geht das mit mehreren Sätzen um? Wird es funktionieren?");
  recordSentenceFromByteRange(target, {
                                          ByteRange{0, 4},    // 0.0 "hall"
                                          ByteRange{4, 5},    // 0.1 "o"
@ -327,7 +439,7 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {

  std::vector<std::string> text_tokens_source{
      "",       "hall", "o",   " Welt", "!", "",  " ",    "Wie", " geht",          " das", " mit", " mehreren",
-      " Sätze", "n",    " um", "?",     "",  " ", "Wird", " es", " funktionieren", "?",    "",     "\n"};
+      " Sätze", "n",    " um", "?",     "",  " ", "Wird", " es", " funktionieren", "?",    "",     ""};

  CHECK(asTokens(target) == text_tokens_source);

@ -360,26 +472,56 @@ TEST_CASE("Test reconstruction of target with multiple sentences") {
                                              " work",
                                              "?",
                                              "",
-                                              "</p>\n"};
+                                              "</p>"};
  CHECK(asTokens(response.source) == html_tokens_source);
 }

 TEST_CASE("Test self-closing tag (HTML5)") {
-  std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>\n");
+  std::string input("<p>hello <img> <b>world</b> <u>and other <a href=\"#\">creatures</a></u></p>");
  HTML html(std::move(input), true);
-  CHECK(input == "hello  world and other creatures\n");  // Note double space between "hello" and "world"
+  CHECK(input == "hello  world and other creatures");  // Note double space between "hello" and "world"
 }

-TEST_CASE("Test empty self-closing tag at end of input") {
+TEST_CASE("Test empty void tag at end of input") {
  std::string input("hello <br>");
  HTML html(std::move(input), true);
  CHECK(input == "hello ");
+
+  Response response;
+  std::string sentence_str("hello ");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),  // 0.0 hell
+      string_view(sentence_str.data() + 4, 2),  // 0.1 o_
+      string_view(sentence_str.data() + 6, 0),  // 0.2 [EOS]
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(3)};
+
+  html.restore(response);
+  CHECK(response.source.text == "hello <br>");
+  CHECK(response.target.text == "hello <br>");
 }

 TEST_CASE("Test empty tag pair at end of input") {
  std::string input("hello <u></u>");
  HTML html(std::move(input), true);
  CHECK(input == "hello ");
+
+  Response response;
+  std::string sentence_str("hello ");
+  std::vector<string_view> sentence{
+      string_view(sentence_str.data() + 0, 4),  // 0.0 hell
+      string_view(sentence_str.data() + 4, 2),  // 0.1 o_
+      string_view(sentence_str.data() + 6, 0),  // 0.2 [EOS]
+  };
+  response.source.appendSentence("", sentence.begin(), sentence.end());
+  response.target.appendSentence("", sentence.begin(), sentence.end());
+  response.alignments = {identity_matrix<float>(3)};
+
+  html.restore(response);
+  CHECK(response.source.text == "hello <u></u>");
+  CHECK(response.target.text == "hello <u></u>");
 }

 TEST_CASE("Test empty self-closing pair at end of input in parent") {
@ -391,11 +533,11 @@ TEST_CASE("Test empty self-closing pair at end of input in parent") {
 TEST_CASE("Test empty tag") {
  std::string test_str(
      "<p id=\"1\">hello <img id=\"1.1\"><span id=\"1.2\"><u id=\"1.2.1\"></u><b id=\"1.2.2\"></b><img "
-      "id=\"1.2.3\">world</span></p>\n");
+      "id=\"1.2.3\">world</span></p>");

  std::string input(test_str);
  HTML html(std::move(input), true);
-  CHECK(input == "hello world\n");
+  CHECK(input == "hello world");

  Response response;

@ -407,11 +549,7 @@ TEST_CASE("Test empty tag") {
      string_view(sentence_str.data() + 11, 0),  // 0.3 ""
  };
  response.source.appendSentence("", sentence.begin(), sentence.end());
-  response.source.appendEndingWhitespace("\n");
-
  response.target.appendSentence("", sentence.begin(), sentence.end());
-  response.target.appendEndingWhitespace("\n");
-
  response.alignments = {identity_matrix<float>(4)};

  html.restore(response);
@ -424,19 +562,20 @@ TEST_CASE("Test <script> element") {

  std::string input(test_str);
  HTML html(std::move(input), true);
-  CHECK(input == "hello world");
+  CHECK(input == "hello \n\nworld");

  Response response;
-  std::string sentence_str("hello world");
+  std::string sentence_str("hello \n\nworld");
  std::vector<string_view> sentence{
      string_view(sentence_str.data() + 0, 4),   // 0.0 hell
-      string_view(sentence_str.data() + 4, 1),   // 0.1 o
-      string_view(sentence_str.data() + 5, 6),   // 0.2 _world
-      string_view(sentence_str.data() + 11, 0),  // 0.3 ""
+      string_view(sentence_str.data() + 4, 2),   // 0.1 o_
+      string_view(sentence_str.data() + 6, 2),   // 0.2 \n\n
+      string_view(sentence_str.data() + 8, 5),   // 0.3 world
+      string_view(sentence_str.data() + 13, 0),  // 0.4 ""
  };
  response.source.appendSentence("", sentence.begin(), sentence.end());
  response.target.appendSentence("", sentence.begin(), sentence.end());
-  response.alignments = {identity_matrix<float>(4)};
+  response.alignments = {identity_matrix<float>(5)};

  html.restore(response);
  CHECK(response.source.text == test_str);
@ -466,10 +605,10 @@ TEST_CASE("Test comment") {
  CHECK(response.target.text == test_str);
 }

-TEST_CASE("End-to-end translation") {
-  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>\n");
+TEST_CASE("End-to-end translation", "[!mayfail]") {
+  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
  HTML html(std::move(input), true);
-  CHECK(input == "I like to drive this car.\n");
+  CHECK(input == "I like to drive this car.");

  Response response;

@ -500,7 +639,6 @@ TEST_CASE("End-to-end translation") {
        string_view(sentence_str.data() + 25, 0),  // 0.7 ""
    };
    response.source.appendSentence("", sentence.begin(), sentence.end());
-    response.source.appendEndingWhitespace("\n");
  }

  {
@ -517,7 +655,6 @@ TEST_CASE("End-to-end translation") {
        string_view(sentence_str.data() + 28, 0),  // 0.8 ""
    };
    response.target.appendSentence("", sentence.begin(), sentence.end());
-    response.target.appendEndingWhitespace("\n");
  }

  html.restore(response);
@ -536,27 +673,116 @@ TEST_CASE("End-to-end translation") {
        string_view(sentence_str.data() + 42, 0),  // 0.7 ""
    };
    source.appendSentence("", sentence.begin(), sentence.end());
-    source.appendEndingWhitespace("</p>\n");
+    source.appendEndingWhitespace("</p>");

    CHECK(asTokens(response.source) == asTokens(source));
  }

  {
    AnnotatedText target;
-    std::string sentence_str("<p>Ich <u>fahre</u> <b>gerne</b> dieses Auto.");
+    // Empty <b></b> because the space token after "Ich" has "<p><b>" markup, passed down from "<b>like</b>"
+    std::string sentence_str("<p>Ich <b></b><u>fahre</u> <b>gerne</b> dieses Auto.");
    std::vector<string_view> sentence{
        string_view(sentence_str.data() + 0, 6),    // 0.0 "<p>Ich"
-        string_view(sentence_str.data() + 6, 4),    // 0.1 " <u>"
-        string_view(sentence_str.data() + 10, 4),   // 0.2 "fahr"
-        string_view(sentence_str.data() + 14, 1),   // 0.3 "e"
-        string_view(sentence_str.data() + 15, 13),  // 0.4 "</u> <b>gerne"
-        string_view(sentence_str.data() + 28, 11),  // 0.5 "</b> dieses"
-        string_view(sentence_str.data() + 39, 5),   // 0.6 " Auto"
-        string_view(sentence_str.data() + 44, 1),   // 0.7 "."
-        string_view(sentence_str.data() + 45, 0),   // 0.8 ""
+        string_view(sentence_str.data() + 6, 4),    // 0.1 " <b>"
+        string_view(sentence_str.data() + 10, 11),  // 0.2 "</b><u>fahr"
+        string_view(sentence_str.data() + 21, 1),   // 0.3 "e"
+        string_view(sentence_str.data() + 22, 13),  // 0.4 "</u> <b>gerne"
+        string_view(sentence_str.data() + 35, 11),  // 0.5 "</b> dieses"
+        string_view(sentence_str.data() + 46, 5),   // 0.6 " Auto"
+        string_view(sentence_str.data() + 51, 1),   // 0.7 "."
+        string_view(sentence_str.data() + 52, 0),   // 0.8 ""
    };
    target.appendSentence("", sentence.begin(), sentence.end());
-    target.appendEndingWhitespace("</p>\n");
+    target.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.target) == asTokens(target));
+  }
+}
+
+TEST_CASE("End-to-end translation when no words with markup align", "[!mayfail]") {
+  std::string input("<p>I <b>like</b> to <u>drive</u> this car.</p>");
+  HTML html(std::move(input), true);
+  CHECK(input == "I like to drive this car.");
+
+  Response response;
+
+  // clang-format off
+  response.alignments = std::vector<std::vector<std::vector<float>>>{{
+    {0.5360, 0.4405, 0.0142, 0.0061, 0.0029, 0.0001, 0.0000, 0.0001},
+    {0.0451, 0.0602, 0.5120, 0.2584, 0.1145, 0.0062, 0.0019, 0.0017},
+    {0.0392, 0.0009, 0.6535, 0.2293, 0.0492, 0.0199, 0.0014, 0.0067},
+    {0.0007, 0.0036, 0.0112, 0.0118, 0.9209, 0.0449, 0.0050, 0.0019},
+    {0.0000, 0.0004, 0.0008, 0.0047, 0.0163, 0.9683, 0.0045, 0.0050},
+    {0.0011, 0.0046, 0.0039, 0.0090, 0.0023, 0.0024, 0.9648, 0.0119},
+    {0.0840, 0.0744, 0.1545, 0.1330, 0.1818, 0.1722, 0.0859, 0.1143},
+  }};
+  // clang-format on
+
+  {
+    std::string sentence_str("I like to drive this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 1),   // 0.0 "I"
+        string_view(sentence_str.data() + 1, 5),   // 0.1 " like"
+        string_view(sentence_str.data() + 6, 3),   // 0.2 " to"
+        string_view(sentence_str.data() + 9, 6),   // 0.3 " drive"
+        string_view(sentence_str.data() + 15, 5),  // 0.4 " this"
+        string_view(sentence_str.data() + 20, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 24, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 25, 0),  // 0.7 [EOS]
+    };
+    response.source.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  {
+    std::string sentence_str("Rád řídím to auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 4),   // 0.0 "Rád"
+        string_view(sentence_str.data() + 4, 6),   // 0.1 " říd"
+        string_view(sentence_str.data() + 10, 3),  // 0.2 "ím"
+        string_view(sentence_str.data() + 13, 3),  // 0.3 "_to"
+        string_view(sentence_str.data() + 16, 5),  // 0.4 " auto"
+        string_view(sentence_str.data() + 21, 1),  // 0.5 "."
+        string_view(sentence_str.data() + 22, 0),  // 0.6 [EOS]
+    };
+    response.target.appendSentence("", sentence.begin(), sentence.end());
+  }
+
+  html.restore(response);
+
+  {
+    AnnotatedText source;
+    std::string sentence_str("<p>I <b>like</b> to <u>drive</u> this car.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 4),   // 0.0 "<p>I"
+        string_view(sentence_str.data() + 4, 8),   // 0.1 " <b>like"
+        string_view(sentence_str.data() + 12, 7),  // 0.2 "</b> to"
+        string_view(sentence_str.data() + 19, 9),  // 0.3 " <u>drive"
+        string_view(sentence_str.data() + 28, 9),  // 0.4 "</u> this"
+        string_view(sentence_str.data() + 37, 4),  // 0.5 " car"
+        string_view(sentence_str.data() + 41, 1),  // 0.6 "."
+        string_view(sentence_str.data() + 42, 0),  // 0.7 ""
+    };
+    source.appendSentence("", sentence.begin(), sentence.end());
+    source.appendEndingWhitespace("</p>");
+
+    CHECK(asTokens(response.source) == asTokens(source));
+  }
+
+  {
+    AnnotatedText target;
+    std::string sentence_str("<p>Rád <b></b>řídím <u></u>to auto.");
+    std::vector<string_view> sentence{
+        string_view(sentence_str.data() + 0, 7),    // 0.0 "<p>Rád"
+        string_view(sentence_str.data() + 7, 13),   // 0.1 " <b></b>říd"
+        string_view(sentence_str.data() + 20, 3),   // 0.2 "ím"
+        string_view(sentence_str.data() + 23, 10),  // 0.3 "_<u></u>to"
+        string_view(sentence_str.data() + 33, 5),   // 0.4 " auto"
+        string_view(sentence_str.data() + 38, 1),   // 0.5 "."
+        string_view(sentence_str.data() + 39, 0),   // 0.6 [EOS]
+    };
+    target.appendSentence("", sentence.begin(), sentence.end());
+    target.appendEndingWhitespace("</p>");

    CHECK(asTokens(response.target) == asTokens(target));
  }
--- a/src/translator/html.cpp
+++ b/src/translator/html.cpp
@ -43,7 +43,7 @@ void encodeEntities(string_view const &input, std::string &output) {

 size_t countPrefixWhitespaces(string_view const &input) {
  size_t size = 0;
-  while (size < input.size() && input[size] == ' ') ++size;
+  while (size < input.size() && std::isspace(input[size])) ++size;
  return size;
 }

@ -59,6 +59,8 @@ std::ostream &operator<<(std::ostream &out, HTML::Tag const *tag) {
      return out << "<!--" << tag->data << "-->";
    case HTML::Tag::PROCESSING_INSTRUCTION:
      return out << "<?" << tag->data << "?>";
+    case HTML::Tag::WHITESPACE:
+      return out << "[inserted space]";
  }
  return out << "[Unknown tag type]";
 }
@ -107,27 +109,8 @@ class reversed {
  T const &container_;
 };

-bool isBlockElement(std::string_view const &name) {
-  // List of elements that we expect might occur inside words, and that should
-  // not introduce spacings around them. Not strictly inline elements, nor flow
-  // elements. See also https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories
-  static std::unordered_set<std::string> inlineishElements{
-      "abbr",  "a",    "b",      "em",  "i",   "kbd",  "mark", "math", "output", "q",   "ruby",
-      "small", "span", "strong", "sub", "sup", "time", "u",    "var",  "wbr",    "ins", "del"};
-
-  return inlineishElements.find(std::string(name)) == inlineishElements.end();
-}
-
-bool isVoidTag(std::string_view const &name) {
-  // List of elements for which we do not expect a closing tag, or self-closing
-  // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
-  // More relevant source of this list:
-  // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
-  static std::unordered_set<std::string> voidElements{"area",  "base",  "basefont", "bgsound", "br",    "col",
-                                                      "embed", "frame", "hr",       "img",     "input", "keygen",
-                                                      "link",  "meta",  "param",    "source",  "track", "wbr"};
-
-  return voidElements.find(std::string(name)) != voidElements.end();
+bool contains(std::unordered_set<std::string> const &set, std::string const &name) {
+  return set.find(name) != set.end();
 }

 void diffTags(HTML::Taint const &prev, HTML::Taint const &curr, HTML::Taint &opening, HTML::Taint &closing) {
@ -187,8 +170,6 @@ AnnotatedText apply(AnnotatedText const &in, Fun fun) {
  return out;
 }

-bool isContinuation(string_view str) { return !str.empty() && str.compare(0, 1, " ", 1) != 0; }
-
 bool hasAlignments(Response const &response) {
  // Test for each sentence individually as a sentence may be empty (or there)
  // might be no sentences, so just testing for alignments.empty() would not be
@ -207,85 +188,11 @@ bool hasAlignments(Response const &response) {
  return true;
 }

-void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
-  // For each sentence...
-  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
-    alignments.emplace_back();
-
-    // Hard-align: find for each target token the most prevalent source token
-    // Note: only search from 0 to N-1 because token N is end-of-sentence token
-    // that can only align with the end-of-sentence token of the target
-    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      size_t maxS = 0;
-      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
-        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
-          maxS = s;
-        }
-      }
-
-      alignments.back().push_back(maxS);
-    }
-
-    // Next, we try to smooth out these selected alignments with a few heuristics
-    for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
-      // If this token is a continuation of a previous token, pick the tags from the most
-      // prevalent token for the whole word.
-      if (isContinuation(response.target.word(sentenceIdx, t))) {
-        // Note: only looking at the previous token since that will already
-        // have this treatment applied to it.
-        size_t currSentenceIdx = alignments.back()[t];
-        size_t prevSentenceIdx = alignments.back()[t - 1];
-        float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
-        float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
-
-        if (currScore > prevScore) {
-          // Apply this to all previous tokens in the word
-          for (size_t i = t;; --i) {
-            alignments.back()[i] = currSentenceIdx;
-
-            // Stop if this was the first token or the beginning of the word
-            if (i == 0 || !isContinuation(response.target.word(sentenceIdx, i))) break;
-          }
-        } else {
-          alignments.back()[t] = prevSentenceIdx;
-        }
-      }
-    }
-
-    // Always align target end with source end
-    alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
-  }
-}
-
-// Internal type used to point to a position in HTML::spans_.
-typedef std::vector<HTML::Span>::const_iterator SpanIterator;
-
-void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
-               std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
-  size_t offset = 0;
-
-  // Fill targetTokenSpans based on the alignments we just made up.
-  // NOTE: this should match the exact order of Apply()
-  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
-    targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for sentence ending gap
-    for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
-      size_t s = alignments[sentenceIdx][t];
-      assert(s < response.source.numWords(sentenceIdx));
-      targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]);  // +1 for prefix gap
-    }
-
-    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
-  }
-
-  assert(offset < sourceTokenSpans.size());
-  targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for ending whitespace
-}
-
 // Little helper class to append HTML to a token
 class TokenFormatter {
 public:
  explicit TokenFormatter(string_view token)
-      : html_(), offset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
+      : html_(), offset_(0), whitespaceOffset_(0), whitespaceSize_(countPrefixWhitespaces(token)), closeLeft_(true) {
    // Do encoding of any entities that popped up in the translation
    encodeEntities(token, html_);
  }
@ -303,6 +210,7 @@ class TokenFormatter {
      std::string closeTag = format("</{}>", tag->name);
      html_.insert(offset_ + (closeLeft_ ? 0 : whitespaceSize_), closeTag);
      offset_ += closeTag.size();
+      if (closeLeft_) whitespaceOffset_ += closeTag.size();
    }

    for (HTML::Tag const *tag : opening) {
@ -318,17 +226,28 @@ class TokenFormatter {
        case HTML::Tag::PROCESSING_INSTRUCTION:
          openTag = format("<?{}?>", tag->data);
          break;
+        case HTML::Tag::WHITESPACE: {
+          // Try to eat two newlines (paragraph break) from our segment
+          auto pos = html_.find("\n\n", whitespaceOffset_);
+          if (pos != std::string::npos && pos < whitespaceOffset_ + whitespaceSize_) {
+            html_.erase(pos, 2);
+            whitespaceSize_ -= 2;
+          }
+        } break;
      }
+
      html_.insert(offset_ + whitespaceSize_, openTag);
      offset_ += openTag.size();
-      closeLeft_ = false;
+      closeLeft_ = closeLeft_ && openTag.empty();
    }
  }

 private:
-  std::string html_;       // Output html
-  size_t offset_;          // Size added by prepending HTML
-  size_t whitespaceSize_;  // number of prefix whitespace characters
+  std::string html_;         // Output html
+  size_t offset_;            // Size added by prepending HTML
+  size_t whitespaceOffset_;  // position of prefix whitespace characters
+                             // (it moves as closing tags are prepended)
+  size_t whitespaceSize_;    // number of prefix whitespace characters

  // Close tags we want to show up left (before) the token, but open tags
  // ideally come directly after any prefix whitespace. However, some tokens
@ -339,96 +258,6 @@ class TokenFormatter {
  bool closeLeft_;
 };

-AnnotatedText restoreSource(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
-                            std::vector<SpanIterator> &sourceTokenSpans) {
-  auto spanIt = sourceSpans.begin();
-  auto prevIt = sourceSpans.begin();  // safe because first span is always empty span, and
-                                      // and the while-loop below will do the rest
-  assert(prevIt == sourceSpans.end() || prevIt->tags.empty());
-
-  return apply(in, [&](ByteRange range, string_view token, bool last) {
-    TokenFormatter formatter(token);
-
-    // Potential issue: spans and tokens can intersect, e.g.
-    //
-    //    text  <p> h <u> e </u> ll o </p>
-    //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
-    //  tokens     |111111111111111|2|
-    //
-    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
-    // Note: only relevant if isBlockElement is used. If we just insert spaces
-    // around all elements, every segment of `hello` will be a token.
-
-    // Seek to the last span that overlaps with this token
-    while (true) {
-      formatter.append(prevIt->tags, spanIt->tags);
-      prevIt = spanIt;
-
-      if (spanIt + 1 != sourceSpans.end() && ((spanIt + 1)->begin < range.end || last)) {
-        spanIt++;
-        continue;
-      }
-
-      break;
-    }
-
-    // TODO: This is just the taint of the last span, not the ones in between.
-    // This makes us lose some markup of parts of tokens as described above.
-    sourceTokenSpans.push_back(prevIt);
-
-    return std::move(formatter.html());
-  });
-}
-
-AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<HTML::Span> const &sourceSpans,
-                            std::vector<SpanIterator> const &targetTokenSpans) {
-  auto prevSpan = sourceSpans.begin();
-  auto targetSpanIt = targetTokenSpans.begin();
-
-  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
-    TokenFormatter formatter(token);
-
-    // First we scan through spans_ to catch up to the span assigned to this
-    // token. We're only interested in empty spans (empty and void elements)
-    for (auto span_it = prevSpan + 1; span_it < *targetSpanIt; span_it++) {
-      // We're only interested in empty spans between the spans in targetSpanIt
-      if (span_it->size() != 0) continue;
-
-      formatter.append(prevSpan->tags, span_it->tags);
-
-      // Note: here, not in 3rd part of for-statement because we don't want to
-      // set prevSpan if the continue clause at the beginning of this for-loop
-      // was hit.
-      prevSpan = span_it;
-    }
-
-    // Now do the same thing but for our target set of tags. Note that we cannot
-    // combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
-    // because there is no guarantee that the order in `targetTokenSpans` is
-    // the same as that of `spans`.
-    formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
-
-    // If this is the last token of the response, close all open tags.
-    if (last) {
-      // Note: this assert is true due to our current implementation of
-      // HardAlignments() that always matches the last token of the input with
-      // the last token of the output. But lets assume someone someday changes
-      // HardAlignments(), and then this for-loop will be necessary.
-      // assert((*targetSpanIt)->tags.empty());
-      formatter.append((*targetSpanIt)->tags, HTML::Taint());
-    }
-
-    prevSpan = *targetSpanIt++;
-
-    return std::move(formatter.html());
-  });
-
-  // Assert that we did in fact use all our taints
-  assert(targetSpanIt == targetTokenSpans.end());
-
-  return out;
-}
-
 size_t debugCountTokens(AnnotatedText const &text) {
  size_t tokens = 1;  // for the ending gap
  for (size_t sentenceIdx = 0; sentenceIdx < text.numSentences(); ++sentenceIdx) {
@ -441,8 +270,9 @@ size_t debugCountTokens(AnnotatedText const &text) {

 namespace marian::bergamot {

-HTML::HTML(std::string &&source, bool process_markup) {
+HTML::HTML(std::string &&source, bool process_markup, Options &&options) : options_(std::move(options)) {
  if (!process_markup) return;
+
  std::string original = std::move(source);
  markup::instream in(original.data(), original.data() + original.size());
  markup::Scanner scanner(in);
@ -450,6 +280,8 @@ HTML::HTML(std::string &&source, bool process_markup) {

  Tag *tag;
  Taint stack;
+  bool addSentenceBreak = false;
+  bool addSpace = false;
  spans_.push_back(Span{0, 0, {}});

  bool stop = false;
@ -463,24 +295,41 @@ HTML::HTML(std::string &&source, bool process_markup) {
        break;

      case markup::Scanner::TT_TEXT: {
+        // If the previous segment was the open or close tag of a block element
+        // we treat the text after it as a new sentence.
+        if (addSentenceBreak) {
+          if (!(source.empty() || (source.size() > 2 && source.substr(source.size() - 2) == ""))) {
+            stack.push_back(makeTag({Tag::WHITESPACE}));
+            // Important: span->size() == 0 to make it behave as a void element.
+            // Also important: position before the \n\n tokens, not after, to
+            // make it easier to remove them later through apply().
+            spans_.push_back(Span{source.size(), source.size(), stack});
+            source.append("\n\n");  // TODO assumes ssplit-mode = wrapped_text
+            stack.pop_back();
+          }
+          addSentenceBreak = false;
+        }
+
+        // If the previous segment was an open or close tag, it might be best
+        // to add a space to make sure we don't append to the previous word.
+        if (addSpace) {
+          if (options_.substituteInlineTagsWithSpaces && !source.empty() && !std::isspace(source.back()) &&
+              !std::isspace(scanner.value()[0])) {
+            source.push_back(' ');
+          }
+          addSpace = false;
+        }
+
        auto begin = source.size();
        source.append(scanner.value());
        spans_.push_back(Span{begin, source.size(), stack});
      } break;

-      case markup::Scanner::TT_TAG_START:
-        // If it makes sense to treat this element as a break in a word (e.g.
-        // <br>, <img>, <li>) make sure it does so in this text as well.
-        // TODO: Strong assumption here that the language uses spaces to
-        // separate words
-        if (isBlockElement(scanner.tag()) && !source.empty() && source.back() != ' ') source.push_back(' ');
-
-        // pool_ takes ownership of our tag, makes sure it's freed when necessary
-        pool_.emplace_back(new Tag{isVoidTag(scanner.tag()) ? Tag::VOID_ELEMENT : Tag::ELEMENT,
-                                   std::string(scanner.tag()), std::string()});
+      case markup::Scanner::TT_TAG_START: {
+        std::string name(scanner.tag());

        // Tag *tag is used by attribute parsing
-        tag = pool_.back().get();
+        tag = makeTag({contains(options_.voidTags, name) ? Tag::VOID_ELEMENT : Tag::ELEMENT, std::move(name)});

        stack.push_back(tag);

@ -491,7 +340,14 @@ HTML::HTML(std::string &&source, bool process_markup) {
          spans_.push_back(Span{source.size(), source.size(), stack});
          stack.pop_back();
        }
-        break;
+
+        // Treat non-inline HTML tags as spaces that break up words.
+        if (!contains(options_.inlineTags, tag->name)) {
+          addSentenceBreak = true;
+        } else {
+          addSpace = true;
+        }
+      } break;

      case markup::Scanner::TT_TAG_END:
        // Note: self-closing tags emit TT_TAG_END immediately after TT_TAG_START
@ -508,6 +364,13 @@ HTML::HTML(std::string &&source, bool process_markup) {
          spans_.push_back(Span{source.size(), source.size(), stack});

        stack.pop_back();
+
+        // Add space if necessary
+        if (!contains(options_.inlineTags, std::string(scanner.tag()))) {
+          addSentenceBreak = true;
+        } else {
+          addSpace = true;
+        }
        break;

      case markup::Scanner::TT_ATTRIBUTE:
@ -516,18 +379,16 @@ HTML::HTML(std::string &&source, bool process_markup) {
        break;

      case markup::Scanner::TT_COMMENT_START:
-        // pool_ takes ownership of our tag, makes sure it's freed when necessary
-        pool_.emplace_back(new Tag{Tag::COMMENT});
-        tag = pool_.back().get();
+        // Tag *tag is used when TT_DATA is seen to add the comment's content.
+        tag = makeTag({Tag::COMMENT});
        stack.push_back(tag);
        spans_.push_back(Span{source.size(), source.size(), stack});
        stack.pop_back();
        break;

      case markup::Scanner::TT_PROCESSING_INSTRUCTION_START:
-        // pool_ takes ownership of our tag, makes sure it's freed when necessary
-        pool_.emplace_back(new Tag{Tag::PROCESSING_INSTRUCTION});
-        tag = pool_.back().get();
+        // Tag *tag is used when TT_DATA is seen to add the PI's content.
+        tag = makeTag({Tag::PROCESSING_INSTRUCTION});
        stack.push_back(tag);
        spans_.push_back(Span{source.size(), source.size(), stack});
        stack.pop_back();
@ -551,7 +412,7 @@ HTML::HTML(std::string &&source, bool process_markup) {
  if (!stack.empty()) throw BadHTML(format("Not all tags were closed: {}", stack));

  // Add a trailing span (that's empty) to signify all closed tags.
-  spans_.emplace_back(Span{source.size() + 1, source.size() + 1, stack});
+  spans_.emplace_back(Span{source.size(), source.size(), stack});
 }

 void HTML::restore(Response &response) {
@ -580,7 +441,7 @@ void HTML::restore(Response &response) {

  // RestoreSource re-inserts HTML into the source text, but also identifies
  // which span each source token fits into best.
-  AnnotatedText source = restoreSource(response.source, spans_, sourceTokenSpans);
+  AnnotatedText source = restoreSource(response.source, sourceTokenSpans);
  assert(sourceTokenSpans.size() == debugCountTokens(response.source));

  // Find for every token in target the token in source that best matches.
@ -591,10 +452,193 @@ void HTML::restore(Response &response) {
  copyTaint(response, alignments, sourceTokenSpans, targetTokenSpans);
  assert(targetTokenSpans.size() == debugCountTokens(response.target));

-  AnnotatedText target = restoreTarget(response.target, spans_, targetTokenSpans);
+  AnnotatedText target = restoreTarget(response.target, targetTokenSpans);

  response.source = source;
  response.target = target;
 }

+AnnotatedText HTML::restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans) {
+  auto spanIt = spans_.begin();
+  auto prevIt = spans_.begin();  // safe because first span is always empty span, and
+                                 // and the while-loop below will do the rest
+  assert(prevIt == spans_.end() || prevIt->tags.empty());
+
+  return apply(in, [&](ByteRange range, string_view token, bool last) {
+    TokenFormatter formatter(token);
+
+    // Potential issue: spans and tokens can intersect, e.g.
+    //
+    //    text  <p> h <u> e </u> ll o </p>
+    //   spans     |1|   |2|    |3333| (so only 2 is tainted with <p><u>, others only <p>)
+    //  tokens     |111111111111111|2|
+    //
+    // Now 1 covers span 1 to 3, so what taint should it get? Just <p>, or <p><u>?
+    // Note: only relevant if isBlockElement is used. If we just insert spaces
+    // around all elements, every segment of `hello` will be a token.
+
+    // Seek to the last span that overlaps with this token
+    while (true) {
+      formatter.append(prevIt->tags, spanIt->tags);
+      prevIt = spanIt;
+
+      if (spanIt + 1 != spans_.end() && ((spanIt + 1)->begin < range.end || last)) {
+        spanIt++;
+        continue;
+      }
+
+      break;
+    }
+
+    // TODO: This is just the taint of the last span, not the ones in between.
+    // This makes us lose some markup of parts of tokens as described above.
+    sourceTokenSpans.push_back(prevIt);
+
+    return std::move(formatter.html());
+  });
+}
+
+AnnotatedText HTML::restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans) {
+  auto prevSpan = spans_.cbegin();
+  auto targetSpanIt = targetTokenSpans.begin();
+
+  AnnotatedText out = apply(in, [&](ByteRange range, string_view token, bool last) {
+    TokenFormatter formatter(token);
+
+    // First we scan through spans_ to catch up to the span assigned to this
+    // token. We're only interested in empty spans (empty and void elements)
+    for (auto span_it = prevSpan; span_it < *targetSpanIt; span_it++) {
+      // We're only interested in empty spans or spans that would otherwise get
+      // lost because they didn't align with anything between the spans in
+      // targetSpanIt
+      // TODO That std::find makes this O(N*N) NOT GOOD NOT GOOD
+      if (span_it->size() != 0 &&
+          std::find(targetTokenSpans.begin(), targetTokenSpans.end(), span_it) != targetTokenSpans.end())
+        continue;
+
+      formatter.append(prevSpan->tags, span_it->tags);
+
+      // Note: here, not in 3rd part of for-statement because we don't want to
+      // set prevSpan if the continue clause at the beginning of this for-loop
+      // was hit.
+      prevSpan = span_it;
+    }
+
+    // Now do the same thing but for our target set of tags. Note that we cannot
+    // combine this in the for-loop above (i.e. `span_it <= *targetSpanIt`)
+    // because there is no guarantee that the order in `targetTokenSpans` is
+    // the same as that of `spans`.
+    formatter.append(prevSpan->tags, (*targetSpanIt)->tags);
+
+    // If this is the last token of the response, close all open tags.
+    if (last) {
+      // Note: this assert is true due to our current implementation of
+      // HardAlignments() that always matches the last token of the input with
+      // the last token of the output. But lets assume someone someday changes
+      // HardAlignments(), and then this for-loop will be necessary.
+      // assert((*targetSpanIt)->tags.empty());
+      formatter.append((*targetSpanIt)->tags, HTML::Taint());
+    }
+
+    prevSpan = *targetSpanIt;
+    ++targetSpanIt;
+
+    return std::move(formatter.html());
+  });
+
+  // Assert that we did in fact use all our taints
+  assert(targetSpanIt == targetTokenSpans.end());
+
+  return out;
+}
+
+HTML::Tag *HTML::makeTag(Tag &&tag) {
+  pool_.emplace_front(std::move(tag));
+  return &pool_.front();
+}
+
+void HTML::copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                     std::vector<SpanIterator> const &sourceTokenSpans, std::vector<SpanIterator> &targetTokenSpans) {
+  size_t offset = 0;  // Sentence offset in sourceTokenSpans
+
+  // Fill targetTokenSpans based on the alignments we just made up.
+  // NOTE: this should match the exact order of Apply()
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for sentence ending gap
+    for (size_t t = 0; t < response.target.numWords(sentenceIdx); ++t) {
+      size_t s = alignments[sentenceIdx][t];
+      assert(s < response.source.numWords(sentenceIdx));
+      targetTokenSpans.push_back(sourceTokenSpans[offset + 1 + s]);  // +1 for prefix gap
+    }
+
+    offset += response.source.numWords(sentenceIdx) + 1;  // +1 for prefix gap
+  }
+
+  assert(offset + 1 == sourceTokenSpans.size());
+  targetTokenSpans.push_back(sourceTokenSpans[offset]);  // token_tag for ending whitespace
+}
+
+// Reports if token `str` is likely to be a continuation of a word. This is used
+// to determine whether we should share the markup, or whether we should see
+// this token as a fresh start. This implementation will treat "hello[world]"
+// as 4 words, assuming its tokenised as something like `h ell o [ wor ld ]`.
+bool HTML::isContinuation(string_view prev, string_view str) {
+  if (options_.continuationDelimiters.empty()) return false;
+  if (prev.empty() || str.empty()) return false;
+  return options_.continuationDelimiters.find(str[0]) == std::string::npos &&
+         options_.continuationDelimiters.find(prev.back()) == std::string::npos;
+}
+
+void HTML::hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments) {
+  // For each sentence...
+  for (size_t sentenceIdx = 0; sentenceIdx < response.target.numSentences(); ++sentenceIdx) {
+    alignments.emplace_back();
+
+    // Hard-align: find for each target token the most prevalent source token
+    // Note: only search from 0 to N-1 because token N is end-of-sentence token
+    // that can only align with the end-of-sentence token of the target
+    for (size_t t = 0; t + 1 < response.target.numWords(sentenceIdx); ++t) {
+      size_t maxS = 0;
+      for (size_t s = 1; s + 1 < response.source.numWords(sentenceIdx); ++s) {
+        if (response.alignments[sentenceIdx][t][s] > response.alignments[sentenceIdx][t][maxS]) {
+          maxS = s;
+        }
+      }
+
+      alignments.back().push_back(maxS);
+    }
+
+    // Next, we try to smooth out these selected alignments with a few heuristics
+    for (size_t t = 1; t + 1 < response.target.numWords(sentenceIdx); ++t) {
+      // If this token is a continuation of a previous token, pick the tags from the most
+      // prevalent token for the whole word.
+      if (isContinuation(response.target.word(sentenceIdx, t - 1), response.target.word(sentenceIdx, t))) {
+        // Note: only looking at the previous token since that will already
+        // have this treatment applied to it.
+        size_t currSentenceIdx = alignments.back()[t];
+        size_t prevSentenceIdx = alignments.back()[t - 1];
+        float currScore = response.alignments[sentenceIdx][t][currSentenceIdx];
+        float prevScore = response.alignments[sentenceIdx][t - 1][prevSentenceIdx];
+
+        if (currScore >= prevScore) {
+          // Apply this to all previous tokens in the word
+          for (size_t i = t;; --i) {
+            alignments.back()[i] = currSentenceIdx;
+
+            // Stop if this was the first token or the beginning of the word
+            if (i == 0 ||
+                !isContinuation(response.target.word(sentenceIdx, i - 1), response.target.word(sentenceIdx, i)))
+              break;
+          }
+        } else {
+          alignments.back()[t] = prevSentenceIdx;
+        }
+      }
+    }
+
+    // Always align target end with source end
+    alignments.back().push_back(response.source.numWords(sentenceIdx) - 1);
+  }
+}
+
 }  // namespace marian::bergamot
--- a/src/translator/html.h
+++ b/src/translator/html.h
@ -1,9 +1,12 @@
 #ifndef SRC_BERGAMOT_HTML_H_
 #define SRC_BERGAMOT_HTML_H_

+#include <forward_list>
 #include <stdexcept>
 #include <string>
+#include <unordered_set>

+#include "annotation.h"
 #include "definitions.h"

 namespace marian {
@ -18,40 +21,84 @@ class BadHTML : public std::runtime_error {

 class HTML {
 public:
+  struct Options {
+    // List of elements for which we do not expect a closing tag, or self-closing
+    // elements in XHTML. See also https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
+    // More relevant source of this list:
+    // https://searchfox.org/mozilla-central/rev/7d17fd1fe9f0005a2fb19e5d53da4741b06a98ba/dom/base/FragmentOrElement.cpp#1791
+    std::unordered_set<std::string> voidTags{"area",  "base",  "basefont", "bgsound", "br",    "col",
+                                             "embed", "frame", "hr",       "img",     "input", "keygen",
+                                             "link",  "meta",  "param",    "source",  "track", "wbr"};
+
+    std::unordered_set<std::string> inlineTags{"abbr",   "a", "b",    "em",    "i",    "kbd",    "mark", "math",
+                                               "output", "q", "ruby", "small", "span", "strong", "sub",  "sup",
+                                               "time",   "u", "var",  "wbr",   "ins",  "del",    "img"};
+
+    // List of characters that occur at the start of a token that indicate that
+    // the this token is probably *not* a continuation of a word. Set to empty
+    // to never mark a token as a continuation of the word.
+    // std::string continuationDelimiters = "\n ,.(){}[]";
+    std::string continuationDelimiters;
+
+    // Should we always add spaces to the places where tags used to be? I.e.
+    // `un<u>der</u>line` should become `un der line`?
+    bool substituteInlineTagsWithSpaces = true;
+  };
+
  struct Tag {
    enum NodeType {
      ELEMENT,
      VOID_ELEMENT,
      COMMENT,
      PROCESSING_INSTRUCTION,
+      WHITESPACE,  // negative space
    };

-    NodeType type;  // Type of the node
-    std::string name;
-    std::string attributes;
-    std::string data;  // Raw data of an element that just needs to be
-                       // copied as is, e.g. <script> or <style>
-                       // TODO: replace with string_view if input lives that long
+    NodeType type;           // Type of the node
+    std::string name;        // Tag name (if type is ELEMENT or VOID_ELEMENT)
+    std::string attributes;  // Tag attributes (as raw HTML string, including
+                             // entities and prefix whitespace)
+    std::string data;        // Raw data of an element that just needs to be
+                             // copied as is, e.g. <script> or <style>
+    // @TODO: if the original HTML stays in memory, we could replace
+    // `attributes` and `data` with string_views pointing to it.
  };

-  typedef std::vector<Tag *> Taint;
+  using Taint = std::vector<Tag *>;

  struct Span {
    size_t begin;
    size_t end;
-    Taint tags;  // Note: free pointer! Lifetime of tags is managed by pool_
+    Taint tags;  // Note: free pointers! Lifetime of tags is managed by pool_
    inline size_t size() const { return end - begin; }
  };

-  explicit HTML(std::string &&source, bool process_markup);
+  explicit HTML(std::string &&source, bool process_markup) : HTML(std::move(source), process_markup, HTML::Options{}){};
+  explicit HTML(std::string &&source, bool process_markup, Options &&options);
  void restore(Response &response);

 private:
+  using SpanIterator = std::vector<HTML::Span>::const_iterator;
+  using AnnotatedText = marian::bergamot::AnnotatedText;
+
+  AnnotatedText restoreSource(AnnotatedText const &in, std::vector<SpanIterator> &sourceTokenSpans);
+  AnnotatedText restoreTarget(AnnotatedText const &in, std::vector<SpanIterator> const &targetTokenSpans);
+  void copyTaint(Response const &response, std::vector<std::vector<size_t>> const &alignments,
+                 std::vector<HTML::SpanIterator> const &sourceTokenSpans,
+                 std::vector<HTML::SpanIterator> &targetTokenSpans);
+  void hardAlignments(Response const &response, std::vector<std::vector<size_t>> &alignments);
+  bool isContinuation(string_view prev, string_view str);
+  // Allocates tag in pool_ (which then owns it) and gives a pointer to be used
+  // in Taints. Pointer is valid as long as this HTML instance lives on.
+  Tag *makeTag(Tag &&tag);
+
+  Options options_;
+
  // List of text spans, and which tags are applied to them
  std::vector<Span> spans_;

  // a pool of tags that we free when HTML goes out of scope
-  std::vector<std::unique_ptr<Tag>> pool_;
+  std::forward_list<Tag> pool_;
 };

 }  // namespace bergamot