Support additional dictionaries for BERT Japanese tokenizers (#6515)

* Update BERT Japanese tokenizers * Update CircleCI config to download unidic * Specify to use the latest dictionary packages
2020-08-17 13:00:23 +09:00 · 2020-08-17 13:00:23 +09:00 · 48c6c6139f
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -150,6 +150,7 @@ jobs:
                      - v0.3-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install .[ja,testing]
+            - run: python -m unidic download
            - save_cache:
                  key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
                  paths:
--- a/setup.py
+++ b/setup.py
@ -65,7 +65,7 @@ if stale_egg_info.exists():

 extras = {}

-extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"]
+extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"]
 extras["sklearn"] = ["scikit-learn"]

 # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi
--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer):
 class MecabTokenizer:
    """Runs basic tokenization with MeCab morphological parser."""

-    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
+    def __init__(
+        self,
+        do_lower_case=False,
+        never_split=None,
+        normalize_text=True,
+        mecab_dic: Optional[str] = "ipadic",
+        mecab_option: Optional[str] = None,
+    ):
        """Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input.
+                Whether to lowercase the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
+                List of tokens not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
-            **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
+            **mecab_dic**: (`optional`) string (default "ipadic")
+                Name of dictionary to be used for MeCab initialization.
+                If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
+            **mecab_option**: (`optional`) string
+                String passed to MeCab constructor.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

-        import fugashi
-        import ipadic
+        try:
+            import fugashi
+        except ModuleNotFoundError as error:
+            raise error(
+                "You need to install fugashi to use MecabTokenizer."
+                "See https://pypi.org/project/fugashi/ for installation."
+            )

-        # Use ipadic by default (later options can override it)
        mecab_option = mecab_option or ""
-        mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
+
+        if mecab_dic is not None:
+            if mecab_dic == "ipadic":
+                try:
+                    import ipadic
+                except ModuleNotFoundError as error:
+                    raise error(
+                        "The ipadic dictionary is not installed. "
+                        "See https://github.com/polm/ipadic-py for installation."
+                    )
+
+                dic_dir = ipadic.DICDIR
+
+            elif mecab_dic == "unidic_lite":
+                try:
+                    import unidic_lite
+                except ModuleNotFoundError as error:
+                    raise error(
+                        "The unidic_lite dictionary is not installed. "
+                        "See https://github.com/polm/unidic-lite for installation."
+                    )
+
+                dic_dir = unidic_lite.DICDIR
+
+            elif mecab_dic == "unidic":
+                try:
+                    import unidic
+                except ModuleNotFoundError as error:
+                    raise error(
+                        "The unidic dictionary is not installed. "
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+
+                dic_dir = unidic.DICDIR
+                if not os.path.isdir(dic_dir):
+                    raise RuntimeError(
+                        "The unidic dictionary itself is not found."
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+
+            else:
+                raise ValueError("Invalid mecab_dic is specified.")
+
+            mecabrc = os.path.join(dic_dir, "mecabrc")
+            mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option

        self.mecab = fugashi.GenericTagger(mecab_option)

@ -213,7 +272,7 @@ class MecabTokenizer:
        return tokens


-class CharacterTokenizer(object):
+class CharacterTokenizer:
    """Runs Character tokenziation."""

    def __init__(self, vocab, unk_token, normalize_text=True):
@ -247,7 +306,7 @@ class CharacterTokenizer(object):
            text = unicodedata.normalize("NFKC", text)

        output_tokens = []
-        for i, char in enumerate(text):
+        for char in text:
            if char not in self.vocab:
                output_tokens.append(self.unk_token)
                continue
--- a/tests/test_tokenization_bert_japanese.py
+++ b/tests/test_tokenization_bert_japanese.py
@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

-    def test_mecab_tokenizer(self):
-        tokenizer = MecabTokenizer()
+    def test_mecab_tokenizer_ipadic(self):
+        tokenizer = MecabTokenizer(mecab_dic="ipadic")

        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
            ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
        )

+    def test_mecab_tokenizer_unidic_lite(self):
+        try:
+            tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
+        except ModuleNotFoundError:
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
+    def test_mecab_tokenizer_unidic(self):
+        try:
+            tokenizer = MecabTokenizer(mecab_dic="unidic")
+        except ModuleNotFoundError:
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
+        )
+
    def test_mecab_tokenizer_lower(self):
-        tokenizer = MecabTokenizer(do_lower_case=True)
+        tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")

        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        )

    def test_mecab_tokenizer_no_normalize(self):
-        tokenizer = MecabTokenizer(normalize_text=False)
+        tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")

        self.assertListEqual(
            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),