Support additional dictionaries for BERT Japanese tokenizers (#6515)

* Update BERT Japanese tokenizers

* Update CircleCI config to download unidic

* Specify to use the latest dictionary packages
This commit is contained in:
Masatoshi Suzuki 2020-08-17 13:00:23 +09:00 коммит произвёл GitHub
Родитель 423eb5b1d7
Коммит 48c6c6139f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 97 добавлений и 15 удалений

Просмотреть файл

@ -150,6 +150,7 @@ jobs:
- v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[ja,testing]
- run: python -m unidic download
- save_cache:
key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
paths:

Просмотреть файл

@ -65,7 +65,7 @@ if stale_egg_info.exists():
extras = {}
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"]
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"]
extras["sklearn"] = ["scikit-learn"]
# keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi

Просмотреть файл

@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer):
class MecabTokenizer:
"""Runs basic tokenization with MeCab morphological parser."""
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
def __init__(
self,
do_lower_case=False,
never_split=None,
normalize_text=True,
mecab_dic: Optional[str] = "ipadic",
mecab_option: Optional[str] = None,
):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Whether to lowercase the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
List of tokens not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
**mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
**mecab_dic**: (`optional`) string (default "ipadic")
Name of dictionary to be used for MeCab initialization.
If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
**mecab_option**: (`optional`) string
String passed to MeCab constructor.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
import fugashi
import ipadic
try:
import fugashi
except ModuleNotFoundError as error:
raise error(
"You need to install fugashi to use MecabTokenizer."
"See https://pypi.org/project/fugashi/ for installation."
)
# Use ipadic by default (later options can override it)
mecab_option = mecab_option or ""
mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
if mecab_dic is not None:
if mecab_dic == "ipadic":
try:
import ipadic
except ModuleNotFoundError as error:
raise error(
"The ipadic dictionary is not installed. "
"See https://github.com/polm/ipadic-py for installation."
)
dic_dir = ipadic.DICDIR
elif mecab_dic == "unidic_lite":
try:
import unidic_lite
except ModuleNotFoundError as error:
raise error(
"The unidic_lite dictionary is not installed. "
"See https://github.com/polm/unidic-lite for installation."
)
dic_dir = unidic_lite.DICDIR
elif mecab_dic == "unidic":
try:
import unidic
except ModuleNotFoundError as error:
raise error(
"The unidic dictionary is not installed. "
"See https://github.com/polm/unidic-py for installation."
)
dic_dir = unidic.DICDIR
if not os.path.isdir(dic_dir):
raise RuntimeError(
"The unidic dictionary itself is not found."
"See https://github.com/polm/unidic-py for installation."
)
else:
raise ValueError("Invalid mecab_dic is specified.")
mecabrc = os.path.join(dic_dir, "mecabrc")
mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option
self.mecab = fugashi.GenericTagger(mecab_option)
@ -213,7 +272,7 @@ class MecabTokenizer:
return tokens
class CharacterTokenizer(object):
class CharacterTokenizer:
"""Runs Character tokenziation."""
def __init__(self, vocab, unk_token, normalize_text=True):
@ -247,7 +306,7 @@ class CharacterTokenizer(object):
text = unicodedata.normalize("NFKC", text)
output_tokens = []
for i, char in enumerate(text):
for char in text:
if char not in self.vocab:
output_tokens.append(self.unk_token)
continue

Просмотреть файл

@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_mecab_tokenizer(self):
tokenizer = MecabTokenizer()
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic_lite(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True)
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False)
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),