Support additional dictionaries for BERT Japanese tokenizers (#6515)
* Update BERT Japanese tokenizers * Update CircleCI config to download unidic * Specify to use the latest dictionary packages
This commit is contained in:
Родитель
423eb5b1d7
Коммит
48c6c6139f
|
@ -150,6 +150,7 @@ jobs:
|
|||
- v0.3-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[ja,testing]
|
||||
- run: python -m unidic download
|
||||
- save_cache:
|
||||
key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
|
|
2
setup.py
2
setup.py
|
@ -65,7 +65,7 @@ if stale_egg_info.exists():
|
|||
|
||||
extras = {}
|
||||
|
||||
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"]
|
||||
extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0.0,<2.0", "unidic_lite>=1.0.7", "unidic>=1.0.2"]
|
||||
extras["sklearn"] = ["scikit-learn"]
|
||||
|
||||
# keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi
|
||||
|
|
|
@ -167,30 +167,89 @@ class BertJapaneseTokenizer(BertTokenizer):
|
|||
class MecabTokenizer:
|
||||
"""Runs basic tokenization with MeCab morphological parser."""
|
||||
|
||||
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
|
||||
def __init__(
|
||||
self,
|
||||
do_lower_case=False,
|
||||
never_split=None,
|
||||
normalize_text=True,
|
||||
mecab_dic: Optional[str] = "ipadic",
|
||||
mecab_option: Optional[str] = None,
|
||||
):
|
||||
"""Constructs a MecabTokenizer.
|
||||
|
||||
Args:
|
||||
**do_lower_case**: (`optional`) boolean (default True)
|
||||
Whether to lower case the input.
|
||||
Whether to lowercase the input.
|
||||
**never_split**: (`optional`) list of str
|
||||
Kept for backward compatibility purposes.
|
||||
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
||||
List of token not to split.
|
||||
List of tokens not to split.
|
||||
**normalize_text**: (`optional`) boolean (default True)
|
||||
Whether to apply unicode normalization to text before tokenization.
|
||||
**mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
|
||||
**mecab_dic**: (`optional`) string (default "ipadic")
|
||||
Name of dictionary to be used for MeCab initialization.
|
||||
If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
|
||||
**mecab_option**: (`optional`) string
|
||||
String passed to MeCab constructor.
|
||||
"""
|
||||
self.do_lower_case = do_lower_case
|
||||
self.never_split = never_split if never_split is not None else []
|
||||
self.normalize_text = normalize_text
|
||||
|
||||
import fugashi
|
||||
import ipadic
|
||||
try:
|
||||
import fugashi
|
||||
except ModuleNotFoundError as error:
|
||||
raise error(
|
||||
"You need to install fugashi to use MecabTokenizer."
|
||||
"See https://pypi.org/project/fugashi/ for installation."
|
||||
)
|
||||
|
||||
# Use ipadic by default (later options can override it)
|
||||
mecab_option = mecab_option or ""
|
||||
mecab_option = ipadic.MECAB_ARGS + " " + mecab_option
|
||||
|
||||
if mecab_dic is not None:
|
||||
if mecab_dic == "ipadic":
|
||||
try:
|
||||
import ipadic
|
||||
except ModuleNotFoundError as error:
|
||||
raise error(
|
||||
"The ipadic dictionary is not installed. "
|
||||
"See https://github.com/polm/ipadic-py for installation."
|
||||
)
|
||||
|
||||
dic_dir = ipadic.DICDIR
|
||||
|
||||
elif mecab_dic == "unidic_lite":
|
||||
try:
|
||||
import unidic_lite
|
||||
except ModuleNotFoundError as error:
|
||||
raise error(
|
||||
"The unidic_lite dictionary is not installed. "
|
||||
"See https://github.com/polm/unidic-lite for installation."
|
||||
)
|
||||
|
||||
dic_dir = unidic_lite.DICDIR
|
||||
|
||||
elif mecab_dic == "unidic":
|
||||
try:
|
||||
import unidic
|
||||
except ModuleNotFoundError as error:
|
||||
raise error(
|
||||
"The unidic dictionary is not installed. "
|
||||
"See https://github.com/polm/unidic-py for installation."
|
||||
)
|
||||
|
||||
dic_dir = unidic.DICDIR
|
||||
if not os.path.isdir(dic_dir):
|
||||
raise RuntimeError(
|
||||
"The unidic dictionary itself is not found."
|
||||
"See https://github.com/polm/unidic-py for installation."
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError("Invalid mecab_dic is specified.")
|
||||
|
||||
mecabrc = os.path.join(dic_dir, "mecabrc")
|
||||
mecab_option = "-d {} -r {} ".format(dic_dir, mecabrc) + mecab_option
|
||||
|
||||
self.mecab = fugashi.GenericTagger(mecab_option)
|
||||
|
||||
|
@ -213,7 +272,7 @@ class MecabTokenizer:
|
|||
return tokens
|
||||
|
||||
|
||||
class CharacterTokenizer(object):
|
||||
class CharacterTokenizer:
|
||||
"""Runs Character tokenziation."""
|
||||
|
||||
def __init__(self, vocab, unk_token, normalize_text=True):
|
||||
|
@ -247,7 +306,7 @@ class CharacterTokenizer(object):
|
|||
text = unicodedata.normalize("NFKC", text)
|
||||
|
||||
output_tokens = []
|
||||
for i, char in enumerate(text):
|
||||
for char in text:
|
||||
if char not in self.vocab:
|
||||
output_tokens.append(self.unk_token)
|
||||
continue
|
||||
|
|
|
@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
|
||||
|
||||
def test_mecab_tokenizer(self):
|
||||
tokenizer = MecabTokenizer()
|
||||
def test_mecab_tokenizer_ipadic(self):
|
||||
tokenizer = MecabTokenizer(mecab_dic="ipadic")
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||
)
|
||||
|
||||
def test_mecab_tokenizer_unidic_lite(self):
|
||||
try:
|
||||
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
|
||||
except ModuleNotFoundError:
|
||||
return
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||
)
|
||||
|
||||
def test_mecab_tokenizer_unidic(self):
|
||||
try:
|
||||
tokenizer = MecabTokenizer(mecab_dic="unidic")
|
||||
except ModuleNotFoundError:
|
||||
return
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||
)
|
||||
|
||||
def test_mecab_tokenizer_lower(self):
|
||||
tokenizer = MecabTokenizer(do_lower_case=True)
|
||||
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
|
@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_mecab_tokenizer_no_normalize(self):
|
||||
tokenizer = MecabTokenizer(normalize_text=False)
|
||||
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
|
|
Загрузка…
Ссылка в новой задаче