41 строка
1.7 KiB
Python
41 строка
1.7 KiB
Python
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT license.
|
|
|
|
import jieba
|
|
import logging
|
|
jieba.setLogLevel(logging.INFO)
|
|
from nltk.tokenize.util import align_tokens
|
|
from .Stopwords import Stopwords
|
|
|
|
class ChineseTokenizer(object):
|
|
def __init__(self, tokenizer='jieba', remove_stopwords=False):
|
|
self.__tokenizer = tokenizer
|
|
self.__remove_stopwords = remove_stopwords
|
|
if self.__remove_stopwords:
|
|
self.__stop_words = Stopwords.chinese_stopwords
|
|
else:
|
|
self.__stop_words = None
|
|
|
|
def tokenize(self, string):
|
|
if self.__tokenizer == 'jieba':
|
|
tokens = list(jieba.cut(string))
|
|
|
|
if self.__remove_stopwords:
|
|
tokens = [word for word in tokens if word not in self.__stop_words]
|
|
return tokens
|
|
|
|
def span_tokenize(self, string):
|
|
if self.__tokenizer == 'jieba':
|
|
tokens = self.tokenize(string)
|
|
spans = align_tokens(tokens, string)
|
|
return spans
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import os
|
|
# nltk.data.path.append(r"C:\Users\wutlin\OneDrive - Microsoft\workspace\DNNMatchingToolkit\dataset\nltk_data")
|
|
tokenizer = ChineseTokenizer(tokenizer='jieba', remove_stopwords=True)
|
|
print(tokenizer.tokenize("我爱北京天安门,天安门上太阳升。"))
|
|
print(tokenizer.span_tokenize("我爱北京天安门,天安门上太阳升。"))
|
|
print(tokenizer.tokenize("给每一条河每一座山取一个温暖的名字;陌生人,我也为你祝福;愿你有一个灿烂的前程;愿你有情人终成眷属;愿你在尘世获得幸福;我只愿面朝大海,春暖花开。"))
|