This commit is contained in:
Taku Kudo 2017-08-29 00:26:23 +09:00
Родитель f56e2aac6a
Коммит 8177dab62b
15 изменённых файлов: 5194 добавлений и 4 удалений

8
.gitignore поставляемый
Просмотреть файл

@ -30,6 +30,7 @@ Makefile.in
*.lo
*.a
*.la
*.pyc
.libs
.deps
@ -49,3 +50,10 @@ spm_test
*.pb.cc
*.pb.h
.DS_Store
*.egg-info/
dist/
*.swp
*.swo
*.pyc

Просмотреть файл

@ -1,10 +1,13 @@
AUTOMAKE_OPTIONS = foreign
SUBDIRS = src
EXTRA_DIRS = m4 third_party data doc
EXTRA_DIRS = m4 third_party data doc python
EXTRA_DIST = README.md LICENSE
ACLOCAL_AMFLAGS = -I third_party/m4
pkgconfigdir = @pkgconfigdir@
pkgconfig_DATA = sentencepiece.pc
dist-hook:
for subdir in $(EXTRA_DIRS); do \
cp -rp $$subdir $(distdir); \
@ -17,6 +20,6 @@ dist-hook:
rm -rf $(distdir)/*/*/.svn; \
rm -rf $(distdir)/$$subdir/*/CVS; \
rm -rf $(distdir)/$$subdir/*/.svn; \
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
find $(distdir) -name .svn | xargs rm -fr; \
done

Просмотреть файл

@ -58,6 +58,19 @@ if test "${enable_gcov}" = "yes"; then
LIBS="$LIBS -lgcov"
fi
# pkgconfigdir
AC_ARG_WITH(pkgconfigdir,
AC_HELP_STRING([--with-pkgconfigdir],
[Use the specified pkgconfig dir (default is libdir/pkgconfig)]),
[pkgconfigdir=${withval}],
[pkgconfigdir='${libdir}/pkgconfig'])
AC_MSG_NOTICE([pkgconfig directory is ${pkgconfigdir}])
pkgconfigcflags=$CFLAGS
pkgconfiglibs=$LIBS
AC_SUBST([pkgconfigdir])
AC_SUBST([pkgconfigcflags])
AC_SUBST([pkgconfiglibs])
# Checks for header files.
AC_CHECK_HEADERS([unistd.h])
@ -69,6 +82,7 @@ AC_FUNC_STRTOD
AC_CHECK_FUNCS([memchr memset])
AC_CONFIG_FILES([Makefile
src/Makefile])
src/Makefile
sentencepiece.pc])
AC_OUTPUT

3
python/MANIFEST.in Normal file
Просмотреть файл

@ -0,0 +1,3 @@
recursive-include test *.py *.model
include *.i *.md

47
python/README.md Normal file
Просмотреть файл

@ -0,0 +1,47 @@
# SentencePiece Python Wrapper
Python wrapper for SentencePiece with SWIG. This module wrappes sentencepiece::SentencePieceProcessor class with the following modifications:
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
* SentencePieceText proto is not supported.
## Build and Install SentencePiece
You need to install SentencePiece before before installing this python wrapper.
```
% pip install sentencepiece
```
You can install this module manually as follows:
```
% python setup.py build
% sudo python setup.py install
```
If you dont have write permission to the global site-packages directory or dont want to install into it, please try:
```
% python setup.py install --user
```
## Usage
```
% python
>>> import sentencepiece as spm
>>> sp = spm.SentencePieceProcessor()
>>> sp.Load("test/test_model.model")
True
>>> sp.EncodeAsPieces("This is a test")
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']
>>> sp.EncodeAsIds("This is a test")
[284, 47, 11, 4, 15, 400]
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
'This is a test'
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
'This is a test'
>>> sp.GetPieceSize()
1000
>>> sp.IdToPiece(2)
'</s>'
>>> sp.PieceToId('</s>')
2
```

155
python/sentencepiece.i Normal file
Просмотреть файл

@ -0,0 +1,155 @@
%module sentencepiece
# Python wrapper is generated with:
# % swig -python -c++ sentencepiece.i
%{
#include <sentencepiece_processor.h>
%}
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<std::string>*) const;
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<int>*) const;
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, SentencePieceText *) const;
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &,std::string *) const;
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, std::string *) const;
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &, SentencePieceText *) const;
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, SentencePieceText *) const;
%ignore sentencepiece::SentencePieceProcessor::model_proto;
%ignore sentencepiece::SentencePieceProcessor::Load(std::istream *);
%ignore sentencepiece::SentencePieceProcessor::LoadOrDie(std::istream *);
%ignore sentencepiece::SentencePieceProcessor::model_proto();
%extend sentencepiece::SentencePieceProcessor {
std::vector<std::string> Encode(const std::string& input) const {
std::vector<std::string> output;
$self->Encode(input, &output);
return output;
}
std::vector<std::string> EncodeAsPieces(const std::string& input) const {
std::vector<std::string> output;
$self->Encode(input, &output);
return output;
}
std::vector<int> EncodeAsIds(const std::string& input) const {
std::vector<int> output;
$self->Encode(input, &output);
return output;
}
std::string Decode(const std::vector<std::string>& input) const {
std::string output;
$self->Decode(input, &output);
return output;
}
std::string DecodePieces(const std::vector<std::string>& input) const {
std::string output;
$self->Decode(input, &output);
return output;
}
std::string DecodeIds(const std::vector<int>& input) const {
std::string output;
$self->Decode(input, &output);
return output;
}
int __len__() {
return $self->GetPieceSize();
}
int __getitem__(const std::string& key) const {
return $self->PieceToId(key);
}
}
%typemap(out) std::vector<int> {
$result = PyList_New($1.size());
for (size_t i = 0; i < $1.size(); ++i)
PyList_SetItem($result, i, PyInt_FromLong((long)$1[i]));
}
%typemap(out) std::vector<std::string> {
$result = PyList_New($1.size());
for (size_t i = 0; i < $1.size(); ++i)
PyList_SetItem($result, i, PyString_FromStringAndSize($1[i].data(), $1[i].size()));
}
%typemap(out) std::string {
$result = PyString_FromStringAndSize($1.data(), $1.size());
}
%typemap(in) const std::string & {
std::string *out = nullptr;
if (PyString_Check($input)) {
char *str = nullptr;
Py_ssize_t str_size = 0;
PyString_AsStringAndSize($input, &str, &str_size);
out = new std::string(str, str_size);
} else {
PyErr_SetString(PyExc_TypeError,"not a string");
return NULL;
}
$1 = out;
}
%typemap(in) const std::vector<std::string>& {
std::vector<std::string> *out = nullptr;
if (PyList_Check($input)) {
const size_t size = PyList_Size($input);
out = new std::vector<std::string>(size);
for (size_t i = 0; i < size; ++i) {
PyObject *o = PyList_GetItem($input, i);
if (PyString_Check(o)) {
char *str = nullptr;
Py_ssize_t str_size = 0;
PyString_AsStringAndSize(o, &str, &str_size);
(*out)[i] = std::string(str, static_cast<size_t>(str_size));
} else {
PyErr_SetString(PyExc_TypeError,"list must contain strings");
return NULL;
}
}
} else {
PyErr_SetString(PyExc_TypeError,"not a list");
return NULL;
}
$1 = out;
}
%typemap(in) const std::vector<int>& {
std::vector<int> *out = nullptr;
if (PyList_Check($input)) {
const size_t size = PyList_Size($input);
out = new std::vector<int>(size);
for (size_t i = 0; i < size; ++i) {
PyObject *o = PyList_GetItem($input, i);
if (PyInt_Check(o)) {
(*out)[i] = static_cast<int>(PyInt_AsLong(o));
} else {
PyErr_SetString(PyExc_TypeError,"list must contain integers");
return NULL;
}
}
} else {
PyErr_SetString(PyExc_TypeError,"not a list");
return NULL;
}
$1 = out;
}
%typemap(freearg) const std::string& {
delete $1;
}
%typemap(freearg) const std::vector<std::string>& {
delete $1;
}
%typemap(freearg) const std::vector<int>& {
delete $1;
}
%include <sentencepiece_processor.h>

107
python/sentencepiece.py Normal file
Просмотреть файл

@ -0,0 +1,107 @@
# This file was automatically generated by SWIG (http://www.swig.org).
# Version 2.0.11
#
# Do not make changes to this file unless you know what you are doing--modify
# the SWIG interface file instead.
from sys import version_info
if version_info >= (2,6,0):
def swig_import_helper():
from os.path import dirname
import imp
fp = None
try:
fp, pathname, description = imp.find_module('_sentencepiece', [dirname(__file__)])
except ImportError:
import _sentencepiece
return _sentencepiece
if fp is not None:
try:
_mod = imp.load_module('_sentencepiece', fp, pathname, description)
finally:
fp.close()
return _mod
_sentencepiece = swig_import_helper()
del swig_import_helper
else:
import _sentencepiece
del version_info
try:
_swig_property = property
except NameError:
pass # Python < 2.2 doesn't have 'property'.
def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
if (name == "thisown"): return self.this.own(value)
if (name == "this"):
if type(value).__name__ == 'SwigPyObject':
self.__dict__[name] = value
return
method = class_type.__swig_setmethods__.get(name,None)
if method: return method(self,value)
if (not static):
self.__dict__[name] = value
else:
raise AttributeError("You cannot add attributes to %s" % self)
def _swig_setattr(self,class_type,name,value):
return _swig_setattr_nondynamic(self,class_type,name,value,0)
def _swig_getattr(self,class_type,name):
if (name == "thisown"): return self.this.own()
method = class_type.__swig_getmethods__.get(name,None)
if method: return method(self)
raise AttributeError(name)
def _swig_repr(self):
try: strthis = "proxy of " + self.this.__repr__()
except: strthis = ""
return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
try:
_object = object
_newclass = 1
except AttributeError:
class _object : pass
_newclass = 0
class SentencePieceProcessor(_object):
__swig_setmethods__ = {}
__setattr__ = lambda self, name, value: _swig_setattr(self, SentencePieceProcessor, name, value)
__swig_getmethods__ = {}
__getattr__ = lambda self, name: _swig_getattr(self, SentencePieceProcessor, name)
__repr__ = _swig_repr
def __init__(self):
this = _sentencepiece.new_SentencePieceProcessor()
try: self.this.append(this)
except: self.this = this
__swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor
__del__ = lambda self : None;
def Load(self, *args): return _sentencepiece.SentencePieceProcessor_Load(self, *args)
def LoadOrDie(self, *args): return _sentencepiece.SentencePieceProcessor_LoadOrDie(self, *args)
def SetEncodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, *args)
def SetDecodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, *args)
def GetPieceSize(self): return _sentencepiece.SentencePieceProcessor_GetPieceSize(self)
def PieceToId(self, *args): return _sentencepiece.SentencePieceProcessor_PieceToId(self, *args)
def IdToPiece(self, *args): return _sentencepiece.SentencePieceProcessor_IdToPiece(self, *args)
def GetScore(self, *args): return _sentencepiece.SentencePieceProcessor_GetScore(self, *args)
def IsUnknown(self, *args): return _sentencepiece.SentencePieceProcessor_IsUnknown(self, *args)
def IsControl(self, *args): return _sentencepiece.SentencePieceProcessor_IsControl(self, *args)
def Encode(self, *args): return _sentencepiece.SentencePieceProcessor_Encode(self, *args)
def EncodeAsPieces(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsPieces(self, *args)
def EncodeAsIds(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsIds(self, *args)
def Decode(self, *args): return _sentencepiece.SentencePieceProcessor_Decode(self, *args)
def DecodePieces(self, *args): return _sentencepiece.SentencePieceProcessor_DecodePieces(self, *args)
def DecodeIds(self, *args): return _sentencepiece.SentencePieceProcessor_DecodeIds(self, *args)
def __len__(self): return _sentencepiece.SentencePieceProcessor___len__(self)
def __getitem__(self, *args): return _sentencepiece.SentencePieceProcessor___getitem__(self, *args)
SentencePieceProcessor_swigregister = _sentencepiece.SentencePieceProcessor_swigregister
SentencePieceProcessor_swigregister(SentencePieceProcessor)
# This file is compatible with both classic and new-style classes.

Разница между файлами не показана из-за своего большого размера Загрузить разницу

2
python/setup.cfg Normal file
Просмотреть файл

@ -0,0 +1,2 @@
[metadata]
description-file = README.md

42
python/setup.py Executable file
Просмотреть файл

@ -0,0 +1,42 @@
#!/usr/bin/env python
from setuptools import setup, Extension
import string
import sys
import os
sys.path.append('./test')
with open("README.md") as f:
long_description = f.read()
def cmd(line):
return os.popen(line).readlines()[0][:-1].split()
setup(name = 'sentencepiece',
author = 'Taku Kudo',
author_email='taku@google.com',
description = 'SentencePiece python wrapper',
long_description = long_description,
url = 'https://github.com/google/sentencepiece',
license = 'Apache',
platforms = 'Unix',
py_modules=['sentencepiece'],
ext_modules = [Extension('_sentencepiece',
sources=['sentencepiece_wrap.cxx'],
extra_compile_args=['-std=c++11'] +
cmd('pkg-config sentencepiece --cflags'),
extra_link_args=cmd('pkg-config sentencepiece --libs'))
],
classifiers = [
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Operating System :: Unix',
'Programming Language :: Python',
'Topic :: Text Processing :: Linguistic',
'Topic :: Software Development :: Libraries :: Python Modules'
],
test_suite = 'sentencepiece_test.suite')

0
python/test/__init__.py Normal file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,42 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sentencepiece as spm
import unittest
class TestSentencepieceProcessor(unittest.TestCase):
"""Test case for SentencePieceProcessor"""
def setUp(self):
self.sp_ = spm.SentencePieceProcessor()
self.assertTrue(self.sp_.Load('test/test_model.model'))
def test_load(self):
self.assertEqual(1000, self.sp_.GetPieceSize())
self.assertEqual(0, self.sp_.PieceToId('<unk>'))
self.assertEqual(1, self.sp_.PieceToId('<s>'))
self.assertEqual(2, self.sp_.PieceToId('</s>'))
self.assertEqual('<unk>', self.sp_.IdToPiece(0))
self.assertEqual('<s>', self.sp_.IdToPiece(1))
self.assertEqual('</s>', self.sp_.IdToPiece(2))
for i in range(self.sp_.GetPieceSize()):
piece = self.sp_.IdToPiece(i)
self.assertEqual(i, self.sp_.PieceToId(piece))
def test_roundtrip(self):
text = 'I saw a girl with a telescope.'
ids = self.sp_.EncodeAsIds(text)
pieces1 = self.sp_.EncodeAsPieces(text)
pieces2 = self.sp_.Encode(text)
self.assertEqual(pieces1, pieces2)
self.assertEqual(text, self.sp_.Decode(pieces1))
self.assertEqual(text, self.sp_.DecodePieces(pieces2))
self.assertEqual(text, self.sp_.DecodeIds(ids))
def suite():
suite = unittest.TestSuite()
suite.addTests(unittest.makeSuite(TestSentencepieceProcessor))
return suite
if __name__ == '__main__':
unittest.main()

Двоичные данные
python/test/test_model.model Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -17,6 +17,7 @@ libsentencepiece_la_SOURCES = \
word_model.h word_model.cc \
char_model.h char_model.cc \
bpe_model.h bpe_model.cc
include_HEADERS = sentencepiece_processor.h
noinst_LIBRARIES = libtrain.a
libtrain_a_SOURCES = builder.cc builder.h \
@ -37,7 +38,7 @@ BUILT_SOURCES = \
sentencepiece.pb.cc \
sentencepiece_model.pb.cc
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
bin_PROGRAMS = spm_encode spm_decode spm_normalize spm_train spm_export_vocab
noinst_PROGRAMS = compile_charsmap

Просмотреть файл

@ -168,6 +168,7 @@ class SentencePieceProcessor {
// Returns true if |id| is control symbol.
virtual bool IsControl(int id) const;
#ifndef SWIG
//////////////////////////////////////////////////////////////
// Model management.
//
@ -176,6 +177,7 @@ class SentencePieceProcessor {
// Allows injection of a normalizer instance. |normalizer| is moved.
void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
#endif
// Returns immutable model proto. Useful to obtain extended
// or experimental parameters encoded in model_proto.