Added Python wrapper
This commit is contained in:
Родитель
f56e2aac6a
Коммит
8177dab62b
|
@ -30,6 +30,7 @@ Makefile.in
|
|||
*.lo
|
||||
*.a
|
||||
*.la
|
||||
*.pyc
|
||||
|
||||
.libs
|
||||
.deps
|
||||
|
@ -49,3 +50,10 @@ spm_test
|
|||
|
||||
*.pb.cc
|
||||
*.pb.h
|
||||
|
||||
.DS_Store
|
||||
*.egg-info/
|
||||
dist/
|
||||
*.swp
|
||||
*.swo
|
||||
*.pyc
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
AUTOMAKE_OPTIONS = foreign
|
||||
SUBDIRS = src
|
||||
|
||||
EXTRA_DIRS = m4 third_party data doc
|
||||
EXTRA_DIRS = m4 third_party data doc python
|
||||
EXTRA_DIST = README.md LICENSE
|
||||
ACLOCAL_AMFLAGS = -I third_party/m4
|
||||
|
||||
pkgconfigdir = @pkgconfigdir@
|
||||
pkgconfig_DATA = sentencepiece.pc
|
||||
|
||||
dist-hook:
|
||||
for subdir in $(EXTRA_DIRS); do \
|
||||
cp -rp $$subdir $(distdir); \
|
||||
|
@ -17,6 +20,6 @@ dist-hook:
|
|||
rm -rf $(distdir)/*/*/.svn; \
|
||||
rm -rf $(distdir)/$$subdir/*/CVS; \
|
||||
rm -rf $(distdir)/$$subdir/*/.svn; \
|
||||
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
|
||||
rm -rf $(distdir)/$$subdir/*/.pb.cc; \
|
||||
find $(distdir) -name .svn | xargs rm -fr; \
|
||||
done
|
||||
|
|
16
configure.ac
16
configure.ac
|
@ -58,6 +58,19 @@ if test "${enable_gcov}" = "yes"; then
|
|||
LIBS="$LIBS -lgcov"
|
||||
fi
|
||||
|
||||
# pkgconfigdir
|
||||
AC_ARG_WITH(pkgconfigdir,
|
||||
AC_HELP_STRING([--with-pkgconfigdir],
|
||||
[Use the specified pkgconfig dir (default is libdir/pkgconfig)]),
|
||||
[pkgconfigdir=${withval}],
|
||||
[pkgconfigdir='${libdir}/pkgconfig'])
|
||||
AC_MSG_NOTICE([pkgconfig directory is ${pkgconfigdir}])
|
||||
pkgconfigcflags=$CFLAGS
|
||||
pkgconfiglibs=$LIBS
|
||||
AC_SUBST([pkgconfigdir])
|
||||
AC_SUBST([pkgconfigcflags])
|
||||
AC_SUBST([pkgconfiglibs])
|
||||
|
||||
# Checks for header files.
|
||||
AC_CHECK_HEADERS([unistd.h])
|
||||
|
||||
|
@ -69,6 +82,7 @@ AC_FUNC_STRTOD
|
|||
AC_CHECK_FUNCS([memchr memset])
|
||||
|
||||
AC_CONFIG_FILES([Makefile
|
||||
src/Makefile])
|
||||
src/Makefile
|
||||
sentencepiece.pc])
|
||||
|
||||
AC_OUTPUT
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
recursive-include test *.py *.model
|
||||
include *.i *.md
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
# SentencePiece Python Wrapper
|
||||
|
||||
Python wrapper for SentencePiece with SWIG. This module wrappes sentencepiece::SentencePieceProcessor class with the following modifications:
|
||||
* Encode and Decode methods are re-defined as EncodeAsIds, EncodeAsPieces, DecodeIds and DecodePieces respectevely.
|
||||
* SentencePieceText proto is not supported.
|
||||
|
||||
## Build and Install SentencePiece
|
||||
You need to install SentencePiece before before installing this python wrapper.
|
||||
|
||||
```
|
||||
% pip install sentencepiece
|
||||
```
|
||||
|
||||
You can install this module manually as follows:
|
||||
```
|
||||
% python setup.py build
|
||||
% sudo python setup.py install
|
||||
```
|
||||
|
||||
If you don’t have write permission to the global site-packages directory or don’t want to install into it, please try:
|
||||
```
|
||||
% python setup.py install --user
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
% python
|
||||
>>> import sentencepiece as spm
|
||||
>>> sp = spm.SentencePieceProcessor()
|
||||
>>> sp.Load("test/test_model.model")
|
||||
True
|
||||
>>> sp.EncodeAsPieces("This is a test")
|
||||
['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est']
|
||||
>>> sp.EncodeAsIds("This is a test")
|
||||
[284, 47, 11, 4, 15, 400]
|
||||
>>> sp.DecodePieces(['\xe2\x96\x81This', '\xe2\x96\x81is', '\xe2\x96\x81a', '\xe2\x96\x81', 't', 'est'])
|
||||
'This is a test'
|
||||
>>> sp.DecodeIds([284, 47, 11, 4, 15, 400])
|
||||
'This is a test'
|
||||
>>> sp.GetPieceSize()
|
||||
1000
|
||||
>>> sp.IdToPiece(2)
|
||||
'</s>'
|
||||
>>> sp.PieceToId('</s>')
|
||||
2
|
||||
```
|
|
@ -0,0 +1,155 @@
|
|||
%module sentencepiece
|
||||
|
||||
# Python wrapper is generated with:
|
||||
# % swig -python -c++ sentencepiece.i
|
||||
|
||||
%{
|
||||
#include <sentencepiece_processor.h>
|
||||
%}
|
||||
|
||||
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<std::string>*) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, std::vector<int>*) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Encode(std::string const &, SentencePieceText *) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &,std::string *) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, std::string *) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<std::string> const &, SentencePieceText *) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Decode(std::vector<int> const &, SentencePieceText *) const;
|
||||
%ignore sentencepiece::SentencePieceProcessor::model_proto;
|
||||
%ignore sentencepiece::SentencePieceProcessor::Load(std::istream *);
|
||||
%ignore sentencepiece::SentencePieceProcessor::LoadOrDie(std::istream *);
|
||||
%ignore sentencepiece::SentencePieceProcessor::model_proto();
|
||||
|
||||
%extend sentencepiece::SentencePieceProcessor {
|
||||
std::vector<std::string> Encode(const std::string& input) const {
|
||||
std::vector<std::string> output;
|
||||
$self->Encode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<std::string> EncodeAsPieces(const std::string& input) const {
|
||||
std::vector<std::string> output;
|
||||
$self->Encode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<int> EncodeAsIds(const std::string& input) const {
|
||||
std::vector<int> output;
|
||||
$self->Encode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string Decode(const std::vector<std::string>& input) const {
|
||||
std::string output;
|
||||
$self->Decode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string DecodePieces(const std::vector<std::string>& input) const {
|
||||
std::string output;
|
||||
$self->Decode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
std::string DecodeIds(const std::vector<int>& input) const {
|
||||
std::string output;
|
||||
$self->Decode(input, &output);
|
||||
return output;
|
||||
}
|
||||
|
||||
int __len__() {
|
||||
return $self->GetPieceSize();
|
||||
}
|
||||
|
||||
int __getitem__(const std::string& key) const {
|
||||
return $self->PieceToId(key);
|
||||
}
|
||||
}
|
||||
|
||||
%typemap(out) std::vector<int> {
|
||||
$result = PyList_New($1.size());
|
||||
for (size_t i = 0; i < $1.size(); ++i)
|
||||
PyList_SetItem($result, i, PyInt_FromLong((long)$1[i]));
|
||||
}
|
||||
|
||||
%typemap(out) std::vector<std::string> {
|
||||
$result = PyList_New($1.size());
|
||||
for (size_t i = 0; i < $1.size(); ++i)
|
||||
PyList_SetItem($result, i, PyString_FromStringAndSize($1[i].data(), $1[i].size()));
|
||||
}
|
||||
|
||||
%typemap(out) std::string {
|
||||
$result = PyString_FromStringAndSize($1.data(), $1.size());
|
||||
}
|
||||
|
||||
%typemap(in) const std::string & {
|
||||
std::string *out = nullptr;
|
||||
if (PyString_Check($input)) {
|
||||
char *str = nullptr;
|
||||
Py_ssize_t str_size = 0;
|
||||
PyString_AsStringAndSize($input, &str, &str_size);
|
||||
out = new std::string(str, str_size);
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"not a string");
|
||||
return NULL;
|
||||
}
|
||||
$1 = out;
|
||||
}
|
||||
|
||||
%typemap(in) const std::vector<std::string>& {
|
||||
std::vector<std::string> *out = nullptr;
|
||||
if (PyList_Check($input)) {
|
||||
const size_t size = PyList_Size($input);
|
||||
out = new std::vector<std::string>(size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
PyObject *o = PyList_GetItem($input, i);
|
||||
if (PyString_Check(o)) {
|
||||
char *str = nullptr;
|
||||
Py_ssize_t str_size = 0;
|
||||
PyString_AsStringAndSize(o, &str, &str_size);
|
||||
(*out)[i] = std::string(str, static_cast<size_t>(str_size));
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"list must contain strings");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"not a list");
|
||||
return NULL;
|
||||
}
|
||||
$1 = out;
|
||||
}
|
||||
|
||||
%typemap(in) const std::vector<int>& {
|
||||
std::vector<int> *out = nullptr;
|
||||
if (PyList_Check($input)) {
|
||||
const size_t size = PyList_Size($input);
|
||||
out = new std::vector<int>(size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
PyObject *o = PyList_GetItem($input, i);
|
||||
if (PyInt_Check(o)) {
|
||||
(*out)[i] = static_cast<int>(PyInt_AsLong(o));
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"list must contain integers");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PyErr_SetString(PyExc_TypeError,"not a list");
|
||||
return NULL;
|
||||
}
|
||||
$1 = out;
|
||||
}
|
||||
|
||||
%typemap(freearg) const std::string& {
|
||||
delete $1;
|
||||
}
|
||||
|
||||
%typemap(freearg) const std::vector<std::string>& {
|
||||
delete $1;
|
||||
}
|
||||
|
||||
%typemap(freearg) const std::vector<int>& {
|
||||
delete $1;
|
||||
}
|
||||
|
||||
%include <sentencepiece_processor.h>
|
|
@ -0,0 +1,107 @@
|
|||
# This file was automatically generated by SWIG (http://www.swig.org).
|
||||
# Version 2.0.11
|
||||
#
|
||||
# Do not make changes to this file unless you know what you are doing--modify
|
||||
# the SWIG interface file instead.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from sys import version_info
|
||||
if version_info >= (2,6,0):
|
||||
def swig_import_helper():
|
||||
from os.path import dirname
|
||||
import imp
|
||||
fp = None
|
||||
try:
|
||||
fp, pathname, description = imp.find_module('_sentencepiece', [dirname(__file__)])
|
||||
except ImportError:
|
||||
import _sentencepiece
|
||||
return _sentencepiece
|
||||
if fp is not None:
|
||||
try:
|
||||
_mod = imp.load_module('_sentencepiece', fp, pathname, description)
|
||||
finally:
|
||||
fp.close()
|
||||
return _mod
|
||||
_sentencepiece = swig_import_helper()
|
||||
del swig_import_helper
|
||||
else:
|
||||
import _sentencepiece
|
||||
del version_info
|
||||
try:
|
||||
_swig_property = property
|
||||
except NameError:
|
||||
pass # Python < 2.2 doesn't have 'property'.
|
||||
def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
|
||||
if (name == "thisown"): return self.this.own(value)
|
||||
if (name == "this"):
|
||||
if type(value).__name__ == 'SwigPyObject':
|
||||
self.__dict__[name] = value
|
||||
return
|
||||
method = class_type.__swig_setmethods__.get(name,None)
|
||||
if method: return method(self,value)
|
||||
if (not static):
|
||||
self.__dict__[name] = value
|
||||
else:
|
||||
raise AttributeError("You cannot add attributes to %s" % self)
|
||||
|
||||
def _swig_setattr(self,class_type,name,value):
|
||||
return _swig_setattr_nondynamic(self,class_type,name,value,0)
|
||||
|
||||
def _swig_getattr(self,class_type,name):
|
||||
if (name == "thisown"): return self.this.own()
|
||||
method = class_type.__swig_getmethods__.get(name,None)
|
||||
if method: return method(self)
|
||||
raise AttributeError(name)
|
||||
|
||||
def _swig_repr(self):
|
||||
try: strthis = "proxy of " + self.this.__repr__()
|
||||
except: strthis = ""
|
||||
return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
|
||||
|
||||
try:
|
||||
_object = object
|
||||
_newclass = 1
|
||||
except AttributeError:
|
||||
class _object : pass
|
||||
_newclass = 0
|
||||
|
||||
|
||||
class SentencePieceProcessor(_object):
|
||||
__swig_setmethods__ = {}
|
||||
__setattr__ = lambda self, name, value: _swig_setattr(self, SentencePieceProcessor, name, value)
|
||||
__swig_getmethods__ = {}
|
||||
__getattr__ = lambda self, name: _swig_getattr(self, SentencePieceProcessor, name)
|
||||
__repr__ = _swig_repr
|
||||
def __init__(self):
|
||||
this = _sentencepiece.new_SentencePieceProcessor()
|
||||
try: self.this.append(this)
|
||||
except: self.this = this
|
||||
__swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor
|
||||
__del__ = lambda self : None;
|
||||
def Load(self, *args): return _sentencepiece.SentencePieceProcessor_Load(self, *args)
|
||||
def LoadOrDie(self, *args): return _sentencepiece.SentencePieceProcessor_LoadOrDie(self, *args)
|
||||
def SetEncodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, *args)
|
||||
def SetDecodeExtraOptions(self, *args): return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, *args)
|
||||
def GetPieceSize(self): return _sentencepiece.SentencePieceProcessor_GetPieceSize(self)
|
||||
def PieceToId(self, *args): return _sentencepiece.SentencePieceProcessor_PieceToId(self, *args)
|
||||
def IdToPiece(self, *args): return _sentencepiece.SentencePieceProcessor_IdToPiece(self, *args)
|
||||
def GetScore(self, *args): return _sentencepiece.SentencePieceProcessor_GetScore(self, *args)
|
||||
def IsUnknown(self, *args): return _sentencepiece.SentencePieceProcessor_IsUnknown(self, *args)
|
||||
def IsControl(self, *args): return _sentencepiece.SentencePieceProcessor_IsControl(self, *args)
|
||||
def Encode(self, *args): return _sentencepiece.SentencePieceProcessor_Encode(self, *args)
|
||||
def EncodeAsPieces(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsPieces(self, *args)
|
||||
def EncodeAsIds(self, *args): return _sentencepiece.SentencePieceProcessor_EncodeAsIds(self, *args)
|
||||
def Decode(self, *args): return _sentencepiece.SentencePieceProcessor_Decode(self, *args)
|
||||
def DecodePieces(self, *args): return _sentencepiece.SentencePieceProcessor_DecodePieces(self, *args)
|
||||
def DecodeIds(self, *args): return _sentencepiece.SentencePieceProcessor_DecodeIds(self, *args)
|
||||
def __len__(self): return _sentencepiece.SentencePieceProcessor___len__(self)
|
||||
def __getitem__(self, *args): return _sentencepiece.SentencePieceProcessor___getitem__(self, *args)
|
||||
SentencePieceProcessor_swigregister = _sentencepiece.SentencePieceProcessor_swigregister
|
||||
SentencePieceProcessor_swigregister(SentencePieceProcessor)
|
||||
|
||||
# This file is compatible with both classic and new-style classes.
|
||||
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,2 @@
|
|||
[metadata]
|
||||
description-file = README.md
|
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from setuptools import setup, Extension
|
||||
import string
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append('./test')
|
||||
|
||||
with open("README.md") as f:
|
||||
long_description = f.read()
|
||||
|
||||
def cmd(line):
|
||||
return os.popen(line).readlines()[0][:-1].split()
|
||||
|
||||
setup(name = 'sentencepiece',
|
||||
author = 'Taku Kudo',
|
||||
author_email='taku@google.com',
|
||||
description = 'SentencePiece python wrapper',
|
||||
long_description = long_description,
|
||||
url = 'https://github.com/google/sentencepiece',
|
||||
license = 'Apache',
|
||||
platforms = 'Unix',
|
||||
py_modules=['sentencepiece'],
|
||||
ext_modules = [Extension('_sentencepiece',
|
||||
sources=['sentencepiece_wrap.cxx'],
|
||||
extra_compile_args=['-std=c++11'] +
|
||||
cmd('pkg-config sentencepiece --cflags'),
|
||||
extra_link_args=cmd('pkg-config sentencepiece --libs'))
|
||||
],
|
||||
classifiers = [
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Operating System :: Unix',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Text Processing :: Linguistic',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules'
|
||||
],
|
||||
test_suite = 'sentencepiece_test.suite')
|
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sentencepiece as spm
|
||||
import unittest
|
||||
|
||||
class TestSentencepieceProcessor(unittest.TestCase):
|
||||
"""Test case for SentencePieceProcessor"""
|
||||
|
||||
def setUp(self):
|
||||
self.sp_ = spm.SentencePieceProcessor()
|
||||
self.assertTrue(self.sp_.Load('test/test_model.model'))
|
||||
|
||||
def test_load(self):
|
||||
self.assertEqual(1000, self.sp_.GetPieceSize())
|
||||
self.assertEqual(0, self.sp_.PieceToId('<unk>'))
|
||||
self.assertEqual(1, self.sp_.PieceToId('<s>'))
|
||||
self.assertEqual(2, self.sp_.PieceToId('</s>'))
|
||||
self.assertEqual('<unk>', self.sp_.IdToPiece(0))
|
||||
self.assertEqual('<s>', self.sp_.IdToPiece(1))
|
||||
self.assertEqual('</s>', self.sp_.IdToPiece(2))
|
||||
for i in range(self.sp_.GetPieceSize()):
|
||||
piece = self.sp_.IdToPiece(i)
|
||||
self.assertEqual(i, self.sp_.PieceToId(piece))
|
||||
|
||||
def test_roundtrip(self):
|
||||
text = 'I saw a girl with a telescope.'
|
||||
ids = self.sp_.EncodeAsIds(text)
|
||||
pieces1 = self.sp_.EncodeAsPieces(text)
|
||||
pieces2 = self.sp_.Encode(text)
|
||||
self.assertEqual(pieces1, pieces2)
|
||||
self.assertEqual(text, self.sp_.Decode(pieces1))
|
||||
self.assertEqual(text, self.sp_.DecodePieces(pieces2))
|
||||
self.assertEqual(text, self.sp_.DecodeIds(ids))
|
||||
|
||||
def suite():
|
||||
suite = unittest.TestSuite()
|
||||
suite.addTests(unittest.makeSuite(TestSentencepieceProcessor))
|
||||
return suite
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Двоичный файл не отображается.
|
@ -17,6 +17,7 @@ libsentencepiece_la_SOURCES = \
|
|||
word_model.h word_model.cc \
|
||||
char_model.h char_model.cc \
|
||||
bpe_model.h bpe_model.cc
|
||||
include_HEADERS = sentencepiece_processor.h
|
||||
|
||||
noinst_LIBRARIES = libtrain.a
|
||||
libtrain_a_SOURCES = builder.cc builder.h \
|
||||
|
@ -37,7 +38,7 @@ BUILT_SOURCES = \
|
|||
sentencepiece.pb.cc \
|
||||
sentencepiece_model.pb.cc
|
||||
|
||||
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
|
||||
EXTRA_DIST = sentencepiece.proto sentencepiece_model.proto
|
||||
|
||||
bin_PROGRAMS = spm_encode spm_decode spm_normalize spm_train spm_export_vocab
|
||||
noinst_PROGRAMS = compile_charsmap
|
||||
|
|
|
@ -168,6 +168,7 @@ class SentencePieceProcessor {
|
|||
// Returns true if |id| is control symbol.
|
||||
virtual bool IsControl(int id) const;
|
||||
|
||||
#ifndef SWIG
|
||||
//////////////////////////////////////////////////////////////
|
||||
// Model management.
|
||||
//
|
||||
|
@ -176,6 +177,7 @@ class SentencePieceProcessor {
|
|||
|
||||
// Allows injection of a normalizer instance. |normalizer| is moved.
|
||||
void SetNormalizer(std::unique_ptr<normalizer::Normalizer> &&normalizer);
|
||||
#endif
|
||||
|
||||
// Returns immutable model proto. Useful to obtain extended
|
||||
// or experimental parameters encoded in model_proto.
|
||||
|
|
Загрузка…
Ссылка в новой задаче