Prepare for 0.4.0 release (#151)
* new CI configuration * Set up CI with Azure Pipelines [skip ci] * install numpy in cibuildwheel * add pyproject.toml * upgrade vmImage * update the build python versions * remove the pytest * move the wheel build files * enable sdist setup.py as well. * use git command line * Update wheels.yml for Azure Pipelines * disable the pypy package for macos; * fix the external repo code tag * fix the ctest problem * fix the unicode 8217. * fix the locale base test
This commit is contained in:
Родитель
98c32dfe4a
Коммит
9f3abe20fd
|
@ -0,0 +1,49 @@
|
|||
jobs:
|
||||
- job: linux
|
||||
pool: {vmImage: 'ubuntu-latest'}
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
- bash: |
|
||||
set -o errexit
|
||||
python3 -m pip install --upgrade pip
|
||||
pip3 install cibuildwheel==2.1.2
|
||||
displayName: Install dependencies
|
||||
- bash: cibuildwheel --output-dir wheelhouse .
|
||||
displayName: Build wheels
|
||||
- task: PublishBuildArtifacts@1
|
||||
inputs: {pathtoPublish: 'wheelhouse'}
|
||||
|
||||
- job: macos
|
||||
pool: {vmImage: 'macOS-latest'}
|
||||
variables:
|
||||
CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
|
||||
# Skip trying to test arm64 builds on Intel Macs
|
||||
# CIBW_TEST_SKIP: "*-macosx_arm64 *-macosx_universal2:arm64"
|
||||
# Disable building PyPy wheels
|
||||
CIBW_SKIP: pp*
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
- bash: |
|
||||
set -o errexit
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install cibuildwheel==2.1.2
|
||||
displayName: Install dependencies
|
||||
- bash: cibuildwheel --output-dir wheelhouse .
|
||||
displayName: Build wheels
|
||||
- task: PublishBuildArtifacts@1
|
||||
inputs: {pathtoPublish: wheelhouse}
|
||||
|
||||
# - job: windows
|
||||
# pool: {vmImage: 'windows-latest'}
|
||||
# steps:
|
||||
# - task: UsePythonVersion@0
|
||||
# - bash: |
|
||||
# set -o errexit
|
||||
# python -m pip install --upgrade pip
|
||||
# pip install cibuildwheel==2.1.2
|
||||
# displayName: Install dependencies
|
||||
# - bash: cibuildwheel --output-dir wheelhouse .
|
||||
# displayName: Build wheels
|
||||
# - task: PublishBuildArtifacts@1
|
||||
# inputs: {pathtoPublish: 'wheelhouse'}
|
|
@ -264,7 +264,7 @@ endif()
|
|||
|
||||
if (OCOS_ENABLE_SPM_TOKENIZER)
|
||||
# SentencePiece
|
||||
target_include_directories(ocos_operators PUBLIC ${sentencepieceproject_INCLUDE_DIRS})
|
||||
target_include_directories(ocos_operators PUBLIC ${spm_INCLUDE_DIRS})
|
||||
list(APPEND OCOS_COMPILE_DEFINITIONS ENABLE_SPM_TOKENIZER)
|
||||
list(APPEND ocos_libraries sentencepiece-static)
|
||||
endif()
|
||||
|
|
20
MANIFEST.in
20
MANIFEST.in
|
@ -1,5 +1,15 @@
|
|||
prune ci_build
|
||||
prune docs
|
||||
exclude *.bat
|
||||
exclude *.yaml
|
||||
exclude *.git*
|
||||
include *.txt
|
||||
global-include *.def
|
||||
recursive-include cmake *.*
|
||||
recursive-include includes *.*
|
||||
recursive-include operators *.*
|
||||
recursive-include pyop *.*
|
||||
recursive-include shared *.*
|
||||
prune ci_build
|
||||
prune docs
|
||||
prune test
|
||||
prune _subbuild
|
||||
prune out
|
||||
exclude *.bat
|
||||
exclude *.yaml
|
||||
exclude *.git*
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
set -e -x -u
|
||||
|
||||
OSNAME=android
|
||||
if [ -z "$NDK_ROOT" ]; then export NDK_ROOT=`ls -d $HOME/Android/ndk/* 2>/dev/null`; fi
|
||||
if [ -z "$NDK_ROOT" ]
|
||||
if [[ -z ${NDK_ROOT+x} ]]; then NDK_ROOT=`ls -d $HOME/Android/Sdk/ndk/* 2>/dev/null`; fi
|
||||
if [[ -z "${NDK_ROOT}" ]]
|
||||
then
|
||||
echo "ERROR: cannot find where NDK was installed, using NDK_ROOT to specify it"
|
||||
exit 7
|
||||
|
|
|
@ -44,7 +44,7 @@ jobs:
|
|||
displayName: build the customop library with onnxruntime
|
||||
|
||||
- script: |
|
||||
cd out/Linux
|
||||
cd out/Linux/RelWithDebInfo
|
||||
ctest -C RelWithDebInfo
|
||||
displayName: Run C++ native tests
|
||||
|
||||
|
@ -119,7 +119,7 @@ jobs:
|
|||
displayName: build the customop library with onnxruntime
|
||||
|
||||
- script: |
|
||||
cd out/Darwin
|
||||
cd out/Darwin/RelWithDebInfo
|
||||
ctest -C RelWithDebInfo
|
||||
displayName: Run C++ native tests
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
FetchContent_Declare(
|
||||
Blingfire
|
||||
GIT_REPOSITORY https://github.com/microsoft/BlingFire.git
|
||||
GIT_TAG master
|
||||
GIT_TAG 0831265c1aca95ca02eca5bf1155e4251e545328
|
||||
)
|
||||
|
||||
|
||||
|
@ -12,4 +12,4 @@ if (NOT blingfire_POPULATED)
|
|||
|
||||
# enable size optimization build
|
||||
add_subdirectory(${blingfire_SOURCE_DIR} ${blingfire_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
endif ()
|
||||
endif ()
|
||||
|
|
|
@ -1,18 +1,19 @@
|
|||
FetchContent_Declare(
|
||||
sentencepieceproject
|
||||
spm
|
||||
GIT_REPOSITORY https://github.com/google/sentencepiece.git
|
||||
GIT_TAG v0.1.96
|
||||
)
|
||||
# spm is abbr. of sentencepiece to meet the MAX_PATH compiling requirement on Windows
|
||||
FetchContent_GetProperties(spm)
|
||||
|
||||
FetchContent_GetProperties(sentencepieceproject)
|
||||
|
||||
if(NOT sentencepieceproject_POPULATED)
|
||||
FetchContent_Populate(sentencepieceproject)
|
||||
add_subdirectory(${sentencepieceproject_SOURCE_DIR} ${sentencepieceproject_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
if(NOT spm_POPULATED)
|
||||
FetchContent_Populate(spm)
|
||||
add_subdirectory(${spm_SOURCE_DIR} ${spm_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
|
||||
set(sentencepieceproject_INCLUDE_DIRS
|
||||
${sentencepieceproject_SOURCE_DIR}/third_party/protobuf-lite
|
||||
${sentencepieceproject_SOURCE_DIR}/src/builtin_pb
|
||||
${sentencepieceproject_SOURCE_DIR}/third_party
|
||||
${sentencepieceproject_SOURCE_DIR}/src
|
||||
set(spm_INCLUDE_DIRS
|
||||
${spm_SOURCE_DIR}/third_party/protobuf-lite
|
||||
${spm_SOURCE_DIR}/src/builtin_pb
|
||||
${spm_SOURCE_DIR}/third_party
|
||||
${spm_SOURCE_DIR}/src
|
||||
)
|
||||
|
|
|
@ -7,10 +7,9 @@
|
|||
The entry point to onnxruntime custom op library
|
||||
"""
|
||||
|
||||
__version__ = "0.3.2"
|
||||
__author__ = "Microsoft"
|
||||
|
||||
|
||||
from ._version import __version__
|
||||
from ._ocos import get_library_path # noqa
|
||||
from ._ocos import Opdef, PyCustomOpDef # noqa
|
||||
from ._ocos import hash_64 # noqa
|
||||
|
|
|
@ -121,6 +121,6 @@ class EagerOp:
|
|||
|
||||
def optimize_model(model_or_file, output_file):
|
||||
sess_options = EagerOp.get_ort_session_options()
|
||||
sess_options.graph_optimization_level = _ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
|
||||
sess_options.graph_optimization_level = _ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
||||
sess_options.optimized_model_filepath = output_file
|
||||
_ort.InferenceSession(model_or_file if isinstance(model_or_file, str) else model_or_file.SerializeToString(), sess_options)
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
###############################################################################
|
||||
|
||||
__version__ = "0.4.0"
|
|
@ -53,7 +53,9 @@ std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (tokenize_punctuation_ && ::iswpunct(c)) {
|
||||
// 0x2019 unicode is not punctuation in some Linux platform,
|
||||
// to be consistent, take it as punctatuation always.
|
||||
if (tokenize_punctuation_ && (::iswpunct(c) || c == wint_t(0x2019))) {
|
||||
push_current_token_and_clear();
|
||||
push_single_char_and_clear(c);
|
||||
continue;
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
[project]
|
||||
# since onnxruntime havn't supported Python 3.10 yet
|
||||
requires-python = "<3.10"
|
||||
|
||||
[build-system]
|
||||
# Minimum requirements for the build system to execute.
|
||||
requires = ["setuptools", "wheel", "numpy>=1.18.5"] # PEP 508 specifications.
|
61
setup.py
61
setup.py
|
@ -1,16 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
###########################################################################
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
###########################################################################
|
||||
|
||||
from setuptools.command.build_ext import build_ext as _build_ext
|
||||
from setuptools.command.develop import develop as _develop
|
||||
from setuptools.command.build_py import build_py as _build_py
|
||||
from contextlib import contextmanager
|
||||
from setuptools import setup, find_packages
|
||||
from setuptools.command.build_ext import build_ext as _build_ext
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
@ -18,26 +13,13 @@ import setuptools
|
|||
import pathlib
|
||||
import subprocess
|
||||
|
||||
from textwrap import dedent
|
||||
|
||||
TOP_DIR = os.path.dirname(__file__)
|
||||
|
||||
TOP_DIR = os.path.dirname(__file__) or os.getcwd()
|
||||
PACKAGE_NAME = 'onnxruntime_extensions'
|
||||
|
||||
|
||||
if '--nightly_build' in sys.argv:
|
||||
PACKAGE_NAME = 'ortext_nightly'
|
||||
sys.argv.remove('--nightly_build')
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chdir(path):
|
||||
orig_path = os.getcwd()
|
||||
os.chdir(str(path))
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
os.chdir(orig_path)
|
||||
|
||||
|
||||
def load_msvcvar():
|
||||
if os.environ.get('vcvars'):
|
||||
stdout, _ = subprocess.Popen([
|
||||
|
@ -55,6 +37,18 @@ def load_msvcvar():
|
|||
"please install one or specify the environement variable VCVARS to the path of VS vcvars64.bat.")
|
||||
|
||||
|
||||
def read_git_refs(git_args):
|
||||
stdout, _ = subprocess.Popen(
|
||||
['git'] + git_args,
|
||||
cwd=TOP_DIR,
|
||||
stdout=subprocess.PIPE, universal_newlines=True).communicate()
|
||||
for _ln in stdout.splitlines():
|
||||
_ln = dedent(_ln).strip('\n\r')
|
||||
if _ln:
|
||||
return _ln
|
||||
return ''
|
||||
|
||||
|
||||
class BuildCMakeExt(_build_ext):
|
||||
|
||||
def run(self):
|
||||
|
@ -94,10 +88,9 @@ class BuildCMakeExt(_build_ext):
|
|||
'--parallel'
|
||||
]
|
||||
|
||||
with chdir(build_temp):
|
||||
self.spawn(['cmake', str(project_dir)] + cmake_args)
|
||||
if not self.dry_run:
|
||||
self.spawn(['cmake', '--build', '.'] + build_args)
|
||||
self.spawn(['cmake', '-S', str(project_dir), '-B', str(build_temp)] + cmake_args)
|
||||
if not self.dry_run:
|
||||
self.spawn(['cmake', '--build', str(build_temp)] + build_args)
|
||||
|
||||
if sys.platform == "win32":
|
||||
self.copy_file(build_temp / config / 'ortcustomops.dll',
|
||||
|
@ -106,19 +99,23 @@ class BuildCMakeExt(_build_ext):
|
|||
|
||||
def read_requirements():
|
||||
with open(os.path.join(TOP_DIR, "requirements.txt"), "r") as f:
|
||||
requirements = [_ for _ in [_.strip("\r\n ")
|
||||
for _ in f.readlines()] if _ is not None]
|
||||
requirements = [_ for _ in [dedent(_) for _ in f.readlines()] if _ is not None]
|
||||
return requirements
|
||||
|
||||
|
||||
# read version from the package file.
|
||||
def read_version():
|
||||
version_str = '1.0.0'
|
||||
with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/__init__.py'), "r")) as f:
|
||||
line = [_ for _ in [_.strip("\r\n ")
|
||||
for _ in f.readlines()] if _.startswith("__version__")]
|
||||
with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/_version.py'), "r")) as f:
|
||||
line = [_ for _ in [dedent(_) for _ in f.readlines()] if _.startswith("__version__")]
|
||||
if len(line) > 0:
|
||||
version_str = line[0].split('=')[1].strip('" ')
|
||||
version_str = line[0].split('=')[1].strip('" \n\r')
|
||||
|
||||
# is it a nightly or dev build?
|
||||
if os.path.isdir('.git') and \
|
||||
not read_git_refs(['rev-parse', '--abbrev-ref', 'HEAD']).startswith('rel-'):
|
||||
# append a git commit id from git remote repo, while the local change ids are skipped.
|
||||
version_str += '+' + read_git_refs(['rev-parse', 'HEAD'])[:7]
|
||||
return version_str
|
||||
|
||||
|
||||
|
|
|
@ -6,6 +6,32 @@
|
|||
#include "wordpiece_tokenizer.hpp"
|
||||
#include "bert_tokenizer.hpp"
|
||||
|
||||
#include <clocale>
|
||||
|
||||
|
||||
class LocaleBaseTest : public testing::Test{
|
||||
public:
|
||||
// Remember that SetUp() is run immediately before a test starts.
|
||||
void SetUp() override {
|
||||
#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) && !defined(__GNUC__))
|
||||
default_locale_ = std::locale().name();
|
||||
std::setlocale(LC_CTYPE, "C");
|
||||
#else
|
||||
default_locale_ = std::locale("").name();
|
||||
std::setlocale(LC_CTYPE, "en_US.UTF-8");
|
||||
#endif
|
||||
}
|
||||
// TearDown() is invoked immediately after a test finishes.
|
||||
void TearDown() override {
|
||||
if (!default_locale_.empty()) {
|
||||
std::setlocale(LC_CTYPE, default_locale_.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::string default_locale_;
|
||||
};
|
||||
|
||||
TEST(tokenizer, bert_word_split) {
|
||||
ustring ind("##");
|
||||
ustring text("A AAA B BB");
|
||||
|
@ -59,8 +85,8 @@ TEST(tokenizer, wordpiece_basic_tokenizer) {
|
|||
std::vector<int32_t> indices;
|
||||
std::vector<int64_t> rows;
|
||||
KernelWordpieceTokenizer_Tokenizer(vocab, ustring("##"), ustring("[unk]"), text, tokens, indices, rows);
|
||||
//EXPECT_EQ(indices, std::vector<int32_t>({9, 6, 7, 12, 10, 11}));
|
||||
//EXPECT_EQ(rows, std::vector<int64_t>({0, 6}));
|
||||
// EXPECT_EQ(indices, std::vector<int32_t>({9, 6, 7, 12, 10, 11}));
|
||||
// EXPECT_EQ(rows, std::vector<int64_t>({0, 6}));
|
||||
}
|
||||
|
||||
std::unordered_map<std::u32string, int32_t> get_vocabulary_wordpiece() {
|
||||
|
@ -127,15 +153,18 @@ TEST(tokenizer, bert_wordpiece_tokenizer_rows) {
|
|||
EXPECT_EQ(rows, std::vector<int64_t>({0, 5, 7}));
|
||||
}
|
||||
|
||||
TEST(tokenizer, basic_tokenizer_chinese) {
|
||||
TEST_F(LocaleBaseTest, basic_tokenizer_chinese) {
|
||||
ustring test_case = ustring("ÀÁÂÃÄÅÇÈÉÊËÌÍÎÑÒÓÔÕÖÚÜ\t䗓𨖷虴𨀐辘𧄋脟𩑢𡗶镇伢𧎼䪱轚榶𢑌㺽𤨡!#$%&(Tom@microsoft.com)*+,-./:;<=>?@[\\]^_`{|}~");
|
||||
std::vector<ustring> expect_result = ustring_vector_convertor({"aaaaaaceeeeiiinooooouu", "䗓", "𨖷", "虴", "𨀐", "辘", "𧄋", "脟", "𩑢", "𡗶", "镇", "伢", "𧎼", "䪱", "轚", "榶", "𢑌", "㺽", "𤨡", "!", "#", "$", "%", "&", "(", "tom", "@", "microsoft", ".", "com", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"});
|
||||
std::vector<ustring> expect_result = ustring_vector_convertor({"aaaaaaceeeeiiinooooouu",
|
||||
"䗓", "𨖷", "虴", "𨀐", "辘", "𧄋", "脟", "𩑢", "𡗶", "镇", "伢", "𧎼", "䪱", "轚", "榶", "𢑌", "㺽", "𤨡",
|
||||
"!", "#", "$", "%", "&", "(", "tom", "@", "microsoft", ".", "com", ")", "*", "+", ",", "-", ".", "/", ":",
|
||||
";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"});
|
||||
BasicTokenizer tokenizer(true, true, true, true, true);
|
||||
auto result = tokenizer.Tokenize(test_case);
|
||||
EXPECT_EQ(result, expect_result);
|
||||
}
|
||||
|
||||
TEST(tokenizer, basic_tokenizer_russia) {
|
||||
TEST_F(LocaleBaseTest, basic_tokenizer_russia) {
|
||||
ustring test_case = ustring("A $100,000 price-tag@big>small на русском языке");
|
||||
std::vector<ustring> expect_result = ustring_vector_convertor({"a", "$", "100", ",", "000", "price", "-", "tag", "@", "big", ">", "small", "на", "русском", "языке"});
|
||||
BasicTokenizer tokenizer(true, true, true, true, true);
|
||||
|
@ -143,7 +172,7 @@ TEST(tokenizer, basic_tokenizer_russia) {
|
|||
EXPECT_EQ(result, expect_result);
|
||||
}
|
||||
|
||||
TEST(tokenizer, basic_tokenizer) {
|
||||
TEST_F(LocaleBaseTest, basic_tokenizer) {
|
||||
ustring test_case = ustring("I mean, you’ll need something to talk about next Sunday, right?");
|
||||
std::vector<ustring> expect_result = ustring_vector_convertor({"I", "mean", ",", "you", "’", "ll", "need", "something", "to", "talk", "about", "next", "Sunday", ",", "right", "?"});
|
||||
BasicTokenizer tokenizer(false, true, true, true, true);
|
||||
|
@ -217,4 +246,4 @@ TEST(tokenizer, truncation_longest_first) {
|
|||
truncate.Truncate(test_input1, test_input2, 12);
|
||||
EXPECT_EQ(test_input1, std::vector<int64_t>({1, 2, 3, 4, 5}));
|
||||
EXPECT_EQ(test_input2, std::vector<int64_t>({1, 2, 3, 4, 5, 6 ,7}));
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче