Skip to content

Commit

Permalink
Prepare for 0.4.0 release (#151)
Browse files Browse the repository at this point in the history
* new CI configuration

* Set up CI with Azure Pipelines

[skip ci]

* install numpy in cibuildwheel

* add pyproject.toml

* upgrade vmImage

* update the build python versions

* remove the pytest

* move the wheel build files

* enable sdist setup.py as well.

* use git command line

* Update wheels.yml for Azure Pipelines

* disable the pypy package for macos;

* fix the external repo code tag

* fix the ctest problem

* fix the unicode 8217.

* fix the locale base test
  • Loading branch information
wenbingl authored Sep 25, 2021
1 parent 98c32df commit 9f3abe2
Show file tree
Hide file tree
Showing 14 changed files with 166 additions and 66 deletions.
49 changes: 49 additions & 0 deletions .az/wheels.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
jobs:
- job: linux
pool: {vmImage: 'ubuntu-latest'}
steps:
- task: UsePythonVersion@0
- bash: |
set -o errexit
python3 -m pip install --upgrade pip
pip3 install cibuildwheel==2.1.2
displayName: Install dependencies
- bash: cibuildwheel --output-dir wheelhouse .
displayName: Build wheels
- task: PublishBuildArtifacts@1
inputs: {pathtoPublish: 'wheelhouse'}

- job: macos
pool: {vmImage: 'macOS-latest'}
variables:
CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
# Skip trying to test arm64 builds on Intel Macs
# CIBW_TEST_SKIP: "*-macosx_arm64 *-macosx_universal2:arm64"
# Disable building PyPy wheels
CIBW_SKIP: pp*

steps:
- task: UsePythonVersion@0
- bash: |
set -o errexit
python3 -m pip install --upgrade pip
python3 -m pip install cibuildwheel==2.1.2
displayName: Install dependencies
- bash: cibuildwheel --output-dir wheelhouse .
displayName: Build wheels
- task: PublishBuildArtifacts@1
inputs: {pathtoPublish: wheelhouse}

# - job: windows
# pool: {vmImage: 'windows-latest'}
# steps:
# - task: UsePythonVersion@0
# - bash: |
# set -o errexit
# python -m pip install --upgrade pip
# pip install cibuildwheel==2.1.2
# displayName: Install dependencies
# - bash: cibuildwheel --output-dir wheelhouse .
# displayName: Build wheels
# - task: PublishBuildArtifacts@1
# inputs: {pathtoPublish: 'wheelhouse'}
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ endif()

if (OCOS_ENABLE_SPM_TOKENIZER)
# SentencePiece
target_include_directories(ocos_operators PUBLIC ${sentencepieceproject_INCLUDE_DIRS})
target_include_directories(ocos_operators PUBLIC ${spm_INCLUDE_DIRS})
list(APPEND OCOS_COMPILE_DEFINITIONS ENABLE_SPM_TOKENIZER)
list(APPEND ocos_libraries sentencepiece-static)
endif()
Expand Down
20 changes: 15 additions & 5 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
prune ci_build
prune docs
exclude *.bat
exclude *.yaml
exclude *.git*
include *.txt
global-include *.def
recursive-include cmake *.*
recursive-include includes *.*
recursive-include operators *.*
recursive-include pyop *.*
recursive-include shared *.*
prune ci_build
prune docs
prune test
prune _subbuild
prune out
exclude *.bat
exclude *.yaml
exclude *.git*
4 changes: 2 additions & 2 deletions build.android
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
set -e -x -u

OSNAME=android
if [ -z "$NDK_ROOT" ]; then export NDK_ROOT=`ls -d $HOME/Android/ndk/* 2>/dev/null`; fi
if [ -z "$NDK_ROOT" ]
if [[ -z ${NDK_ROOT+x} ]]; then NDK_ROOT=`ls -d $HOME/Android/Sdk/ndk/* 2>/dev/null`; fi
if [[ -z "${NDK_ROOT}" ]]
then
echo "ERROR: cannot find where NDK was installed, using NDK_ROOT to specify it"
exit 7
Expand Down
4 changes: 2 additions & 2 deletions ci_build/azure-pipelines/mshost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
displayName: build the customop library with onnxruntime
- script: |
cd out/Linux
cd out/Linux/RelWithDebInfo
ctest -C RelWithDebInfo
displayName: Run C++ native tests
Expand Down Expand Up @@ -119,7 +119,7 @@ jobs:
displayName: build the customop library with onnxruntime
- script: |
cd out/Darwin
cd out/Darwin/RelWithDebInfo
ctest -C RelWithDebInfo
displayName: Run C++ native tests
Expand Down
4 changes: 2 additions & 2 deletions cmake/externals/blingfire.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FetchContent_Declare(
Blingfire
GIT_REPOSITORY https://github.com/microsoft/BlingFire.git
GIT_TAG master
GIT_TAG 0831265c1aca95ca02eca5bf1155e4251e545328
)


Expand All @@ -12,4 +12,4 @@ if (NOT blingfire_POPULATED)

# enable size optimization build
add_subdirectory(${blingfire_SOURCE_DIR} ${blingfire_BINARY_DIR} EXCLUDE_FROM_ALL)
endif ()
endif ()
23 changes: 12 additions & 11 deletions cmake/externals/sentencepieceproject.cmake
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
FetchContent_Declare(
sentencepieceproject
spm
GIT_REPOSITORY https://github.com/google/sentencepiece.git
GIT_TAG v0.1.96
)
# spm is abbr. of sentencepiece to meet the MAX_PATH compiling requirement on Windows
FetchContent_GetProperties(spm)

FetchContent_GetProperties(sentencepieceproject)

if(NOT sentencepieceproject_POPULATED)
FetchContent_Populate(sentencepieceproject)
add_subdirectory(${sentencepieceproject_SOURCE_DIR} ${sentencepieceproject_BINARY_DIR} EXCLUDE_FROM_ALL)
if(NOT spm_POPULATED)
FetchContent_Populate(spm)
add_subdirectory(${spm_SOURCE_DIR} ${spm_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()

set(sentencepieceproject_INCLUDE_DIRS
${sentencepieceproject_SOURCE_DIR}/third_party/protobuf-lite
${sentencepieceproject_SOURCE_DIR}/src/builtin_pb
${sentencepieceproject_SOURCE_DIR}/third_party
${sentencepieceproject_SOURCE_DIR}/src
set(spm_INCLUDE_DIRS
${spm_SOURCE_DIR}/third_party/protobuf-lite
${spm_SOURCE_DIR}/src/builtin_pb
${spm_SOURCE_DIR}/third_party
${spm_SOURCE_DIR}/src
)
3 changes: 1 addition & 2 deletions onnxruntime_extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
The entry point to onnxruntime custom op library
"""

__version__ = "0.3.2"
__author__ = "Microsoft"


from ._version import __version__
from ._ocos import get_library_path # noqa
from ._ocos import Opdef, PyCustomOpDef # noqa
from ._ocos import hash_64 # noqa
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime_extensions/_ortapi2.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,6 @@ def __call__(self, *args, **kwargs):

def optimize_model(model_or_file, output_file):
sess_options = EagerOp.get_ort_session_options()
sess_options.graph_optimization_level = _ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
sess_options.graph_optimization_level = _ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
sess_options.optimized_model_filepath = output_file
_ort.InferenceSession(model_or_file if isinstance(model_or_file, str) else model_or_file.SerializeToString(), sess_options)
6 changes: 6 additions & 0 deletions onnxruntime_extensions/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
###############################################################################

__version__ = "0.4.0"
4 changes: 3 additions & 1 deletion operators/tokenizer/basic_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {
continue;
}

if (tokenize_punctuation_ && ::iswpunct(c)) {
// 0x2019 unicode is not punctuation in some Linux platform,
// to be consistent, take it as punctatuation always.
if (tokenize_punctuation_ && (::iswpunct(c) || c == wint_t(0x2019))) {
push_current_token_and_clear();
push_single_char_and_clear(c);
continue;
Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[project]
# since onnxruntime havn't supported Python 3.10 yet
requires-python = "<3.10"

[build-system]
# Minimum requirements for the build system to execute.
requires = ["setuptools", "wheel", "numpy>=1.18.5"] # PEP 508 specifications.
61 changes: 29 additions & 32 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,23 @@
# -*- coding: utf-8 -*-

###########################################################################
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
###########################################################################

from setuptools.command.build_ext import build_ext as _build_ext
from setuptools.command.develop import develop as _develop
from setuptools.command.build_py import build_py as _build_py
from contextlib import contextmanager
from setuptools import setup, find_packages
from setuptools.command.build_ext import build_ext as _build_ext

import os
import sys
import setuptools
import pathlib
import subprocess


TOP_DIR = os.path.dirname(__file__)
PACKAGE_NAME = 'onnxruntime_extensions'


if '--nightly_build' in sys.argv:
PACKAGE_NAME = 'ortext_nightly'
sys.argv.remove('--nightly_build')
from textwrap import dedent


@contextmanager
def chdir(path):
orig_path = os.getcwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(orig_path)
TOP_DIR = os.path.dirname(__file__) or os.getcwd()
PACKAGE_NAME = 'onnxruntime_extensions'


def load_msvcvar():
Expand All @@ -55,6 +37,18 @@ def load_msvcvar():
"please install one or specify the environement variable VCVARS to the path of VS vcvars64.bat.")


def read_git_refs(git_args):
stdout, _ = subprocess.Popen(
['git'] + git_args,
cwd=TOP_DIR,
stdout=subprocess.PIPE, universal_newlines=True).communicate()
for _ln in stdout.splitlines():
_ln = dedent(_ln).strip('\n\r')
if _ln:
return _ln
return ''


class BuildCMakeExt(_build_ext):

def run(self):
Expand Down Expand Up @@ -94,10 +88,9 @@ def build_cmake(self, extension):
'--parallel'
]

with chdir(build_temp):
self.spawn(['cmake', str(project_dir)] + cmake_args)
if not self.dry_run:
self.spawn(['cmake', '--build', '.'] + build_args)
self.spawn(['cmake', '-S', str(project_dir), '-B', str(build_temp)] + cmake_args)
if not self.dry_run:
self.spawn(['cmake', '--build', str(build_temp)] + build_args)

if sys.platform == "win32":
self.copy_file(build_temp / config / 'ortcustomops.dll',
Expand All @@ -106,19 +99,23 @@ def build_cmake(self, extension):

def read_requirements():
with open(os.path.join(TOP_DIR, "requirements.txt"), "r") as f:
requirements = [_ for _ in [_.strip("\r\n ")
for _ in f.readlines()] if _ is not None]
requirements = [_ for _ in [dedent(_) for _ in f.readlines()] if _ is not None]
return requirements


# read version from the package file.
def read_version():
version_str = '1.0.0'
with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/__init__.py'), "r")) as f:
line = [_ for _ in [_.strip("\r\n ")
for _ in f.readlines()] if _.startswith("__version__")]
with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/_version.py'), "r")) as f:
line = [_ for _ in [dedent(_) for _ in f.readlines()] if _.startswith("__version__")]
if len(line) > 0:
version_str = line[0].split('=')[1].strip('" ')
version_str = line[0].split('=')[1].strip('" \n\r')

# is it a nightly or dev build?
if os.path.isdir('.git') and \
not read_git_refs(['rev-parse', '--abbrev-ref', 'HEAD']).startswith('rel-'):
# append a git commit id from git remote repo, while the local change ids are skipped.
version_str += '+' + read_git_refs(['rev-parse', 'HEAD'])[:7]
return version_str


Expand Down
43 changes: 36 additions & 7 deletions test/static_test/test_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@
#include "wordpiece_tokenizer.hpp"
#include "bert_tokenizer.hpp"

#include <clocale>


class LocaleBaseTest : public testing::Test{
public:
// Remember that SetUp() is run immediately before a test starts.
void SetUp() override {
#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) && !defined(__GNUC__))
default_locale_ = std::locale().name();
std::setlocale(LC_CTYPE, "C");
#else
default_locale_ = std::locale("").name();
std::setlocale(LC_CTYPE, "en_US.UTF-8");
#endif
}
// TearDown() is invoked immediately after a test finishes.
void TearDown() override {
if (!default_locale_.empty()) {
std::setlocale(LC_CTYPE, default_locale_.c_str());
}
}

private:
std::string default_locale_;
};

TEST(tokenizer, bert_word_split) {
ustring ind("##");
ustring text("A AAA B BB");
Expand Down Expand Up @@ -59,8 +85,8 @@ TEST(tokenizer, wordpiece_basic_tokenizer) {
std::vector<int32_t> indices;
std::vector<int64_t> rows;
KernelWordpieceTokenizer_Tokenizer(vocab, ustring("##"), ustring("[unk]"), text, tokens, indices, rows);
//EXPECT_EQ(indices, std::vector<int32_t>({9, 6, 7, 12, 10, 11}));
//EXPECT_EQ(rows, std::vector<int64_t>({0, 6}));
// EXPECT_EQ(indices, std::vector<int32_t>({9, 6, 7, 12, 10, 11}));
// EXPECT_EQ(rows, std::vector<int64_t>({0, 6}));
}

std::unordered_map<std::u32string, int32_t> get_vocabulary_wordpiece() {
Expand Down Expand Up @@ -127,23 +153,26 @@ TEST(tokenizer, bert_wordpiece_tokenizer_rows) {
EXPECT_EQ(rows, std::vector<int64_t>({0, 5, 7}));
}

TEST(tokenizer, basic_tokenizer_chinese) {
TEST_F(LocaleBaseTest, basic_tokenizer_chinese) {
ustring test_case = ustring("ÀÁÂÃÄÅÇÈÉÊËÌÍÎÑÒÓÔÕÖÚÜ\t䗓𨖷虴𨀐辘𧄋脟𩑢𡗶镇伢𧎼䪱轚榶𢑌㺽𤨡!#$%&(Tom@microsoft.com)*+,-./:;<=>?@[\\]^_`{|}~");
std::vector<ustring> expect_result = ustring_vector_convertor({"aaaaaaceeeeiiinooooouu", "", "𨖷", "", "𨀐", "", "𧄋", "", "𩑢", "𡗶", "", "", "𧎼", "", "", "", "𢑌", "", "𤨡", "!", "#", "$", "%", "&", "(", "tom", "@", "microsoft", ".", "com", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"});
std::vector<ustring> expect_result = ustring_vector_convertor({"aaaaaaceeeeiiinooooouu",
"", "𨖷", "", "𨀐", "", "𧄋", "", "𩑢", "𡗶", "", "", "𧎼", "", "", "", "𢑌", "", "𤨡",
"!", "#", "$", "%", "&", "(", "tom", "@", "microsoft", ".", "com", ")", "*", "+", ",", "-", ".", "/", ":",
";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"});
BasicTokenizer tokenizer(true, true, true, true, true);
auto result = tokenizer.Tokenize(test_case);
EXPECT_EQ(result, expect_result);
}

TEST(tokenizer, basic_tokenizer_russia) {
TEST_F(LocaleBaseTest, basic_tokenizer_russia) {
ustring test_case = ustring("A $100,000 price-tag@big>small на русском языке");
std::vector<ustring> expect_result = ustring_vector_convertor({"a", "$", "100", ",", "000", "price", "-", "tag", "@", "big", ">", "small", "на", "русском", "языке"});
BasicTokenizer tokenizer(true, true, true, true, true);
auto result = tokenizer.Tokenize(test_case);
EXPECT_EQ(result, expect_result);
}

TEST(tokenizer, basic_tokenizer) {
TEST_F(LocaleBaseTest, basic_tokenizer) {
ustring test_case = ustring("I mean, you’ll need something to talk about next Sunday, right?");
std::vector<ustring> expect_result = ustring_vector_convertor({"I", "mean", ",", "you", "", "ll", "need", "something", "to", "talk", "about", "next", "Sunday", ",", "right", "?"});
BasicTokenizer tokenizer(false, true, true, true, true);
Expand Down Expand Up @@ -217,4 +246,4 @@ TEST(tokenizer, truncation_longest_first) {
truncate.Truncate(test_input1, test_input2, 12);
EXPECT_EQ(test_input1, std::vector<int64_t>({1, 2, 3, 4, 5}));
EXPECT_EQ(test_input2, std::vector<int64_t>({1, 2, 3, 4, 5, 6 ,7}));
}
}

0 comments on commit 9f3abe2

Please sign in to comment.