Skip to content

Commit

Permalink
Add Bert tokenizer in the supported model list and code refinement (#503
Browse files Browse the repository at this point in the history
)

* Add Bert tokenizer in the supported model list and the related code refinement

* utest fix
  • Loading branch information
wenbingl authored Aug 2, 2023
1 parent 6209804 commit 922b7cc
Show file tree
Hide file tree
Showing 13 changed files with 229 additions and 142 deletions.
36 changes: 27 additions & 9 deletions onnxruntime_extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,42 @@
###############################################################################

"""
The entry point to onnxruntime-extensions package.
The `onnxruntime-extensions` Python package offers an API that allows users to generate models for pre-processing and
post-processing tasks. In addition, it also provides an API to register custom operations implemented in Python.
This enables more flexibility and control over model execution, thus expanding the functionality of the ONNX Runtime.
"""

__author__ = "Microsoft"

__all__ = [
'gen_processing_models',
'get_library_path',
'Opdef', 'onnx_op', 'PyCustomOpDef', 'PyOp',
'enable_py_op',
'expand_onnx_inputs',
'hook_model_op',
'default_opset_domain',
'OrtPyFunction', 'PyOrtFunction',
'optimize_model',
'make_onnx_model',
'ONNXRuntimeError',
'hash_64',
'__version__',
]

from ._version import __version__
from ._ocos import get_library_path # noqa
from ._ocos import Opdef, PyCustomOpDef # noqa
from ._ocos import hash_64 # noqa
from ._ocos import enable_py_op # noqa
from ._ocos import expand_onnx_inputs # noqa
from ._ocos import hook_model_op # noqa
from ._ocos import default_opset_domain # noqa
from ._cuops import * # noqa
from ._ocos import get_library_path
from ._ocos import Opdef, PyCustomOpDef
from ._ocos import hash_64
from ._ocos import enable_py_op
from ._ocos import expand_onnx_inputs
from ._ocos import hook_model_op
from ._ocos import default_opset_domain
from ._cuops import * # noqa
from ._ortapi2 import OrtPyFunction as PyOrtFunction # backward compatibility
from ._ortapi2 import OrtPyFunction, optimize_model, make_onnx_model, ONNXRuntimeError
from .cvt import gen_processing_models

# rename the implementation with a more formal name
onnx_op = Opdef.declare
PyOp = PyCustomOpDef
4 changes: 3 additions & 1 deletion onnxruntime_extensions/_cuops.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,9 @@ def get_outputs(cls):
def serialize_attr(cls, attrs):
attrs_data = {}
for k_, v_ in attrs.items():
if k_ == 'vocab_file':
if k_ == 'vocab':
attrs_data['vocab_file'] = v_
elif k_ == 'vocab_file':
with open(v_, "r", encoding='utf-8') as model_file:
lines = model_file.readlines()
attrs_data[k_] = '\n'.join(lines)
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime_extensions/_extensions_pydll.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Licensed under the MIT License. See License.txt in the project root for
# license information.
###############################################################################
from typing import Callable


class PyCustomOpDef:
undefined: int = ...
Expand All @@ -21,6 +23,8 @@ class PyCustomOpDef:
dt_complex64: int = ...
dt_complex128: int = ...
dt_bfloat16: int = ...
def install_hooker(self, invocation_handler: Callable) -> None:
...
...


Expand Down
98 changes: 63 additions & 35 deletions onnxruntime_extensions/_hf_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@

import json
import onnx
import numpy as np
from numpy import array as nparray
from functools import partial
from collections import namedtuple
from collections import namedtuple, OrderedDict

from ._cuops import CustomOpConverter, SingleOpGraph
from .util import read_file
Expand All @@ -32,6 +32,25 @@ def bpe_tokenizer(self, **kwargs):
attrs.update(**kwargs)
return attrs

def bert_tokenizer(self, **kwargs):
hf_bert_tokenizer = self.tokenizer
# has to be sorted since the id of token was generated automatically.
ordered_vocab = OrderedDict(sorted(hf_bert_tokenizer.vocab.items(), key=lambda item: int(item[1])))
vocab = '\n'.join(ordered_vocab.keys())
attrs = dict(vocab=vocab)
init_kwargs = hf_bert_tokenizer.init_kwargs
attrs['do_lower_case'] = 1 if 'do_lower_case' in init_kwargs and init_kwargs.get('do_lower_case') else 0
attrs['strip_accents'] = 1 if 'strip_accents' in init_kwargs and init_kwargs.get('strip_accents') else 0
attrs.update(**kwargs)
return attrs

def bert_decoder(self, **kwargs):
hf_bert_tokenizer = self.tokenizer
attrs = {'vocab': json.dumps(
hf_bert_tokenizer.ids_to_tokens, separators=(',', ':'))}
attrs.update(**kwargs)
return attrs

def bpe_decoder(self, **kwargs):
decoder = self.tokenizer.decoder
id_vocab = "\n".join([decoder[_idx] for _idx in sorted(decoder)])
Expand Down Expand Up @@ -95,22 +114,28 @@ def spm_decoder(self, **kwargs):
"default_inputs"],
defaults=(None, None, None, None, None))

# fmt: off
# @formatter:off
_PROCESSOR_DICT = {
"GPT2Tokenizer": TokenOpParam('Gpt2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
'BpeDecoder', HFTokenizerConverter.bpe_decoder),
"ClipTokenizer": TokenOpParam('ClipTokenizer', HFTokenizerConverter.clip_tokenizer,
'BpeDecoder', HFTokenizerConverter.bpe_decoder),
"RobertaTokenizer": TokenOpParam("RobertaTokenizer", HFTokenizerConverter.roberta_tokenizer,
None, None),
"T5Tokenizer": TokenOpParam("SentencepieceTokenizer", HFTokenizerConverter.spm_tokenizer,
"SentencepieceDecoder", HFTokenizerConverter.spm_decoder,
"BertTokenizer": TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
"DistilBertTokenizer":
TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
"GPT2Tokenizer": TokenOpParam('Gpt2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
"ClipTokenizer": TokenOpParam('ClipTokenizer', HFTokenizerConverter.clip_tokenizer,
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
"RobertaTokenizer": TokenOpParam("RobertaTokenizer", HFTokenizerConverter.roberta_tokenizer,
None, None, None),
"T5Tokenizer": TokenOpParam("SentencepieceTokenizer", HFTokenizerConverter.spm_tokenizer,
"SentencepieceDecoder", HFTokenizerConverter.spm_decoder,
default_inputs={'add_eos': [True]}),
"LlamaTokenizer": TokenOpParam("SentencepieceTokenizer", HFTokenizerConverter.spm_tokenizer,
"SentencepieceDecoder", HFTokenizerConverter.spm_decoder,
"LlamaTokenizer": TokenOpParam("SentencepieceTokenizer", HFTokenizerConverter.spm_tokenizer,
"SentencepieceDecoder", HFTokenizerConverter.spm_decoder,
default_inputs={'add_bos': [True]}),
}
# fmt: on
# @formatter:on


class HFTokenizerOnnxGraph:

Expand All @@ -137,31 +162,34 @@ def pre_processing(self, **kwargs):
_cvt_func = self.cvt_quadruple.pre_attribute_cvt
cvt = partial(_cvt_func, self.cvt_obj)
g = SingleOpGraph.build_graph(_cvt_op, cvt=cvt, **kwargs)
default_inputs = []
if with_default_inputs:
op_class = SingleOpGraph.get_op_class(_cvt_op)
default_inputs = op_class.input_default_values()
if default_inputs is None:
raise ValueError("The op {} doesn't define default inputs".format(_cvt_op))
n_inputs = len(default_inputs)
if self.cvt_quadruple.default_inputs is not None:
default_inputs.update(self.cvt_quadruple.default_inputs)
if len(default_inputs) != n_inputs:
raise ValueError("Op: {} does have the inputs from its TokenOpParam.".format(_cvt_op))

new_initializers = []

for k, v in default_inputs.items():
input_value_info = next((i for i in g.input if i.name == k), None)
if input_value_info is None:
raise ValueError("The input {} is not found in the graph".format(k))

np_dtype = onnx.helper.tensor_dtype_to_np_dtype(input_value_info.type.tensor_type.elem_type)
value = np.array(v, np_dtype)
new_initializers.append(onnx.numpy_helper.from_array(value, k))
g.initializer.extend(new_initializers)
new_inputs = [i for i in g.input if i.name not in default_inputs]
g.ClearField("input")
g.input.extend(new_inputs)
return g

# add default_inputs into initializers to simplify the model input
n_inputs = len(default_inputs)
if self.cvt_quadruple.default_inputs is not None:
default_inputs.update(self.cvt_quadruple.default_inputs)
if len(default_inputs) != n_inputs:
raise ValueError("Op: {} does have the inputs from its TokenOpParam.".format(_cvt_op))

new_initializers = []

for k, v in default_inputs.items():
input_value_info = next((i for i in g.input if i.name == k), None)
if input_value_info is None:
raise ValueError("The input {} is not found in the graph".format(k))

np_dtype = onnx.helper.tensor_dtype_to_np_dtype(input_value_info.type.tensor_type.elem_type)
value = nparray(v, np_dtype)
new_initializers.append(onnx.numpy_helper.from_array(value, k))
g.initializer.extend(new_initializers)
new_inputs = [i for i in g.input if i.name not in default_inputs]
g.ClearField("input")
g.input.extend(new_inputs)
return g

def post_processing(self, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime_extensions/_ocos.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def get_library_path():
"""
The custom operator library binary path
:return: A string of the this library path.
:return: A string of this library path.
"""
mod = sys.modules['onnxruntime_extensions._extensions_pydll']
return mod.__file__
Expand Down
34 changes: 23 additions & 11 deletions onnxruntime_extensions/_ortapi2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
from ._ocos import default_opset_domain, get_library_path # noqa
from ._cuops import onnx, onnx_proto, SingleOpGraph


_ort_check_passed = False
try:
from packaging import version as _ver
import onnxruntime as _ort

if _ver.parse(_ort.__version__) >= _ver.parse("1.10.0"):
_ort_check_passed = True
except ImportError:
Expand All @@ -37,6 +37,7 @@ def get_opset_version_from_ort():
"1.12": 17,
"1.13": 17,
"1.14": 18,
"1.15": 18
}

ort_ver_string = '.'.join(_ort.__version__.split('.')[0:2])
Expand All @@ -59,14 +60,21 @@ def make_onnx_model(graph, opset_version=0, extra_domain=default_opset_domain(),


class OrtPyFunction:
"""
OrtPyFunction is a convenience class that serves as a wrapper around the ONNXRuntime InferenceSession,
equipped with registered onnxruntime-extensions. This allows execution of an ONNX model as if it were a
standard Python function. The order of the function arguments correlates directly with
the sequence of the input/output in the ONNX graph.
"""

def get_ort_session_options(self):
so = _ort.SessionOptions()
for k, v in self.extra_session_options.items():
so.__setattr__(k, v)
so.register_custom_ops_library(get_library_path())
return so

def __init__(self, cpu_only=None):
def __init__(self, path_or_model=None, cpu_only=None):
self._onnx_model = None
self.ort_session = None
self.default_inputs = {}
Expand All @@ -75,6 +83,14 @@ def __init__(self, cpu_only=None):
if _ort.get_device() == 'GPU':
self.execution_providers = ['CUDAExecutionProvider']
self.extra_session_options = {}
mpath = None
if isinstance(path_or_model, str):
oxml = onnx.load_model(path_or_model)
mpath = path_or_model
else:
oxml = path_or_model
if path_or_model is not None:
self._bind(oxml, mpath)

def create_from_customop(self, op_type, *args, **kwargs):
graph = SingleOpGraph.build_graph(op_type, *args, **kwargs)
Expand Down Expand Up @@ -130,17 +146,13 @@ def _get_kwarg_device(kwargs):

@classmethod
def from_customop(cls, op_type, *args, **kwargs):
return cls(cls._get_kwarg_device(kwargs)).create_from_customop(op_type, *args, **kwargs)
return (cls(cpu_only=cls._get_kwarg_device(kwargs))
.create_from_customop(op_type, *args, **kwargs))

@classmethod
def from_model(cls, path_or_model, *args, **kwargs):
mpath = None
if isinstance(path_or_model, str):
oxml = onnx.load_model(path_or_model)
mpath = path_or_model
else:
oxml = path_or_model
return cls(cls._get_kwarg_device(kwargs))._bind(oxml, mpath)
fn = cls(path_or_model, cls._get_kwarg_device(kwargs))
return fn

def _argument_map(self, *args, **kwargs):
idx = 0
Expand Down Expand Up @@ -169,7 +181,7 @@ def __call__(self, *args, **kwargs):


def optimize_model(model_or_file, output_file):
sess_options = OrtPyFunction.get_ort_session_options()
sess_options = OrtPyFunction().get_ort_session_options()
sess_options.graph_optimization_level = _ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
sess_options.optimized_model_filepath = output_file
_ort.InferenceSession(model_or_file if isinstance(model_or_file, str)
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime_extensions/_torch_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,6 @@ def post_processing(self, **kwargs):
inputs = [onnx.helper.make_tensor_value_info("sequences", onnx.TensorProto.INT32, ['N', 'seq_len', 'ids'])]
del g.input[:]
g.input.extend(inputs)
g.output[0].type.CopyFrom(onnx.helper.make_tensor_type_proto(onnx.TensorProto.STRING, ['N', 'seq_len', 'text']))
g.output[0].type.CopyFrom(onnx.helper.make_tensor_type_proto(onnx.TensorProto.STRING, ['N', 'text']))

return make_onnx_model(g, opset_version=self.opset_version)
3 changes: 2 additions & 1 deletion test/test_audio_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def test_mp3_decoder(self):
def test_decoder_resampling(self):
test_file = util.get_test_data_file('data', 'jfk.flac')
blob = bytearray(util.read_file(test_file, mode='rb'))
decoder = PyOrtFunction.from_customop('AudioDecoder', cpu_only=True, downsampling_rate=16000, stereo_to_mono=1)
decoder = PyOrtFunction.from_customop(
'AudioDecoder', cpu_only=True, downsampling_rate=16000, stereo_to_mono=1)
pcm_tensor = decoder(np.expand_dims(np.asarray(blob), axis=(0,)))
self.assertEqual(pcm_tensor.shape, (1, 176000))

Expand Down
1 change: 0 additions & 1 deletion test/test_audio_signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import onnx
from onnx import onnx_pb as onnx_proto


_is_torch_available = False
try:
import torch
Expand Down
Loading

0 comments on commit 922b7cc

Please sign in to comment.