Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ serde_json = "1.0"
libc = "0.2"
env_logger = "0.11"
pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"] }
tokio = { version = "1.47.1", features = ["rt", "rt-multi-thread", "macros", "signal"] }
once_cell = "1.19.0"
numpy = "0.25"
ndarray = "0.16"
itertools = "0.14"
Expand Down
124 changes: 124 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,130 @@ class Tokenizer:
"""
pass

def async_decode_batch(self, sequences, skip_special_tokens=True):
"""
Decode a batch of ids back to their corresponding string

Args:
sequences (:obj:`List` of :obj:`List[int]`):
The batch of sequences we want to decode

skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether the special tokens should be removed from the decoded strings

Returns:
:obj:`List[str]`: A list of decoded strings
"""
pass

def async_encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
"""
Asynchronously encode the given input with character offsets.

This is an async version of encode that can be awaited in async Python code.

Example:
Here are some examples of the inputs that are accepted::

await async_encode("A single sequence")

Args:
sequence (:obj:`~tokenizers.InputSequence`):
The main input sequence we want to encode. This sequence can be either raw
text or pre-tokenized, according to the ``is_pretokenized`` argument:

- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`

pair (:obj:`~tokenizers.InputSequence`, `optional`):
An optional input sequence. The expected format is the same that for ``sequence``.

is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens

Returns:
:class:`~tokenizers.Encoding`: The encoded result

"""
pass

def async_encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Asynchronously encode the given batch of inputs with character offsets.

This is an async version of encode_batch that can be awaited in async Python code.

Example:
Here are some examples of the inputs that are accepted::

await async_encode_batch([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])

Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:

- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`

is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens

Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch

"""
pass

def async_encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Asynchronously encode the given batch of inputs without tracking character offsets.

This is an async version of encode_batch_fast that can be awaited in async Python code.

Example:
Here are some examples of the inputs that are accepted::

await async_encode_batch_fast([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])

Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:

- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`

is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens

Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch

"""
pass

def decode(self, ids, skip_special_tokens=True):
"""
Decode the given list of ids back to a string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,47 @@ def encode_batch(

return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)

async def async_encode_batch(
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
"""Asynchronously encode a batch (tracks character offsets).

Args:
inputs: A list of single or pair sequences to encode.
is_pretokenized: Whether inputs are already pre-tokenized.
add_special_tokens: Whether to add special tokens.

Returns:
A list of Encoding.
"""
if inputs is None:
raise ValueError("async_encode_batch: `inputs` can't be `None`")
# Exposed by the Rust bindings via pyo3_async_runtimes::tokio::future_into_py
return await self._tokenizer.async_encode_batch(inputs, is_pretokenized, add_special_tokens)

async def async_encode_batch_fast(
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
"""Asynchronously encode a batch (no character offsets, faster).

Args:
inputs: A list of single or pair sequences to encode.
is_pretokenized: Whether inputs are already pre-tokenized.
add_special_tokens: Whether to add special tokens.

Returns:
A list of Encoding.
"""
if inputs is None:
raise ValueError("async_encode_batch_fast: `inputs` can't be `None`")
return await self._tokenizer.async_encode_batch_fast(inputs, is_pretokenized, add_special_tokens)

def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
"""Decode the given list of ids to a string sequence

Expand Down
2 changes: 1 addition & 1 deletion bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Source = "https://github.com/huggingface/tokenizers"


[project.optional-dependencies]
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"]
testing = ["pytest", "pytest-asyncio", "requests", "numpy", "datasets", "black==22.3", "ruff"]
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
dev = ["tokenizers[testing]"]

Expand Down
13 changes: 13 additions & 0 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@

extern crate tokenizers as tk;

use once_cell::sync::Lazy;
use std::sync::Arc;
use tokio::runtime::Runtime;

// We create a global runtime that will be initialized once when first needed
// This ensures we always have a runtime available for tokio::task::spawn_blocking
static TOKIO_RUNTIME: Lazy<Arc<Runtime>> = Lazy::new(|| {
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.expect("Failed to create global Tokio runtime");
Arc::new(rt)
});
mod decoders;
mod encoding;
mod error;
Expand Down
Loading
Loading