Skip to content

Commit

Permalink
Nested Binary fields improvements (#4)
Browse files Browse the repository at this point in the history
* Make benchmarks go through the base64 validator/serializator and update bencharmks values

* Improve nested Binary support

---------

Signed-off-by: Federico Busetti <729029+febus982@users.noreply.github.com>
  • Loading branch information
febus982 authored Sep 28, 2024
1 parent 9720963 commit ec3079e
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 56 deletions.
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,16 @@ some performance issue in the official serialization using pydantic)
These results come from a Macbook Pro M3 Max on python 3.12. Feel free to run the `benchmark.py`
script yourself.

```shell
```
Timings for HTTP JSON deserialization:
This package: 2.5353065830422565
Official SDK with pydantic model: 12.80780174996471
Official SDK with http model: 11.474249749968294
This package: 3.0855846670019673
Official SDK with pydantic model: 15.35431600001175
Official SDK with http model: 13.728038166998886
Timings for HTTP JSON serialization:
This package: 3.4850796660175547
Official SDK with pydantic model: 39.037468083028216
Official SDK with http model: 7.681282749981619

This package: 4.292417042001034
Official SDK with pydantic model: 44.50933354199515
Official SDK with http model: 8.929204874992138
```


Expand Down
26 changes: 17 additions & 9 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER =
# DEALINGS IN THE SOFTWARE. =
# ==============================================================================
import base64
import json
from timeit import timeit

Expand All @@ -36,11 +37,18 @@
from cloudevents.pydantic import (
from_json as from_json_pydantic,
)
from pydantic import Field

from cloudevents_pydantic.bindings.http import HTTPHandler
from cloudevents_pydantic.events import CloudEvent
from cloudevents_pydantic.events.field_types import Binary

valid_json = '{"data":null,"source":"https://example.com/event-producer","id":"b96267e2-87be-4f7a-b87c-82f64360d954","type":"com.example.string","specversion":"1.0","time":"2022-07-16T12:03:20.519216+04:00","subject":null,"datacontenttype":null,"dataschema":null}'
valid_json = '{"data_base64":"dGVzdA==","source":"https://example.com/event-producer","id":"b96267e2-87be-4f7a-b87c-82f64360d954","type":"com.example.string","specversion":"1.0","time":"2022-07-16T12:03:20.519216+04:00","subject":null,"datacontenttype":null,"dataschema":null}'
test_iterations = 1000000


class BinaryEvent(CloudEvent):
data: Binary = Field(Binary, alias="data_base64")


def json_deserialization():
Expand All @@ -56,19 +64,19 @@ def json_deserialization_official_sdk_cloudevent():


print("Timings for HTTP JSON deserialization:")
print("This package: " + str(timeit(json_deserialization)))
print("This package: " + str(timeit(json_deserialization, number=test_iterations)))
print(
"Official SDK with pydantic model: "
+ str(timeit(json_deserialization_official_sdk_pydantic))
+ str(timeit(json_deserialization_official_sdk_pydantic, number=test_iterations))
)
print(
"Official SDK with http model: "
+ str(timeit(json_deserialization_official_sdk_cloudevent))
+ str(timeit(json_deserialization_official_sdk_cloudevent, number=test_iterations))
)

attributes = json.loads(valid_json)
data = attributes["data"]
del attributes["data"]
data = base64.b64decode(attributes["data_base64"])
del attributes["data_base64"]
event = CloudEvent(**attributes, data=data)
http_handler = HTTPHandler()
official_pydantic_event = PydanticOfficialCloudEvent.create(
Expand All @@ -91,12 +99,12 @@ def json_serialization_official_sdk_cloudevent():

print("")
print("Timings for HTTP JSON serialization:")
print("This package: " + str(timeit(json_serialization)))
print("This package: " + str(timeit(json_serialization, number=test_iterations)))
print(
"Official SDK with pydantic model: "
+ str(timeit(json_serialization_official_sdk_pydantic))
+ str(timeit(json_serialization_official_sdk_pydantic, number=test_iterations))
)
print(
"Official SDK with http model: "
+ str(timeit(json_serialization_official_sdk_cloudevent))
+ str(timeit(json_serialization_official_sdk_cloudevent, number=test_iterations))
)
41 changes: 21 additions & 20 deletions cloudevents_pydantic/events/_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
# ==============================================================================
import base64
import datetime
import typing
from typing import Union
from typing import Any, Dict, Optional, Union

from cloudevents.pydantic.fields_docs import FIELD_DESCRIPTIONS
from pydantic import (
Expand All @@ -33,14 +32,18 @@
model_serializer,
model_validator,
)
from pydantic.fields import FieldInfo
from pydantic_core.core_schema import ValidationInfo
from ulid import ULID

from .field_types import URI, DateTime, SpecVersion, String, URIReference
from .field_types import URI, Binary, DateTime, SpecVersion, String, URIReference

DEFAULT_SPECVERSION = SpecVersion.v1_0


_binary_field_metadata = FieldInfo.from_annotation(Binary).metadata


class CloudEvent(BaseModel): # type: ignore
"""
A Python-friendly CloudEvent representation backed by Pydantic-modeled fields.
Expand All @@ -49,9 +52,9 @@ class CloudEvent(BaseModel): # type: ignore
@classmethod
def event_factory(
cls,
id: typing.Optional[str] = None,
specversion: typing.Optional[SpecVersion] = None,
time: typing.Optional[Union[datetime.datetime, str]] = None,
id: Optional[str] = None,
specversion: Optional[SpecVersion] = None,
time: Optional[Union[datetime.datetime, str]] = None,
**kwargs,
) -> "CloudEvent":
"""
Expand All @@ -74,7 +77,7 @@ def event_factory(
**kwargs,
)

data: typing.Optional[typing.Any] = Field(
data: Any = Field(
title=FIELD_DESCRIPTIONS["data"].get("title"),
description=FIELD_DESCRIPTIONS["data"].get("description"),
examples=[FIELD_DESCRIPTIONS["data"].get("example")],
Expand Down Expand Up @@ -104,25 +107,25 @@ def event_factory(
)

# Optional fields
time: typing.Optional[DateTime] = Field(
time: Optional[DateTime] = Field(
title=FIELD_DESCRIPTIONS["time"].get("title"),
description=FIELD_DESCRIPTIONS["time"].get("description"),
examples=[FIELD_DESCRIPTIONS["time"].get("example")],
default=None,
)
subject: typing.Optional[String] = Field(
subject: Optional[String] = Field(
title=FIELD_DESCRIPTIONS["subject"].get("title"),
description=FIELD_DESCRIPTIONS["subject"].get("description"),
examples=[FIELD_DESCRIPTIONS["subject"].get("example")],
default=None,
)
datacontenttype: typing.Optional[String] = Field(
datacontenttype: Optional[String] = Field(
title=FIELD_DESCRIPTIONS["datacontenttype"].get("title"),
description=FIELD_DESCRIPTIONS["datacontenttype"].get("description"),
examples=[FIELD_DESCRIPTIONS["datacontenttype"].get("example")],
default=None,
)
dataschema: typing.Optional[URI] = Field(
dataschema: Optional[URI] = Field(
title=FIELD_DESCRIPTIONS["dataschema"].get("title"),
description=FIELD_DESCRIPTIONS["dataschema"].get("description"),
examples=[FIELD_DESCRIPTIONS["dataschema"].get("example")],
Expand Down Expand Up @@ -154,7 +157,7 @@ def event_factory(
"""

@model_serializer(when_used="json")
def base64_json_serializer(self) -> typing.Dict[str, typing.Any]:
def base64_json_serializer(self) -> Dict[str, Any]:
"""Takes care of handling binary data serialization into `data_base64`
attribute.
Expand All @@ -164,20 +167,18 @@ def base64_json_serializer(self) -> typing.Dict[str, typing.Any]:
data handled.
"""
model_dict = self.model_dump() # type: ignore

if isinstance(self.data, (bytes, bytearray, memoryview)):
model_dict["data_base64"] = (
base64.b64encode(self.data)
if isinstance(self.data, (bytes, bytearray, memoryview))
else self.data
)
if _binary_field_metadata == self.model_fields["data"].metadata:
model_dict["data_base64"] = model_dict["data"]
del model_dict["data"]
elif isinstance(model_dict["data"], (bytes, bytearray, memoryview)):
model_dict["data_base64"] = base64.b64encode(model_dict["data"])
del model_dict["data"]

return model_dict

@model_validator(mode="before")
@classmethod
def base64_data_parser(cls, data: typing.Any, info: ValidationInfo) -> typing.Any:
def base64_json_validator(cls, data: dict, info: ValidationInfo) -> Any:
"""Takes care of handling binary data deserialization from `data_base64`
attribute.
Expand Down
22 changes: 18 additions & 4 deletions cloudevents_pydantic/events/field_types/_canonic_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import base64
from datetime import datetime
from enum import Enum
from typing import Annotated, Union
from urllib.parse import ParseResult, urlparse, urlunparse

from annotated_types import Ge, Le
Expand All @@ -31,7 +32,6 @@
PlainValidator,
StringConstraints,
)
from typing_extensions import Annotated


def bool_serializer(value: bool) -> str:
Expand All @@ -42,6 +42,16 @@ def binary_serializer(value: bytes) -> str:
return base64.b64encode(value).decode()


def binary_validator(value: Union[str, bytes, bytearray, memoryview]) -> bytes:
if isinstance(value, (bytes, bytearray, memoryview)):
return value

if isinstance(value, str):
return base64.b64decode(value, validate=True)

raise ValueError(f"Unsupported value type: {type(value)} - {value}")


def url_serializer(value: ParseResult) -> str:
return urlunparse(value)

Expand Down Expand Up @@ -98,10 +108,14 @@ def generic_uri_validator(value: str) -> ParseResult:
Sequence of allowable Unicode characters
"""

# bytearray is coerced to bytes, memoryview is not supported
Binary = Annotated[bytes, PlainSerializer(binary_serializer)]
Binary = Annotated[
bytes,
PlainValidator(binary_validator),
PlainSerializer(binary_serializer),
]
"""
Sequence of bytes supporting base64 serialization/deserialization
Sequence of bytes that accepts both bytes and base64 encoded strings as input
and is serialized to a base64 encoded string.
"""

URI = Annotated[
Expand Down
4 changes: 2 additions & 2 deletions cloudevents_pydantic/formats/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER =
# DEALINGS IN THE SOFTWARE. =
# ==============================================================================
from typing import List, Type
from typing import List, Type, overload

from pydantic import TypeAdapter
from typing_extensions import TypeVar, overload
from typing_extensions import TypeVar

from ..events import CloudEvent

Expand Down
2 changes: 1 addition & 1 deletion docs/event_class.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ When you create event types in your app you will want to make sure to follow the
will be compliant with the [CloudEvents spec](https://github.com/cloudevents/spec/tree/main).

```python
from typing_extensions import TypedDict, Literal
from typing import TypedDict, Literal
from cloudevents_pydantic.events import CloudEvent, field_types

class OrderCreatedData(TypedDict):
Expand Down
41 changes: 36 additions & 5 deletions tests/events/test_field_types_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER =
# DEALINGS IN THE SOFTWARE. =
# ==============================================================================
from typing import Union

import pytest
from pydantic import BaseModel

Expand All @@ -45,18 +47,47 @@ class BoolModel(BaseModel):


@pytest.mark.parametrize(
["data", "expected_value"],
["data", "serialized_output"],
[
pytest.param("test", "dGVzdA==", id="string"),
pytest.param(b"test", "dGVzdA==", id="bytes"),
pytest.param(bytearray([2, 3, 5, 7]), "AgMFBw==", id="bytearray"),
pytest.param(b"\x02\x03\x05\x07", "AgMFBw==", id="bytearray"),
],
)
def test_binary_data_is_b64encoded(data, expected_value):
def test_binary_serialization(
data: Union[bytes, str],
serialized_output: str,
):
class BinaryModel(BaseModel):
value: Binary

assert BinaryModel(value=data).model_dump()["value"] == expected_value
model = BinaryModel(value=data)

serialized_value = model.model_dump()["value"]

assert serialized_value == serialized_output
assert isinstance(serialized_value, str)


@pytest.mark.parametrize(
["data", "serialized_output"],
[
pytest.param(b"test", "dGVzdA==", id="bytes"),
pytest.param(b"\x02\x03\x05\x07", "AgMFBw==", id="bytearray"),
],
)
def test_nested_binary_serialization(
data: Union[bytes, str],
serialized_output: str,
):
class BinaryModel(BaseModel):
value: Binary

model = BinaryModel(value=data)

serialized_value = model.model_dump()["value"]

assert serialized_value == serialized_output
assert isinstance(serialized_value, str)


@pytest.mark.parametrize(
Expand Down
Loading

0 comments on commit ec3079e

Please sign in to comment.