Skip to content

Commit d7bfe90

Browse files
authored
Add ensure_ascii option (#1689)
1 parent 52e9a53 commit d7bfe90

File tree

7 files changed

+164
-11
lines changed

7 files changed

+164
-11
lines changed

python/pydantic_core/_pydantic_core.pyi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ class SchemaSerializer:
344344
value: Any,
345345
*,
346346
indent: int | None = None,
347+
ensure_ascii: bool = False,
347348
include: _IncEx | None = None,
348349
exclude: _IncEx | None = None,
349350
by_alias: bool | None = None,
@@ -362,6 +363,8 @@ class SchemaSerializer:
362363
Arguments:
363364
value: The Python object to serialize.
364365
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
366+
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
367+
If `False` (the default), these characters will be output as-is.
365368
include: A set of fields to include, if `None` all fields are included.
366369
exclude: A set of fields to exclude, if `None` no fields are excluded.
367370
by_alias: Whether to use the alias names of fields.
@@ -389,6 +392,7 @@ def to_json(
389392
value: Any,
390393
*,
391394
indent: int | None = None,
395+
ensure_ascii: bool = False,
392396
include: _IncEx | None = None,
393397
exclude: _IncEx | None = None,
394398
# Note: In Pydantic 2.11, the default value of `by_alias` on `SchemaSerializer` was changed from `True` to `None`,
@@ -413,6 +417,8 @@ def to_json(
413417
Arguments:
414418
value: The Python object to serialize.
415419
indent: If `None`, the JSON will be compact, otherwise it will be pretty-printed with the indent provided.
420+
ensure_ascii: If `True`, the output is guaranteed to have all incoming non-ASCII characters escaped.
421+
If `False` (the default), these characters will be output as-is.
416422
include: A set of fields to include, if `None` all fields are included.
417423
exclude: A set of fields to exclude, if `None` no fields are excluded.
418424
by_alias: Whether to use the alias names of fields.

src/serializers/mod.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,15 @@ impl SchemaSerializer {
155155
}
156156

157157
#[allow(clippy::too_many_arguments)]
158-
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = None,
158+
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, by_alias = None,
159159
exclude_unset = false, exclude_defaults = false, exclude_none = false, round_trip = false, warnings = WarningsArg::Bool(true),
160160
fallback = None, serialize_as_any = false, context = None))]
161161
pub fn to_json(
162162
&self,
163163
py: Python,
164164
value: &Bound<'_, PyAny>,
165165
indent: Option<usize>,
166+
ensure_ascii: Option<bool>,
166167
include: Option<&Bound<'_, PyAny>>,
167168
exclude: Option<&Bound<'_, PyAny>>,
168169
by_alias: Option<bool>,
@@ -203,6 +204,7 @@ impl SchemaSerializer {
203204
exclude,
204205
&extra,
205206
indent,
207+
ensure_ascii.unwrap_or(false),
206208
self.expected_json_size.load(Ordering::Relaxed),
207209
)?;
208210

@@ -238,14 +240,15 @@ impl SchemaSerializer {
238240

239241
#[allow(clippy::too_many_arguments)]
240242
#[pyfunction]
241-
#[pyo3(signature = (value, *, indent = None, include = None, exclude = None, by_alias = true,
243+
#[pyo3(signature = (value, *, indent = None, ensure_ascii = false, include = None, exclude = None, by_alias = true,
242244
exclude_none = false, round_trip = false, timedelta_mode = "iso8601", bytes_mode = "utf8",
243245
inf_nan_mode = "constants", serialize_unknown = false, fallback = None, serialize_as_any = false,
244246
context = None))]
245247
pub fn to_json(
246248
py: Python,
247249
value: &Bound<'_, PyAny>,
248250
indent: Option<usize>,
251+
ensure_ascii: Option<bool>,
249252
include: Option<&Bound<'_, PyAny>>,
250253
exclude: Option<&Bound<'_, PyAny>>,
251254
by_alias: bool,
@@ -271,7 +274,16 @@ pub fn to_json(
271274
serialize_as_any,
272275
context,
273276
);
274-
let bytes = to_json_bytes(value, AnySerializer::get(), include, exclude, &extra, indent, 1024)?;
277+
let bytes = to_json_bytes(
278+
value,
279+
AnySerializer::get(),
280+
include,
281+
exclude,
282+
&extra,
283+
indent,
284+
ensure_ascii.unwrap_or(false),
285+
1024,
286+
)?;
275287
state.final_check(py)?;
276288
let py_bytes = PyBytes::new(py, &bytes);
277289
Ok(py_bytes.into())

src/serializers/shared.rs

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::borrow::Cow;
22
use std::fmt::Debug;
3+
use std::io::{self, Write};
34

45
use pyo3::exceptions::PyTypeError;
56
use pyo3::prelude::*;
@@ -9,7 +10,7 @@ use pyo3::{intern, PyTraverseError, PyVisit};
910

1011
use enum_dispatch::enum_dispatch;
1112
use serde::Serialize;
12-
use serde_json::ser::PrettyFormatter;
13+
use serde_json::ser::{Formatter, PrettyFormatter};
1314

1415
use crate::build_tools::py_schema_err;
1516
use crate::build_tools::py_schema_error_type;
@@ -432,6 +433,87 @@ impl Serialize for PydanticSerializer<'_> {
432433
}
433434
}
434435

436+
struct EscapeNonAsciiFormatter;
437+
438+
impl Formatter for EscapeNonAsciiFormatter {
439+
fn write_string_fragment<W: ?Sized + Write>(&mut self, writer: &mut W, fragment: &str) -> io::Result<()> {
440+
let mut input = fragment;
441+
442+
while let Some((idx, non_ascii_char)) = input.chars().enumerate().find(|(_, c)| !c.is_ascii()) {
443+
if idx > 0 {
444+
// write all ascii characters before the non-ascii one
445+
let ascii_run = &input[..idx];
446+
writer.write_all(ascii_run.as_bytes()).unwrap();
447+
}
448+
449+
let codepoint = non_ascii_char as u32;
450+
if codepoint < 0xFFFF {
451+
// write basic codepoint as single escape
452+
write!(writer, "\\u{codepoint:04x}").unwrap();
453+
} else {
454+
// encode extended plane character as utf16 pair
455+
for escape in non_ascii_char.encode_utf16(&mut [0; 2]) {
456+
write!(writer, "\\u{escape:04x}").unwrap();
457+
}
458+
}
459+
460+
input = &input[(idx + non_ascii_char.len_utf8())..];
461+
}
462+
463+
// write any ascii trailer
464+
writer.write_all(input.as_bytes())?;
465+
Ok(())
466+
}
467+
}
468+
469+
struct EscapeNonAsciiPrettyFormatter<'a> {
470+
pretty: PrettyFormatter<'a>,
471+
escape_non_ascii: EscapeNonAsciiFormatter,
472+
}
473+
474+
impl<'a> EscapeNonAsciiPrettyFormatter<'a> {
475+
pub fn with_indent(indent: &'a [u8]) -> Self {
476+
Self {
477+
pretty: PrettyFormatter::with_indent(indent),
478+
escape_non_ascii: EscapeNonAsciiFormatter,
479+
}
480+
}
481+
}
482+
483+
macro_rules! defer {
484+
($formatter:ident, $fun:ident) => {
485+
fn $fun<W>(&mut self, writer: &mut W) -> io::Result<()>
486+
where
487+
W: ?Sized + io::Write,
488+
{
489+
self.$formatter.$fun(writer)
490+
}
491+
};
492+
($formatter:ident, $fun:ident, $val:ty) => {
493+
fn $fun<W>(&mut self, writer: &mut W, val: $val) -> io::Result<()>
494+
where
495+
W: ?Sized + io::Write,
496+
{
497+
self.$formatter.$fun(writer, val)
498+
}
499+
};
500+
}
501+
502+
#[allow(clippy::needless_lifetimes)]
503+
impl Formatter for EscapeNonAsciiPrettyFormatter<'_> {
504+
defer!(escape_non_ascii, write_string_fragment, &str);
505+
defer!(pretty, begin_array);
506+
defer!(pretty, end_array);
507+
defer!(pretty, begin_array_value, bool);
508+
defer!(pretty, end_array_value);
509+
defer!(pretty, begin_object);
510+
defer!(pretty, end_object);
511+
defer!(pretty, begin_object_key, bool);
512+
defer!(pretty, end_object_key);
513+
defer!(pretty, begin_object_value);
514+
defer!(pretty, end_object_value);
515+
}
516+
435517
#[allow(clippy::too_many_arguments)]
436518
pub(crate) fn to_json_bytes(
437519
value: &Bound<'_, PyAny>,
@@ -440,25 +522,40 @@ pub(crate) fn to_json_bytes(
440522
exclude: Option<&Bound<'_, PyAny>>,
441523
extra: &Extra,
442524
indent: Option<usize>,
525+
ensure_ascii: bool,
443526
expected_json_size: usize,
444527
) -> PyResult<Vec<u8>> {
445528
let serializer = PydanticSerializer::new(value, serializer, include, exclude, extra);
446529

447530
let writer: Vec<u8> = Vec::with_capacity(expected_json_size);
448-
let bytes = match indent {
449-
Some(indent) => {
531+
532+
let bytes = match (indent, ensure_ascii) {
533+
(Some(indent), true) => {
534+
let indent = vec![b' '; indent];
535+
let formatter = EscapeNonAsciiPrettyFormatter::with_indent(&indent);
536+
let mut ser = PythonSerializer::with_formatter(writer, formatter);
537+
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
538+
ser.into_inner()
539+
}
540+
(Some(indent), false) => {
450541
let indent = vec![b' '; indent];
451542
let formatter = PrettyFormatter::with_indent(&indent);
452543
let mut ser = PythonSerializer::with_formatter(writer, formatter);
453544
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
454545
ser.into_inner()
455546
}
456-
None => {
547+
(None, true) => {
548+
let mut ser = PythonSerializer::with_formatter(writer, EscapeNonAsciiFormatter);
549+
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
550+
ser.into_inner()
551+
}
552+
(None, false) => {
457553
let mut ser = PythonSerializer::new(writer);
458554
serializer.serialize(&mut ser).map_err(se_err_py_err)?;
459555
ser.into_inner()
460556
}
461557
};
558+
462559
Ok(bytes)
463560
}
464561

src/serializers/type_serializers/json.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ impl TypeSerializer for JsonSerializer {
5454
extra: &Extra,
5555
) -> PyResult<PyObject> {
5656
if extra.round_trip {
57-
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0)?;
57+
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)?;
5858
let py = value.py();
5959
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
6060
Ok(PyString::new(py, s).into())
@@ -65,7 +65,7 @@ impl TypeSerializer for JsonSerializer {
6565

6666
fn json_key<'a>(&self, key: &'a Bound<'_, PyAny>, extra: &Extra) -> PyResult<Cow<'a, str>> {
6767
if extra.round_trip {
68-
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, 0)?;
68+
let bytes = to_json_bytes(key, &self.serializer, None, None, extra, None, false, 0)?;
6969
let py = key.py();
7070
let s = from_utf8(&bytes).map_err(|e| utf8_py_error(py, e, &bytes))?;
7171
Ok(Cow::Owned(s.to_string()))
@@ -83,8 +83,8 @@ impl TypeSerializer for JsonSerializer {
8383
extra: &Extra,
8484
) -> Result<S::Ok, S::Error> {
8585
if extra.round_trip {
86-
let bytes =
87-
to_json_bytes(value, &self.serializer, include, exclude, extra, None, 0).map_err(py_err_se_err)?;
86+
let bytes = to_json_bytes(value, &self.serializer, include, exclude, extra, None, false, 0)
87+
.map_err(py_err_se_err)?;
8888
match from_utf8(&bytes) {
8989
Ok(s) => serializer.serialize_str(s),
9090
Err(e) => Err(Error::custom(e.to_string())),

tests/serializers/test_string.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,38 @@ def test_str():
2323
assert json.loads(json_emoji) == 'emoji 💩'
2424

2525

26+
# Tests borrowed from:
27+
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_encode_basestring_ascii.py
28+
# - https://github.com/python/cpython/blob/d87e7f35/Lib/test/test_json/test_unicode.py
29+
@pytest.mark.parametrize(
30+
['input', 'expected'],
31+
[
32+
(
33+
'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?',
34+
'"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"',
35+
),
36+
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
37+
('controls', '"controls"'),
38+
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
39+
(
40+
'{"object with 1 member":["array with 1 element"]}',
41+
'"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"',
42+
),
43+
(' s p a c e d ', '" s p a c e d "'),
44+
('\U0001d120', '"\\ud834\\udd20"'),
45+
('\u03b1\u03a9', '"\\u03b1\\u03a9"'),
46+
("`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
47+
('\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
48+
('\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
49+
('\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}', '"\\u03b1\\u03a9"'),
50+
('\U0001d120', '"\\ud834\\udd20"'),
51+
],
52+
)
53+
def test_str_ensure_ascii(input: str, expected: str) -> None:
54+
v = SchemaSerializer(core_schema.str_schema())
55+
assert v.to_json(input, ensure_ascii=True).decode('utf-8') == expected
56+
57+
2658
def test_huge_str():
2759
v = SchemaSerializer(core_schema.int_schema())
2860
msg = r"Expected `int` - serialized value may not be as expected \[input_value='123456789012345678901234...89012345678901234567890', input_type=str\]"

tests/test.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ a = A()
9393
py,
9494
&a,
9595
None,
96+
Some(false),
9697
None,
9798
None,
9899
Some(true),
@@ -201,6 +202,7 @@ dump_json_input_2 = {'a': 'something'}
201202
py,
202203
&dump_json_input_1,
203204
None,
205+
Some(false),
204206
None,
205207
None,
206208
Some(false),
@@ -222,6 +224,7 @@ dump_json_input_2 = {'a': 'something'}
222224
py,
223225
&dump_json_input_2,
224226
None,
227+
Some(false),
225228
None,
226229
None,
227230
Some(false),

tests/test_json.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ def test_to_json():
218218
assert to_json([1, 2]) == b'[1,2]'
219219
assert to_json([1, 2], indent=2) == b'[\n 1,\n 2\n]'
220220
assert to_json([1, b'x']) == b'[1,"x"]'
221+
assert to_json(['à', 'é']).decode('utf-8') == '["à","é"]'
222+
assert to_json(['à', 'é'], indent=2).decode('utf-8') == '[\n "à",\n "é"\n]'
223+
assert to_json(['à', 'é'], indent=2, ensure_ascii=True).decode('utf-8') == '[\n "\\u00e0",\n "\\u00e9"\n]'
221224

222225
# kwargs required
223226
with pytest.raises(TypeError, match=r'to_json\(\) takes 1 positional arguments but 2 were given'):

0 commit comments

Comments
 (0)