Skip to content

[mypyc] Speed up generator allocation by using a per-type freelist #19316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions mypyc/codegen/emit.py
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,31 @@ def emit_gc_clear(self, target: str, rtype: RType) -> None:
else:
assert False, "emit_gc_clear() not implemented for %s" % repr(rtype)

def emit_reuse_clear(self, target: str, rtype: RType) -> None:
"""Emit attribute clear before object is added into freelist.

Assume that 'target' represents a C expression that refers to a
struct member, such as 'self->x'.

Unlike emit_gc_clear(), initialize attribute value to match a freshly
allocated object.
"""
if isinstance(rtype, RTuple):
for i, item_type in enumerate(rtype.types):
self.emit_reuse_clear(f"{target}.f{i}", item_type)
elif not rtype.is_refcounted:
self.emit_line(f"{target} = {rtype.c_undefined};")
elif isinstance(rtype, RPrimitive) and rtype.name == "builtins.int":
self.emit_line(f"if (CPyTagged_CheckLong({target})) {{")
self.emit_line(f"CPyTagged __tmp = {target};")
self.emit_line(f"{target} = {self.c_undefined_value(rtype)};")
self.emit_line("Py_XDECREF(CPyTagged_LongAsObject(__tmp));")
self.emit_line("} else {")
self.emit_line(f"{target} = {self.c_undefined_value(rtype)};")
self.emit_line("}")
else:
self.emit_gc_clear(target, rtype)

def emit_traceback(
self, source_path: str, module_name: str, traceback_entry: tuple[str, int]
) -> None:
Expand Down
98 changes: 86 additions & 12 deletions mypyc/codegen/emitclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,29 @@ def generate_class_type_decl(
)


def generate_class_reuse(
cl: ClassIR, c_emitter: Emitter, external_emitter: Emitter, emitter: Emitter
) -> None:
"""Generate a definition of a single-object per-class free "list".

This speeds up object allocation and freeing when there are many short-lived
objects.

TODO: Generalize to support a free list with up to N objects.
"""
assert cl.reuse_freed_instance

# The free list implementation doesn't support class hierarchies
assert cl.is_final_class or cl.children == []

context = c_emitter.context
name = cl.name_prefix(c_emitter.names) + "_free_instance"
struct_name = cl.struct_name(c_emitter.names)
context.declarations[name] = HeaderDeclaration(
f"CPyThreadLocal {struct_name} *{name};", needs_export=True
)


def generate_class(cl: ClassIR, module: str, emitter: Emitter) -> None:
"""Generate C code for a class.

Expand Down Expand Up @@ -557,7 +580,22 @@ def generate_setup_for_class(
emitter.emit_line("static PyObject *")
emitter.emit_line(f"{func_name}(PyTypeObject *type)")
emitter.emit_line("{")
emitter.emit_line(f"{cl.struct_name(emitter.names)} *self;")
struct_name = cl.struct_name(emitter.names)
emitter.emit_line(f"{struct_name} *self;")

prefix = cl.name_prefix(emitter.names)
if cl.reuse_freed_instance:
# Attempt to use a per-type free list first (a free "list" with up to one object only).
emitter.emit_line(f"if ({prefix}_free_instance != NULL) {{")
emitter.emit_line(f"self = {prefix}_free_instance;")
emitter.emit_line(f"{prefix}_free_instance = NULL;")
emitter.emit_line("Py_SET_REFCNT(self, 1);")
emitter.emit_line("PyObject_GC_Track(self);")
if defaults_fn is not None:
emit_attr_defaults_func_call(defaults_fn, "self", emitter)
emitter.emit_line("return (PyObject *)self;")
emitter.emit_line("}")

emitter.emit_line(f"self = ({cl.struct_name(emitter.names)} *)type->tp_alloc(type, 0);")
emitter.emit_line("if (self == NULL)")
emitter.emit_line(" return NULL;")
Expand All @@ -571,9 +609,7 @@ def generate_setup_for_class(
else:
emitter.emit_line(f"self->vtable = {vtable_name};")

for i in range(0, len(cl.bitmap_attrs), BITMAP_BITS):
field = emitter.bitmap_field(i)
emitter.emit_line(f"self->{field} = 0;")
emit_clear_bitmaps(cl, emitter)

if cl.has_method("__call__"):
name = cl.method_decl("__call__").cname(emitter.names)
Expand All @@ -590,19 +626,34 @@ def generate_setup_for_class(

# Initialize attributes to default values, if necessary
if defaults_fn is not None:
emitter.emit_lines(
"if ({}{}((PyObject *)self) == 0) {{".format(
NATIVE_PREFIX, defaults_fn.cname(emitter.names)
),
"Py_DECREF(self);",
"return NULL;",
"}",
)
emit_attr_defaults_func_call(defaults_fn, "self", emitter)

emitter.emit_line("return (PyObject *)self;")
emitter.emit_line("}")


def emit_clear_bitmaps(cl: ClassIR, emitter: Emitter) -> None:
"""Emit C code to clear bitmaps that track if attributes have an assigned value."""
for i in range(0, len(cl.bitmap_attrs), BITMAP_BITS):
field = emitter.bitmap_field(i)
emitter.emit_line(f"self->{field} = 0;")


def emit_attr_defaults_func_call(defaults_fn: FuncIR, self_name: str, emitter: Emitter) -> None:
"""Emit C code to initialize attribute defaults by calling defaults_fn.

The code returns NULL on a raised exception.
"""
emitter.emit_lines(
"if ({}{}((PyObject *){}) == 0) {{".format(
NATIVE_PREFIX, defaults_fn.cname(emitter.names), self_name
),
"Py_DECREF(self);",
"return NULL;",
"}",
)


def generate_constructor_for_class(
cl: ClassIR,
fn: FuncDecl,
Expand Down Expand Up @@ -787,6 +838,8 @@ def generate_dealloc_for_class(
emitter.emit_line("Py_TYPE(self)->tp_finalize((PyObject *)self);")
emitter.emit_line("}")
emitter.emit_line("PyObject_GC_UnTrack(self);")
if cl.reuse_freed_instance:
emit_reuse_dealloc(cl, emitter)
# The trashcan is needed to handle deep recursive deallocations
emitter.emit_line(f"CPy_TRASHCAN_BEGIN(self, {dealloc_func_name})")
emitter.emit_line(f"{clear_func_name}(self);")
Expand All @@ -795,6 +848,27 @@ def generate_dealloc_for_class(
emitter.emit_line("}")


def emit_reuse_dealloc(cl: ClassIR, emitter: Emitter) -> None:
"""Emit code to deallocate object by putting it to per-type free list.

The free "list" currently can have up to one object.
"""
prefix = cl.name_prefix(emitter.names)
emitter.emit_line(f"if ({prefix}_free_instance == NULL) {{")
emitter.emit_line(f"{prefix}_free_instance = self;")

# Clear attributes and free referenced objects.

emit_clear_bitmaps(cl, emitter)

for base in reversed(cl.base_mro):
for attr, rtype in base.attributes.items():
emitter.emit_reuse_clear(f"self->{emitter.attr(attr)}", rtype)

emitter.emit_line("return;")
emitter.emit_line("}")


def generate_finalize_for_class(
del_method: FuncIR, finalize_func_name: str, emitter: Emitter
) -> None:
Expand Down
4 changes: 3 additions & 1 deletion mypyc/codegen/emitmodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from mypy.util import hash_digest, json_dumps
from mypyc.codegen.cstring import c_string_initializer
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
from mypyc.codegen.emitclass import generate_class, generate_class_reuse, generate_class_type_decl
from mypyc.codegen.emitfunc import generate_native_function, native_function_header
from mypyc.codegen.emitwrapper import (
generate_legacy_wrapper_function,
Expand Down Expand Up @@ -609,6 +609,8 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
self.declare_finals(module_name, module.final_names, declarations)
for cl in module.classes:
generate_class_type_decl(cl, emitter, ext_declarations, declarations)
if cl.reuse_freed_instance:
generate_class_reuse(cl, emitter, ext_declarations, declarations)
self.declare_type_vars(module_name, module.type_var_names, declarations)
for fn in module.functions:
generate_function_declaration(fn, declarations)
Expand Down
8 changes: 8 additions & 0 deletions mypyc/ir/class_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@ def __init__(
# If this is a generator environment class, what is the actual method for it
self.env_user_function: FuncIR | None = None

# If True, keep one freed, cleared instance available for immediate reuse to
# speed up allocations. This helps if many objects are freed quickly, before
# other instances of the same class are allocated. This is effectively a
# per-type free "list" of up to length 1.
self.reuse_freed_instance = False

def __repr__(self) -> str:
return (
"ClassIR("
Expand Down Expand Up @@ -403,6 +409,7 @@ def serialize(self) -> JsonDict:
"_sometimes_initialized_attrs": sorted(self._sometimes_initialized_attrs),
"init_self_leak": self.init_self_leak,
"env_user_function": self.env_user_function.id if self.env_user_function else None,
"reuse_freed_instance": self.reuse_freed_instance,
}

@classmethod
Expand Down Expand Up @@ -458,6 +465,7 @@ def deserialize(cls, data: JsonDict, ctx: DeserMaps) -> ClassIR:
ir.env_user_function = (
ctx.functions[data["env_user_function"]] if data["env_user_function"] else None
)
ir.reuse_freed_instance = data["reuse_freed_instance"]

return ir

Expand Down
1 change: 1 addition & 0 deletions mypyc/irbuild/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def setup_generator_class(builder: IRBuilder) -> ClassIR:
name = f"{builder.fn_info.namespaced_name()}_gen"

generator_class_ir = ClassIR(name, builder.module_name, is_generated=True, is_final_class=True)
generator_class_ir.reuse_freed_instance = True
if builder.fn_info.can_merge_generator_and_env_classes():
builder.fn_info.env_class = generator_class_ir
else:
Expand Down
25 changes: 25 additions & 0 deletions mypyc/lib-rt/mypyc_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,31 @@
#define CPy_NOINLINE
#endif

#ifndef Py_GIL_DISABLED

// Everything is running in the same thread, so no need for thread locals
#define CPyThreadLocal

#else

// 1. Use C11 standard thread_local storage, if available
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
#define CPyThreadLocal _Thread_local

// 2. Microsoft Visual Studio fallback
#elif defined(_MSC_VER)
#define CPyThreadLocal __declspec(thread)

// 3. GNU thread local storage for GCC/Clang targets that still need it
#elif defined(__GNUC__) || defined(__clang__)
#define CPyThreadLocal __thread

#else
#error "Cannot define CPyThreadLocal for this compiler/target"
#endif

#endif // Py_GIL_DISABLED

// INCREF and DECREF that assert the pointer is not NULL.
// asserts are disabled in release builds so there shouldn't be a perf hit.
// I'm honestly kind of surprised that this isn't done by default.
Expand Down
Loading