Remove internal server object pointer overhead in small strings by rainsupreme · Pull Request #2516 · valkey-io/valkey

rainsupreme · 2025-08-20T00:24:12Z

Achieved 20-30% reduction in overhead.

This was accomplished by reusing the 8B robj->ptr memory to store embedded data. For 16B key and 16B value, this reduces the per-item memory overhead by ~20%. Overall performance was not measurably changed. Raising the threshold for embedding strings produces ~30% reduction in per-item overhead for affected value sizes.

Here's what I did:

Modify all robj->ptr access to use get/set methods. Most of this was done with a script (code listed below), with a few manual touches. Manual and programmatic changes are in separate commits to make review easier.
Next I changed objectGetVal() to calculate the location instead of using robj->ptr, and modified the embedding code to start writing embedded data 8B earlier, overwriting the ptr location. I changed objectSetVal() to assert that no value is embedded - all of the code that assigned o->ptr would have been incorrect for embedded strings anyway. (Except for one instance in module.c which I made a separate method for.)

Memory efficiency comparison:

Performance is not significantly changed (0-2% faster):

For each test I collected 65 minutes of data and discarded the first 5 minutes as warmup. I did not use cluster mode, iothreads=1, pipelining=100, and key size was 16B. Note that for 64B values valkey:unstable did not embed and my branch did embed because I raised the size threshold.

Appendix 1 - refactoring script

# parse robj->ptr references and replace them with objectGetVal(robj) and objectSetVal(robj)

from dataclasses import dataclass
from pathlib import Path
import re

@dataclass
class Ref:
    path: Path
    line: int
    char: int
    note: str

def load(base_path: Path, ref_listing_path: Path):
    with open(ref_listing_path, "r") as f:
        data = f.readlines()

    file = ""
    reflist: list[Ref] = []
    for line in data:
        if line.strip() == "":
            continue
        if not line.startswith(' '):
            file = line.strip()
            continue

        # parse lines like "  2443, 39: sdscatsds(user, descr->ptr);\n"
        # Extract line number, char position, and note
        try:
            parts = line.strip().split(":", 1)
            loc, note = parts[0], parts[1].strip()
            line_num, char_num = [int(x.strip()) for x in loc.split(",")]
            ref = Ref(base_path / file, line_num, char_num, note)
            reflist.append(ref)
        except Exception as e:
            print(f"Failed to parse line: {line.strip()} ({e})")

    return reflist

def get_obj_var(line: str):
    # extract the variable name from a line like "sdsdup(c->argv[argpos]->ptr"
    assert line.endswith("->ptr"), f"Line does not end with '->ptr': {line}"
    line = line[:-5].strip()  # exclude "->ptr"

    pairs = {
        ')': '(',
        ']': '[',
        '}': '{',
        }
    pairends = "([{"

    i = len(line) - 1
    unclosed = ""
    if line[i] == ')': # only start new parenthesis when no other unclosed pairs if it's an outer parenthesis wrapping the whole variable
        unclosed += '('
        i -= 1

    for i in range(i, -1, -1):
        char = line[i]
        if unclosed:
            if char in pairends:
                assert char == unclosed[-1]
                unclosed = unclosed[:-1]
        else:
            # no unclosed pairs, check stop conditions
            if char in pairends or char.isspace() or char in ')!':
                i += 1 # this char was invalid so don't include it
                break

        if char in pairs:
            unclosed += pairs[char]

    var = line[i:]
    stripped_var = var.strip()
    while stripped_var.startswith('(') and stripped_var.endswith(')'):
        stripped_var = stripped_var[1:-1].strip()
    return var, stripped_var

assert get_obj_var("sdsdup(c->argv[argpos]->ptr")[0] == "c->argv[argpos]"
assert get_obj_var("sdsdup(user->descr->ptr")[0] == "user->descr"
assert get_obj_var('addReplyErrorFormat(c, "Unknown node %s", (char *)c->argv[2]->ptr')[0] == "c->argv[2]"
assert get_obj_var("        sds key->ptr = argv[result.keys[i].pos]->ptr")[0] == "argv[result.keys[i].pos]"
assert get_obj_var("     sds err = getAclErrorMessage(result, u, cmd, c->argv[idx + 3]->ptr")[0] == "c->argv[idx + 3]"
assert get_obj_var("batch->keys[i] = ((robj *)batch->keys[i])->ptr")[0] == "((robj *)batch->keys[i])"
assert get_obj_var("if (!client->name || !client->name->ptr")[0] == "client->name"
assert get_obj_var("    set->ptr")[0] == "set"
assert get_obj_var("set->ptr")[0] == "set"

def get_assignment(line) -> str:
    assert line.startswith("ptr = "), f"Line does not start with 'ptr = ': {line}"
    first_semicolon = line.find(';')
    if first_semicolon == -1:
        raise ValueError(f"Line does not contain a semicolon")
    assignment = line[5:first_semicolon].strip()  # exclude "ptr = "
    return assignment

def replace_read_ref(ref: Ref, line) -> str:
    var, stripped_var = get_obj_var(line[:ref.char + 2])  # include "->ptr"


    new_line = line[:ref.char - 1 - (len(var) + 2)] + f"objectGetVal({stripped_var})" + line[ref.char - 1 + 3:]
    return new_line

def replace_write_ref(ref: Ref, line_num: int, lines: list[str]) -> str:
    line = lines[ref.line - 1]

    var, stripped_var = get_obj_var(line[:ref.char + 2])  # include "->ptr"
    assignment = get_assignment(line[ref.char-1:]) # include "ptr = "
    new_line = line[:ref.char - 1 - (len(var) + 2)] + f"objectSetVal({stripped_var}, {assignment})" + line[ref.char + 5 + len(assignment):]
    if " = " in assignment:
        print(f"Follow up on {ref.path}:{ref.line}:{ref.char} - assignment contains '=': {assignment}")
        print("  old:", line[:-1])
        print("  new:", new_line[:-1])
    return new_line

def replace_all(refs: list[Ref], dry_run=False):
    to_replace = refs.copy()
    skipped: list[Ref] = []

    i = 0
    file = None
    data: list[str] = []
    while to_replace:
        ref = to_replace.pop() # backwards in case there are multiple refs in the same line
        if file != ref.path:
            if file and not dry_run:
                assert data, f"No data loaded for {file}"
                # write previous file
                with open(file, "w") as f:
                    f.writelines(data)
            file = ref.path
            with open(file, 'r') as f:
                data = f.readlines()

        assert ref.line - 1 < len(data), f"Line {ref.line} out of range for {file}"
        line = data[ref.line - 1]
        assert ref.char + 2 <= len(line), f"Char {ref.char} out of range for {file} at line {ref.line}"

        if line[ref.char - 3:ref.char - 1] != "->":
            print(f"Skipping {ref.path}:{ref.line}:{ref.char} - not pointer deref")
            print(f"  line: {line[:-1]}")
            skipped.append(ref)
            continue
        if line[ref.char - 1:ref.char + 5] == "ptr = ":
            try:
                data[ref.line - 1] = replace_write_ref(ref, ref.line, data)
            except ValueError as e:
                print(f"Skipping {ref.path}:{ref.line}:{ref.char} - {e}")
                print(f"  line: {line[:-1]}")
                skipped.append(ref)
                continue
        else:
            # read ref
            data[ref.line - 1] = replace_read_ref(ref, line)

        i += 1

    if file and not dry_run:
        assert data, f"No data loaded for {file}"
        # write previous file
        with open(file, "w") as f:
            f.writelines(data)

    print(f"Replaced {i} references, skipped {len(skipped)} references")


reflist = load(Path("/home/ubuntu/rainsupreme"), Path("robj-ptr-refs.txt"))
replace_all(reflist, dry_run=False)

Script output (I addressed all of these manually in the following manual commit):

~/rainsupreme$ python ptr-refactor-script.py 
Skipping /home/ubuntu/rainsupreme/src/t_list.c:171:22 - Line does not contain a semicolon
  line:             subject->ptr = (where == LIST_HEAD) ? lpPrepend(objectGetVal(subject), objectGetVal(value), sdslen(objectGetVal(value)))
Skipping /home/ubuntu/rainsupreme/src/t_list.c:168:22 - Line does not contain a semicolon
  line:             subject->ptr = (where == LIST_HEAD) ? lpPrependInteger(objectGetVal(subject), (long)objectGetVal(value))
Skipping /home/ubuntu/rainsupreme/src/server.h:816:11 - not pointer deref
  line:     void *ptr;
Follow up on /home/ubuntu/rainsupreme/src/module.c:14278:16 - assignment contains '=': mv = newmv
  old:         value->ptr = mv = newmv;
  new:         objectSetVal(value, mv = newmv);
Follow up on /home/ubuntu/rainsupreme/src/defrag.c:646:44 - assignment contains '=': s = news
  old:     if ((news = activeDefragAlloc(s))) ob->ptr = s = news;
  new:     if ((news = activeDefragAlloc(s))) objectSetVal(ob, s = news);
Follow up on /home/ubuntu/rainsupreme/src/defrag.c:460:46 - assignment contains '=': zs = newzs
  old:     if ((newzs = activeDefragAlloc(zs))) ob->ptr = zs = newzs;
  new:     if ((newzs = activeDefragAlloc(zs))) objectSetVal(ob, zs = newzs);
Follow up on /home/ubuntu/rainsupreme/src/defrag.c:446:46 - assignment contains '=': ql = newql
  old:     if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql;
  new:     if ((newql = activeDefragAlloc(ql))) objectSetVal(ob, ql = newql);
Replaced 1773 references, skipped 3 references

Appendix 2 - micro benchmarking results

ubuntu@ip-10-0-145-171:~/robj-microbench$ ./robj-bench
2025-08-18T23:54:11+00:00
Running ./robj-bench
Run on (4 X 2300.18 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 256 KiB (x4)
  L3 Unified 46080 KiB (x1)
Load Average: 0.35, 0.18, 0.06
-----------------------------------------------------------------------
Benchmark                             Time             CPU   Iterations
-----------------------------------------------------------------------
BM_objectGetExpire/1               1.88 ns         1.88 ns    372201373
BM_objectGetExpire/2               1.89 ns         1.89 ns    371970617
BM_objectGetExpire/4               1.89 ns         1.89 ns    369780061
BM_objectGetExpire/8               1.90 ns         1.90 ns    366829968
BM_objectGetExpire/16              1.90 ns         1.90 ns    367467478
BM_objectGetExpire/32              1.88 ns         1.88 ns    369002953
BM_objectGetExpire/64              1.91 ns         1.91 ns    367496136
BM_objectGetExpire/128             1.90 ns         1.90 ns    372766529
BM_objectGetExpire/256             1.89 ns         1.88 ns    371629344
BM_objectGetExpire/512             1.90 ns         1.90 ns    369819056
BM_objectGetExpire/1024            1.90 ns         1.90 ns    372576954
BM_objectGetExpire/2048            1.89 ns         1.89 ns    367325743
BM_objectGetExpire/4096            1.88 ns         1.88 ns    371473964
BM_objectGetExpire_BigO            1.89 (1)        1.89 (1)
BM_objectGetExpire_RMS                0 %             0 %
BM_objectGetKey/1                  2.88 ns         2.88 ns    243820017
BM_objectGetKey/2                  2.91 ns         2.91 ns    242765864
BM_objectGetKey/4                  2.91 ns         2.91 ns    241573647
BM_objectGetKey/8                  2.89 ns         2.89 ns    242279472
BM_objectGetKey/16                 2.90 ns         2.90 ns    240966931
BM_objectGetKey/32                 2.92 ns         2.91 ns    241347563
BM_objectGetKey/64                 2.92 ns         2.91 ns    239829973
BM_objectGetKey/128                2.92 ns         2.92 ns    240860846
BM_objectGetKey/256                2.89 ns         2.89 ns    239168435
BM_objectGetKey/512                2.91 ns         2.90 ns    240518452
BM_objectGetKey/1024               2.91 ns         2.91 ns    242862430
BM_objectGetKey/2048               2.91 ns         2.91 ns    240383966
BM_objectGetKey/4096               2.91 ns         2.91 ns    240724659
BM_objectGetKey_BigO               2.90 (1)        2.90 (1)
BM_objectGetKey_RMS                   0 %             0 %
BM_objectGetVal/1                  3.75 ns         3.75 ns    186373286
BM_objectGetVal/2                  3.75 ns         3.75 ns    186349886
BM_objectGetVal/4                  3.76 ns         3.76 ns    186069434
BM_objectGetVal/8                  3.77 ns         3.77 ns    185854886
BM_objectGetVal/16                 3.76 ns         3.76 ns    185938601
BM_objectGetVal/32                 3.77 ns         3.77 ns    185350054
BM_objectGetVal/64                 3.76 ns         3.76 ns    185015799
BM_objectGetVal/128                1.88 ns         1.88 ns    372679151
BM_objectGetVal/256                1.88 ns         1.88 ns    372541933
BM_objectGetVal/512                1.88 ns         1.88 ns    372547356
BM_objectGetVal/1024               1.88 ns         1.88 ns    372967463
BM_objectGetVal/2048               1.88 ns         1.88 ns    371578466
BM_objectGetVal/4096               1.88 ns         1.88 ns    372867744
BM_objectGetVal_BigO               2.89 (1)        2.89 (1)
BM_objectGetVal_RMS                  32 %            32 %
BM_objectGetVal_Using_ptr/1        1.50 ns         1.50 ns    466016074
BM_objectGetVal_Using_ptr/2        1.51 ns         1.51 ns    465033531
BM_objectGetVal_Using_ptr/4        1.51 ns         1.51 ns    465690307
BM_objectGetVal_Using_ptr/8        1.50 ns         1.50 ns    465292160
BM_objectGetVal_Using_ptr/16       1.50 ns         1.50 ns    466284090
BM_objectGetVal_Using_ptr/32       1.50 ns         1.50 ns    465706801
BM_objectGetVal_Using_ptr/64       1.50 ns         1.50 ns    466260051
BM_objectGetVal_Using_ptr/128      1.50 ns         1.50 ns    465387679
BM_objectGetVal_Using_ptr/256      1.50 ns         1.50 ns    464241882
BM_objectGetVal_Using_ptr/512      1.50 ns         1.50 ns    466336830
BM_objectGetVal_Using_ptr/1024     1.51 ns         1.50 ns    465435503
BM_objectGetVal_Using_ptr/2048     1.50 ns         1.50 ns    466323005
BM_objectGetVal_Using_ptr/4096     1.50 ns         1.50 ns    465025662
BM_objectGetVal_Using_ptr_BigO     1.50 (1)        1.50 (1)
BM_objectGetVal_Using_ptr_RMS         0 %             0 %

Appendix 3 - rebasing
I cherry picked and merged each commit in sequence. For the programmatic commit I regenerated it instead of cherry picking. (So it contains no manual edits still.)

Copilot

Pull Request Overview

This PR implements a memory optimization for small strings by reusing the 8-byte robj->ptr memory to store embedded data. This change reduces per-item memory overhead by approximately 20-30% for affected value sizes while maintaining performance.

Key changes include:

Replacing direct robj->ptr access with getter/setter methods throughout the codebase
Modifying the embedding logic to overwrite the ptr location with embedded data
Increasing embedding size thresholds to maximize memory savings

Reviewed Changes

Copilot reviewed 47 out of 47 changed files in this pull request and generated no comments.

Show a summary per file

File	Description
src/server.h	Updates serverObject struct to support embedded values and adds accessor function declarations
src/unit/test_object.c	Adds comprehensive tests for embedded string functionality with keys, expiration, and value operations
tests/unit/type/string.tcl	Updates memory usage tests to reflect new embedding behavior and memory layout
src/unit/test_files.h	Registers new test functions for embedded string functionality
Multiple t_*.c files	Replaces direct ptr access with objectGetVal/objectSetVal calls throughout data type implementations
src/server.c	Updates hash and comparison functions to use new object accessor methods
src/tracking.c	Updates tracking functionality to use new object accessor methods
src/valkey-check-rdb.c	Updates RDB checking code to use new object accessor methods
src/sort.c	Updates sorting functionality to use new object accessor methods

Comments suppressed due to low confidence (6)

_{Tip: Customize your code reviews with copilot-instructions.md. Create the file or learn how to get started.}

madolson · 2025-08-20T03:12:13Z

We probably also want a performance test for IO threading as well either that or a deep pipeline. It will tease out a performance regression more clearly.

zuiderkwast

Great! I'm aligned with this change. We have been chatting about it before.

I posted a few random comments.

src/object.c

codecov · 2025-08-21T21:52:54Z

Codecov Report

❌ Patch coverage is 85.52392% with 239 lines in your changes missing coverage. Please review.
✅ Project coverage is 74.31%. Comparing base (9f7cfc7) to head (8227e4f).
⚠️ Report is 1 commits behind head on unstable.

Files with missing lines	Patch %	Lines
src/sentinel.c	0.00%	74 Missing ⚠️
src/module.c	20.65%	73 Missing ⚠️
src/debug.c	80.73%	21 Missing ⚠️
src/cluster_legacy.c	83.82%	11 Missing ⚠️
src/object.c	94.73%	9 Missing ⚠️
src/defrag.c	76.92%	6 Missing ⚠️
src/rdb.c	93.25%	6 Missing ⚠️
src/scripting_engine.c	58.33%	5 Missing ⚠️
src/t_list.c	91.22%	5 Missing ⚠️
src/replication.c	90.90%	4 Missing ⚠️
... and 15 more

Additional details and impacted files

@@             Coverage Diff              @@
##           unstable    #2516      +/-   ##
============================================
+ Coverage     74.27%   74.31%   +0.04%     
============================================
  Files           129      129              
  Lines         70923    70974      +51     
============================================
+ Hits          52675    52745      +70     
+ Misses        18248    18229      -19

Files with missing lines	Coverage Δ
src/bitops.c	`96.22% <100.00%> (ø)`
src/cluster_migrateslots.c	`92.09% <100.00%> (+<0.01%)`	⬆️
src/cluster_slot_stats.c	`93.75% <100.00%> (ø)`
src/commandlog.c	`96.58% <100.00%> (ø)`
src/eval.c	`91.42% <100.00%> (+0.02%)`	⬆️
src/expire.c	`97.31% <100.00%> (ø)`
src/functions.c	`96.57% <100.00%> (+4.24%)`	⬆️
src/latency.c	`83.29% <100.00%> (ø)`
src/lazyfree.c	`85.41% <100.00%> (+6.25%)`	⬆️
src/logreqres.c	`72.72% <ø> (ø)`
... and 36 more

... and 3 files with indirect coverage changes

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.
📦 JS Bundle Analysis: Save yourself from yourself by tracking and limiting bundle sizes in JS merges.

rainsupreme · 2025-08-21T22:34:12Z

I updated the test results using a pipeline depth of 100, and I've also rebased the branch, appended some minor fixes, and addressed PR feedback. :)

zuiderkwast

Not a full review; just follow-up on previous comments.

Also take a look at the failing test on 32-bit x86:

*** [err]: Memory usage of embedded string value in tests/unit/type/string.tcl
Expected '6' to be equal to '10' (context: type eval line 24 cmd {assert_equal 6 $sds_overhead} proc ::test)

src/unit/test_object.c

src/server.h

src/robj-ptr-refs.txt

src/server.h

src/object.c

tests/unit/type/string.tcl

src/debug.c

src/replace_obj_refs.py

src/t_hash.c

github-actions · 2025-08-26T02:47:46Z

Benchmark ran on this commit: d8400e3

Benchmark Comparison by Command

GET | cluster disabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	988142.31	977517.12	-10625.19	-1.08%
latency_avg_ms	0.45	0.45	0.00	+0.89%
latency_p50_ms	0.43	0.43	0.00	+0.00%
latency_p95_ms	0.60	0.61	0.01	+1.34%
latency_p99_ms	0.71	0.71	0.00	+0.00%

GET | cluster enabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	855432.00	843170.31	-12261.69	-1.43%
latency_avg_ms	0.53	0.54	0.01	+1.70%
latency_p50_ms	0.52	0.54	0.02	+3.08%
latency_p95_ms	0.69	0.69	0.01	+1.16%
latency_p99_ms	0.74	0.74	0.00	+0.00%

MGET | cluster disabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	319795.31	312891.09	-6904.22	-2.16%
latency_avg_ms	1.39	1.43	0.03	+2.52%
latency_p50_ms	1.38	1.42	0.03	+2.31%
latency_p95_ms	1.74	1.77	0.03	+1.84%
latency_p99_ms	1.80	1.83	0.03	+1.78%

MSET | cluster disabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	220070.42	213949.52	-6120.90	-2.78%
latency_avg_ms	2.18	2.25	0.07	+3.03%
latency_p50_ms	2.28	2.35	0.07	+3.16%
latency_p95_ms	2.50	2.58	0.07	+2.88%
latency_p99_ms	2.59	2.67	0.08	+3.09%

SET | cluster disabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	850340.12	840336.12	-10004.00	-1.18%
latency_avg_ms	0.53	0.54	0.01	+1.32%
latency_p50_ms	0.53	0.53	0.00	+0.00%
latency_p95_ms	0.69	0.70	0.02	+2.33%
latency_p99_ms	0.74	0.75	0.01	+1.08%

SET | cluster enabled | tls disabled

Metric	Baseline	PR	Diff	% Change
rps	669792.38	670690.81	898.43	+0.13%
latency_avg_ms	0.69	0.69	0.00	+0.00%
latency_p50_ms	0.69	0.69	0.00	+0.00%
latency_p95_ms	0.88	0.87	-0.01	-0.91%
latency_p99_ms	1.09	1.06	-0.02	-2.21%

src/object.c

zuiderkwast · 2025-09-08T13:17:11Z

"Reduce small string memory overhead by ~20-30%"

This PR title isn't perfect. It describes the benefit, which is important, but a PR title should instead primarily summarize the change itself.

I mean, reducing memory overhead can be done in other ways too, so it's not immediately clear from the title what this change is.

Naming isn't easy. We also want to avoid going into code details, because these titles will end up in release notes and be read by users. How about something like "Remove internal pointer overhead in small strings"?

ranshid · 2025-09-11T12:37:56Z

Small note: we should definitely take that, but it would probably impact our ability to simply backport fixed to older versions. Maybe we could just backport the function objectGetVal as a macro or something which will use the ptr in older versions?

(These are the cases I miss c++ operator overloading :))

rainsupreme · 2025-12-19T21:23:29Z

For anyone wondering what I'm waiting for: In my latest tests I'm seeing a 4-5% regression with and without LTO for SET with 512B values on ARM architecture. So I'm investigating further to see if this came from adding the pure attribute and how smaller data sizes were affected.

madolson

Curious about the performance delta, but besides that I think all the code is fine.

tests/unit/type/string.tcl

src/server.h

rainsupreme · 2025-12-22T02:43:44Z

So here's the result of the tests I ran for various commits that are part of this change. Many mysteries here that I'm not sure what to think of. Strange that O2 is faster for get and slower for set, and strange that -O2 SET was unaffected by switching the 64B size from raw encoding to embstr encoding. I guess adding the pure attributes didn't help though, so I should revert that bit.

Error bars indicate 1 standard deviation:

This version is percent change relative to unstable as a baseline (error bars = 1 std dev):

Signed-off-by: Rain Valentine <rsg000@gmail.com>

…ds. No manual edits. Signed-off-by: Rain Valentine <rsg000@gmail.com>

Signed-off-by: Rain Valentine <rsg000@gmail.com>

…->ptr to val_ptr to ensure any code not updated fails to compile Signed-off-by: Rain Valentine <rsg000@gmail.com>

Signed-off-by: Rain Valentine <rsg000@gmail.com>

rainsupreme · 2026-01-06T00:10:38Z

I've rebased again. This time I've omitted the pure markers because it did not improve performance.

My conclusions from the performance tests:

The added complexity in determining the location of the embedded value data caused a ~4% performance decrease for SET performance, and no clear change to GET performance. I'd argue losing ~4% SET perf to eliminate ~25% of memory overhead cost is worth it. (Note: after this change it would be pretty easy to maintain a fork with different behavior or change it here. It's easy to undo this tradeoff. 😉)
There are different performance patterns for embedded vs non-embedded values. The changes to the 64B value size reflect the change from non-embedded to embedded caused by raising the size threshold.
For some reason when compiled with -O2, SET commands perform up to ~70% faster. I am surprised, curious, excited, and I plan to investigate this further in the future. Feel free to ping me if you want to chat about that finding 😀

This reverts commit e737021.

rainsupreme · 2026-01-06T00:30:22Z

After discussion with @madolson I've reverted the increase in size threshold for embedding values for now, since for the sizes affected we see 25% worse SET performance, which might break some customer use cases. (Though we do see 20% better GET.) I'll investigate the difference in SET between O2 and O3/LTO compiler options and perhaps we can revisit the threshold change in the future. :)

Signed-off-by: Rain Valentine <rsg000@gmail.com>

madolson · 2026-01-06T02:03:44Z

Merging this so that it stops accumulating merge conflicts. There was a comment about doing this in 2 commits, but I think 1 is OK.

enjoy-binbin

Another memory optimization!
Just leave the comment, good job!

zuiderkwast · 2026-01-06T19:31:56Z

Great, finally!

After discussion with @madolson I've reverted the increase in size threshold for embedding values for now

So the memory usage is the green line in the "Memory efficiency comparison" diagram. Makes sense.

For a 16B key, it means we only embed values up to 33 bytes, instead of up to 97 bytes.

+--------+-----------------+------------+------------+
| header | key header size | key sds5   | val sds8   |
| 8      | 1               | 1 + 16 + 1 | 3 + 33 + 1 |  = 64 bytes
+--------+-----------------+------------+------------+

+--------+-----------------+------------+------------+
| header | key header size | key sds5   | val sds8   |
| 8      | 1               | 1 + 16 + 1 | 3 + 97 + 1 |  = 128 bytes
+--------+-----------------+------------+------------+

Many string values fall in the range 34-97 bytes, so it a little sad that we can't apply this memory saving to these values. Hopefully we can solve the performence issue in the future and increase the threshold later.

…ey-io#2516) Achieved 20-30% reduction in overhead. This was accomplished by reusing the 8B robj->ptr memory to store embedded data. For 16B key and 16B value, this reduces the per-item memory overhead by ~20%. Overall performance was not measurably changed. Raising the threshold for embedding strings produces ~30% reduction in per-item overhead for affected value sizes. Here's what I did: 1. Modify all `robj->ptr` access to use get/set methods. Most of this was done with a script (code listed below), with a few manual touches. Manual and programmatic changes are in separate commits to make review easier. 2. Next I changed `objectGetVal()` to calculate the location instead of using `robj->ptr`, and modified the embedding code to start writing embedded data 8B earlier, overwriting the ptr location. I changed `objectSetVal()` to assert that no value is embedded - all of the code that assigned `o->ptr` would have been incorrect for embedded strings anyway. (Except for one instance in module.c which I made a separate method for.) --------- Signed-off-by: Rain Valentine <rsg000@gmail.com>

…urn type (#3184) This fixes a regression where PFADD returned 1 instead of INVALIDOBJ for corrupted sparse HLL payloads. In small strings refactor (#2516) we incorrectly changed the datatype from int to bool. That function still uses -1 as its error signal. Signed-off-by: Sarthak Aggarwal <sarthagg@amazon.com>

…urn type (valkey-io#3184) This fixes a regression where PFADD returned 1 instead of INVALIDOBJ for corrupted sparse HLL payloads. In small strings refactor (valkey-io#2516) we incorrectly changed the datatype from int to bool. That function still uses -1 as its error signal. Signed-off-by: Sarthak Aggarwal <sarthagg@amazon.com>

github-actions bot assigned rainsupreme Aug 20, 2025

rainsupreme requested review from madolson and zuiderkwast August 20, 2025 00:26

madolson requested a review from Copilot August 20, 2025 03:11

Copilot AI reviewed Aug 20, 2025

View reviewed changes

madolson added the run-extra-tests Run extra tests on this PR (Runs all tests from daily except valgrind and RESP) label Aug 20, 2025

zuiderkwast reviewed Aug 21, 2025

View reviewed changes

src/object.c Outdated Show resolved Hide resolved

src/object.c Outdated Show resolved Hide resolved

src/object.c Outdated Show resolved Hide resolved

rainsupreme force-pushed the robj-reuse-ptr branch from 4710276 to db2e800 Compare August 21, 2025 21:33

rainsupreme force-pushed the robj-reuse-ptr branch from db2e800 to 91568dc Compare August 21, 2025 22:31

zuiderkwast reviewed Aug 22, 2025

View reviewed changes

src/unit/test_object.c Outdated Show resolved Hide resolved

src/unit/test_object.c Outdated Show resolved Hide resolved

rainsupreme force-pushed the robj-reuse-ptr branch 2 times, most recently from fcc304a to 7d41e05 Compare August 22, 2025 21:26

zuiderkwast reviewed Aug 23, 2025

View reviewed changes

rainsupreme force-pushed the robj-reuse-ptr branch from 7d41e05 to d8400e3 Compare August 25, 2025 20:16

zuiderkwast added the run-benchmark label Aug 26, 2025

github-actions bot removed the run-benchmark label Aug 26, 2025

zuiderkwast reviewed Aug 26, 2025

View reviewed changes

src/object.c Show resolved Hide resolved

madolson added this to Valkey 9.1 Aug 26, 2025

madolson moved this to Todo in Valkey 9.1 Aug 26, 2025

madolson removed the status in Valkey 9.1 Aug 26, 2025

madolson moved this to In Progress in Valkey 9.1 Aug 26, 2025

madolson mentioned this pull request Sep 5, 2025

Opaque robj: add get/set methods and remove all robj->ptr refs #1658

Closed

rainsupreme changed the title ~~Reduce small string memory overhead by ~20-30%~~ Reduce small string memory overhead by ~20-30% for embstr encoding by reusing robj->ptr memory Sep 29, 2025

madolson added the release-notes This issue should get a line item in the release notes label Nov 10, 2025

JimB123 approved these changes Dec 19, 2025

View reviewed changes

madolson approved these changes Dec 19, 2025

View reviewed changes

tests/unit/type/string.tcl Show resolved Hide resolved

tests/unit/type/string.tcl Show resolved Hide resolved

src/server.h Show resolved Hide resolved

src/server.h Show resolved Hide resolved

rainsupreme added 6 commits January 5, 2026 22:09

add ptr get/set, consistently refer to robj as 'o'

16c36cb

Signed-off-by: Rain Valentine <rsg000@gmail.com>

programmatically replace references to robj->ptr with interface metho…

3c6d76c

…ds. No manual edits. Signed-off-by: Rain Valentine <rsg000@gmail.com>

manual followup - replacing robj->ptr references with interface calls

8e9c275

Signed-off-by: Rain Valentine <rsg000@gmail.com>

reuse pointer memory when value embedded, saving 8 bytes; rename robj…

9a2bfe8

…->ptr to val_ptr to ensure any code not updated fails to compile Signed-off-by: Rain Valentine <rsg000@gmail.com>

formatting, test fixes/updates, PR feedback

9bed90e

Signed-off-by: Rain Valentine <rsg000@gmail.com>

increase value embedding threshold to 128 (2 cache lines)

e737021

Signed-off-by: Rain Valentine <rsg000@gmail.com>

rainsupreme force-pushed the robj-reuse-ptr branch from f8bc389 to e737021 Compare January 5, 2026 23:47

Revert "increase value embedding threshold to 128 (2 cache lines)"

d749c8c

This reverts commit e737021.

update test for embstr threshold

8227e4f

Signed-off-by: Rain Valentine <rsg000@gmail.com>

madolson merged commit 0ee4234 into valkey-io:unstable Jan 6, 2026
24 checks passed

github-project-automation bot moved this from In Progress to Done in Valkey 9.1 Jan 6, 2026

enjoy-binbin reviewed Jan 6, 2026

View reviewed changes

rainsupreme mentioned this pull request Jan 6, 2026

[Performance] SET performance is up to ~70% faster when compiled with O2 vs default args (O3, LTO) #3016

Open

rainsupreme deleted the robj-reuse-ptr branch January 6, 2026 22:44

rainsupreme mentioned this pull request Jan 6, 2026

Make serverObject (robj) opaque #3017

Open

roshkhatri mentioned this pull request Jan 8, 2026

[PERFORMANCE] Potential regression due to Removal of internal server object pointer overhead #3025

Open

AdityaTeltia mentioned this pull request Feb 2, 2026

chore: Make serverObject (robj) Opaque #3149

Open

sarthakaggarwal97 mentioned this pull request Feb 10, 2026

Fix PFADD corrupted sparse HLL handling by restoring hllSparseSet return type #3184

Merged

Nikhil-Manglore mentioned this pull request Feb 12, 2026

Reducing calls to objectGetVal() in hot paths #3194

Open

Comments

Conversation

rainsupreme commented Aug 20, 2025 • edited by madolson Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull Request Overview

Reviewed Changes

Uh oh!

madolson commented Aug 20, 2025

Uh oh!

zuiderkwast left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

codecov bot commented Aug 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

rainsupreme commented Aug 21, 2025

Uh oh!

zuiderkwast left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Aug 26, 2025

Benchmark Comparison by Command

GET | cluster disabled | tls disabled

GET | cluster enabled | tls disabled

MGET | cluster disabled | tls disabled

MSET | cluster disabled | tls disabled

SET | cluster disabled | tls disabled

SET | cluster enabled | tls disabled

Uh oh!

Uh oh!

zuiderkwast commented Sep 8, 2025

Uh oh!

ranshid commented Sep 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rainsupreme commented Dec 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

madolson left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

rainsupreme commented Dec 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rainsupreme commented Jan 6, 2026

Uh oh!

rainsupreme commented Jan 6, 2026

Uh oh!

Uh oh!

madolson commented Jan 6, 2026

Uh oh!

enjoy-binbin left a comment

Choose a reason for hiding this comment

Uh oh!

zuiderkwast commented Jan 6, 2026

Uh oh!

Reviewers

Assignees

rainsupreme commented Aug 20, 2025 •

edited by madolson

Loading

codecov bot commented Aug 21, 2025 •

edited

Loading

ranshid commented Sep 11, 2025 •

edited

Loading

rainsupreme commented Dec 19, 2025 •

edited

Loading

rainsupreme commented Dec 22, 2025 •

edited

Loading