eunomia-bpf · Forsworns · Oct 27, 2025
diff --git a/attach/nv_attach_impl/nv_attach_impl_patcher.cpp b/attach/nv_attach_impl/nv_attach_impl_patcher.cpp
@@ -122,6 +122,8 @@ generate_ptx_for_ebpf(const std::vector<ebpf_inst> &inst,
 	vm.register_external_function(503, "get_block_idx", (void *)test_func);
 	vm.register_external_function(504, "get_block_dim", (void *)test_func);
 	vm.register_external_function(505, "get_thread_idx", (void *)test_func);
+	vm.register_external_function(507, "cuda_exit", (void *)test_func);
+	vm.register_external_function(508, "get_grid_dim", (void *)test_func);
 
 	vm.load_code(inst.data(), inst.size() * 8);
 	llvm_bpf_jit_context ctx(vm);

diff --git a/attach/nv_attach_impl/trampoline/default_trampoline.cu b/attach/nv_attach_impl/trampoline/default_trampoline.cu
@@ -404,6 +404,22 @@ _bpf_helper_ext_0506(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t)
 	asm("membar.sys;                      \n\t");
 	return 0;
 }
+extern "C" __noinline__ __device__ uint64_t
+_bpf_helper_ext_0507(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t)
+{
+	asm("exit;                      \n\t");
+	return 0;
+}
+extern "C" __noinline__ __device__ uint64_t
+_bpf_helper_ext_0508(uint64_t x, uint64_t y, uint64_t z, uint64_t, uint64_t)
+{
+	// get grid dim
+	*(uint64_t *)(uintptr_t)x = gridDim.x;
+	*(uint64_t *)(uintptr_t)y = gridDim.y;
+	*(uint64_t *)(uintptr_t)z = gridDim.z;
+
+	return 0;
+}
 
 extern "C" __global__ void bpf_main(void *mem, size_t sz)
 {

diff --git a/attach/nv_attach_impl/trampoline_ptx.h b/attach/nv_attach_impl/trampoline_ptx.h
@@ -1384,5 +1384,56 @@ static const char TRAMPOLINE_PTX[] = R"(
 	st.param.b64 	[func_retval0+0], %rd1;
 	ret;
                                         // -- End function
+}
+    // .globl   _bpf_helper_ext_0507    // -- Begin function _bpf_helper_ext_0507
+.visible .func  (.param .b64 func_retval0) _bpf_helper_ext_0507(
+    .param .b64 _bpf_helper_ext_0507_param_0,
+    .param .b64 _bpf_helper_ext_0507_param_1,
+    .param .b64 _bpf_helper_ext_0507_param_2,
+    .param .b64 _bpf_helper_ext_0507_param_3,
+    .param .b64 _bpf_helper_ext_0507_param_4
+)                                       // @_bpf_helper_ext_0507
+{
+    .reg .b64   %rd<2>;
+
+// %bb.0:
+    // begin inline asm
+    exit;                      
+
+    // end inline asm
+    mov.u64     %rd1, 0;
+    st.param.b64    [func_retval0+0], %rd1;
+    ret;
+                                        // -- End function
+}
+	// .globl	_bpf_helper_ext_0508    // -- Begin function _bpf_helper_ext_0508
+.visible .func  (.param .b64 func_retval0) _bpf_helper_ext_0508(
+	.param .b64 _bpf_helper_ext_0508_param_0,
+	.param .b64 _bpf_helper_ext_0508_param_1,
+	.param .b64 _bpf_helper_ext_0508_param_2,
+	.param .b64 _bpf_helper_ext_0508_param_3,
+	.param .b64 _bpf_helper_ext_0508_param_4
+)                                       // @_bpf_helper_ext_0508
+{
+	.reg .b32 	%r<4>;
+	.reg .b64 	%rd<8>;
+
+// %bb.0:
+	ld.param.u64 	%rd1, [_bpf_helper_ext_0508_param_0];
+	ld.param.u64 	%rd2, [_bpf_helper_ext_0508_param_1];
+	mov.u32 	%r1, %nctaid.x;
+	cvt.u64.u32 	%rd3, %r1;
+	ld.param.u64 	%rd4, [_bpf_helper_ext_0508_param_2];
+	st.u64 	[%rd1], %rd3;
+	mov.u32 	%r2, %nctaid.y;
+	cvt.u64.u32 	%rd5, %r2;
+	st.u64 	[%rd2], %rd5;
+	mov.u32 	%r3, %nctaid.z;
+	cvt.u64.u32 	%rd6, %r3;
+	st.u64 	[%rd4], %rd6;
+	mov.u64 	%rd7, 0;
+	st.param.b64 	[func_retval0+0], %rd7;
+	ret;
+                                        // -- End function
 }
 )";
diff --git a/example/gpu/atomizer/.gitignore b/example/gpu/atomizer/.gitignore
@@ -0,0 +1,6 @@
+/atomizer
+/.output
+/victim*
+/vec_add.cpp
+/vec_add
+/vec_add-new.cpp
diff --git a/example/gpu/atomizer/Makefile b/example/gpu/atomizer/Makefile
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+OUTPUT := .output
+CLANG ?= clang
+LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src)
+BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src)
+LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
+BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
+BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
+			 | sed 's/arm.*/arm/' \
+			 | sed 's/aarch64/arm64/' \
+			 | sed 's/ppc64le/powerpc/' \
+			 | sed 's/mips.*/mips/' \
+			 | sed 's/riscv64/riscv/' \
+			 | sed 's/loongarch64/loongarch/')
+VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h
+# Use our own libbpf API headers and Linux UAPI headers distributed with
+# libbpf to avoid dependency on system-wide headers, which could be missing or
+# outdated
+INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
+CFLAGS := -g -Wall
+ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
+
+APPS = atomizer # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
+
+CARGO ?= $(shell which cargo)
+ifeq ($(strip $(CARGO)),)
+BZS_APPS :=
+else
+BZS_APPS := # profile
+APPS += $(BZS_APPS)
+# Required by libblazesym
+ALL_LDFLAGS += -lrt -ldl -lpthread -lm
+endif
+
+# Get Clang's default includes on this system. We'll explicitly add these dirs
+# to the includes list when compiling with `-target bpf` because otherwise some
+# architecture-specific dirs will be "missing" on some architectures/distros -
+# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
+# sys/cdefs.h etc. might be missing.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+ifeq ($(V),1)
+	Q =
+	msg =
+else
+	Q = @
+	msg = @printf '  %-8s %s%s\n'					\
+		      "$(1)"						\
+		      "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))"	\
+		      "$(if $(3), $(3))";
+	MAKEFLAGS += --no-print-directory
+endif
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+$(call allow-override,CC,$(CROSS_COMPILE)cc)
+$(call allow-override,LD,$(CROSS_COMPILE)ld)
+
+.PHONY: all
+all: $(APPS) vec_add
+
+vec_add: vec_add.cu
+	@if command -v nvcc >/dev/null 2>&1; then \
+		nvcc -cudart shared vec_add.cu -o vec_add -g; \
+	else \
+		echo "Warning: CUDA not found, skipping vec_add build"; \
+	fi
+
+.PHONY: clean
+clean:
+	$(call msg,CLEAN)
+	$(Q)rm -rf $(OUTPUT) $(APPS) vec_add
+
+$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
+	$(call msg,MKDIR,$@)
+	$(Q)mkdir -p $@
+
+# Build libbpf
+$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
+	$(call msg,LIB,$@)
+	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
+		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
+		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
+		    install
+
+# Build bpftool
+$(BPFTOOL): | $(BPFTOOL_OUTPUT)
+	$(call msg,BPFTOOL,$@)
+	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
+
+
+$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
+	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
+
+$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB, $@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
+
+$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB,$@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
+
+# Build BPF code
+$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
+	$(call msg,BPF,$@)
+	$(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH)		      \
+		     $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES)		      \
+		     -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+	$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+
+# Generate BPF skeletons
+$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
+	$(call msg,GEN-SKEL,$@)
+	$(Q)$(BPFTOOL) gen skeleton $< > $@
+
+# Build user-space code
+$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
+
+$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
+	$(call msg,CC,$@)
+	$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
+
+$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
+
+$(BZS_APPS): $(LIBBLAZESYM_OBJ)
+
+# Build application binary
+$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
+	$(call msg,BINARY,$@)
+	$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/example/gpu/atomizer/README.md b/example/gpu/atomizer/README.md
@@ -0,0 +1,131 @@
+
+# CUDA eBPF Early Exit Example
+
+This example demonstrates how to use `bpftime` to exit CUDA kernels even before entering it, which is similar to the role of XDP for network packets, allowing you to:
+
+- Implement the kernel atomizer proposed in [LithOS](https://dl.acm.org/doi/10.1145/3731569.3764818)
+- Configure the CUDA thread block partition from the host side.
+
+## Overview
+
+The example consists of two main components:
+
+1. **Vector Addition CUDA Application** (`vec_add`): A simple CUDA application that repeatedly performs vector addition on the GPU.
+
+2. **eBPF CUDA Atomizer** (`cuda_probe`): An eBPF program that attaches to CUDA kernel functions, deciding whether to exit before launching the real kernels.
+
+## How It Works
+
+This example leverages bpftime's CUDA attachment implementation to:
+
+1. Intercept CUDA binary loading via the CUDA runtime API
+2. Insert eBPF code (converted to PTX) at the entry point of the `vectorAdd` kernel
+3. Execute the eBPF program whenever the kernel is called
+4. Exit the thread blocks if the prediction is fulfilled.
+
+## Building the Example
+
+```bash
+# Navigate to the bpftime root directory
+cd bpftime
+
+# Build the main bpftime project first
+cmake -Bbuild -DBPFTIME_ENABLE_CUDA_ATTACH=1 -DBPFTIME_CUDA_ROOT=/usr/local/cuda-12.8 .
+make  -C build -j$(nproc)
+
+# Build the example (from the build directory)
+cd ..
+make -C example/gpu/atomizer
+```
+
+## Running the Example
+
+You need to start two processes:
+
+### 1. Launch the eBPF Program (Server)
+
+```bash
+BPFTIME_LOG_OUTPUT=console LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so \
+  example/gpu/atomizer/atomizer
+```
+
+This process loads the eBPF program and waits for CUDA events.
+
+### 2. Run the CUDA Application (Client)
+
+In another terminal:
+
+```bash
+BPFTIME_LOG_OUTPUT=console LD_PRELOAD=build/runtime/agent/libbpftime-agent.so \
+  example/gpu/atomizer/vec_add
+```
+
+This runs the vector addition program with the bpftime agent, which connects to the first process for eBPF execution.
+
+## Understanding the Output
+
+When running successfully, you'll see output like (from the application itself):
+
+```
+Exited _Z9vectorAddPKfS0_Pf block_id=1, L=5, H=10
+Enter _Z9vectorAddPKfS0_Pf block_id=7, L=5, H=10
+C[1] = 0 (expected 0)
+C[7] = 21 (expected 21)
+```
+
+This shows:
+- The kernel entering/exiting logs.
+- Vector addition results, with only the higher half computed (10 blocks in total)
+
+## Code Components
+
+### CUDA Vector Addition (`vec_add.cu`)
+
+A simple CUDA application that:
+- Allocates memory on GPU and CPU
+- Executes a basic vector addition kernel in a loop
+- Uses constant memory for vector size
+
+### eBPF Program (`atomizer.bpf.c`) 
+
+Contains an eBPF program:
+- `probe__cuda`: Executes when entering the CUDA kernel
+  - Load the configurations from the bpf map.
+  - Check the prediction for early exit
+
+### Userspace Loader (`atomizer.c`)
+
+Manages the eBPF program lifecycle:
+- Loads the compiled eBPF code
+- Attaches to the CUDA kernel functions
+- Handles proper signal termination
+
+## Advanced Features
+
+This example demonstrates several advanced bpftime capabilities:
+
+1. **Custom CUDA Helpers**: Special eBPF helper functions for CUDA:
+   - `bpf_cuda_exit()` - Exit the thread block
+   - `bpf_get_block_idx()` - Get current block indices
+   - `bpf_get_grid_dim()` - Get current grid sizes
+
+2. **Interprocess Communication**: The eBPF program runs in a separate process from the CUDA application, communicating through shared memory.
+
+3. **Dynamic Binary Modification**: The CUDA binary is intercepted, modified, and recompiled at runtime.
+
+## Troubleshooting
+
+If you encounter issues:
+
+- Ensure CUDA is properly installed and in your path
+- Check that both processes are running and can communicate
+- Verify the PTX modification succeeded in the logs
+- If you see CUDA errors, try simplifying the vector addition kernel
+
+## Further Exploration
+
+Try modifying this example to:
+- Track memory access patterns in CUDA kernels
+- Measure specific operations within kernels
+- Apply eBPF programs to more complex CUDA applications
+- Implement performance optimizations based on eBPF insights