Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions attach/nv_attach_impl/nv_attach_impl_patcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ generate_ptx_for_ebpf(const std::vector<ebpf_inst> &inst,
vm.register_external_function(503, "get_block_idx", (void *)test_func);
vm.register_external_function(504, "get_block_dim", (void *)test_func);
vm.register_external_function(505, "get_thread_idx", (void *)test_func);
vm.register_external_function(507, "cuda_exit", (void *)test_func);
vm.register_external_function(508, "get_grid_dim", (void *)test_func);

vm.load_code(inst.data(), inst.size() * 8);
llvm_bpf_jit_context ctx(vm);
Expand Down
16 changes: 16 additions & 0 deletions attach/nv_attach_impl/trampoline/default_trampoline.cu
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,22 @@ _bpf_helper_ext_0506(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t)
asm("membar.sys; \n\t");
return 0;
}
extern "C" __noinline__ __device__ uint64_t
_bpf_helper_ext_0507(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t)
{
asm("exit; \n\t");
return 0;
}
extern "C" __noinline__ __device__ uint64_t
_bpf_helper_ext_0508(uint64_t x, uint64_t y, uint64_t z, uint64_t, uint64_t)
{
// get grid dim
*(uint64_t *)(uintptr_t)x = gridDim.x;
*(uint64_t *)(uintptr_t)y = gridDim.y;
*(uint64_t *)(uintptr_t)z = gridDim.z;

return 0;
}

extern "C" __global__ void bpf_main(void *mem, size_t sz)
{
Expand Down
51 changes: 51 additions & 0 deletions attach/nv_attach_impl/trampoline_ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -1384,5 +1384,56 @@ static const char TRAMPOLINE_PTX[] = R"(
st.param.b64 [func_retval0+0], %rd1;
ret;
// -- End function
}
// .globl _bpf_helper_ext_0507 // -- Begin function _bpf_helper_ext_0507
.visible .func (.param .b64 func_retval0) _bpf_helper_ext_0507(
.param .b64 _bpf_helper_ext_0507_param_0,
.param .b64 _bpf_helper_ext_0507_param_1,
.param .b64 _bpf_helper_ext_0507_param_2,
.param .b64 _bpf_helper_ext_0507_param_3,
.param .b64 _bpf_helper_ext_0507_param_4
) // @_bpf_helper_ext_0507
{
.reg .b64 %rd<2>;

// %bb.0:
// begin inline asm
exit;

// end inline asm
mov.u64 %rd1, 0;
st.param.b64 [func_retval0+0], %rd1;
ret;
// -- End function
}
// .globl _bpf_helper_ext_0508 // -- Begin function _bpf_helper_ext_0508
.visible .func (.param .b64 func_retval0) _bpf_helper_ext_0508(
.param .b64 _bpf_helper_ext_0508_param_0,
.param .b64 _bpf_helper_ext_0508_param_1,
.param .b64 _bpf_helper_ext_0508_param_2,
.param .b64 _bpf_helper_ext_0508_param_3,
.param .b64 _bpf_helper_ext_0508_param_4
) // @_bpf_helper_ext_0508
{
.reg .b32 %r<4>;
.reg .b64 %rd<8>;

// %bb.0:
ld.param.u64 %rd1, [_bpf_helper_ext_0508_param_0];
ld.param.u64 %rd2, [_bpf_helper_ext_0508_param_1];
mov.u32 %r1, %nctaid.x;
cvt.u64.u32 %rd3, %r1;
ld.param.u64 %rd4, [_bpf_helper_ext_0508_param_2];
st.u64 [%rd1], %rd3;
mov.u32 %r2, %nctaid.y;
cvt.u64.u32 %rd5, %r2;
st.u64 [%rd2], %rd5;
mov.u32 %r3, %nctaid.z;
cvt.u64.u32 %rd6, %r3;
st.u64 [%rd4], %rd6;
mov.u64 %rd7, 0;
st.param.b64 [func_retval0+0], %rd7;
ret;
// -- End function
}
)";
6 changes: 6 additions & 0 deletions example/gpu/atomizer/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/atomizer
/.output
/victim*
/vec_add.cpp
/vec_add
/vec_add-new.cpp
145 changes: 145 additions & 0 deletions example/gpu/atomizer/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
OUTPUT := .output
CLANG ?= clang
LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src)
BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src)
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
| sed 's/arm.*/arm/' \
| sed 's/aarch64/arm64/' \
| sed 's/ppc64le/powerpc/' \
| sed 's/mips.*/mips/' \
| sed 's/riscv64/riscv/' \
| sed 's/loongarch64/loongarch/')
VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h
# Use our own libbpf API headers and Linux UAPI headers distributed with
# libbpf to avoid dependency on system-wide headers, which could be missing or
# outdated
INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
CFLAGS := -g -Wall
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)

APPS = atomizer # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall

CARGO ?= $(shell which cargo)
ifeq ($(strip $(CARGO)),)
BZS_APPS :=
else
BZS_APPS := # profile
APPS += $(BZS_APPS)
# Required by libblazesym
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
endif

# Get Clang's default includes on this system. We'll explicitly add these dirs
# to the includes list when compiling with `-target bpf` because otherwise some
# architecture-specific dirs will be "missing" on some architectures/distros -
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
# sys/cdefs.h etc. might be missing.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')

ifeq ($(V),1)
Q =
msg =
else
Q = @
msg = @printf ' %-8s %s%s\n' \
"$(1)" \
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
"$(if $(3), $(3))";
MAKEFLAGS += --no-print-directory
endif

define allow-override
$(if $(or $(findstring environment,$(origin $(1))),\
$(findstring command line,$(origin $(1)))),,\
$(eval $(1) = $(2)))
endef

$(call allow-override,CC,$(CROSS_COMPILE)cc)
$(call allow-override,LD,$(CROSS_COMPILE)ld)

.PHONY: all
all: $(APPS) vec_add

vec_add: vec_add.cu
@if command -v nvcc >/dev/null 2>&1; then \
nvcc -cudart shared vec_add.cu -o vec_add -g; \
else \
echo "Warning: CUDA not found, skipping vec_add build"; \
fi

.PHONY: clean
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) $(APPS) vec_add

$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $@

# Build libbpf
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
$(call msg,LIB,$@)
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
INCLUDEDIR= LIBDIR= UAPIDIR= \
install

# Build bpftool
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
$(call msg,BPFTOOL,$@)
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap


$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release

$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
$(call msg,LIB, $@)
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@

$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
$(call msg,LIB,$@)
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@

# Build BPF code
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
$(call msg,BPF,$@)
$(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)

# Generate BPF skeletons
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
$(Q)$(BPFTOOL) gen skeleton $< > $@

# Build user-space code
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h

$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
$(call msg,CC,$@)
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@

$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)

$(BZS_APPS): $(LIBBLAZESYM_OBJ)

# Build application binary
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
$(call msg,BINARY,$@)
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@

# delete failed targets
.DELETE_ON_ERROR:

# keep intermediate (.skel.h, .bpf.o, etc) targets
.SECONDARY:
131 changes: 131 additions & 0 deletions example/gpu/atomizer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@

# CUDA eBPF Early Exit Example

This example demonstrates how to use `bpftime` to exit CUDA kernels even before entering it, which is similar to the role of XDP for network packets, allowing you to:

- Implement the kernel atomizer proposed in [LithOS](https://dl.acm.org/doi/10.1145/3731569.3764818)
- Configure the CUDA thread block partition from the host side.

## Overview

The example consists of two main components:

1. **Vector Addition CUDA Application** (`vec_add`): A simple CUDA application that repeatedly performs vector addition on the GPU.

2. **eBPF CUDA Atomizer** (`cuda_probe`): An eBPF program that attaches to CUDA kernel functions, deciding whether to exit before launching the real kernels.

## How It Works

This example leverages bpftime's CUDA attachment implementation to:

1. Intercept CUDA binary loading via the CUDA runtime API
2. Insert eBPF code (converted to PTX) at the entry point of the `vectorAdd` kernel
3. Execute the eBPF program whenever the kernel is called
4. Exit the thread blocks if the prediction is fulfilled.

## Building the Example

```bash
# Navigate to the bpftime root directory
cd bpftime

# Build the main bpftime project first
cmake -Bbuild -DBPFTIME_ENABLE_CUDA_ATTACH=1 -DBPFTIME_CUDA_ROOT=/usr/local/cuda-12.8 .
make -C build -j$(nproc)

# Build the example (from the build directory)
cd ..
make -C example/gpu/atomizer
```

## Running the Example

You need to start two processes:

### 1. Launch the eBPF Program (Server)

```bash
BPFTIME_LOG_OUTPUT=console LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so \
example/gpu/atomizer/atomizer
```

This process loads the eBPF program and waits for CUDA events.

### 2. Run the CUDA Application (Client)

In another terminal:

```bash
BPFTIME_LOG_OUTPUT=console LD_PRELOAD=build/runtime/agent/libbpftime-agent.so \
example/gpu/atomizer/vec_add
```

This runs the vector addition program with the bpftime agent, which connects to the first process for eBPF execution.

## Understanding the Output

When running successfully, you'll see output like (from the application itself):

```
Exited _Z9vectorAddPKfS0_Pf block_id=1, L=5, H=10
Enter _Z9vectorAddPKfS0_Pf block_id=7, L=5, H=10
C[1] = 0 (expected 0)
C[7] = 21 (expected 21)
```

This shows:
- The kernel entering/exiting logs.
- Vector addition results, with only the higher half computed (10 blocks in total)

## Code Components

### CUDA Vector Addition (`vec_add.cu`)

A simple CUDA application that:
- Allocates memory on GPU and CPU
- Executes a basic vector addition kernel in a loop
- Uses constant memory for vector size

### eBPF Program (`atomizer.bpf.c`)

Contains an eBPF program:
- `probe__cuda`: Executes when entering the CUDA kernel
- Load the configurations from the bpf map.
- Check the prediction for early exit

### Userspace Loader (`atomizer.c`)

Manages the eBPF program lifecycle:
- Loads the compiled eBPF code
- Attaches to the CUDA kernel functions
- Handles proper signal termination

## Advanced Features

This example demonstrates several advanced bpftime capabilities:

1. **Custom CUDA Helpers**: Special eBPF helper functions for CUDA:
- `bpf_cuda_exit()` - Exit the thread block
- `bpf_get_block_idx()` - Get current block indices
- `bpf_get_grid_dim()` - Get current grid sizes

2. **Interprocess Communication**: The eBPF program runs in a separate process from the CUDA application, communicating through shared memory.

3. **Dynamic Binary Modification**: The CUDA binary is intercepted, modified, and recompiled at runtime.

## Troubleshooting

If you encounter issues:

- Ensure CUDA is properly installed and in your path
- Check that both processes are running and can communicate
- Verify the PTX modification succeeded in the logs
- If you see CUDA errors, try simplifying the vector addition kernel

## Further Exploration

Try modifying this example to:
- Track memory access patterns in CUDA kernels
- Measure specific operations within kernels
- Apply eBPF programs to more complex CUDA applications
- Implement performance optimizations based on eBPF insights
Loading
Loading