Skip to content

DTO async user-level function API #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ install:
cp libdto.so.1.0 /usr/lib64/
ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1
ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so
cp dto.h /usr/include/
Copy link
Preview

Copilot AI Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Avoid hardcoding /usr/include; use $(PREFIX)/include or DESTDIR to respect custom install paths and cross-compilation scenarios.

Suggested change
cp dto.h /usr/include/
cp dto.h $(DESTDIR)$(PREFIX)/include/

Copilot uses AI. Check for mistakes.


install-local:
ln -sf ./libdto.so.1.0 ./libdto.so.1
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS.

DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS.

Finally, DTO offers an API to allow applications to pass a function pointer to be called while waiting for DSA to complete the operation. This can be used to perform other work while waiting for DSA to complete the operation. The function signature is:

```bash
dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args);
```
where callback_t cb is a function pointer in the calling application. If the callback terminates before DSA completes the operation, the specified wait method is used to complete the waiting.




```bash
dto.c: DSA Transparent Offload shared library
dto-test.c: Sample multi-threaded test application
Expand Down Expand Up @@ -179,4 +189,4 @@ When linking DTO using LD_PRELOAD environment variable special care is required
in the script.
- When the application is started by a script with #!<location of shell> which invokes another script with #!<location of shell>, for
unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum
DTO size above 8K, or by avoiding this invocation sequence.
DTO size above 8K, or by avoiding this invocation sequence.
76 changes: 76 additions & 0 deletions dto.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <accel-config/libaccel_config.h>
#include <numaif.h>
#include <numa.h>
#include "dto.h"

#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)
Expand All @@ -48,6 +49,10 @@
#define DTO_INITIALIZED 0
#define DTO_INITIALIZING 1

#define NSEC_PER_SEC (1000000000)
#define MSEC_PER_SEC (1000)
#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC)

// thread specific variables
static __thread struct dsa_hw_desc thr_desc;
static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32)));
Expand Down Expand Up @@ -107,6 +112,7 @@ static enum numa_aware is_numa_aware;
static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
static int wait_method = WAIT_YIELD;
static size_t cpu_size_fraction; // range of values is 0 to 99
static uint64_t wait_time = 100000; //10K nanoseconds

static uint8_t dto_dsa_memcpy = 1;
static uint8_t dto_dsa_memmove = 1;
Expand All @@ -122,6 +128,7 @@ static uint8_t fork_handler_registered;
enum memop {
MEMSET = 0x0,
MEMCOPY,
MEMCOPY_ASYNC,
MEMMOVE,
MEMCMP,
MAX_MEMOP,
Expand All @@ -130,6 +137,7 @@ enum memop {
static const char * const memop_names[] = {
[MEMSET] = "set",
[MEMCOPY] = "cpy",
[MEMCOPY_ASYNC] = "cpy_async",
[MEMMOVE] = "mov",
[MEMCMP] = "cmp"
};
Expand Down Expand Up @@ -557,6 +565,7 @@ static void print_stats(void)
clock_gettime(CLOCK_BOOTTIME, &dto_end_time);

LOG_TRACE("DTO Run Time: %ld ms\n", TS_NS(dto_start_time, dto_end_time)/1000000);
LOG_TRACE("DTO CPU Fraction: %.2f \n", cpu_size_fraction/100.0);

// display stats
for (int t = 0; t < 2; ++t) {
Expand Down Expand Up @@ -1340,6 +1349,23 @@ static int init_dto(void)
LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n");
use_std_lib_calls = 1;
}
unsigned int num, den, freq;
unsigned int unused;
unsigned long long tmp;
__get_cpuid( 0x15, &den, &num, &freq, &unused );
freq /= 1000;
LOG_TRACE( "Core Freq = %u kHz\n", freq );
LOG_TRACE( "TSC Mult = %u\n", num );
LOG_TRACE( "TSC Den = %u\n", den );
freq *= num;
freq /= den;
LOG_TRACE( "CPU freq = %u kHz\n", freq );
LOG_TRACE( "Requested wait: %llu nsec\n", wait_time );
tmp = wait_time;
tmp *= freq;
wait_time = tmp / NSEC_PER_MSEC;
LOG_TRACE( "Requested wait duration: %llu cycles\n", wait_time );


// display configuration
LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
Expand Down Expand Up @@ -1484,6 +1510,56 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
return true;
}

__attribute__((visibility("default"))) void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args) {
//submit dsa work if successful, call the callback
int result = 0;
struct dto_wq *wq = get_wq(dest);
size_t dsa_size = n;
#ifdef DTO_STATS_SUPPORT
struct timespec st, et;
size_t orig_n = n;
DTO_COLLECT_STATS_START(collect_stats, st);
#endif

thr_desc.opcode = DSA_OPCODE_MEMMOVE;
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
thr_desc.flags |= IDXD_OP_FLAG_CC;
thr_desc.completion_addr = (uint64_t)&thr_comp;

thr_bytes_completed = 0;

thr_desc.src_addr = (uint64_t) src;
thr_desc.dst_addr = (uint64_t) dest;
thr_desc.xfer_size = (uint32_t) dsa_size;
thr_comp.status = 0;
result = dsa_submit(wq, &thr_desc);
if (result == SUCCESS) {
cb(args);
result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
n -= thr_bytes_completed;
if (thr_comp.result == 0) {
dest = (void *)((uint64_t)dest + thr_bytes_completed);
src = (const void *)((uint64_t)src + thr_bytes_completed);
}
#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_START(collect_stats, st);
#endif

orig_memcpy(dest, src, n);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n);
#endif
}
}

static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
{
struct dto_wq *wq = get_wq(dest);
Expand Down
18 changes: 18 additions & 0 deletions dto.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

#ifndef DTO_H
#define DTO_H

#ifdef __cplusplus
extern "C" {
#endif

typedef void(*callback_t)(void*);

void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args);

#ifdef __cplusplus
}
#endif

#endif