Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Fix GPU checkpointing #19

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ clean:
$(MAKE) -C cpu clean
@echo -e "\033[31m----> Cleaning up test kernels\033[0m"
$(MAKE) -C tests clean
@echo -e "\033[31m----> Removing bin...\033[0m"
rm -rf bin
@echo -e "\033[31m All done!\033[0m"

cuda-gdb:
@echo -e "\033[36m----> Building submodules\033[0m"
Expand Down Expand Up @@ -43,8 +46,9 @@ bin:
mkdir bin

bin/tests: bin tests
ifneq (, $(test -f "bin/tests"))
ln -s ../tests/bin bin/tests

endif
bin/cricket-client.so: bin
$(MAKE) -C cpu cricket-client.so
cp cpu/cricket-client.so bin
Expand Down
21 changes: 11 additions & 10 deletions cpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,17 @@ SRC_SERVER = $(RPC_XDR) \
cpu-server-driver-hidden.c \
log.c \
cpu-libwrap.c \
cpu-server-cusolver.c \
cpu-server-cublas.c \
cpu-server-cusolver.c \
cpu-server-cublas.c \
list.c \
api-recorder.c \
resource-mg.c \
cr.c \
gsched_none.c \
oob.c \
mt-memcpy.c \
cpu-elf2.c
api-recorder.c \
resource-mg.c \
cr.c \
gsched_none.c \
oob.c \
mt-memcpy.c \
cpu-elf2.c \
gpu/ckp-kernel.c

SRC_SERVER_LIB = server-library.c
SRC_SERVER_EXE = server-exe.c
Expand Down Expand Up @@ -80,7 +81,7 @@ LIB_FLAGS += -L$(CUDA_SRC)/lib64
CC_FLAGS += -std=gnu99 $(INC_FLAGS) -O2
# TODO: use extern in header files instead of direct definition e.g. in cpu-common.h to remove -fcommon flag
CC_FLAGS += -fcommon
LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf
LD_FLAGS = $(LIB_FLAGS) -ltirpc -ldl -lcrypto -lelf

ifdef WITH_DEBUG
# use ASAN_OPTIONS=protect_shadow_gap=0 LSAN_OPTIONS=fast_unwind_on_malloc=0 when running
Expand Down
6 changes: 6 additions & 0 deletions cpu/cpu-server.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "cpu-server-driver.h"
#include "rpc/xdr.h"
#include "cr.h"
#include "gpu/ckp-kernel.h"
#include "cpu-elf2.h"
#ifdef WITH_IB
#include "cpu-ib.h"
Expand Down Expand Up @@ -90,6 +91,11 @@ int cricket_server_checkpoint(int dump_memory)
goto error;
}

if ((ret = gpu_checkpoint(/*TODO*/)) != 0) {
LOGE(LOG_ERROR, "gpu_checkpoint returned %d", ret);
goto error;
}

LOG(LOG_INFO, "checkpoint successfully created.");
return 0;
error:
Expand Down
7 changes: 7 additions & 0 deletions cpu/gpu/ckp-kernel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include "ckp-kernel.h"
#include <stdio.h>

int gpu_checkpoint(void) {
printf("TESTING...\n");
return 0;
}
6 changes: 6 additions & 0 deletions cpu/gpu/ckp-kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef __CKP_KERNEL_H__
#define __CKP_KERNEL_H__

int gpu_checkpoint(void);

#endif //!__CKP_KERNEL_H__
2 changes: 1 addition & 1 deletion gpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ INC_DIRS := -I$(CUDA_GDB_PATH)/bfd \
LIB_DIR := ../submodules/lib
BUILD_DIR := build

DLIBS = -lncurses -lpthread -lm -lz -ldl -lexpat -llzma -Wl,--dynamic-list=utils/proc-service.list
DLIBS = -lncurses -lpthread -lm -lz -ldl -lexpat -llzma -lmpfr -Wl,--dynamic-list=utils/proc-service.list
# Order of .a files is important!
SLIBS = libgdb.a libbfd.a libiberty.a libreadline.a libdecnumber.a libcudacore.a libopcodes.a libgnu.a
SLIBS:= $(addprefix $(LIB_DIR)/, $(SLIBS))
Expand Down
5 changes: 5 additions & 0 deletions gpu/src/cricket-cr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,7 @@ bool cricket_cr_rst_params(CUDBGAPI cudbgAPI, const char *ckp_dir,
cudbgGetErrorString(res));
goto cleanup;
}
/*
for (int i = 0; i != elf_info->param_num; ++i) {
if (elf_info->params[i].size != 8)
continue;
Expand Down Expand Up @@ -1150,6 +1151,7 @@ bool cricket_cr_rst_params(CUDBGAPI cudbgAPI, const char *ckp_dir,
free(param_data);
param_data = NULL;
}
*/
ret = true;
cleanup:
free(param_mem);
Expand All @@ -1170,6 +1172,8 @@ bool cricket_cr_ckp_params(CUDBGAPI cudbgAPI, const char *ckp_dir,
/* Parameters are the same for all warps so just use warp 0
* TODO: use first valid warp, because warp 0 may not be in use (is that
* possible?)
*
* This seems to cause issues right now. Needs a solution.
*/
if ((param_mem = (uint8_t*)malloc(elf_info->param_size)) == NULL)
return false;
Expand Down Expand Up @@ -1482,6 +1486,7 @@ bool cricket_cr_ckp_globals(CUDBGAPI cudbgAPI, const char *ckp_dir)
if (res != CUDBG_SUCCESS) {
LOGE(LOG_ERROR, "cuda error: %s",
cudbgGetErrorString(res));
LOGE(LOG_DEBUG, "encountered in iteration %d of %d\n", i, globals_num);
goto cleanup;
}
offset += globals[i].size;
Expand Down
12 changes: 11 additions & 1 deletion gpu/src/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,12 @@ void now_time(char* buf)

const char* to_string(log_level level)
{
#ifdef NOCOLORS
static const char* const buffer[] = {"ERROR", "WARNING", "INFO", "DEBUG"};
#else
static const char* const buffer[] = {"\033[1m\033[31mERROR\033[0m", "\033[33mWARNING\033[0m", "\033[34mINFO\033[0m", "\033[32mDEBUG\033[0m"};
#endif //NOCOLORS

if(level > LOG_DEBUG){
return buffer[LOG_DEBUG];
}
Expand Down Expand Up @@ -90,5 +95,10 @@ void loggfe(log_level level, int line, const char* file, const char* formatstr,
char stripped[64];
strcpy(stripped, file);
str_strip(stripped, get_log_data()->project_offset);
printf("\tin %s(%d)\n", stripped, line);
#ifdef NOCOLORS
printf("\tin %s:%d\n", stripped, line);
#else
printf("\tin \033[4m%s:%d\033[0m\n", stripped, line);
#endif //NOCOLORS

}
30 changes: 23 additions & 7 deletions gpu/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include "gdb.h"

#ifndef LOG_LEVEL
#define LOG_LEVEL LOG_INFO
#define LOG_LEVEL LOG_DEBUG
#endif

#define CRICKET_PROFILE 1
Expand Down Expand Up @@ -124,7 +124,7 @@ bool cricket_all_warps_broken(CUDBGAPI cudbgAPI, CricketDeviceProp *dev_prop)
int cricket_analyze(int argc, char *argv[])
{
if (argc != 3) {
LOG(LOG_ERROR, "wrong number of arguments, use: %s <executable>", argv[0]);
LOG(LOG_ERROR, "wrong number of arguments, use: %s analyze <executable>", argv[0]);
return -1;
}
LOG(LOG_INFO, "Analyzing \"%s\"", argv[2]);
Expand Down Expand Up @@ -155,7 +155,7 @@ int cricket_restore(int argc, char *argv[])
double bt, ct, dt, et, ft, gt, comt;
#endif
if (argc != 3) {
LOG(LOG_ERROR, "wrong number of arguments, use: %s <executable>", argv[0]);
LOG(LOG_ERROR, "wrong number of arguments, use: %s restore <executable>", argv[0]);
return -1;
}

Expand Down Expand Up @@ -827,14 +827,15 @@ int cricket_checkpoint(int argc, char *argv[])
#endif

if (argc != 3) {
printf("wrong number of arguments, use: %s <pid>\n", argv[0]);
printf("wrong number of arguments, use: %s checkpoint <pid>\n", argv[0]);
return -1;
}

printf("Initializing GDB!\n\n");
gdb_init(argc, argv, NULL, argv[2]);

/* attach to process (both CPU and GPU) */
// printf("attaching...\n");
printf("attaching...\n");
// attach_command(argv[2], !batch_flag);

if (cuda_api_get_state() != CUDA_API_STATE_INITIALIZED) {
Expand All @@ -849,6 +850,9 @@ int cricket_checkpoint(int argc, char *argv[])
#ifdef CRICKET_PROFILE
gettimeofday(&b, NULL);
#endif

printf("attached!\n\n");
printf("trying to get CUDA debugger API\n");

/* get CUDA debugger API */
res = cudbgGetAPI(CUDBG_API_VERSION_MAJOR, CUDBG_API_VERSION_MINOR,
Expand All @@ -858,7 +862,7 @@ int cricket_checkpoint(int argc, char *argv[])
goto cuda_error;
}
printf("got API\n");

printf("enumerating devices...\n");

if (!cricket_device_get_num(cudbgAPI, &numDev)) {
printf("error getting device num\n");
Expand Down Expand Up @@ -1042,11 +1046,14 @@ int cricket_checkpoint(int argc, char *argv[])
//cricket_focus_kernel(!batch_flag);


/// TODO: There is a loop to determine the first warp, however
/// cricket_cr_ckp_params still causes errors over invalid warps...
if (!cricket_cr_ckp_params(cudbgAPI, ckp_dir, &elf_info, 0, 0,
first_warp)) {
printf("cricket_cr_ckp_params unsuccessful\n");
}

/// TODO: work out globals
if (!cricket_cr_ckp_globals(cudbgAPI, ckp_dir)) {
printf("cricket_cr_ckp_globals unsuccessful\n");
}
Expand Down Expand Up @@ -1092,6 +1099,8 @@ int cricket_checkpoint(int argc, char *argv[])

int cricket_start(int argc, char *argv[])
{
char* cricket_path;
char cmd_str[1024];
struct cmd_list_element *alias = NULL;
struct cmd_list_element *prefix_cmd = NULL;
struct cmd_list_element *cmd = NULL;
Expand All @@ -1101,12 +1110,19 @@ int cricket_start(int argc, char *argv[])
return -1;
}

cricket_path = getenv("CRICKET_PATH");
if (cricket_path == NULL) {
LOG(LOG_DEBUG, "no cricket path specified. assuming /usr/local/cricket\n");
cricket_path = "/usr/local/cricket";
}

gdb_init(argc, argv, argv[2], NULL);

/* load files */
//exec_file_attach(argv[2], !batch_flag);
//
execute_command("set exec-wrapper env 'LD_PRELOAD=/home/eiling/projects/cricket/bin/libtirpc.so.3:/home/eiling/projects/cricket/cpu/cricket-server.so'", !batch_flag);
snprintf(cmd_str, 1024, "set exec-wrapper env 'LD_PRELOAD=%s/bin/libtirpc.so.3:%s/cpu/cricket-server.so'", cricket_path, cricket_path);
execute_command(cmd_str, !batch_flag);
//execute_command("break main", !batch_flag);
execute_command("starti", !batch_flag);
//execute_command("unset exec-wrapper", !batch_flag);
Expand Down
2 changes: 2 additions & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ bin: cpu samples test_apps
cp test_apps/*.testapp bin
cp samples/matrixMul/matrixMul bin
cp samples/bandwidthTest/bandwidthTest bin
ifneq (,$(test -f "samples/nbody/nbody"))
cp samples/nbody/nbody bin
endif

clean:
@echo -e "\033[31m----> Cleaning up tests/test_apps\033[0m"
Expand Down
7 changes: 7 additions & 0 deletions tests/cpu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.PHONY: all clean

all:

clean:
$(MAKE) -C cubin clean
$(MAKE) -C unit clean
2 changes: 1 addition & 1 deletion tests/gpu/checkpoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ echo "using $CUDA_APP"
CUDA_APP_NAME="$(basename -- $CUDA_APP)"
CRICKET_CLIENT=${CRICKET_PATH}/cpu/cricket-client.so
CRICKET_SERVER=${CRICKET_PATH}/cpu/cricket-server.so
CRIU=/home/eiling/tmp/criu/criu/criu
CRIU=${HOME}/tmp/criu/criu/criu

export REMOTE_GPU_ADDRESS=localhost
export CUDA_VISIBLE_DEVICES=0
Expand Down
2 changes: 1 addition & 1 deletion tests/gpu/restore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ echo "using $CUDA_APP"
CUDA_APP_NAME="$(basename -- $CUDA_APP)"
CRICKET_CLIENT=${CRICKET_PATH}/cpu/cricket-client.so
CRICKET_SERVER=${CRICKET_PATH}/cpu/cricket-server.so
CRIU=/home/eiling/tmp/criu/criu/criu
CRIU=${HOME}/tmp/criu/criu/criu

export REMOTE_GPU_ADDRESS=localhost
export CUDA_VISIBLE_DEVICES=0
Expand Down