Skip to content

first set of memkind optimizations #13095

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions ompi/communicator/comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ int ompi_comm_create_w_info (ompi_communicator_t *comm, ompi_group_t *group, opa
if (info) {
opal_info_dup(info, &(newcomp->super.s_info));
}
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

/* Set name for debugging purposes */
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %s CREATE FROM %s",
Expand Down Expand Up @@ -705,8 +709,11 @@ int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
if (info) {
opal_info_dup(info, &(newcomp->super.s_info));
}
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info);

ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}
/* Activate the communicator and init coll-component */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);

Expand Down Expand Up @@ -997,7 +1004,11 @@ static int ompi_comm_split_type_core(ompi_communicator_t *comm,
if (info) {
opal_infosubscribe_change_info(&newcomp->super, info);
}
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

/* Activate the communicator and init coll-component */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
Expand Down Expand Up @@ -1351,7 +1362,11 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
if (info) {
opal_infosubscribe_change_info(&newcomp->super, info);
}
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
Expand Down Expand Up @@ -1442,7 +1457,12 @@ static int ompi_comm_idup_internal (ompi_communicator_t *comm, ompi_group_t *gro
if (info) {
opal_info_dup(info, &(newcomp->super.s_info));
}
ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info);

ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}
}

ompi_comm_request_schedule_append (request, ompi_comm_idup_getcid, subreq, subreq[0] ? 1 : 0);
Expand Down Expand Up @@ -1594,7 +1614,11 @@ int ompi_comm_create_from_group (ompi_group_t *group, const char *tag, opal_info
if (NULL == newcomp->super.s_info) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ompi_info_memkind_copy_or_set (&group->grp_instance->super, &newcomp->super, info);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&group->grp_instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

/* activate communicator and init coll-module. use the group allreduce implementation as
* no collective module has yet been selected. the tag does not matter as any tag will
Expand Down Expand Up @@ -1736,7 +1760,12 @@ int ompi_intercomm_create (ompi_communicator_t *local_comm, int local_leader, om

// Copy info if there is one.
newcomp->super.s_info = OBJ_NEW(opal_info_t);
ompi_info_memkind_copy_or_set (&local_comm->instance->super, &newcomp->super, &ompi_mpi_info_null.info.super);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&local_comm->instance->super, &newcomp->super,
&ompi_mpi_info_null.info.super, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

*newintercomm = newcomp;

Expand Down Expand Up @@ -1900,7 +1929,11 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
if (info) {
opal_info_dup(info, &(newcomp->super.s_info));
}
ompi_info_memkind_copy_or_set (&local_group->grp_instance->super, &newcomp->super, info);
ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&local_group->grp_instance->super, &newcomp->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
newcomp->c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}

/* activate communicator and init coll-module */
rc = ompi_comm_activate (&newcomp, local_comm, leader_comm, &local_leader, &leader_comm_remote_leader,
Expand Down
7 changes: 6 additions & 1 deletion ompi/communicator/comm_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,16 @@ int ompi_comm_init_mpi3 (void)
char *memkind_requested = getenv ("OMPI_MCA_mpi_memory_alloc_kinds");
if (NULL != memkind_requested) {
char *memkind_provided;
ompi_info_memkind_assert_type type;

ompi_info_memkind_process (memkind_requested, &memkind_provided);
ompi_info_memkind_process (memkind_requested, &memkind_provided, &type);
opal_infosubscribe_subscribe (&ompi_mpi_comm_world.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb);
opal_infosubscribe_subscribe (&ompi_mpi_comm_self.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb);
opal_infosubscribe_subscribe (&ompi_mpi_comm_world.comm.instance->super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
ompi_mpi_comm_world.comm.c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
ompi_mpi_comm_self.comm.c_assertions |= OMPI_COMM_ASSERT_NO_ACCEL_BUF;
}
free (memkind_provided);
}

Expand Down
2 changes: 2 additions & 0 deletions ompi/communicator/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t);
#define OMPI_COMM_ASSERT_ALLOW_OVERTAKE 0x00000008
#define OMPI_COMM_ASSERT_LAZY_BARRIER 0x00000010
#define OMPI_COMM_ASSERT_ACTIVE_POLL 0x00000020
#define OMPI_COMM_ASSERT_NO_ACCEL_BUF 0x00000040

#define OMPI_COMM_CHECK_ASSERT(comm, flag) !!((comm)->c_assertions & flag)
#define OMPI_COMM_CHECK_ASSERT_NO_ANY_TAG(comm) OMPI_COMM_CHECK_ASSERT(comm, OMPI_COMM_ASSERT_NO_ANY_TAG)
Expand All @@ -114,6 +115,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t);
#define OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm) OMPI_COMM_CHECK_ASSERT(comm, OMPI_COMM_ASSERT_ALLOW_OVERTAKE)
#define OMPI_COMM_CHECK_ASSERT_LAZY_BARRIER(comm) OMPI_COMM_CHECK_ASSERT(comm, OMPI_COMM_ASSERT_LAZY_BARRIER)
#define OMPI_COMM_CHECK_ASSERT_ACTIVE_POLL(comm) OMPI_COMM_CHECK_ASSERT(comm, OMPI_COMM_ASSERT_ACTIVE_POLL)
#define OMPI_COMM_CHECK_ASSERT_NO_ACCEL_BUF(comm) OMPI_COMM_CHECK_ASSERT(comm, OMPI_COMM_ASSERT_NO_ACCEL_BUF)

/**
* Modes required for acquiring the new comm-id.
Expand Down
9 changes: 6 additions & 3 deletions ompi/file/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2024 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -124,8 +124,11 @@ int ompi_file_open(struct ompi_communicator_t *comm, const char *filename,
if (info) {
opal_info_dup(info, &(file->super.s_info));
}
ompi_info_memkind_copy_or_set (&comm->instance->super, &file->super, info);

ompi_info_memkind_assert_type type;
ompi_info_memkind_copy_or_set (&comm->instance->super, &file->super, info, &type);
if (OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL == type) {
file->f_flags |= OMPI_FILE_ASSERT_NO_ACCEL_BUF;
}
file->f_amode = amode;
file->f_filename = strdup(filename);
if (NULL == file->f_filename) {
Expand Down
6 changes: 3 additions & 3 deletions ompi/file/file.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
/*
* Flags
*/
#define OMPI_FILE_ISCLOSED 0x00000001
#define OMPI_FILE_HIDDEN 0x00000002

#define OMPI_FILE_ISCLOSED 0x00000001
#define OMPI_FILE_HIDDEN 0x00000002
#define OMPI_FILE_ASSERT_NO_ACCEL_BUF 0x00000004
BEGIN_C_DECLS

/**
Expand Down
54 changes: 50 additions & 4 deletions ompi/info/info_memkind.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static void ompi_info_memkind_extract (const char* memkind_str, int *num_memkind
*/

/* Separate requested_str into an array of individual entries */
int current_max = 0;
char **memkind_combos = opal_argv_split(memkind_str, ',');
int max_num_memkinds = opal_argv_count(memkind_combos);

Expand All @@ -71,7 +72,6 @@ static void ompi_info_memkind_extract (const char* memkind_str, int *num_memkind

int iter = 0;
char *m = memkind_combos[iter];
int current_max = 0;
while (m != NULL) {
bool name_found = false;
char **tmp_str = opal_argv_split (m, ':');
Expand Down Expand Up @@ -419,19 +419,52 @@ static bool ompi_info_memkind_validate (const char *assert_str, const char *pare
return ret;
}

static bool ompi_info_memkind_check_no_accel (int num_memkinds, ompi_memkind_t *memkinds)
{
bool result = true;

for (int i = 0; i < num_memkinds; i++) {
if (!strncmp(memkinds[i].im_name, "system", strlen("system"))) {
continue;
}
if (!strncmp(memkinds[i].im_name, "mpi", strlen("mpi"))) {
continue;
}
result = false;
break;
}

return result;
}

int ompi_info_memkind_process (const char* requested_str, char **provided_str)
static bool ompi_info_memkind_check_no_accel_from_string (char *mstring)
{
bool ret = false;
int num_memkinds;
ompi_memkind_t *memkinds = NULL;

ompi_info_memkind_extract (mstring, &num_memkinds, &memkinds);
if (NULL != memkinds) {
ret = ompi_info_memkind_check_no_accel (num_memkinds, memkinds);
ompi_info_memkind_free(num_memkinds, memkinds);
}

return ret;
}
int ompi_info_memkind_process (const char* requested_str, char **provided_str,
ompi_info_memkind_assert_type *type)
{
int err;
char *tmp_str = NULL;

int num_requested_memkinds, num_available_memkinds, num_provided_memkinds;
ompi_memkind_t *requested_memkinds = NULL ;
ompi_memkind_t *available_memkinds = NULL;
ompi_memkind_t *provided_memkinds = NULL;
ompi_info_memkind_assert_type assert_type = OMPI_INFO_MEMKIND_ASSERT_UNDEFINED;

if (NULL == requested_str) {
*provided_str = NULL;
*type = assert_type;
return OMPI_SUCCESS;
}

Expand All @@ -448,6 +481,10 @@ int ompi_info_memkind_process (const char* requested_str, char **provided_str)
goto exit;
}

if (ompi_info_memkind_check_no_accel (num_provided_memkinds, provided_memkinds)) {
assert_type = OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL;
}

ompi_info_memkind_str_create(num_provided_memkinds, provided_memkinds, &tmp_str);

exit:
Expand All @@ -459,6 +496,7 @@ int ompi_info_memkind_process (const char* requested_str, char **provided_str)
}
// Don't free the available_memkinds, they will be released in info_finalize;

*type = assert_type;
*provided_str = tmp_str;
return err;
}
Expand Down Expand Up @@ -504,15 +542,17 @@ const char *ompi_info_memkind_cb (opal_infosubscriber_t *obj, const char *key, c
** value of another info key (mpi_memory_alloc_kinds).
*/
int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, opal_infosubscriber_t *child,
opal_info_t *info)
opal_info_t *info, ompi_info_memkind_assert_type *type)
{
opal_cstring_t *parent_val;
opal_cstring_t *assert_val;
ompi_info_memkind_assert_type assert_type = OMPI_INFO_MEMKIND_ASSERT_UNDEFINED;
char *final_str = NULL;
int flag;

opal_info_get(parent->s_info, "mpi_memory_alloc_kinds", &parent_val, &flag);
if (0 == flag) {
*type = assert_type;
return OMPI_SUCCESS;
}
final_str = (char*) parent_val->string;
Expand All @@ -539,6 +579,12 @@ int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, opal_infosubsc
opal_infosubscribe_subscribe (child, "mpi_memory_alloc_kinds", final_str,
ompi_info_memkind_cb);
OBJ_RELEASE(parent_val);

if (ompi_info_memkind_check_no_accel_from_string(final_str)) {
assert_type = OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL;
}

*type = assert_type;
return OMPI_SUCCESS;
}

Expand Down
15 changes: 13 additions & 2 deletions ompi/info/info_memkind.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,26 @@ struct ompi_memkind_t {
};
typedef struct ompi_memkind_t ompi_memkind_t;

typedef enum {
OMPI_INFO_MEMKIND_ASSERT_UNDEFINED = 0, // no statement on memkind usage
OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL, // no accelerator memory is used
OMPI_INFO_MEMKIND_ASSERT_ACCEL_DEVICE_ONLY, // only accelerator device memory used
OMPI_INFO_MEMKIND_ASSERT_ACCEL_ALL // only accelerator memory (no restrictors) used
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will include the bits for OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL, probably not what we want when we check flag & OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a bit pattern, but an enum. I was contemplating whether to use a bitpatter instead (in which case your argument would have been correct), but that would have required additional processing afterwards on whether we have only mpi+system enabled or also a gpu one. With an enum we don't have that problem on this layer. Hence, we cannot do flag & OMPI_INFO_MEMKIND_ASSERT_NO_ACCEL, but have to do flag == MPI_INFO_MEMKIND_ASSERT_NO_ACCEL instead, see e.g. https://github.com/open-mpi/ompi/pull/13095/files#diff-9ab2a505b980955c4d5e74feeb016cb470239023de667666fafe9662f6884498R453

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh right, I got confused by the assertion bit patterns.

} ompi_info_memkind_assert_type;

/*
** Given a string of user requested memory alloc kinds, create
** a string with the actually support memory kinds by the library.
**
** @param[IN]: requested_str input string
** @param[OUT]: provided_str result string
** @param[OUT]: type guarantuees given on memkind utilization
**
** @return: OMPI_SUCCESS or error on failure
*/
OMPI_DECLSPEC int ompi_info_memkind_process (const char* requested_str,
char **provided_str);
char **provided_str,
ompi_info_memkind_assert_type *type);
/*
** Set the memory_alloc_kind info object on the child object, either
** by copying it from the parent object, or adjusting it based
Expand All @@ -46,12 +55,14 @@ OMPI_DECLSPEC int ompi_info_memkind_process (const char* requested_str,
** @param [INOUT]: child child object
** @param[IN]: info info object provided by code during object creation
** (e.g. MPI_Comm_dup_with_info, MPI_File_open, etc.)
** @param[OUT]: type guarantuees given on memkind utilization
**
** @return: OMPI_SUCCESS or error on failure
*/
OMPI_DECLSPEC int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent,
opal_infosubscriber_t *child,
opal_info_t *info);
opal_info_t *info,
ompi_info_memkind_assert_type *type);

/*
** free the array of available memkinds when shutting down the info
Expand Down
3 changes: 2 additions & 1 deletion ompi/instance/instance.c
Original file line number Diff line number Diff line change
Expand Up @@ -859,13 +859,14 @@ int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t
/* Copy info if there is one. */
if (OPAL_UNLIKELY(NULL != info)) {
opal_cstring_t *memkind_requested;
ompi_info_memkind_assert_type type;
int flag;

new_instance->super.s_info = OBJ_NEW(opal_info_t);
opal_info_get(info, "mpi_memory_alloc_kinds", &memkind_requested, &flag);
if (1 == flag) {
char *memkind_provided;
ompi_info_memkind_process (memkind_requested->string, &memkind_provided);
ompi_info_memkind_process (memkind_requested->string, &memkind_provided, &type);
opal_infosubscribe_subscribe (&new_instance->super, "mpi_memory_alloc_kinds",
memkind_provided, ompi_info_memkind_cb);
free (memkind_provided);
Expand Down
6 changes: 6 additions & 0 deletions ompi/mca/coll/accelerator/coll_accelerator_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ mca_coll_accelerator_comm_query(struct ompi_communicator_t *comm,
return NULL;
}

if (OMPI_COMM_CHECK_ASSERT_NO_ACCEL_BUF(comm)) {
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
"coll:accelerator:comm_query: NO_ACCEL_BUF assertion set: disqualifying myself");
return NULL;
}

accelerator_module = OBJ_NEW(mca_coll_accelerator_module_t);
if (NULL == accelerator_module) {
return NULL;
Expand Down
Loading