UCP/PROTO: Filter single net device by min distance instead of max BW#11493
UCP/PROTO: Filter single net device by min distance instead of max BW#11493guy-ealey-morag wants to merge 3 commits into
Conversation
Signed-off-by: Guy Ealey Morag <gealeymorag@nvidia.com>
8347526 to
99f5143
Compare
Signed-off-by: Guy Ealey Morag <gealeymorag@nvidia.com>
257cc34 to
27871fe
Compare
Signed-off-by: Guy Ealey Morag <gealeymorag@nvidia.com>
| UCP_PROTO_LANE_ARG(params, lane, &tl_perfs[lane])); | ||
| } else { | ||
| lanes[num_filtered_lanes++] = lane; | ||
| continue; |
There was a problem hiding this comment.
the trace log could still be useful?
| const uct_tl_resource_desc_t *tl_rsc; | ||
| int cmp; | ||
|
|
||
| ucs_trace("single net dev: proto %s node_local_id %lu num_lanes %u " |
There was a problem hiding this comment.
all logs would need = sign like proto=%s node_local_id=%lu..
| }; | ||
|
|
||
| UCS_TEST_P(test_ucp_proto_mock_rcx_trio_unsorted, | ||
| sort_local_id_0_picks_lowest_sys_dev, "IB_NUM_PATHS?=2", |
There was a problem hiding this comment.
maybe name test as single_net_dev_... without sort?
| iface_attr.latency.m = 1e-9; | ||
| }; | ||
|
|
||
| add_mock_iface("mock_0:1", iface_attr_slow); /* sys_dev = 1 */ |
There was a problem hiding this comment.
min_distance/lane_map is populated by sys_latency and bandwidth. since we use host memory, all lanes are added to the lane_map. any simple way to have few more interfaces that show higher sys_latency, so to have them excluded from the list? maybe emulating get_distance?
There was a problem hiding this comment.
providing sample below if it could help, as I think it might be reproducing the issue mentioned above regarding sys_latency:
diff --git a/test/gtest/ucp/test_ucp_proto_mock.cc b/test/gtest/ucp/test_ucp_proto_mock.cc
index 5eff351e9..c9cf7f8fc 100644
--- a/test/gtest/ucp/test_ucp_proto_mock.cc
+++ b/test/gtest/ucp/test_ucp_proto_mock.cc
@@ -1259,3 +1259,135 @@ UCS_TEST_P(test_ucp_proto_mock_rcx_trio_unsorted,
UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_proto_mock_rcx_trio_unsorted, rcx,
"rc_x")
+
+class test_ucp_proto_mock_rcx_trio_local_distance_get :
+ public test_ucp_proto_mock_rcx_trio_unsorted {
+public:
+ test_ucp_proto_mock_rcx_trio_local_distance_get() :
+ m_topo_provider_added(false)
+ {
+ }
+
+ virtual void init() override
+ {
+ test_ucp_proto_mock_rcx_trio_unsorted::init();
+ ucs_list_add_head(&ucs_sys_topo_providers_list,
+ &m_topo_provider.list);
+ m_topo_provider_added = true;
+ modify_config("TOPO_PRIO", "proto_mock");
+ ucs_sys_topo_reset_provider();
+ }
+
+ virtual void cleanup() override
+ {
+ if (m_topo_provider_added) {
+ modify_config("TOPO_PRIO", "default");
+ ucs_sys_topo_reset_provider();
+ ucs_list_del(&m_topo_provider.list);
+ m_topo_provider_added = false;
+ }
+ test_ucp_proto_mock_rcx_trio_unsorted::cleanup();
+ }
+
+protected:
+ void check_get_picks(const std::string &expected_mock)
+ {
+ uint8_t remote = 42;
+ auto memh = mem_map(receiver(), &remote, sizeof(remote));
+ auto rkey_packed = rkey_pack(receiver(), memh);
+ auto rkey = rkey_unpack(sender().ep(), rkey_packed);
+ uint8_t local = 0;
+ auto local_memh = mem_map(sender(), &local, sizeof(local));
+
+ local_memh->sys_dev = 2; /* mock_0:1 and mock_2:1 are closest */
+
+ ucp_request_param_t req_param;
+ req_param.op_attr_mask = UCP_OP_ATTR_FIELD_MEMH;
+ req_param.memh = local_memh;
+
+ auto status = ucp_get_nbx(sender().ep(), &local, sizeof(local),
+ (uint64_t)&remote, rkey, &req_param);
+ request_wait(status);
+ ASSERT_EQ(local, 42);
+
+ ucp_proto_select_key_t key = any_key();
+ key.param.op_id_flags = UCP_OP_ID_GET;
+ key.param.op_attr = 0;
+
+ const std::string config = "rc_mlx5/" + expected_mock + "/path0";
+ check_rkey_config(sender(), {{1, INF, "zero-copy", config}}, key,
+ rkey->cfg_index);
+ }
+
+ struct topo_provider {
+ const char *name;
+ struct {
+ ucs_topo_get_distance_func_t get_distance;
+ void (*get_memory_distance)(ucs_sys_device_t,
+ ucs_sys_dev_distance_t*);
+ } ops;
+ ucs_list_link_t list;
+ };
+
+ static ucs_status_t
+ get_distance(ucs_sys_device_t device1, ucs_sys_device_t device2,
+ ucs_sys_dev_distance_t *distance)
+ {
+ *distance = ucs_topo_default_distance;
+ if ((device1 != UCS_SYS_DEVICE_ID_UNKNOWN) &&
+ (device2 != UCS_SYS_DEVICE_ID_UNKNOWN) &&
+ (sys_dev_delta(device1, device2) != 1)) {
+ distance->latency = 100e-9;
+ }
+
+ return UCS_OK;
+ }
+
+ static void get_memory_distance(ucs_sys_device_t,
+ ucs_sys_dev_distance_t *distance)
+ {
+ *distance = ucs_topo_default_distance;
+ }
+
+private:
+ static ucs_sys_device_t
+ sys_dev_delta(ucs_sys_device_t device1, ucs_sys_device_t device2)
+ {
+ return (device1 > device2) ? (device1 - device2) :
+ (device2 - device1);
+ }
+
+ static topo_provider m_topo_provider;
+ bool m_topo_provider_added;
+};
+
+test_ucp_proto_mock_rcx_trio_local_distance_get::topo_provider
+ test_ucp_proto_mock_rcx_trio_local_distance_get::m_topo_provider = {
+ "proto_mock", {get_distance, get_memory_distance}, {}};
+
+UCS_TEST_P(test_ucp_proto_mock_rcx_trio_local_distance_get,
+ local_id_0_picks_lowest_adjacent_sys_dev,
+ "IB_NUM_PATHS?=2", "SINGLE_NET_DEVICE=y", "NODE_LOCAL_ID=0",
+ "ZCOPY_THRESH=0")
+{
+ check_get_picks("mock_0:1");
+}
+
+UCS_TEST_P(test_ucp_proto_mock_rcx_trio_local_distance_get,
+ local_id_1_picks_highest_adjacent_sys_dev,
+ "IB_NUM_PATHS?=2", "SINGLE_NET_DEVICE=y", "NODE_LOCAL_ID=1",
+ "ZCOPY_THRESH=0")
+{
+ check_get_picks("mock_2:1");
+}
+
+UCS_TEST_P(test_ucp_proto_mock_rcx_trio_local_distance_get,
+ local_id_2_wraps_to_lowest_adjacent_sys_dev,
+ "IB_NUM_PATHS?=2", "SINGLE_NET_DEVICE=y", "NODE_LOCAL_ID=2",
+ "ZCOPY_THRESH=0")
+{
+ check_get_picks("mock_0:1");
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_proto_mock_rcx_trio_local_distance_get,
+ rcx, "rc_x")| { | ||
| ucs_sys_dev_distance_t d; | ||
|
|
||
| d.latency = tl_perf->sys_latency; |
There was a problem hiding this comment.
Using sys_latency seems right, just that it could be an issue with get/bcopy proto that does not set reg_mem_info, so single net device configuration will not be respected? maybe there other protocols in that case? maybe we could fix this super.reg_mem_info = ucp_mem_info_unknown, in get/bcopy probing function?
| { | ||
| ucs_sys_dev_distance_t d; | ||
|
|
||
| d.latency = tl_perf->sys_latency; |
There was a problem hiding this comment.
sys_latency seems derived from memory sys_dev/mem_type (host/cuda...). Instead of memory sys_dev that might change (or be missing), could we use the sys_dev of the gpu derived from local_id, or from cuda context? this would provide stable per-process net device usage, since it seems the purpose of the pr?
What?
UCX_SINGLE_NET_DEVICE=y, filter net devices by min distance instead of max BW to make sure that the nearest NIC is always selected for every rank.masterbut pass on the current branch.Why?
gaiacluster (8 GPUs, 8 ranks) every rank saw 4 GPUs with the same maximal BW.One of the 4 GPUs has lower latency than the others (the one that should be naturally assigned to the rank), but the current code considered all 4 candidates as identical, leading to wrong NIC selection.