3030
3131#include "common_ofi.h"
3232#include "opal/constants.h"
33+ #include "opal/mca/accelerator/accelerator.h"
3334#include "opal/mca/base/mca_base_framework.h"
3435#include "opal/mca/base/mca_base_var.h"
3536#include "opal/mca/hwloc/base/base.h"
3839#include "opal/util/argv.h"
3940#include "opal/util/show_help.h"
4041
42+ extern opal_accelerator_base_module_t opal_accelerator ;
4143opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
4244 .prov_exclude = NULL ,
4345 .output = -1 };
4446static const char default_prov_exclude_list [] = "shm,sockets,tcp,udp,rstream,usnic,net" ;
4547static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT ;
4648static int opal_common_ofi_verbose_level = 0 ;
4749static int opal_common_ofi_init_ref_cnt = 0 ;
50+ static bool opal_common_ofi_gpu_aware_provider_selection = false;
4851#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
4952static bool opal_common_ofi_installed_memory_monitor = false;
5053#endif
@@ -324,6 +327,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
324327 static int include_index = -1 ;
325328 static int exclude_index = -1 ;
326329 static int verbose_index = -1 ;
330+ static int gpu_aware_provider_selection = -1 ;
327331 int ret ;
328332
329333 if (fi_version () < FI_VERSION (1 , 0 )) {
@@ -389,6 +393,19 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
389393 }
390394 }
391395
396+ if (0 > gpu_aware_provider_selection ) {
397+ gpu_aware_provider_selection
398+ = mca_base_var_register ("opal" , "opal_common" , "ofi" , "gpu_aware_provider_selection" ,
399+ "Enable GPU-aware provider selection" , MCA_BASE_VAR_TYPE_BOOL ,
400+ NULL , 0 , MCA_BASE_VAR_FLAG_SETTABLE , OPAL_INFO_LVL_1 ,
401+ MCA_BASE_VAR_SCOPE_LOCAL ,
402+ & opal_common_ofi_gpu_aware_provider_selection );
403+ if (0 > gpu_aware_provider_selection ) {
404+ ret = gpu_aware_provider_selection ;
405+ goto err ;
406+ }
407+ }
408+
392409 if (component ) {
393410 ret = mca_base_var_register_synonym (include_index ,
394411 component -> mca_project_name ,
@@ -414,6 +431,15 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
414431 if (0 > ret ) {
415432 goto err ;
416433 }
434+
435+ ret = mca_base_var_register_synonym (gpu_aware_provider_selection ,
436+ component -> mca_project_name ,
437+ component -> mca_type_name ,
438+ component -> mca_component_name ,
439+ "gpu_aware_provider_selection" , 0 );
440+ if (0 > ret ) {
441+ goto err ;
442+ }
417443 }
418444
419445 /* The frameworks initialize their output streams during
@@ -915,18 +941,192 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915941 return (uint32_t ) process_info -> myprocid .rank ;
916942}
917943
944+ static int get_parent_distance (hwloc_obj_t parent , hwloc_obj_t child , int * distance )
945+ {
946+ int dist = 0 ;
947+
948+ while (child != parent ) {
949+ if (!child ) {
950+ return OPAL_ERROR ;
951+ }
952+ child = child -> parent ;
953+ ++ dist ;
954+ }
955+
956+ * distance = dist ;
957+ return OPAL_SUCCESS ;
958+ }
959+
960+ #if OPAL_OFI_PCI_DATA_AVAILABLE
961+ /**
962+ * @brief Attempt to find a nearest provider from the accelerator.
963+ * Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
964+ * shortest distance.
965+ * Special cases:
966+ * 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
967+ * 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
968+ * return OPAL_ERR_NOT_AVAILABLE.
969+ * 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
970+ * i.e. (local rank on the same accelerator) % (number of nearest providers)
971+ * @param[in] provider_list linked list of providers
972+ * @param[in] num_providers number of providers
973+ * @param[in] accl_id Accelerator id
974+ * @param[in] device_rank local rank on the accelerator
975+ * @param[out] provider pointer to the selected provider
976+ * @return OPAL_SUCCESS if a provider is successfully selected
977+ * OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
978+ * OPAL_ERROR if a fatal error happened
979+ */
980+ static int find_nearest_provider_from_accelerator (struct fi_info * provider_list ,
981+ size_t num_providers ,
982+ int accl_id ,
983+ uint32_t device_rank ,
984+ struct fi_info * * provider )
985+ {
986+ hwloc_obj_t accl_dev = NULL , prov_dev = NULL , common_ancestor = NULL ;
987+ int ret = -1 , accl_distance = -1 , prov_distance = -1 , min_distance = INT_MAX ;
988+ opal_accelerator_pci_attr_t accl_pci_attr = {0 };
989+ struct fi_info * current_provider = NULL ;
990+ struct fi_pci_attr pci = {0 };
991+ uint32_t distances [num_providers ], * distance = distances ;
992+ uint32_t near_provider_count = 0 , provider_rank = 0 ;
993+
994+ memset (distances , 0 , sizeof (distances ));
995+
996+ ret = opal_accelerator .get_device_pci_attr (accl_id , & accl_pci_attr );
997+ if (OPAL_SUCCESS != ret ) {
998+ opal_output_verbose (1 , opal_common_ofi .output ,
999+ "%s:%d:Accelerator PCI info is not available" , __FILE__ , __LINE__ );
1000+ return OPAL_ERROR ;
1001+ }
1002+
1003+ accl_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , accl_pci_attr .domain_id ,
1004+ accl_pci_attr .bus_id , accl_pci_attr .device_id ,
1005+ accl_pci_attr .function_id );
1006+ if (NULL == accl_dev ) {
1007+ opal_output_verbose (1 , opal_common_ofi .output ,
1008+ "%s:%d:Failed to find accelerator PCI device" , __FILE__ , __LINE__ );
1009+ return OPAL_ERROR ;
1010+ }
1011+
1012+ opal_output_verbose (1 , opal_common_ofi .output ,
1013+ "%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x" ,
1014+ __FILE__ , __LINE__ , accl_id , accl_pci_attr .domain_id , accl_pci_attr .bus_id ,
1015+ accl_pci_attr .device_id , accl_pci_attr .function_id ,
1016+ accl_dev -> attr -> pcidev .vendor_id , accl_dev -> attr -> pcidev .device_id );
1017+
1018+ current_provider = provider_list ;
1019+ while (NULL != current_provider ) {
1020+ common_ancestor = NULL ;
1021+ if (0 == check_provider_attr (provider_list , current_provider )
1022+ && OPAL_SUCCESS == get_provider_nic_pci (current_provider , & pci )) {
1023+ prov_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , pci .domain_id , pci .bus_id ,
1024+ pci .device_id , pci .function_id );
1025+ if (NULL == prov_dev ) {
1026+ opal_output_verbose (1 , opal_common_ofi .output ,
1027+ "%s:%d:Failed to find provider PCI device" , __FILE__ , __LINE__ );
1028+ return OPAL_ERROR ;
1029+ }
1030+
1031+ common_ancestor = hwloc_get_common_ancestor_obj (opal_hwloc_topology , accl_dev ,
1032+ prov_dev );
1033+ if (!common_ancestor ) {
1034+ opal_output_verbose (
1035+ 1 , opal_common_ofi .output ,
1036+ "%s:%d:Failed to find common ancestor of accelerator and provider PCI device" ,
1037+ __FILE__ , __LINE__ );
1038+ /**
1039+ * Return error because any 2 PCI devices should share at least one common ancestor,
1040+ * i.e. root
1041+ */
1042+ return OPAL_ERROR ;
1043+ }
1044+
1045+ ret = get_parent_distance (common_ancestor , accl_dev , & accl_distance );
1046+ if (OPAL_SUCCESS != ret ) {
1047+ opal_output_verbose (
1048+ 1 , opal_common_ofi .output ,
1049+ "%s:%d:Failed to get distance between common ancestor and accelerator device" ,
1050+ __FILE__ , __LINE__ );
1051+ return OPAL_ERROR ;
1052+ }
1053+
1054+ ret = get_parent_distance (common_ancestor , prov_dev , & prov_distance );
1055+ if (OPAL_SUCCESS != ret ) {
1056+ opal_output_verbose (
1057+ 1 , opal_common_ofi .output ,
1058+ "%s:%d:Failed to get distance between common ancestor and provider device" ,
1059+ __FILE__ , __LINE__ );
1060+ return OPAL_ERROR ;
1061+ }
1062+
1063+ if (min_distance > accl_distance + prov_distance ) {
1064+ min_distance = accl_distance + prov_distance ;
1065+ near_provider_count = 1 ;
1066+ } else if (min_distance == accl_distance + prov_distance ) {
1067+ ++ near_provider_count ;
1068+ }
1069+ }
1070+
1071+ * (distance ++ ) = !common_ancestor ? 0 : accl_distance + prov_distance ;
1072+ current_provider = current_provider -> next ;
1073+ }
1074+
1075+ if (0 == near_provider_count ) {
1076+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Provider does not have PCI device" ,
1077+ __FILE__ , __LINE__ );
1078+ return OPAL_ERR_NOT_AVAILABLE ;
1079+ }
1080+
1081+ provider_rank = device_rank % near_provider_count ;
1082+
1083+ distance = distances ;
1084+ current_provider = provider_list ;
1085+ while (NULL != current_provider ) {
1086+ if ((uint32_t ) min_distance == * (distance ++ )
1087+ && provider_rank == -- near_provider_count ) {
1088+ * provider = current_provider ;
1089+ return OPAL_SUCCESS ;
1090+ }
1091+
1092+ current_provider = current_provider -> next ;
1093+ }
1094+
1095+ assert (0 == near_provider_count );
1096+
1097+ return OPAL_ERROR ;
1098+ }
1099+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1100+
1101+
9181102struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
9191103 opal_process_info_t * process_info )
9201104{
921- int ret , num_providers = 0 ;
1105+ int ret , num_providers = 0 , accel_id = -1 ;
9221106 struct fi_info * provider = NULL ;
923- uint32_t package_rank = process_info -> my_local_rank ;
1107+ uint32_t package_rank ;
9241108
1109+ /* Current process' local rank on the same package(socket) */
1110+ package_rank = process_info -> proc_is_bound ? get_package_rank (process_info )
1111+ : process_info -> my_local_rank ;
9251112 num_providers = count_providers (provider_list );
926- if (!process_info -> proc_is_bound || 2 > num_providers ) {
1113+
1114+ #if OPAL_OFI_PCI_DATA_AVAILABLE
1115+ if (opal_common_ofi_gpu_aware_provider_selection ) {
1116+ ret = opal_accelerator .get_device (& accel_id );
1117+ if (OPAL_SUCCESS != ret ) {
1118+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Accelerator is not available" ,
1119+ __FILE__ , __LINE__ );
1120+ accel_id = -1 ;
1121+ }
1122+ }
1123+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1124+
1125+ if ((!process_info -> proc_is_bound && 0 > accel_id ) || 2 > num_providers ) {
9271126 goto round_robin ;
9281127 }
9291128
1129+ #if OPAL_OFI_PCI_DATA_AVAILABLE
9301130 /* Initialize opal_hwloc_topology if it is not already */
9311131 ret = opal_hwloc_base_get_topology ();
9321132 if (0 > ret ) {
@@ -935,9 +1135,27 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9351135 __FILE__ , __LINE__ );
9361136 }
9371137
938- package_rank = get_package_rank (process_info );
1138+ if (0 <= accel_id ) {
1139+ /**
1140+ * If accelerator is enabled, select the closest provider to the accelerator.
1141+ * Note: the function expects a local rank on the accelerator to break ties if there are
1142+ * multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
1143+ */
1144+ ret = find_nearest_provider_from_accelerator (provider_list , num_providers , accel_id ,
1145+ package_rank , & provider );
1146+ if (OPAL_SUCCESS == ret ) {
1147+ goto out ;
1148+ }
1149+
1150+ opal_output_verbose (1 , opal_common_ofi .output ,
1151+ "%s:%d:Failed to find a provider close to the accelerator. Error: %d" ,
1152+ __FILE__ , __LINE__ , ret );
1153+
1154+ if (!process_info -> proc_is_bound ) {
1155+ goto round_robin ;
1156+ }
1157+ }
9391158
940- #if OPAL_OFI_PCI_DATA_AVAILABLE
9411159 /**
9421160 * If provider PCI BDF information is available, we calculate its physical distance
9431161 * to the current process, and select the provider with the shortest distance.
0 commit comments