3030
3131#include "common_ofi.h"
3232#include "opal/constants.h"
33+ #include "opal/mca/accelerator/accelerator.h"
3334#include "opal/mca/base/mca_base_framework.h"
3435#include "opal/mca/base/mca_base_var.h"
3536#include "opal/mca/hwloc/base/base.h"
3839#include "opal/util/argv.h"
3940#include "opal/util/show_help.h"
4041
42+ extern opal_accelerator_base_module_t opal_accelerator ;
4143opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
4244 .prov_exclude = NULL ,
4345 .output = -1 };
@@ -915,15 +917,184 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915917 return (uint32_t ) process_info -> myprocid .rank ;
916918}
917919
920+ static int get_obj_depth (hwloc_obj_t obj , int * depth )
921+ {
922+ hwloc_obj_t parent = NULL ;
923+ int depth_from_obj = 0 ;
924+
925+ /* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
926+ if (0 <= (int ) obj -> depth ) {
927+ * depth = obj -> depth ;
928+ return OPAL_SUCCESS ;
929+ }
930+
931+ parent = obj -> parent ;
932+ while (parent ) {
933+ ++ depth_from_obj ;
934+ if (0 <= (int ) parent -> depth ) {
935+ * depth = parent -> depth + depth_from_obj ;
936+ return OPAL_SUCCESS ;
937+ }
938+ parent = obj -> parent ;
939+ }
940+
941+ return OPAL_ERROR ;
942+ }
943+
944+ #if OPAL_OFI_PCI_DATA_AVAILABLE
945+ /**
946+ * @brief Attempt to find a nearest provider from the accelerator.
947+ * Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
948+ * shortest distance.
949+ * Special cases:
950+ * 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
951+ * 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
952+ * return OPAL_ERR_NOT_AVAILABLE.
953+ * 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
954+ * i.e. (local rank on the same accelerator) % (number of nearest providers)
955+ * @param[in] provider_list linked list of providers
956+ * @param[in] num_providers number of providers
957+ * @param[in] accl_id Accelerator id
958+ * @param[in] device_rank local rank on the accelerator
959+ * @param[out] provider pointer to the selected provider
960+ * @return OPAL_SUCCESS if a provider is successfully selected
961+ * OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
962+ * OPAL_ERROR if a fatal error happened
963+ */
964+ static int find_nearest_provider_from_accelerator (struct fi_info * provider_list ,
965+ size_t num_providers ,
966+ int accl_id ,
967+ uint32_t device_rank ,
968+ struct fi_info * * provider )
969+ {
970+ hwloc_obj_t accl_dev = NULL , prov_dev = NULL , common_ancestor = NULL ;
971+ int ret = -1 , depth = -1 , max_common_ancestor_depth = -1 ;
972+ opal_accelerator_pci_attr_t accl_pci_attr = {0 };
973+ struct fi_info * current_provider = NULL ;
974+ struct fi_pci_attr pci = {0 };
975+ uint32_t near_provider_count = 0 , provider_rank = 0 ;
976+ uint32_t distances [num_providers ], * distance = distances ;
977+
978+ memset (distances , 0 , sizeof (distances ));
979+
980+ ret = opal_accelerator .get_device_pci_attr (accl_id , & accl_pci_attr );
981+ if (OPAL_SUCCESS != ret ) {
982+ opal_output_verbose (1 , opal_common_ofi .output ,
983+ "%s:%d:Accelerator PCI info is not available" , __FILE__ , __LINE__ );
984+ return OPAL_ERROR ;
985+ }
986+
987+ accl_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , accl_pci_attr .domain_id ,
988+ accl_pci_attr .bus_id , accl_pci_attr .device_id ,
989+ accl_pci_attr .function_id );
990+ if (NULL == accl_dev ) {
991+ opal_output_verbose (1 , opal_common_ofi .output ,
992+ "%s:%d:Failed to find accelerator PCI device" , __FILE__ , __LINE__ );
993+ return OPAL_ERROR ;
994+ }
995+
996+ opal_output_verbose (1 , opal_common_ofi .output ,
997+ "%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x" ,
998+ __FILE__ , __LINE__ , accl_id , accl_pci_attr .domain_id , accl_pci_attr .bus_id ,
999+ accl_pci_attr .device_id , accl_pci_attr .function_id ,
1000+ accl_dev -> attr -> pcidev .vendor_id , accl_dev -> attr -> pcidev .device_id );
1001+
1002+ current_provider = provider_list ;
1003+ while (NULL != current_provider ) {
1004+ common_ancestor = NULL ;
1005+ if (0 == check_provider_attr (provider_list , current_provider )
1006+ && OPAL_SUCCESS == get_provider_nic_pci (current_provider , & pci )) {
1007+ prov_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , pci .domain_id , pci .bus_id ,
1008+ pci .device_id , pci .function_id );
1009+ if (NULL == prov_dev ) {
1010+ opal_output_verbose (1 , opal_common_ofi .output ,
1011+ "%s:%d:Failed to find provider PCI device" , __FILE__ , __LINE__ );
1012+ return OPAL_ERROR ;
1013+ }
1014+
1015+ common_ancestor = hwloc_get_common_ancestor_obj (opal_hwloc_topology , accl_dev ,
1016+ prov_dev );
1017+ if (!common_ancestor ) {
1018+ opal_output_verbose (
1019+ 1 , opal_common_ofi .output ,
1020+ "%s:%d:Failed to find common ancestor of accelerator and provider PCI device" ,
1021+ __FILE__ , __LINE__ );
1022+ /**
1023+ * Return error because any 2 PCI devices should share at least one common ancestor,
1024+ * i.e. root
1025+ */
1026+ return OPAL_ERROR ;
1027+ }
1028+
1029+ ret = get_obj_depth (common_ancestor , & depth );
1030+ if (OPAL_SUCCESS != ret ) {
1031+ opal_output_verbose (1 , opal_common_ofi .output ,
1032+ "%s:%d:Failed to get common ancestor depth" , __FILE__ ,
1033+ __LINE__ );
1034+ return OPAL_ERROR ;
1035+ }
1036+
1037+ if (max_common_ancestor_depth < depth ) {
1038+ max_common_ancestor_depth = depth ;
1039+ near_provider_count = 1 ;
1040+ } else if (max_common_ancestor_depth == depth ) {
1041+ ++ near_provider_count ;
1042+ }
1043+ }
1044+
1045+ * (distance ++ ) = !common_ancestor ? 0 : depth ;
1046+ current_provider = current_provider -> next ;
1047+ }
1048+
1049+ if (0 == near_provider_count || 0 > max_common_ancestor_depth ) {
1050+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Provider does not have PCI device" ,
1051+ __FILE__ , __LINE__ );
1052+ return OPAL_ERR_NOT_AVAILABLE ;
1053+ }
1054+
1055+ provider_rank = device_rank % near_provider_count ;
1056+
1057+ distance = distances ;
1058+ current_provider = provider_list ;
1059+ while (NULL != current_provider ) {
1060+ if ((uint32_t ) max_common_ancestor_depth == * (distance ++ )
1061+ && provider_rank == -- near_provider_count ) {
1062+ * provider = current_provider ;
1063+ return OPAL_SUCCESS ;
1064+ }
1065+
1066+ current_provider = current_provider -> next ;
1067+ }
1068+
1069+ assert (0 == near_provider_count );
1070+
1071+ return OPAL_ERROR ;
1072+ }
1073+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1074+
1075+
9181076struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
9191077 opal_process_info_t * process_info )
9201078{
921- int ret , num_providers = 0 ;
1079+ int ret , num_providers = 0 , accel_id = -1 ;
9221080 struct fi_info * provider = NULL ;
923- uint32_t package_rank = process_info -> my_local_rank ;
1081+ uint32_t package_rank ;
9241082
1083+ /* Current process' local rank on the same package(socket) */
1084+ package_rank = process_info -> proc_is_bound ? get_package_rank (process_info )
1085+ : process_info -> my_local_rank ;
9251086 num_providers = count_providers (provider_list );
926- if (!process_info -> proc_is_bound || 2 > num_providers ) {
1087+
1088+ #if OPAL_OFI_PCI_DATA_AVAILABLE
1089+ ret = opal_accelerator .get_device (& accel_id );
1090+ if (OPAL_SUCCESS != ret ) {
1091+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Accelerator is not available" ,
1092+ __FILE__ , __LINE__ );
1093+ accel_id = -1 ;
1094+ }
1095+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1096+
1097+ if ((!process_info -> proc_is_bound && 0 > accel_id ) || 2 > num_providers ) {
9271098 goto round_robin ;
9281099 }
9291100
@@ -935,9 +1106,28 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9351106 __FILE__ , __LINE__ );
9361107 }
9371108
938- package_rank = get_package_rank (process_info );
939-
9401109#if OPAL_OFI_PCI_DATA_AVAILABLE
1110+ if (0 <= accel_id ) {
1111+ /**
1112+ * If accelerator is enabled, select the closest provider to the accelerator.
1113+ * Note: the function expects a local rank on the accelerator to break ties if there are
1114+ * multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
1115+ */
1116+ ret = find_nearest_provider_from_accelerator (provider_list , num_providers , accel_id ,
1117+ package_rank , & provider );
1118+ if (OPAL_SUCCESS == ret ) {
1119+ goto out ;
1120+ }
1121+
1122+ opal_output_verbose (1 , opal_common_ofi .output ,
1123+ "%s:%d:Failed to find a provider close to the accelerator. Error: %d" ,
1124+ __FILE__ , __LINE__ , ret );
1125+
1126+ if (!process_info -> proc_is_bound ) {
1127+ goto round_robin ;
1128+ }
1129+ }
1130+
9411131 /**
9421132 * If provider PCI BDF information is available, we calculate its physical distance
9431133 * to the current process, and select the provider with the shortest distance.
0 commit comments