From 2f5c521edd3f86935bae4f1b07377566e50ba051 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 9 Jan 2024 13:41:04 -0800 Subject: [PATCH] src: Implement Dave's feedback --- src/init.c | 16 +++++----- src/shmem_internal.h | 2 +- src/transport_ofi.c | 74 ++++++++++++++++++++++++-------------------- 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/src/init.c b/src/init.c index 532e388c..83bfab64 100644 --- a/src/init.c +++ b/src/init.c @@ -100,7 +100,7 @@ unsigned int shmem_internal_rand_seed; #ifdef USE_HWLOC #include -hwloc_topology_t shmem_topology; +hwloc_topology_t shmem_internal_topology; #endif #ifdef ENABLE_THREADS @@ -384,17 +384,17 @@ shmem_internal_heap_postinit(void) #ifdef HAVE_SCHED_GETAFFINITY #ifdef USE_HWLOC - ret = hwloc_topology_init(&shmem_topology); + ret = hwloc_topology_init(&shmem_internal_topology); if (ret < 0) { RETURN_ERROR_MSG("hwloc_topology_init failed (%s)\n", strerror(errno)); } - ret = hwloc_topology_set_io_types_filter(shmem_topology, HWLOC_TYPE_FILTER_KEEP_ALL); + ret = hwloc_topology_set_io_types_filter(shmem_internal_topology, HWLOC_TYPE_FILTER_KEEP_ALL); if (ret < 0) { RETURN_ERROR_MSG("hwloc_topology_set_io_types_filter failed (%s)\n", strerror(errno)); } - ret = hwloc_topology_load(shmem_topology); + ret = hwloc_topology_load(shmem_internal_topology); if (ret < 0) { RETURN_ERROR_MSG("hwloc_topology_load failed (%s)\n", strerror(errno)); } @@ -403,20 +403,20 @@ shmem_internal_heap_postinit(void) hwloc_bitmap_t bindset_all = hwloc_bitmap_alloc(); hwloc_bitmap_t bindset_socket = hwloc_bitmap_alloc(); - ret = hwloc_get_proc_last_cpu_location(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_last_cpu_location(shmem_internal_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RETURN_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); } - ret = hwloc_get_proc_cpubind(shmem_topology, getpid(), bindset_all, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_cpubind(shmem_internal_topology, getpid(), bindset_all, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RETURN_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); } - hwloc_obj_t socket = hwloc_get_next_obj_covering_cpuset_by_type(shmem_topology, bindset, HWLOC_OBJ_PACKAGE, NULL); + hwloc_obj_t socket = hwloc_get_next_obj_covering_cpuset_by_type(shmem_internal_topology, bindset, HWLOC_OBJ_PACKAGE, NULL); if (!socket) RETURN_ERROR_MSG("hwloc_get_next_obj_covering_cpuset_by_type failed (could not detect object of type 'HWLOC_OBJ_PACKAGE' in provided cpuset)\n"); hwloc_bitmap_and(bindset_socket, bindset_all, socket->cpuset); - hwloc_set_proc_cpubind(shmem_topology, getpid(), bindset_socket, HWLOC_CPUBIND_PROCESS); //Include HWLOC_CPUBIND_STRICT in flags? + hwloc_set_proc_cpubind(shmem_internal_topology, getpid(), bindset_socket, HWLOC_CPUBIND_PROCESS); //Include HWLOC_CPUBIND_STRICT in flags? hwloc_bitmap_free(bindset); hwloc_bitmap_free(bindset_all); diff --git a/src/shmem_internal.h b/src/shmem_internal.h index d92f9d34..74559ef2 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -53,7 +53,7 @@ extern unsigned int shmem_internal_rand_seed; #ifdef USE_HWLOC #include -extern hwloc_topology_t shmem_topology; +extern hwloc_topology_t shmem_internal_topology; #endif #define SHMEM_INTERNAL_HEAP_OVERHEAD (1024*1024) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 4d8a9c6e..f4ac98d9 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1338,7 +1338,7 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p int ret = 0; hwloc_bitmap_t bindset = hwloc_bitmap_alloc(); - ret = hwloc_get_proc_last_cpu_location(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_last_cpu_location(shmem_internal_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RAISE_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); } @@ -1352,10 +1352,16 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p if (cur_prov->nic->bus_attr->bus_type != FI_BUS_PCI) continue; struct fi_pci_attr pci = cur_prov->nic->bus_attr->attr.pci; - hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); - if (!io_device) RAISE_ERROR_MSG("hwloc_get_pcidev_by_busid failed\n"); - hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_topology, io_device); - if (!first_non_io) RAISE_ERROR_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_internal_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); + if (!io_device) { + RAISE_WARN_MSG("hwloc_get_pcidev_by_busid failed\n"); + return provs[shmem_internal_my_pe % num_nics]; + }; + hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_internal_topology, io_device); + if (!first_non_io) { + RAISE_WARN_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + return provs[shmem_internal_my_pe % num_nics]; + } if (hwloc_bitmap_isincluded(bindset, first_non_io->cpuset) || hwloc_bitmap_isincluded(first_non_io->cpuset, bindset)) { @@ -1372,7 +1378,6 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p RAISE_WARN_MSG("Could not detect any NICs with affinity to the process\n"); /* If no 'close' NICs, select from list of all NICs using round-robin assignment */ - //return provs[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; return provs[shmem_internal_my_pe % num_nics]; } @@ -1386,7 +1391,6 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p hwloc_bitmap_free(bindset); - //struct fi_info *provider = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; struct fi_info *provider = prov_list[shmem_internal_my_pe % num_close_nics]; free(prov_list); @@ -1512,7 +1516,8 @@ int query_for_fabric(struct fabric_info *info) info->prov_name != NULL ? info->prov_name : ""); /* If the user supplied a fabric or domain name, use it to select the - * fabric. Otherwise, select the first fabric in the list. */ + * fabrics that may be chosen. Otherwise, consider all available + * fabrics */ int num_nics = 0; struct fi_info *fallback = NULL; struct fi_info *filtered_fabrics_list_head = NULL; @@ -1544,38 +1549,41 @@ int query_for_fabric(struct fabric_info *info) info->p_info = NULL; - for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - if (!fallback) fallback = cur_fabric; - if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { - num_nics += 1; - if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; - if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; - multirail_fabric_last_added = cur_fabric; - } - } - - DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); - if ((num_nics == 0) || (shmem_internal_params.DISABLE_MULTIRAIL)) { - info->p_info = fallback; + if (shmem_internal_params.DISABLE_MULTIRAIL) { + info->p_info = filtered_fabrics_list_head; } else { - int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); - for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + /* Generate a linked list of all fabrics with a non-null nic value */ + for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + if (!fallback) fallback = cur_fabric; + if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { + num_nics += 1; + if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; + if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; + multirail_fabric_last_added = cur_fabric; + } } - qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); - //DEBUG_MSG("[%d]: local_pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); + + DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); + if (num_nics == 0) { + info->p_info = fallback; + } + else { + int idx = 0; + struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + prov_list[idx++] = cur_fabric; + } + qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC - info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); + info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); #else - /* Round-robin assignment of NICs to PEs */ - //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; - info->p_info = prov_list[shmem_internal_my_pe % num_nics]; + /* Round-robin assignment of NICs to PEs */ + info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif - free(prov_list); + free(prov_list); + } } - if (NULL == info->p_info) { RAISE_WARN_MSG("OFI transport, no valid fabric (prov=%s, fabric=%s, domain=%s)\n", info->prov_name != NULL ? info->prov_name : "",