diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index 6dd662cd6c4..58ae1950b8b 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -65,6 +65,7 @@ bin_PROGRAMS = \ unit/fi_getinfo_test \ unit/fi_setopt_test \ unit/fi_check_hmem \ + unit/fi_nic_affinity_test \ ubertest/fi_ubertest \ multinode/fi_multinode \ multinode/fi_multinode_coll \ @@ -496,6 +497,11 @@ unit_fi_check_hmem_LDADD = libfabtests.la unit_fi_check_hmem_CFLAGS = \ $(AM_CFLAGS) +unit_fi_nic_affinity_test_SOURCES = \ + unit/nic_affinity_test.c \ + $(unit_srcs) +unit_fi_nic_affinity_test_LDADD = libfabtests.la + ubertest_fi_ubertest_SOURCES = \ ubertest/fabtest.h \ ubertest/uber.c \ @@ -678,6 +684,7 @@ dummy_man_pages = \ man/man1/fi_eq_test.1 \ man/man1/fi_getinfo_test.1 \ man/man1/fi_mr_test.1 \ + man/man1/fi_nic_affinity_test.1 \ man/man1/fi_flood.1 \ man/man1/fi_rdm_multi_client.1 \ man/man1/fi_ubertest.1 \ diff --git a/fabtests/include/windows/osd.h b/fabtests/include/windows/osd.h index bc9fd781977..fec51a696b3 100644 --- a/fabtests/include/windows/osd.h +++ b/fabtests/include/windows/osd.h @@ -802,4 +802,16 @@ OFI_COMPLEX_OPS(float) OFI_COMPLEX_OPS(double) OFI_COMPLEX_OPS(long_double) +static inline int setenv(const char *name, const char *value, int overwrite) +{ + if (!overwrite && getenv(name) != NULL) + return 0; + return _putenv_s(name, value); +} + +static inline int unsetenv(const char *name) +{ + return _putenv_s(name, ""); +} + #endif /* _WINDOWS_OSD_H_ */ diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index e5cb8f5636e..e59edb20662 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -226,6 +226,9 @@ testing scope is limited. *fi_mr_cache_evict* : Tests provider MR cache eviction capabilities. +*fi_nic_affinity_test* +: Validates that fi_getinfo returns correct output when the GPU-NIC affinity feature is enabled. + ## Multinode This test runs a series of tests over multiple formats and patterns to help diff --git a/fabtests/man/man1/fi_nic_affinity_test.1 b/fabtests/man/man1/fi_nic_affinity_test.1 new file mode 100644 index 00000000000..3f6ccf96f11 --- /dev/null +++ b/fabtests/man/man1/fi_nic_affinity_test.1 @@ -0,0 +1 @@ +.so man7/fabtests.7 diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index c1987b4a303..b6def6dfcc5 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -246,6 +246,7 @@ unit_tests=( "fi_mr_test" "fi_cntr_test" "fi_setopt_test" + "fi_nic_affinity_test" ) regression_tests=( diff --git a/fabtests/unit/nic_affinity_test.c b/fabtests/unit/nic_affinity_test.c new file mode 100644 index 00000000000..3106a2cac23 --- /dev/null +++ b/fabtests/unit/nic_affinity_test.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include "shared.h" +#include "unit_common.h" + +#define OFI_CORE_PROV_ONLY (1ULL << 59) +#define TEST_PCI_ADDR "0000:00:00.0" +#define TEST_CONFIG_FILE "/tmp/verbs_test_config.conf" + +#define TEST_ENTRY_NIC_AFFINITY(name) TEST_ENTRY(nic_affinity_ ## name,\ + nic_affinity_ ## name ## _desc) + +typedef int (*ft_nic_affinity_init)(struct fi_info *); +typedef int (*ft_nic_affinity_test)(struct fi_info *); + +static char err_buf[512]; + +static const char *get_nic_name(struct fi_info *info) +{ + if (info->nic && info->nic->device_attr && info->nic->device_attr->name) + return info->nic->device_attr->name; + return NULL; +} + +static int get_arbitrary_nic_name(char *nic_name, size_t len) +{ + struct fi_info *info = NULL; + int ret; + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &info); + if (ret) { + sprintf(err_buf, "fi_getinfo failed to discover NICs: %s", fi_strerror(-ret)); + return ret; + } + + if (!info) { + sprintf(err_buf, "No provider info returned"); + return -FI_ENODATA; + } + + for (struct fi_info *cur = info; cur; cur = cur->next) { + const char *name = get_nic_name(cur); + if (name) { + snprintf(nic_name, len, "%s", name); + fi_freeinfo(info); + return 0; + } + } + + fi_freeinfo(info); + sprintf(err_buf, "No NIC names found in provider info"); + return -FI_ENODATA; +} + +static int create_valid_test_config_file(void) +{ + FILE *fp; + char nic_name[64]; + int ret; + + ret = get_arbitrary_nic_name(nic_name, sizeof(nic_name)); + if (ret) return ret; + + fp = fopen(TEST_CONFIG_FILE, "w"); + if (!fp) { + sprintf(err_buf, "Failed to open config file for writing"); + return -FI_EIO; + } + + fprintf(fp, "%s %s\n", TEST_PCI_ADDR, nic_name); + fclose(fp); + + return 0; +} + +static int create_invalid_test_config_file(void) +{ + FILE *fp; + + fp = fopen(TEST_CONFIG_FILE, "w"); + if (!fp) { + sprintf(err_buf, "Failed to open config file for writing"); + return -FI_EIO; + } + + fprintf(fp, "invalid_config_line\n"); + fclose(fp); + + return 0; +} + +static void cleanup_verbs_affinity_test(void) +{ + unlink(TEST_CONFIG_FILE); + unsetenv("FI_VERBS_NIC_AFFINITY_POLICY"); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + unsetenv("FI_VERBS_NIC_AFFINITY_CONFIG"); +} + +/* + * Verbs GPU/NIC affinity init functions + */ +static int init_verbs_affinity_manual(struct fi_info *hints) +{ + int ret; + + ret = create_valid_test_config_file(); + if (ret) return ret; + + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "manual", 1); + setenv("FI_VERBS_NIC_AFFINITY_CONFIG", TEST_CONFIG_FILE, 1); + return 0; +} + +static int init_verbs_affinity_manual_no_device(struct fi_info *hints) +{ + int ret; + + ret = create_valid_test_config_file(); + if (ret) return ret; + + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "manual", 1); + setenv("FI_VERBS_NIC_AFFINITY_CONFIG", TEST_CONFIG_FILE, 1); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + return 0; +} + +static int init_verbs_affinity_manual_invalid_device(struct fi_info *hints) +{ + int ret; + + ret = create_valid_test_config_file(); + if (ret) return ret; + + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "manual", 1); + setenv("FI_VERBS_NIC_AFFINITY_CONFIG", TEST_CONFIG_FILE, 1); + setenv("FI_VERBS_AFFINITY_DEVICE", "invalid:pci:format:bad", 1); + return 0; +} + +static int init_verbs_affinity_manual_missing_config(struct fi_info *hints) +{ + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "manual", 1); + setenv("FI_VERBS_NIC_AFFINITY_CONFIG", "/nonexistent/path/to/config.conf", 1); + return 0; +} + +static int init_verbs_affinity_manual_malformed_config(struct fi_info *hints) +{ + int ret; + + ret = create_invalid_test_config_file(); + if (ret) return ret; + + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "manual", 1); + setenv("FI_VERBS_NIC_AFFINITY_CONFIG", TEST_CONFIG_FILE, 1); + return 0; +} + +static int init_verbs_affinity_auto(struct fi_info *hints) +{ + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "auto", 1); + return 0; +} + +static int init_verbs_affinity_auto_no_device(struct fi_info *hints) +{ + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "auto", 1); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + return 0; +} + +static int init_verbs_affinity_auto_invalid_device(struct fi_info *hints) +{ + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "auto", 1); + setenv("FI_VERBS_AFFINITY_DEVICE", "invalid:pci:format:bad", 1); + return 0; +} + +static int init_verbs_affinity_invalid(struct fi_info *hints) +{ + setenv("FI_VERBS_NIC_AFFINITY_POLICY", "invalid_garbage_policy", 1); + return 0; +} + +/* + * Verbs GPU/NIC affinity check functions + */ +static int check_count_and_grouping(struct fi_info *original_info, struct fi_info *policy_info) +{ + struct fi_info *original_cur; + struct fi_info *affinity_cur; + const char *nic_to_find; + size_t original_count; + size_t policy_count; + + original_cur = original_info; + while (original_cur) { + nic_to_find = get_nic_name(original_cur); + if (!nic_to_find) { + original_cur = original_cur->next; + continue; + } + + original_count = 0; + while (original_cur && get_nic_name(original_cur) && + strcmp(get_nic_name(original_cur), nic_to_find) == 0) { + original_count++; + original_cur = original_cur->next; + } + + for (affinity_cur = policy_info; affinity_cur; affinity_cur = affinity_cur->next) { + if (get_nic_name(affinity_cur) && + strcmp(get_nic_name(affinity_cur), nic_to_find) == 0) + break; + } + + policy_count = 0; + while (affinity_cur && get_nic_name(affinity_cur) && + strcmp(get_nic_name(affinity_cur), nic_to_find) == 0) { + policy_count++; + affinity_cur = affinity_cur->next; + } + + if (original_count != policy_count) { + sprintf(err_buf, "NIC %s: original has %zu entries, policy has %zu consecutive entries", + nic_to_find, original_count, policy_count); + return EXIT_FAILURE; + } + } + + return 0; +} + +static int compare_lists_same_order(struct fi_info *list1, struct fi_info *list2) +{ + struct fi_info *cur1; + struct fi_info *cur2; + const char *name1; + const char *name2; + + cur1 = list1; + cur2 = list2; + while (cur1 && cur2) { + name1 = get_nic_name(cur1); + name2 = get_nic_name(cur2); + + if (name1 && name2 && strcmp(name1, name2) != 0) { + sprintf(err_buf, "Order mismatch: %s != %s", name1, name2); + return EXIT_FAILURE; + } + + cur1 = cur1->next; + cur2 = cur2->next; + } + + if (cur1 || cur2) { + sprintf(err_buf, "Different number of entries"); + return EXIT_FAILURE; + } + + return 0; +} + +static int check_verbs_no_interference(struct fi_info *hints) +{ + struct fi_info *original_info = NULL; + struct fi_info *policy_info1 = NULL; + struct fi_info *policy_info2 = NULL; + int ret; + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, OFI_CORE_PROV_ONLY, hints, &policy_info1); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_getinfo with affinity policy failed", ret); + return ret; + } + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, OFI_CORE_PROV_ONLY, hints, &policy_info2); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_getinfo with affinity policy (second call) failed", ret); + fi_freeinfo(policy_info1); + return ret; + } + + ret = compare_lists_same_order(policy_info1, policy_info2); + if (ret) + goto cleanup; + + unsetenv("FI_VERBS_NIC_AFFINITY_POLICY"); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, OFI_CORE_PROV_ONLY, hints, &original_info); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_getinfo with policy=none failed", ret); + goto cleanup; + } + + ret = check_count_and_grouping(original_info, policy_info1); + +cleanup: + fi_freeinfo(original_info); + fi_freeinfo(policy_info1); + fi_freeinfo(policy_info2); + + cleanup_verbs_affinity_test(); + + return ret; +} + +static int check_verbs_identical_list(struct fi_info *hints) +{ + struct fi_info *original_info = NULL; + struct fi_info *policy_info = NULL; + int ret; + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, OFI_CORE_PROV_ONLY, hints, &policy_info); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_getinfo with affinity policy failed", ret); + cleanup_verbs_affinity_test(); + return ret; + } + + unsetenv("FI_VERBS_NIC_AFFINITY_POLICY"); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, OFI_CORE_PROV_ONLY, hints, &original_info); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_getinfo with policy=none failed", ret); + fi_freeinfo(policy_info); + cleanup_verbs_affinity_test(); + return ret; + } + + ret = compare_lists_same_order(original_info, policy_info); + + fi_freeinfo(original_info); + fi_freeinfo(policy_info); + + cleanup_verbs_affinity_test(); + + return ret; +} + +/* + * nic_affinity test + */ +static int nic_affinity_unit_test(ft_nic_affinity_init init, + ft_nic_affinity_test test) +{ + struct fi_info *info = NULL, *test_hints = NULL; + int ret; + + test_hints = fi_dupinfo(hints); + if (!test_hints) + return -FI_ENOMEM; + + if (init) { + ret = init(test_hints); + if (ret) + goto out; + } + + if (test) { + ret = test(test_hints); + } else { + ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, + test_hints, &info); + } + if (ret) { + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); + goto out; + } + +out: + fi_freeinfo(test_hints); + fi_freeinfo(info); + return ret; +} + +#define nic_affinity_test(name, desc, init, test) \ +char *nic_affinity_ ## name ## _desc = desc; \ +static int nic_affinity_ ## name(void) \ +{ \ + int ret, testret = FAIL; \ + ret = nic_affinity_unit_test(init, test); \ + if (ret) \ + goto fail; \ + testret = PASS; \ +fail: \ + return TEST_RET_VAL(ret, testret); \ +} + +/* + * Tests: + */ +/* Verbs GPU/NIC affinity tests */ +nic_affinity_test(verbs_manual, "Test verbs manual", + init_verbs_affinity_manual, + check_verbs_no_interference) +nic_affinity_test(verbs_manual_no_device, "Test verbs manual without device", + init_verbs_affinity_manual_no_device, + check_verbs_identical_list) +nic_affinity_test(verbs_manual_invalid_device, "Test verbs manual with invalid device", + init_verbs_affinity_manual_invalid_device, + check_verbs_identical_list) +nic_affinity_test(verbs_manual_missing_config, "Test verbs manual with missing config file", + init_verbs_affinity_manual_missing_config, + check_verbs_identical_list) +nic_affinity_test(verbs_manual_malformed_config, "Test verbs manual with malformed config", + init_verbs_affinity_manual_malformed_config, + check_verbs_identical_list) +nic_affinity_test(verbs_auto, "Test verbs auto", + init_verbs_affinity_auto, + check_verbs_no_interference) +nic_affinity_test(verbs_auto_no_device, "Test verbs auto without device", + init_verbs_affinity_auto_no_device, + check_verbs_identical_list) +nic_affinity_test(verbs_auto_invalid_device, "Test verbs auto with invalid device", + init_verbs_affinity_auto_invalid_device, + check_verbs_identical_list) +nic_affinity_test(verbs_invalid_policy, "Test verbs invalid fallback to none", + init_verbs_affinity_invalid, + check_verbs_identical_list) + +static void usage(char *name) +{ + ft_unit_usage(name, "Unit tests for GPU-NIC affinity"); +} + +int main(int argc, char **argv) +{ + int failed, cleanup_ret; + int op; + + struct test_entry verbs_nic_affinity_tests[] = { + TEST_ENTRY_NIC_AFFINITY(verbs_manual), + TEST_ENTRY_NIC_AFFINITY(verbs_manual_no_device), + TEST_ENTRY_NIC_AFFINITY(verbs_manual_invalid_device), + TEST_ENTRY_NIC_AFFINITY(verbs_manual_missing_config), + TEST_ENTRY_NIC_AFFINITY(verbs_manual_malformed_config), + TEST_ENTRY_NIC_AFFINITY(verbs_auto), + TEST_ENTRY_NIC_AFFINITY(verbs_auto_no_device), + TEST_ENTRY_NIC_AFFINITY(verbs_auto_invalid_device), + TEST_ENTRY_NIC_AFFINITY(verbs_invalid_policy), + { NULL, "" } + }; + + opts = INIT_OPTS; + + hints = fi_allocinfo(); + if (!hints) + return EXIT_FAILURE; + + while ((op = getopt(argc, argv, INFO_OPTS "h")) != -1) { + switch (op) { + default: + ft_parseinfo(op, optarg, hints, &opts); + break; + case '?': + case 'h': + usage(argv[0]); + return EXIT_FAILURE; + } + } + + if (!hints->fabric_attr->prov_name || + strcmp(hints->fabric_attr->prov_name, "verbs") != 0) { + printf("GPU-NIC affinity is a verbs-specific feature; " + "overriding requested provider with 'verbs'.\n"); + free(hints->fabric_attr->prov_name); + hints->fabric_attr->prov_name = strdup("verbs"); + if (!hints->fabric_attr->prov_name) { + fi_freeinfo(hints); + return EXIT_FAILURE; + } + } + + hints->mode = ~0; + + setenv("FI_VERBS_AFFINITY_DEVICE", TEST_PCI_ADDR, 1); + failed = run_tests(verbs_nic_affinity_tests, err_buf); + unsetenv("FI_VERBS_AFFINITY_DEVICE"); + + if (failed > 0) { + printf("\nSummary: %d tests failed\n", failed); + } else { + printf("\nSummary: all tests passed\n"); + } + + cleanup_ret = ft_free_res(); + return cleanup_ret ? ft_exit_code(cleanup_ret) : + (failed > 0) ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/man/fi_verbs.7.md b/man/fi_verbs.7.md index a85323eccac..2b6396a80d0 100644 --- a/man/fi_verbs.7.md +++ b/man/fi_verbs.7.md @@ -209,6 +209,15 @@ The verbs provider checks for the following environment variables. testing the functionality of the dmabuf_peer_mem hooking provider and the corresponding kernel driver. (default: yes) +*FI_VERBS_NIC_AFFINITY_POLICY* +: NIC affinity policy for ordering NICs in fi_getinfo results. (default: none) + +*FI_VERBS_AFFINITY_DEVICE* +: PCI address of device for NIC affinity. + +*FI_VERBS_NIC_AFFINITY_CONFIG* +: Path to NIC affinity configuration file for 'manual' policy. + ### Variables specific to MSG endpoints *FI_VERBS_IFACE* diff --git a/prov/verbs/configure.m4 b/prov/verbs/configure.m4 index 6217b804c07..20eec21cc41 100644 --- a/prov/verbs/configure.m4 +++ b/prov/verbs/configure.m4 @@ -42,6 +42,18 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ [verbs_rdmacm_ex_happy=1], [verbs_rdmacm_ex_happy=0]) + hwloc_prefix="" + AS_IF([test "x$HWLOC_DIR" != "x"], [hwloc_prefix="$HWLOC_DIR"]) + FI_CHECK_PACKAGE([verbs_hwloc], + [hwloc.h], + [hwloc], + [hwloc_topology_init], + [], + [$hwloc_prefix], + [], + [verbs_hwloc_happy=1], + [verbs_hwloc_happy=0]) + ]) AC_CHECK_HEADERS([asm/types.h]) @@ -95,6 +107,15 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ AC_DEFINE_UNQUOTED([VERBS_HAVE_DMABUF_MR],[$VERBS_HAVE_DMABUF_MR], [Whether infiniband/verbs.h has ibv_reg_dmabuf_mr() support or not]) + #See if we have hwloc support (optional, for NIC-GPU affinity) + AS_IF([test $verbs_hwloc_happy -eq 1],[ + AC_DEFINE([HAVE_HWLOC], [1], [Define to 1 if hwloc is available]) + ],[ + verbs_hwloc_CPPFLAGS="" + verbs_hwloc_LDFLAGS="" + verbs_hwloc_LIBS="" + ]) + CPPFLAGS=$fi_verbs_configure_save_CPPFLAGS # Technically, verbs_ibverbs_CPPFLAGS and @@ -102,9 +123,9 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ # unlikely that they ever will be. So only list # verbs_ibverbs_CPPFLAGS here. Same with verbs_*_LDFLAGS, # below. - verbs_CPPFLAGS=$verbs_ibverbs_CPPFLAGS - verbs_LDFLAGS=$verbs_ibverbs_LDFLAGS - verbs_LIBS="$verbs_rdmacm_LIBS $verbs_ibverbs_LIBS" + verbs_CPPFLAGS="$verbs_ibverbs_CPPFLAGS $verbs_hwloc_CPPFLAGS" + verbs_LDFLAGS="$verbs_ibverbs_LDFLAGS $verbs_hwloc_LDFLAGS" + verbs_LIBS="$verbs_rdmacm_LIBS $verbs_ibverbs_LIBS $verbs_hwloc_LIBS" AC_SUBST(verbs_CPPFLAGS) AC_SUBST(verbs_LDFLAGS) AC_SUBST(verbs_LIBS) diff --git a/prov/verbs/src/verbs_info.c b/prov/verbs/src/verbs_info.c index dacfe591bd7..d0244fca1cf 100644 --- a/prov/verbs/src/verbs_info.c +++ b/prov/verbs/src/verbs_info.c @@ -41,6 +41,9 @@ #include "verbs_ofi.h" #include "verbs_osd.h" +#ifdef HAVE_HWLOC +#include +#endif #define VERBS_IB_PREFIX "IB-0x" #define VERBS_IWARP_FABRIC "Ethernet-iWARP" @@ -72,6 +75,20 @@ be64toh((ib_ud_addr)->gid.global.subnet_prefix), \ (ib_ud_addr)->lid, (ib_ud_addr)->service) +/* PCIe proximity levels for NIC-GPU affinity */ +enum vrb_pcie_proximity { + VRB_PROXIMITY_BRIDGE = 0, /* Same PCIe switch/bridge (best) */ + VRB_PROXIMITY_PACKAGE = 1, /* Same CPU socket (good) */ + VRB_PROXIMITY_MACHINE = 2, /* Different sockets (worst) */ + VRB_PROXIMITY_UNKNOWN = 3 /* Cannot determine */ +}; + +/* Per-NIC info with proximity data for auto policy */ +struct vrb_nic_proximity_cache { + struct fi_info *info; + enum vrb_pcie_proximity proximity; +}; + const struct fi_fabric_attr verbs_fabric_attr = { .prov_version = OFI_VERSION_DEF_PROV, }; @@ -1967,6 +1984,333 @@ void vrb_devs_free(struct dlist_entry *verbs_devs) } } +#define VRB_PCI_ADDR_COMPONENTS 4 +#define VRB_PCI_DOMAIN_MAX 0xFFFF /* 16 bits */ +#define VRB_PCI_BUS_MAX 0xFF /* 8 bits */ +#define VRB_PCI_DEVICE_MAX 0x1F /* 5 bits */ +#define VRB_PCI_FUNCTION_MAX 0x7 /* 3 bits */ + +static int vrb_parse_pci_address(const char *pci_str, struct fi_pci_attr *pci_attr) +{ + unsigned int domain; + unsigned int bus; + unsigned int device; + unsigned int function; + int ret; + + ret = sscanf(pci_str, "%x:%x:%x.%x", &domain, &bus, &device, &function); + if (ret != VRB_PCI_ADDR_COMPONENTS || + domain > VRB_PCI_DOMAIN_MAX || bus > VRB_PCI_BUS_MAX || + device > VRB_PCI_DEVICE_MAX || function > VRB_PCI_FUNCTION_MAX) { + VRB_WARN(FI_LOG_CORE, + "Invalid PCI address format: '%s' " + "(expected xxxx:xx:xx.x with valid ranges)\n", + pci_str); + return -FI_EINVAL; + } + + pci_attr->domain_id = (uint16_t)domain; + pci_attr->bus_id = (uint8_t)bus; + pci_attr->device_id = (uint8_t)device; + pci_attr->function_id = (uint8_t)function; + + return FI_SUCCESS; +} + +static bool vrb_pci_addr_equal(const struct fi_pci_attr *a, const struct fi_pci_attr *b) +{ + return a->domain_id == b->domain_id && + a->bus_id == b->bus_id && + a->device_id == b->device_id && + a->function_id == b->function_id; +} + +#define VRB_MAX_CONFIG_LINE_LEN 256 + +static int vrb_parse_manual_affinity_config(const struct fi_pci_attr *device_pci, + char *nic_name, size_t nic_name_len) +{ + FILE *fp; + char line[VRB_MAX_CONFIG_LINE_LEN]; + char pci_str[VRB_MAX_CONFIG_LINE_LEN]; + char nic[VRB_MAX_CONFIG_LINE_LEN]; + struct fi_pci_attr pci_attr; + int ret; + + if (NULL == vrb_gl_data.nic_affinity_config) { + VRB_WARN(FI_LOG_CORE, "FI_VERBS_NIC_AFFINITY_CONFIG not set\n"); + return -FI_ENODATA; + } + + fp = fopen(vrb_gl_data.nic_affinity_config, "r"); + if (!fp) { + VRB_WARN(FI_LOG_CORE, "Failed to open config file: %s\n", + vrb_gl_data.nic_affinity_config); + return -FI_ENOENT; + } + + while (fgets(line, sizeof(line), fp)) { + /* Skip comments and empty lines */ + if (line[0] == '#' || line[0] == '\n') continue; + + ret = sscanf(line, "%s %s", pci_str, nic); + if (ret != 2) { + VRB_WARN(FI_LOG_CORE, "Malformed config line (expected " + "' '): %s", line); + continue; + } + + ret = vrb_parse_pci_address(pci_str, &pci_attr); + if (ret) { + VRB_WARN(FI_LOG_CORE, "Invalid PCI address format '%s' " + "(expected xxxx:xx:xx.x), skipping line: %s", + pci_str, line); + continue; + } + + if (vrb_pci_addr_equal(&pci_attr, device_pci)) { + fclose(fp); + snprintf(nic_name, nic_name_len, "%s", nic); + VRB_INFO(FI_LOG_CORE, "Found mapping: %04x:%02x:%02x.%x -> %s\n", + device_pci->domain_id, device_pci->bus_id, + device_pci->device_id, device_pci->function_id, + nic_name); + return FI_SUCCESS; + } + } + + fclose(fp); + VRB_WARN(FI_LOG_CORE, "No mapping found for device %04x:%02x:%02x.%x in config\n", + device_pci->domain_id, device_pci->bus_id, + device_pci->device_id, device_pci->function_id); + return -FI_ENODATA; +} + +#ifdef HAVE_HWLOC +static hwloc_obj_t vrb_find_pci_device(hwloc_topology_t topology, + const struct fi_pci_attr *pci) +{ + hwloc_obj_t obj = NULL; + + obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, obj); + while (obj != NULL) { + if (obj->attr->pcidev.domain == pci->domain_id && + obj->attr->pcidev.bus == pci->bus_id && + obj->attr->pcidev.dev == pci->device_id && + obj->attr->pcidev.func == pci->function_id) + return obj; + obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, obj); + } + + return NULL; +} + +static enum vrb_pcie_proximity +vrb_calc_pcie_proximity(hwloc_topology_t topology, + hwloc_obj_t dev1, hwloc_obj_t dev2) +{ + hwloc_obj_t ancestor; + + ancestor = hwloc_get_common_ancestor_obj(topology, dev1, dev2); + if (!ancestor) + return VRB_PROXIMITY_UNKNOWN; + + if (ancestor->type == HWLOC_OBJ_BRIDGE) + return VRB_PROXIMITY_BRIDGE; + + if (ancestor->type == HWLOC_OBJ_PACKAGE) + return VRB_PROXIMITY_PACKAGE; + + return VRB_PROXIMITY_MACHINE; +} + +static int vrb_proximity_compare(const void *a, const void *b) +{ + const struct vrb_nic_proximity_cache *nic_a; + const struct vrb_nic_proximity_cache *nic_b; + + nic_a = (const struct vrb_nic_proximity_cache *)a; + nic_b = (const struct vrb_nic_proximity_cache *)b; + + if (nic_a->proximity < nic_b->proximity) + return -1; + if (nic_a->proximity > nic_b->proximity) + return 1; + return 0; +} +#endif /* HAVE_HWLOC */ + +int vrb_nic_affinity_manual(struct fi_info **info, const struct fi_pci_attr *device_pci) +{ + char target_nic[VRB_MAX_CONFIG_LINE_LEN]; + struct fi_info *cur; + struct fi_info *prev; + struct fi_info *next; + struct fi_info *target_head; + struct fi_info *target_tail; + int ret; + + target_head = NULL; + target_tail = NULL; + + ret = vrb_parse_manual_affinity_config(device_pci, target_nic, + sizeof(target_nic)); + if (ret) + return FI_SUCCESS; + + VRB_DBG(FI_LOG_CORE, "Manual policy: device %04x:%02x:%02x.%x -> NIC %s\n", + device_pci->domain_id, device_pci->bus_id, + device_pci->device_id, device_pci->function_id, target_nic); + + prev = NULL; + cur = *info; + while (cur) { + next = cur->next; + if (cur->nic && cur->nic->device_attr && cur->nic->device_attr->name && + !strncmp(cur->nic->device_attr->name, target_nic, VRB_MAX_CONFIG_LINE_LEN)) { + if (prev) + prev->next = next; + else + *info = next; + + cur->next = NULL; + if (!target_head) { + target_head = cur; + target_tail = cur; + } else { + target_tail->next = cur; + target_tail = cur; + } + + cur = next; + } else { + prev = cur; + cur = next; + } + } + + if (target_head) { + target_tail->next = *info; + *info = target_head; + VRB_DBG(FI_LOG_CORE, "Moved fi_info entries for NIC %s to front of list\n", target_nic); + } else { + VRB_DBG(FI_LOG_CORE, "Target NIC %s not found in provider list, list unchanged\n", target_nic); + } + + return FI_SUCCESS; +} + +#ifdef HAVE_HWLOC +int vrb_nic_affinity_auto(struct fi_info **info, const struct fi_pci_attr *device_pci) +{ + hwloc_topology_t topology; + hwloc_obj_t gpu_obj; + hwloc_obj_t nic_pci_obj; + struct fi_info *cur; + struct vrb_nic_proximity_cache *nic_array; + int entries_count; + int i; + int ret; + + entries_count = 0; + + topology = NULL; + gpu_obj = NULL; + nic_array = NULL; + + for (cur = *info; cur; cur = cur->next) + entries_count++; + if (entries_count == 0) + return FI_SUCCESS; + + ret = hwloc_topology_init(&topology); + if (ret) { + VRB_WARN(FI_LOG_CORE, "hwloc_topology_init failed, " + "falling back to none policy\n"); + return FI_SUCCESS; + } + + ret = hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); + if (ret) { + VRB_WARN(FI_LOG_CORE, "hwloc_topology_set_io_types_filter failed, " + "falling back to none policy\n"); + hwloc_topology_destroy(topology); + return FI_SUCCESS; + } + + ret = hwloc_topology_load(topology); + if (ret) { + VRB_WARN(FI_LOG_CORE, "hwloc_topology_load failed, " + "falling back to none policy\n"); + hwloc_topology_destroy(topology); + return FI_SUCCESS; + } + + gpu_obj = vrb_find_pci_device(topology, device_pci); + if (!gpu_obj) { + VRB_DBG(FI_LOG_CORE, "Device %04x:%02x:%02x.%x not found in topology, " + "list unchanged\n", device_pci->domain_id, device_pci->bus_id, + device_pci->device_id, device_pci->function_id); + hwloc_topology_destroy(topology); + return FI_SUCCESS; + } + + /* Allocate array for NIC info with proximity data */ + nic_array = calloc(entries_count, sizeof(*nic_array)); + if (!nic_array) { + VRB_WARN(FI_LOG_CORE, "Failed to allocate NIC array, " + "list unchanged\n"); + hwloc_topology_destroy(topology); + return FI_SUCCESS; + } + + /* Calculate proximity for each NIC */ + i = 0; + for (cur = *info; cur; cur = cur->next, i++) { + nic_array[i].info = cur; + nic_array[i].proximity = VRB_PROXIMITY_UNKNOWN; + + if (!cur->nic || !cur->nic->bus_attr || + cur->nic->bus_attr->bus_type != FI_BUS_PCI) + continue; + + nic_pci_obj = vrb_find_pci_device(topology, &cur->nic->bus_attr->attr.pci); + if (!nic_pci_obj) + continue; + + nic_array[i].proximity = vrb_calc_pcie_proximity(topology, gpu_obj, nic_pci_obj); + + VRB_DBG(FI_LOG_CORE, "NIC %s proximity to GPU: %s\n", + cur->nic->device_attr ? cur->nic->device_attr->name : "unknown", + nic_array[i].proximity == VRB_PROXIMITY_BRIDGE ? "bridge" : + nic_array[i].proximity == VRB_PROXIMITY_PACKAGE ? "package" : + nic_array[i].proximity == VRB_PROXIMITY_MACHINE ? "machine" : "unknown"); + } + + /* Sort NICs by proximity (best first) */ + qsort(nic_array, entries_count, sizeof(*nic_array), vrb_proximity_compare); + + /* Rebuild fi_info list in sorted order */ + *info = NULL; + struct fi_info *tail = NULL; + for (i = 0; i < entries_count; i++) { + nic_array[i].info->next = NULL; + if (!*info) { + *info = nic_array[i].info; + tail = *info; + } else { + tail->next = nic_array[i].info; + tail = tail->next; + } + } + + free(nic_array); + hwloc_topology_destroy(topology); + + return FI_SUCCESS; +} +#endif /* HAVE_HWLOC */ + int vrb_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) @@ -1974,6 +2318,7 @@ int vrb_getinfo(uint32_t version, const char *node, const char *service, static bool init_done = false; struct dlist_entry tmp_devs; struct fi_info *tmp_info = NULL; + struct fi_pci_attr device_pci; int ret; vrb_prof_func_start(__func__); @@ -2029,6 +2374,24 @@ int vrb_getinfo(uint32_t version, const char *node, const char *service, if (hints) vrb_filter_info_by_addr_format(info, hints->addr_format); + + /* Apply NIC affinity reordering policy */ + if (*info && vrb_gl_data.nic_affinity_handler && vrb_gl_data.affinity_device) { + ret = vrb_parse_pci_address(vrb_gl_data.affinity_device, &device_pci); + if (ret) { + VRB_WARN(FI_LOG_CORE, + "Failed to parse FI_VERBS_AFFINITY_DEVICE, list unchanged\n"); + ret = 0; + } else { + ret = vrb_gl_data.nic_affinity_handler(info, &device_pci); + if (ret) { + VRB_WARN(FI_LOG_CORE, + "NIC affinity handler failed: %s (%d)\n", + fi_strerror(-ret), ret); + ret = 0; + } + } + } out: vrb_prof_func_end(__func__); if (!ret || ret == -FI_ENOMEM || ret == -FI_ENODEV) diff --git a/prov/verbs/src/verbs_init.c b/prov/verbs/src/verbs_init.c index 3d1d6f4ec97..83aaebe18d4 100644 --- a/prov/verbs/src/verbs_init.c +++ b/prov/verbs/src/verbs_init.c @@ -66,6 +66,10 @@ struct vrb_gl_data vrb_gl_data = { .prefer_xrc = 0, .xrcd_filename = "/tmp/verbs_xrcd", }, + + .nic_affinity_policy = "none", + .affinity_device = NULL, + .nic_affinity_config = NULL, }; struct vrb_dev_preset { @@ -766,11 +770,67 @@ static int vrb_read_params(void) return -FI_EINVAL; } + /* NIC affinity parameters */ + if (vrb_get_param_str("nic_affinity_policy", + "NIC affinity policy for ordering NICs in fi_getinfo results. " + "(Default: 'none').", + &vrb_gl_data.nic_affinity_policy)) { + VRB_WARN(FI_LOG_CORE, "Invalid value of nic_affinity_policy\n"); + return -FI_EINVAL; + } + if (vrb_get_param_str("affinity_device", + "PCI address of device for NIC affinity. ", + &vrb_gl_data.affinity_device)) { + VRB_WARN(FI_LOG_CORE, "Invalid value of affinity_device\n"); + return -FI_EINVAL; + } + if (vrb_get_param_str("nic_affinity_config", + "Path to NIC affinity configuration file for 'manual' policy. ", + &vrb_gl_data.nic_affinity_config)) { + VRB_WARN(FI_LOG_CORE, "Invalid value of nic_affinity_config\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int vrb_nic_affinity_init(void) +{ + vrb_gl_data.nic_affinity_handler = NULL; + + if (!vrb_gl_data.nic_affinity_policy) { + VRB_INFO(FI_LOG_CORE, "NIC affinity policy not set, " + "using default 'none' (no reordering)\n"); + return FI_SUCCESS; + } + + if (!strcmp(vrb_gl_data.nic_affinity_policy, "none")) { + VRB_INFO(FI_LOG_CORE, "NIC affinity policy 'none': " + "no reordering\n"); + } else if (!strcmp(vrb_gl_data.nic_affinity_policy, "manual")) { + VRB_INFO(FI_LOG_CORE, "NIC affinity policy 'manual' enabled\n"); + vrb_gl_data.nic_affinity_handler = vrb_nic_affinity_manual; + } else if (!strcmp(vrb_gl_data.nic_affinity_policy, "auto")) { +#ifdef HAVE_HWLOC + VRB_INFO(FI_LOG_CORE, "NIC affinity policy 'auto' enabled\n"); + vrb_gl_data.nic_affinity_handler = vrb_nic_affinity_auto; +#else + VRB_WARN(FI_LOG_CORE, "NIC affinity policy 'auto' requested but hwloc " + "support not available, falling back to 'none'\n"); +#endif + } else { + VRB_WARN(FI_LOG_CORE, "Invalid NIC affinity policy '%s', " + "falling back to 'none'. Valid values: none, manual, auto\n", + vrb_gl_data.nic_affinity_policy); + } + return FI_SUCCESS; } int vrb_init() { + int ret; + if (vrb_os_ini()) { FI_WARN(&vrb_prov, FI_LOG_FABRIC, "failed in OS specific device initialization\n"); @@ -787,6 +847,13 @@ int vrb_init() return -FI_ENODATA; } + /* Initialize NIC affinity. */ + ret = vrb_nic_affinity_init(); + if (ret) { + VRB_INFO(FI_LOG_FABRIC, "failed to initialize NIC affinity\n"); + return -FI_ENODATA; + } + return FI_SUCCESS; } diff --git a/prov/verbs/src/verbs_ofi.h b/prov/verbs/src/verbs_ofi.h index befd53ce4e0..69c193bf72a 100644 --- a/prov/verbs/src/verbs_ofi.h +++ b/prov/verbs/src/verbs_ofi.h @@ -176,6 +176,9 @@ extern ofi_mutex_t vrb_info_mutex; extern ofi_mutex_t vrb_init_mutex; extern struct dlist_entry vrb_devs; +typedef int (*vrb_nic_affinity_handler_t)(struct fi_info **info, + const struct fi_pci_attr *device_pci); + extern struct vrb_gl_data { int def_tx_size; int def_rx_size; @@ -210,6 +213,11 @@ extern struct vrb_gl_data { bool peer_mem_support; bool dmabuf_support; + + vrb_nic_affinity_handler_t nic_affinity_handler; + char *nic_affinity_policy; + char *affinity_device; + char *nic_affinity_config; } vrb_gl_data; struct verbs_addr { @@ -1134,5 +1142,10 @@ void vrb_prof_init(); int vrb_prof_create(vrb_profile_t **prof); +/* NIC affinity policy handlers */ +int vrb_nic_affinity_manual(struct fi_info **info, const struct fi_pci_attr *device_pci); +#ifdef HAVE_HWLOC +int vrb_nic_affinity_auto(struct fi_info **info, const struct fi_pci_attr *device_pci); +#endif #endif /* VERBS_OFI_H */