diff --git a/fabtests/prov/efa/Makefile.include b/fabtests/prov/efa/Makefile.include index 94cdc181fc8..eb7aed54e0c 100644 --- a/fabtests/prov/efa/Makefile.include +++ b/fabtests/prov/efa/Makefile.include @@ -39,7 +39,8 @@ bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \ prov/efa/src/fi_efa_implicit_av_test \ prov/efa/src/fi_efa_multi_ep_stress \ prov/efa/src/fi_efa_mmap_test \ - prov/efa/src/fi_efa_mr_test + prov/efa/src/fi_efa_mr_test \ + prov/efa/src/fi_efa_runt_read_no_handshake if HAVE_VERBS_DEVEL if HAVE_EFA_DV @@ -101,6 +102,11 @@ prov_efa_src_fi_efa_mr_test_SOURCES = \ prov/efa/src/efa_mr_test.c prov_efa_src_fi_efa_mr_test_LDADD = libfabtests.la +prov_efa_src_fi_efa_runt_read_no_handshake_SOURCES = \ + prov/efa/src/efa_runt_read_no_handshake.c \ + $(benchmarks_srcs) +prov_efa_src_fi_efa_runt_read_no_handshake_LDADD = libfabtests.la + if HAVE_VERBS_DEVEL if HAVE_EFA_DV efa_exhaust_mr_reg_srcs = \ diff --git a/fabtests/prov/efa/src/efa_runt_read_no_handshake.c b/fabtests/prov/efa/src/efa_runt_read_no_handshake.c new file mode 100644 index 00000000000..d8d616498dd --- /dev/null +++ b/fabtests/prov/efa/src/efa_runt_read_no_handshake.c @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include "benchmarks/benchmark_shared.h" + +/* + * Mirrors ft_init_fabric() but inserts fi_setopt(HOMOGENEOUS_PEERS) + * before fi_enable, since the option must be set before enabling the EP + * to avoid the requirement for a handshake before using a read based + * protocol. + */ +static int init_fabric_with_homogeneous_peers(void) +{ + int ret; + bool homogeneous = true; + + ret = ft_init(); + if (ret) + return ret; + + ret = ft_init_oob(); + if (ret) + return ret; + + if (oob_sock >= 0 && opts.dst_addr) { + ret = ft_sock_sync(oob_sock, 0); + if (ret) + return ret; + } + + ret = ft_getinfo(hints, &fi); + if (ret) + return ret; + + ret = ft_open_fabric_res(); + if (ret) + return ret; + + ret = ft_alloc_active_res(fi); + if (ret) + return ret; + + /* Must be set before fi_enable to skip handshake requirement before + * using read based protocol */ + ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, + FI_OPT_EFA_HOMOGENEOUS_PEERS, + &homogeneous, sizeof(homogeneous)); + if (ret) { + FT_PRINTERR("fi_setopt(HOMOGENEOUS_PEERS)", ret); + return ret; + } + + ret = ft_enable_ep_recv(); + if (ret) + return ret; + + if (oob_sock >= 0 && !opts.dst_addr) { + ret = ft_sock_sync(oob_sock, 0); + if (ret) + return ret; + } + + ret = ft_init_av(); + if (ret) + return ret; + + return 0; +} + +static int run(void) +{ + int ret; + char test_name[64] = ""; + + ret = init_fabric_with_homogeneous_peers(); + if (ret) + return ret; + + init_test(&opts, test_name, sizeof(test_name)); + ret = bandwidth(); + if (ret) + return ret; + + return ft_finalize(); +} + +int main(int argc, char **argv) +{ + int op, ret, cleanup_ret; + + opts = INIT_OPTS; + opts.options |= FT_OPT_BW; + opts.iterations = 1; + opts.warmup_iterations = 0; + opts.transfer_size = 262144; /* 256 KB */ + opts.window_size = 1; + + hints = fi_allocinfo(); + if (!hints) + return EXIT_FAILURE; + + while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS, + long_opts, &lopt_idx)) != -1) { + switch (op) { + default: + if (!ft_parse_long_opts(op, optarg)) + continue; + ft_parse_benchmark_opts(op, optarg); + ft_parseinfo(op, optarg, hints, &opts); + ft_parsecsopts(op, optarg, &opts); + break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; + case '?': + case 'h': + ft_csusage(argv[0], + "Verify runt read protocol transfers " + "data with both RTM packets and RDMA read"); + ft_benchmark_usage(); + ft_longopts_usage(); + return EXIT_FAILURE; + } + } + + if (optind < argc) + opts.dst_addr = argv[optind]; + + hints->ep_attr->type = FI_EP_RDM; + hints->caps = FI_MSG; + hints->mode |= FI_CONTEXT | FI_CONTEXT2; + hints->domain_attr->mr_mode = opts.mr_mode; + hints->addr_format = opts.address_format; + + ret = run(); + + cleanup_ret = ft_free_res(); + return ft_exit_code(ret ? ret : cleanup_ret); +} diff --git a/fabtests/pytest/efa/test_runt.py b/fabtests/pytest/efa/test_runt.py index cc6ebaec4a5..10d0a76c07b 100644 --- a/fabtests/pytest/efa/test_runt.py +++ b/fabtests/pytest/efa/test_runt.py @@ -14,23 +14,33 @@ pytest.param("neuron_to_neuron", None, marks=pytest.mark.neuron_memory)]) def test_runt_read_functional(cmdline_args, memory_type, copy_method): """ - Verify runt reading protocol is working as expected by sending 1 message of 256 KB. - 64 KB of the message will be transfered using EFA device's send capability - The remainer will be tranfered using EFA device's RDMA read capability + Verify runt read protocol works with FI_OPT_EFA_HOMOGENEOUS_PEERS set, + which skips the handshake. This sends 1 message of 256 KB using + fi_efa_runt_read_no_handshake (a unidirectional bandwidth test). + + 64 KB of the message is transferred using EFA device's send capability (runt). + The remainder (192 KB) is transferred using EFA device's RDMA read capability. + + The test uses FI_OPT_EFA_HOMOGENEOUS_PEERS to skip the handshake, verifying + that the runt read protocol can be selected without prior handshake exchange. """ if cmdline_args.server_id == cmdline_args.client_id: pytest.skip("no runting for intra-node communication") - cmdline_args.append_environ("FI_EFA_RUNT_SIZE=65536") + additional_env = "FI_EFA_RUNT_SIZE=65536 FI_EFA_ENABLE_SHM_TRANSFER=0" + + if memory_type == "host_to_host": + # For host memory, min_read_message_size defaults to 1MB. + # Lower it so the 256KB message triggers the read-based path. + additional_env += " FI_EFA_INTER_MIN_READ_MESSAGE_SIZE=65536" if copy_method == "gdrcopy": if not has_gdrcopy(cmdline_args.server_id) or not has_gdrcopy(cmdline_args.client_id): pytest.skip("No gdrcopy") - - cmdline_args.append_environ("FI_HMEM_CUDA_USE_GDRCOPY=1") + additional_env += " FI_HMEM_CUDA_USE_GDRCOPY=1" elif copy_method == "localread": assert memory_type == "cuda_to_cuda" - cmdline_args.append_environ("FI_HMEM_CUDA_USE_GDRCOPY=0") + additional_env += " FI_HMEM_CUDA_USE_GDRCOPY=0" # wrs stands for work requests server_read_wrs_before_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs") @@ -41,16 +51,17 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method): client_send_bytes_before_test = efa_retrieve_hw_counter_value(cmdline_args.client_id, "send_bytes") efa_run_client_server_test(cmdline_args, - "fi_rdm_tagged_bw", + "fi_efa_runt_read_no_handshake", iteration_type="1", completion_semantic="transmit_complete", memory_type=memory_type, message_size="262144", warmup_iteration_type="0", - fabric="efa") + fabric="efa", + additional_env=additional_env) server_read_wrs_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs") - server_read_bytes_after_test =efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes") + server_read_bytes_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes") client_send_bytes_after_test = efa_retrieve_hw_counter_value(cmdline_args.client_id, "send_bytes") print("server_read_bytes_before_test: {}".format(server_read_bytes_before_test)) @@ -61,32 +72,32 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method): # return here for those that does not support return - # fi_rdm_tagged_bw is a uni-diretional test. client is the sender and server is the receiver. - # In a RDMA read based message tranfer protocol, the receiver will submit work request for read + # fi_efa_runt_read_no_handshake is a unidirectional test. + # Client is the sender and server is the receiver. + # In the runt read protocol, the receiver issues RDMA read work requests. server_read_wrs = server_read_wrs_after_test - server_read_wrs_before_test server_read_bytes = server_read_bytes_after_test - server_read_bytes_before_test client_send_bytes = client_send_bytes_after_test - client_send_bytes_before_test - # Among the 256 KB of data, 64 KB data will be sent via 8 RUNTREAD RTM packets. + # Among the 256 KB of data, 64 KB data will be sent via RUNTREAD RTM packets. # The total number of send bytes will be larger than 64K because: # a. each packet has a header - # b. when runing on single node, server will use the same EFA device to send control packets + # b. when running on single node, server will use the same EFA device to send control packets assert client_send_bytes > 65536 if copy_method == "localread": - # when local read copy is used, server issue RDMA requests to copy received data + # when local read copy is used, server issues RDMA requests to copy received data # # so in this case, total read wr is at least 9, which is # 1 remote read of 192k - # 8 local read for the 64k data transfer by send + # 8 local read for the 64k data transferred by send # More local reads for fabtests control messages # # and total read_bytes will be >= 256K including the control messages - # assert server_read_wrs >= 9 assert server_read_bytes >= 262144 else: - # The other 192 KB is transfer by RDMA read + # The other 192 KB is transferred by RDMA read # for which the server (receiver) will issue 1 read request. assert server_read_wrs == 1 assert server_read_bytes == 196608 diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 799bf142744..0419d5ee7a0 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -323,7 +323,7 @@ Table: 2.1 a list of extra features/requests | ID | Name | Type | Introduced since | Described in | |---|---|---|---|---| | 0 | RDMA-Read based data transfer | extra feature | libfabric 1.10.0 | Section 4.1 | -| 1 | delivery complete | extra feature | libfabric 1.12.0 | Section 4.2 | +| 1 | delivery complete | extra feature | libfabric 1.12.0 | Section 4.2 _(baseline since 2.6)_ | | 2 | keep packet header length constant | extra request | libfabric 1.13.0 | Section 4.3 | | 3 | sender connection id in packet header | extra request | libfabric 1.14.0 | Section 4.4 | | 4 | runting read message protocol | extra feature | libfabric 1.16.0 | Section 4.5 | @@ -379,6 +379,29 @@ If receiver is in zero copy receive mode, it will have the extra request for sender to ignore the request, and send packets with different header length. It is receiver's responsibility to react accordingly. (section 4.3) +#### Baseline promotion of extra features and requests (libfabric 2.6+) + +Starting with libfabric 2.6, backwards compatibility is only guaranteed to libfabric 2.0. +Any extra feature or request that was introduced before libfabric 2.0 is considered to be +universally supported by all peers, since every supported peer (v2.0+) already implements it. + +When an extra feature is promoted to baseline (marked as _(baseline since X.Y)_ in table 2.1), +the following applies: + +- The flag is still set in `extra_info` during the handshake, because v2.0 peers check for + it and would reject operations if the flag is missing. +- However, the local endpoint no longer checks the peer's `extra_info` for that flag and + assumes the peer always supports the feature. A handshake is no longer required before + using the feature. + +As of libfabric 2.6, the following features/requests has been baselined: + +- **Delivery complete (ID 1)**: Baseline since 2.6. The handshake requirement for DC packets + has been removed from the send, write, and atomic paths. See section 4.2. + +The remaining extra features and requests are candidates for the same transition in future +releases. + This concludes the workflow of the handshake subprotocol. The binary format of a HANDSHAKE packet is listed in table 2.2. @@ -1287,6 +1310,11 @@ not necessary for the responder to keep the progress engine running. ### 4.2 Delivery complete +**Since libfabric 2.6, delivery complete is treated as a baseline feature. All supported +peers (v2.0+) support it, so a handshake is no longer required before sending DC packets. +The `EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE` flag is still advertised in `extra_info` +for backwards compatibility with v2.0 peers that check for it.** + The extra feature "delivery complete" was introduced with libfabric 1.12.0 and was assigned ID 1. Delivery complete is a requirement the application can impose on an endpoint when opening the endpoint. diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index ac28e983b84..81f52979d8a 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -103,27 +103,6 @@ ssize_t efa_rdm_atomic_post_atomic(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm }; delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE; - if (delivery_complete_requested && !(txe->peer->is_local)) { - /* - * Because delivery complete is defined as an extra - * feature, the receiver might not support it. - * The sender cannot send with FI_DELIVERY_COMPLETE - * if the peer is not able to handle it. - * If the sender does not know whether the peer - * can handle it, it needs to trigger - * a handshake packet from the peer. - * The handshake packet contains - * the information whether the peer - * support it or not. - */ - if (!efa_rdm_ep->homogeneous_peers) { - if (!(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) - return efa_rdm_ep_enforce_handshake_for_txe(efa_rdm_ep, txe); - - if (!(txe->peer->is_self) && !efa_rdm_peer_support_delivery_complete(txe->peer)) - return -FI_EOPNOTSUPP; - } - } if (delivery_complete_requested && txe->op == ofi_op_atomic) { return efa_rdm_ope_post_send(txe, EFA_RDM_DC_WRITE_RTA_PKT); diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index dd01ff48ec7..9a3e0f2e988 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -146,8 +146,11 @@ ssize_t efa_rdm_msg_post_rtm(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) * * Check handshake packet from peer to verify support status. */ - if (!ep->homogeneous_peers && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) - return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); + if (!ep->homogeneous_peers && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) { + int ex_feature = EFA_RDM_PKT_TYPE_REQ_INFO_VEC[rtm_type].ex_feature_flag; + if (ex_feature) + return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); + } if (!ep->homogeneous_peers && !efa_rdm_pkt_type_is_supported_by_peer(rtm_type, txe->peer)) return -FI_EOPNOTSUPP; diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 5928519ea40..43478b26179 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -176,19 +176,6 @@ bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); } -static inline -bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) -{ - /* FI_DELIVERY_COMPLETE is an extra feature defined - * in version 4 (the base version). - * Because it is an extra feature, - * an EP will assume the peer does not support - * it before a handshake packet was received. - */ - return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && - (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE); -} - static inline bool efa_rdm_peer_support_read_nack(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_pkt_type.c b/prov/efa/src/rdm/efa_rdm_pkt_type.c index b918fac5416..c0b66cd7051 100644 --- a/prov/efa/src/rdm/efa_rdm_pkt_type.c +++ b/prov/efa/src/rdm/efa_rdm_pkt_type.c @@ -19,21 +19,21 @@ struct efa_rdm_pkt_type_req_info EFA_RDM_PKT_TYPE_REQ_INFO_VEC[] = { [EFA_RDM_LONGCTS_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_longcts_tagrtm_hdr), 0}, [EFA_RDM_LONGREAD_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_longread_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_RDMA_READ}, [EFA_RDM_LONGREAD_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_longread_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_RDMA_READ}, - [EFA_RDM_DC_EAGER_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_eager_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, - [EFA_RDM_DC_EAGER_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_eager_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, - [EFA_RDM_DC_MEDIUM_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_medium_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, - [EFA_RDM_DC_MEDIUM_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_medium_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, - [EFA_RDM_DC_LONGCTS_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_longcts_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, - [EFA_RDM_DC_LONGCTS_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_longcts_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, + [EFA_RDM_DC_EAGER_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_eager_msgrtm_hdr), 0}, + [EFA_RDM_DC_EAGER_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_eager_tagrtm_hdr), 0}, + [EFA_RDM_DC_MEDIUM_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_medium_msgrtm_hdr), 0}, + [EFA_RDM_DC_MEDIUM_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_dc_medium_tagrtm_hdr), 0}, + [EFA_RDM_DC_LONGCTS_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_longcts_msgrtm_hdr), 0}, + [EFA_RDM_DC_LONGCTS_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_longcts_tagrtm_hdr), 0}, [EFA_RDM_RUNTCTS_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_runtcts_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_RUNT}, [EFA_RDM_RUNTCTS_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_runtcts_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_RUNT}, [EFA_RDM_RUNTREAD_MSGRTM_PKT] = {0, sizeof(struct efa_rdm_runtread_msgrtm_hdr), EFA_RDM_EXTRA_FEATURE_RUNT | EFA_RDM_EXTRA_FEATURE_RDMA_READ}, [EFA_RDM_RUNTREAD_TAGRTM_PKT] = {0, sizeof(struct efa_rdm_runtread_tagrtm_hdr), EFA_RDM_EXTRA_FEATURE_RUNT | EFA_RDM_EXTRA_FEATURE_RDMA_READ}, /* rtw header */ [EFA_RDM_EAGER_RTW_PKT] = {0, sizeof(struct efa_rdm_eager_rtw_hdr), 0}, - [EFA_RDM_DC_EAGER_RTW_PKT] = {0, sizeof(struct efa_rdm_dc_eager_rtw_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, + [EFA_RDM_DC_EAGER_RTW_PKT] = {0, sizeof(struct efa_rdm_dc_eager_rtw_hdr), 0}, [EFA_RDM_LONGCTS_RTW_PKT] = {0, sizeof(struct efa_rdm_longcts_rtw_hdr), 0}, - [EFA_RDM_DC_LONGCTS_RTW_PKT] = {0, sizeof(struct efa_rdm_longcts_rtw_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, + [EFA_RDM_DC_LONGCTS_RTW_PKT] = {0, sizeof(struct efa_rdm_longcts_rtw_hdr), 0}, [EFA_RDM_LONGREAD_RTW_PKT] = {0, sizeof(struct efa_rdm_longread_rtw_hdr), EFA_RDM_EXTRA_FEATURE_RDMA_READ}, [EFA_RDM_RUNTCTS_RTW_PKT] = {0, sizeof(struct efa_rdm_runtcts_rtw_hdr), EFA_RDM_EXTRA_FEATURE_RUNT}, [EFA_RDM_RUNTREAD_RTW_PKT] = {0, sizeof(struct efa_rdm_runtread_rtw_hdr), EFA_RDM_EXTRA_FEATURE_RUNT}, @@ -43,7 +43,7 @@ struct efa_rdm_pkt_type_req_info EFA_RDM_PKT_TYPE_REQ_INFO_VEC[] = { [EFA_RDM_READ_RTR_PKT] = {0, sizeof(struct efa_rdm_base_hdr), EFA_RDM_EXTRA_FEATURE_RDMA_READ}, /* rta header */ [EFA_RDM_WRITE_RTA_PKT] = {0, sizeof(struct efa_rdm_rta_hdr), 0}, - [EFA_RDM_DC_WRITE_RTA_PKT] = {0, sizeof(struct efa_rdm_rta_hdr), EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE}, + [EFA_RDM_DC_WRITE_RTA_PKT] = {0, sizeof(struct efa_rdm_rta_hdr), 0}, [EFA_RDM_FETCH_RTA_PKT] = {0, sizeof(struct efa_rdm_rta_hdr), 0}, [EFA_RDM_COMPARE_RTA_PKT] = {0, sizeof(struct efa_rdm_rta_hdr), 0}, }; @@ -64,11 +64,20 @@ struct efa_rdm_pkt_type_req_info EFA_RDM_PKT_TYPE_REQ_INFO_VEC[] = { */ bool efa_rdm_pkt_type_is_supported_by_peer(int req_pkt_type, struct efa_rdm_peer *peer) { + uint64_t ex_feature_flag; + + ex_feature_flag = EFA_RDM_PKT_TYPE_REQ_INFO_VEC[req_pkt_type].ex_feature_flag; + + /* No extra feature required means always supported */ + if (!ex_feature_flag) + return true; + + /* If an extra feature is needed, handshake should have been enforced */ assert(peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED); int extra_info_id = EFA_RDM_PKT_TYPE_REQ_INFO_VEC[req_pkt_type].extra_info_id; - return peer->extra_info[extra_info_id] & EFA_RDM_PKT_TYPE_REQ_INFO_VEC[req_pkt_type].ex_feature_flag; + return peer->extra_info[extra_info_id] & ex_feature_flag; } /** diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index d361dd564b5..f959d26eb86 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -373,25 +373,10 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) } delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE; - if (delivery_complete_requested) { - /* - * Because delivery complete is defined as an extra - * feature, the receiver might not support it. - * - * The sender cannot send with FI_DELIVERY_COMPLETE - * if the peer is not able to handle it. - * - * handshake is already made now since we enforce - * handshake for write earlier. - */ - - if (!ep->homogeneous_peers && !(txe->peer->is_self) && !efa_rdm_peer_support_delivery_complete(txe->peer)) - return -FI_EOPNOTSUPP; - + if (delivery_complete_requested) max_eager_rtw_data_size = efa_rdm_txe_max_req_data_capacity(ep, txe, EFA_RDM_DC_EAGER_RTW_PKT); - } else { + else max_eager_rtw_data_size = efa_rdm_txe_max_req_data_capacity(ep, txe, EFA_RDM_EAGER_RTW_PKT); - } iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->iface : FI_HMEM_SYSTEM; diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index ec9daf55b50..0b6f8ff942a 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -417,189 +417,6 @@ void test_efa_rdm_pke_get_available_copy_methods_align128(struct efa_resource ** * * @param[in] state struct efa_resource that is managed by the framework */ -void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **state) -{ - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_ioc ioc = {0}; - struct fi_rma_ioc rma_ioc = {0}; - struct fi_msg_atomic msg = {0}; - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int buf[1] = {0}, err, numaddr; - struct efa_rdm_ope *txe; - - /* disable shm to force using efa device to send */ - efa_unit_test_resource_construct_rdm_shm_disabled(resource); - - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - msg.addr = peer_addr; - - ioc.addr = buf; - ioc.count = 1; - msg.msg_iov = &ioc; - msg.iov_count = 1; - - msg.rma_iov = &rma_ioc; - msg.rma_iov_count = 1; - msg.datatype = FI_INT32; - msg.op = FI_SUM; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think - * a REQ packet has been sent to the peer (so no need to send again) - * handshake has not been received, so we do not know whether the peer support DC - */ - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - peer->flags = EFA_RDM_PEER_REQ_SENT; - peer->is_local = false; - - assert_false(efa_rdm_ep->homogeneous_peers); - assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - err = fi_atomicmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - /* DC has been reuquested, but ep do not know whether peer supports it, therefore - * the ope has been queued to domain->ope_queued_list - */ - assert_int_equal(err, 0); - assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1); - assert_int_equal(efa_unit_test_get_dlist_length(&(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list)), 1); - txe = container_of(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list.next, struct efa_rdm_ope, queued_entry); - assert_true((txe->op == ofi_op_atomic)); - assert_true(txe->internal_flags & EFA_RDM_OPE_QUEUED_BEFORE_HANDSHAKE); -} - -/** - * @brief when delivery complete send was used and handshake packet has not been received - * verify the txe is queued - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) -{ - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_msg msg = {0}; - struct iovec iov; - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int err, numaddr; - struct efa_rdm_ope *txe; - - /* disable shm to force using efa device to send */ - efa_unit_test_resource_construct_rdm_shm_disabled(resource); - - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - msg.addr = peer_addr; - msg.iov_count = 1; - iov.iov_base = NULL; - iov.iov_len = 0; - msg.msg_iov = &iov; - msg.desc = NULL; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think - * a REQ packet has been sent to the peer (so no need to send again) - * handshake has not been received, so we do not know whether the peer support DC - */ - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - peer->flags = EFA_RDM_PEER_REQ_SENT; - peer->is_local = false; - - assert_false(efa_rdm_ep->homogeneous_peers); - assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - /* DC has been reuquested, but ep do not know whether peer supports it, therefore - * the ope has been queued to domain->ope_queued_list - */ - assert_int_equal(err, 0); - assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1); - assert_int_equal(efa_unit_test_get_dlist_length(&(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list)), 1); - txe = container_of(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list.next, struct efa_rdm_ope, queued_entry); - assert_true((txe->op == ofi_op_msg)); - assert_true(txe->internal_flags & EFA_RDM_OPE_QUEUED_BEFORE_HANDSHAKE); -} - -/** - * @brief when delivery complete send was used and handshake packet has not been received - * verify the txes are queued before the number of requests reach EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE. - * After reaching the limit, fi_send should return -FI_EAGAIN - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource **state) -{ - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_msg msg = {0}; - struct iovec iov; - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int err, numaddr; - int i; - - /* disable shm to force using efa device to send */ - efa_unit_test_resource_construct_rdm_shm_disabled(resource); - - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - msg.addr = peer_addr; - msg.iov_count = 1; - iov.iov_base = NULL; - iov.iov_len = 0; - msg.msg_iov = &iov; - msg.desc = NULL; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think - * a REQ packet has been sent to the peer (so no need to send again) - * handshake has not been received, so we do not know whether the peer support DC - */ - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - peer->flags = EFA_RDM_PEER_REQ_SENT; - peer->is_local = false; - - assert_false(efa_rdm_ep->homogeneous_peers); - assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - - for (i = 0; i < EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE; i++) { - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - assert_int_equal(err, 0); - } - - assert_true(efa_rdm_ep->ope_queued_before_handshake_cnt == EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE); - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - assert_int_equal(err, -FI_EAGAIN); -} - /** * @brief verify tx entry is queued for rma (read or write) request before handshake is made. * diff --git a/prov/efa/test/efa_unit_test_ope.c b/prov/efa/test/efa_unit_test_ope.c index f4fb1c1db8d..b69063812c7 100644 --- a/prov/efa/test/efa_unit_test_ope.c +++ b/prov/efa/test/efa_unit_test_ope.c @@ -1113,7 +1113,6 @@ void test_efa_rdm_atomic_compare_desc_persistence(struct efa_resource **state) ret = fi_av_insert(resource->av, &raw_addr, 1, &addr, 0, NULL); assert_int_equal(ret, 1); - /* Set peer flags to simulate handshake state */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); peer = efa_rdm_ep_get_peer(efa_rdm_ep, addr); peer->flags = EFA_RDM_PEER_REQ_SENT; @@ -1134,7 +1133,6 @@ void test_efa_rdm_atomic_compare_desc_persistence(struct efa_resource **state) compare_desc_array[0] = fi_mr_desc(compare_buff.mr); void *original_desc_value = compare_desc_array[0]; - /* Setup atomic message with FI_DELIVERY_COMPLETE to force queuing */ ioc.addr = send_buff.buff; ioc.count = 1; compare_ioc.addr = compare_buff.buff; @@ -1152,26 +1150,23 @@ void test_efa_rdm_atomic_compare_desc_persistence(struct efa_resource **state) msg.op = FI_CSWAP; /* - * Call fi_compare_atomicmsg with FI_DELIVERY_COMPLETE. - * This forces the operation to be queued when handshake is not complete. - * The old buggy code would store a pointer to compare_desc_array, - * which becomes dangling when this function returns. - * The fix copies the array contents into txe->atomic_ex.compare_desc[]. + * Mock efa_qp_post_send to succeed so the compare atomic completes. + * The test verifies that compare_desc is properly copied into the txe + * (not just a pointer to the caller's stack array). */ - ret = fi_compare_atomicmsg(resource->ep, &msg, &compare_ioc, compare_desc_array, 1, - &result_ioc, result_desc_array, 1, FI_DELIVERY_COMPLETE); + g_efa_unit_test_mocks.efa_qp_post_send = &efa_mock_efa_qp_post_send_return_mock; + will_return_int(efa_mock_efa_qp_post_send_return_mock, 0); - /* Operation should succeed (queued) */ + ret = fi_compare_atomicmsg(resource->ep, &msg, &compare_ioc, compare_desc_array, 1, + &result_ioc, result_desc_array, 1, 0); assert_int_equal(ret, 0); /* Destroy stack array to simulate function return */ compare_desc_array[0] = (void *)(uintptr_t)0xDEADBEEF; - /* Retrieve queued txe from ope_queued_list */ - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - assert_false(dlist_empty(&efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list)); - txe = container_of(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list.next, - struct efa_rdm_ope, queued_entry); + /* Retrieve the txe from the txe_list */ + assert_false(dlist_empty(&efa_rdm_ep->txe_list)); + txe = container_of(efa_rdm_ep->txe_list.next, struct efa_rdm_ope, ep_entry); /* Verify compare_desc was copied, not just pointer stored */ assert_ptr_equal(txe->atomic_ex.compare_desc[0], original_desc_value); diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index b94af056906..83fc5294980 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -179,9 +179,6 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_tx_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_page_alignment, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_atomic_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_send_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_send_queue_limit_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_read_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_write_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_trigger_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index da9011be0bb..c042486fbeb 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -142,9 +142,6 @@ void test_efa_rdm_ep_getopt_oversized_optlen(); void test_efa_rdm_ep_tx_pkt_pool_flags(); void test_efa_rdm_ep_rx_pkt_pool_flags(); void test_efa_rdm_ep_pkt_pool_page_alignment(); -void test_efa_rdm_ep_dc_atomic_queue_before_handshake(); -void test_efa_rdm_ep_dc_send_queue_before_handshake(); -void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(); void test_efa_rdm_ep_write_queue_before_handshake(); void test_efa_rdm_ep_read_queue_before_handshake(); void test_efa_rdm_ep_trigger_handshake();