Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion fabtests/prov/efa/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ bin_PROGRAMS += prov/efa/src/fi_efa_rnr_read_cq_error \
prov/efa/src/fi_efa_implicit_av_test \
prov/efa/src/fi_efa_multi_ep_stress \
prov/efa/src/fi_efa_mmap_test \
prov/efa/src/fi_efa_mr_test
prov/efa/src/fi_efa_mr_test \
prov/efa/src/fi_efa_runt_read_no_handshake
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I would name it generally like fi_efa_rdm_bw_homogeneous_peer


if HAVE_VERBS_DEVEL
if HAVE_EFA_DV
Expand Down Expand Up @@ -101,6 +102,11 @@ prov_efa_src_fi_efa_mr_test_SOURCES = \
prov/efa/src/efa_mr_test.c
prov_efa_src_fi_efa_mr_test_LDADD = libfabtests.la

prov_efa_src_fi_efa_runt_read_no_handshake_SOURCES = \
prov/efa/src/efa_runt_read_no_handshake.c \
$(benchmarks_srcs)
prov_efa_src_fi_efa_runt_read_no_handshake_LDADD = libfabtests.la

if HAVE_VERBS_DEVEL
if HAVE_EFA_DV
efa_exhaust_mr_reg_srcs = \
Expand Down
145 changes: 145 additions & 0 deletions fabtests/prov/efa/src/efa_runt_read_no_handshake.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <rdma/fi_errno.h>
#include <rdma/fi_ext.h>
#include <shared.h>
#include "benchmarks/benchmark_shared.h"

/*
* Mirrors ft_init_fabric() but inserts fi_setopt(HOMOGENEOUS_PEERS)
* before fi_enable, since the option must be set before enabling the EP
* to avoid the requirement for a handshake before using a read based
* protocol.
*/
static int init_fabric_with_homogeneous_peers(void)
{
int ret;
bool homogeneous = true;

ret = ft_init();
if (ret)
return ret;

ret = ft_init_oob();
if (ret)
return ret;

if (oob_sock >= 0 && opts.dst_addr) {
ret = ft_sock_sync(oob_sock, 0);
if (ret)
return ret;
}

ret = ft_getinfo(hints, &fi);
if (ret)
return ret;

ret = ft_open_fabric_res();
if (ret)
return ret;

ret = ft_alloc_active_res(fi);
if (ret)
return ret;

/* Must be set before fi_enable to skip handshake requirement before
* using read based protocol */
ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT,
FI_OPT_EFA_HOMOGENEOUS_PEERS,
&homogeneous, sizeof(homogeneous));
if (ret) {
FT_PRINTERR("fi_setopt(HOMOGENEOUS_PEERS)", ret);
return ret;
}

ret = ft_enable_ep_recv();
if (ret)
return ret;

if (oob_sock >= 0 && !opts.dst_addr) {
ret = ft_sock_sync(oob_sock, 0);
if (ret)
return ret;
}

ret = ft_init_av();
if (ret)
return ret;

return 0;
}

static int run(void)
{
int ret;
char test_name[64] = "";

ret = init_fabric_with_homogeneous_peers();
if (ret)
return ret;

init_test(&opts, test_name, sizeof(test_name));
ret = bandwidth();
if (ret)
return ret;

return ft_finalize();
}

int main(int argc, char **argv)
{
int op, ret, cleanup_ret;

opts = INIT_OPTS;
opts.options |= FT_OPT_BW;
opts.iterations = 1;
opts.warmup_iterations = 0;
opts.transfer_size = 262144; /* 256 KB */
opts.window_size = 1;

hints = fi_allocinfo();
if (!hints)
return EXIT_FAILURE;

while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS,
long_opts, &lopt_idx)) != -1) {
switch (op) {
default:
if (!ft_parse_long_opts(op, optarg))
continue;
ft_parse_benchmark_opts(op, optarg);
ft_parseinfo(op, optarg, hints, &opts);
ft_parsecsopts(op, optarg, &opts);
break;
case 'U':
hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE;
break;
case '?':
case 'h':
ft_csusage(argv[0],
"Verify runt read protocol transfers "
"data with both RTM packets and RDMA read");
ft_benchmark_usage();
ft_longopts_usage();
return EXIT_FAILURE;
}
}

if (optind < argc)
opts.dst_addr = argv[optind];

hints->ep_attr->type = FI_EP_RDM;
hints->caps = FI_MSG;
hints->mode |= FI_CONTEXT | FI_CONTEXT2;
hints->domain_attr->mr_mode = opts.mr_mode;
hints->addr_format = opts.address_format;

ret = run();

cleanup_ret = ft_free_res();
return ft_exit_code(ret ? ret : cleanup_ret);
}
47 changes: 29 additions & 18 deletions fabtests/pytest/efa/test_runt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,33 @@
pytest.param("neuron_to_neuron", None, marks=pytest.mark.neuron_memory)])
def test_runt_read_functional(cmdline_args, memory_type, copy_method):
"""
Verify runt reading protocol is working as expected by sending 1 message of 256 KB.
64 KB of the message will be transfered using EFA device's send capability
The remainer will be tranfered using EFA device's RDMA read capability
Verify runt read protocol works with FI_OPT_EFA_HOMOGENEOUS_PEERS set,
which skips the handshake. This sends 1 message of 256 KB using
fi_efa_runt_read_no_handshake (a unidirectional bandwidth test).

64 KB of the message is transferred using EFA device's send capability (runt).
The remainder (192 KB) is transferred using EFA device's RDMA read capability.

The test uses FI_OPT_EFA_HOMOGENEOUS_PEERS to skip the handshake, verifying
that the runt read protocol can be selected without prior handshake exchange.
"""
if cmdline_args.server_id == cmdline_args.client_id:
pytest.skip("no runting for intra-node communication")

cmdline_args.append_environ("FI_EFA_RUNT_SIZE=65536")
additional_env = "FI_EFA_RUNT_SIZE=65536 FI_EFA_ENABLE_SHM_TRANSFER=0"

if memory_type == "host_to_host":
# For host memory, min_read_message_size defaults to 1MB.
# Lower it so the 256KB message triggers the read-based path.
additional_env += " FI_EFA_INTER_MIN_READ_MESSAGE_SIZE=65536"

if copy_method == "gdrcopy":
if not has_gdrcopy(cmdline_args.server_id) or not has_gdrcopy(cmdline_args.client_id):
pytest.skip("No gdrcopy")

cmdline_args.append_environ("FI_HMEM_CUDA_USE_GDRCOPY=1")
additional_env += " FI_HMEM_CUDA_USE_GDRCOPY=1"
elif copy_method == "localread":
assert memory_type == "cuda_to_cuda"
cmdline_args.append_environ("FI_HMEM_CUDA_USE_GDRCOPY=0")
additional_env += " FI_HMEM_CUDA_USE_GDRCOPY=0"

# wrs stands for work requests
server_read_wrs_before_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs")
Expand All @@ -41,16 +51,17 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method):
client_send_bytes_before_test = efa_retrieve_hw_counter_value(cmdline_args.client_id, "send_bytes")

efa_run_client_server_test(cmdline_args,
"fi_rdm_tagged_bw",
"fi_efa_runt_read_no_handshake",
iteration_type="1",
completion_semantic="transmit_complete",
memory_type=memory_type,
message_size="262144",
warmup_iteration_type="0",
fabric="efa")
fabric="efa",
additional_env=additional_env)

server_read_wrs_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_wrs")
server_read_bytes_after_test =efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes")
server_read_bytes_after_test = efa_retrieve_hw_counter_value(cmdline_args.server_id, "rdma_read_bytes")
client_send_bytes_after_test = efa_retrieve_hw_counter_value(cmdline_args.client_id, "send_bytes")

print("server_read_bytes_before_test: {}".format(server_read_bytes_before_test))
Expand All @@ -61,32 +72,32 @@ def test_runt_read_functional(cmdline_args, memory_type, copy_method):
# return here for those that does not support
return

# fi_rdm_tagged_bw is a uni-diretional test. client is the sender and server is the receiver.
# In a RDMA read based message tranfer protocol, the receiver will submit work request for read
# fi_efa_runt_read_no_handshake is a unidirectional test.
# Client is the sender and server is the receiver.
# In the runt read protocol, the receiver issues RDMA read work requests.
server_read_wrs = server_read_wrs_after_test - server_read_wrs_before_test
server_read_bytes = server_read_bytes_after_test - server_read_bytes_before_test
client_send_bytes = client_send_bytes_after_test - client_send_bytes_before_test

# Among the 256 KB of data, 64 KB data will be sent via 8 RUNTREAD RTM packets.
# Among the 256 KB of data, 64 KB data will be sent via RUNTREAD RTM packets.
# The total number of send bytes will be larger than 64K because:
# a. each packet has a header
# b. when runing on single node, server will use the same EFA device to send control packets
# b. when running on single node, server will use the same EFA device to send control packets
assert client_send_bytes > 65536

if copy_method == "localread":
# when local read copy is used, server issue RDMA requests to copy received data
# when local read copy is used, server issues RDMA requests to copy received data
#
# so in this case, total read wr is at least 9, which is
# 1 remote read of 192k
# 8 local read for the 64k data transfer by send
# 8 local read for the 64k data transferred by send
# More local reads for fabtests control messages
#
# and total read_bytes will be >= 256K including the control messages
#
assert server_read_wrs >= 9
assert server_read_bytes >= 262144
else:
# The other 192 KB is transfer by RDMA read
# The other 192 KB is transferred by RDMA read
# for which the server (receiver) will issue 1 read request.
assert server_read_wrs == 1
assert server_read_bytes == 196608
30 changes: 29 additions & 1 deletion prov/efa/docs/efa_rdm_protocol_v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ Table: 2.1 a list of extra features/requests
| ID | Name | Type | Introduced since | Described in |
|---|---|---|---|---|
| 0 | RDMA-Read based data transfer | extra feature | libfabric 1.10.0 | Section 4.1 |
| 1 | delivery complete | extra feature | libfabric 1.12.0 | Section 4.2 |
| 1 | delivery complete | extra feature | libfabric 1.12.0 | Section 4.2 _(baseline since 2.6)_ |
| 2 | keep packet header length constant | extra request | libfabric 1.13.0 | Section 4.3 |
| 3 | sender connection id in packet header | extra request | libfabric 1.14.0 | Section 4.4 |
| 4 | runting read message protocol | extra feature | libfabric 1.16.0 | Section 4.5 |
Expand Down Expand Up @@ -379,6 +379,29 @@ If receiver is in zero copy receive mode, it will have the extra request
for sender to ignore the request, and send packets with different header length.
It is receiver's responsibility to react accordingly. (section 4.3)

#### Baseline promotion of extra features and requests (libfabric 2.6+)

Starting with libfabric 2.6, backwards compatibility is only guaranteed to libfabric 2.0.
Any extra feature or request that was introduced before libfabric 2.0 is considered to be
universally supported by all peers, since every supported peer (v2.0+) already implements it.

When an extra feature is promoted to baseline (marked as _(baseline since X.Y)_ in table 2.1),
the following applies:

- The flag is still set in `extra_info` during the handshake, because v2.0 peers check for
it and would reject operations if the flag is missing.
- However, the local endpoint no longer checks the peer's `extra_info` for that flag and
assumes the peer always supports the feature. A handshake is no longer required before
using the feature.

As of libfabric 2.6, the following features/requests has been baselined:

- **Delivery complete (ID 1)**: Baseline since 2.6. The handshake requirement for DC packets
has been removed from the send, write, and atomic paths. See section 4.2.

The remaining extra features and requests are candidates for the same transition in future
releases.

This concludes the workflow of the handshake subprotocol.

The binary format of a HANDSHAKE packet is listed in table 2.2.
Expand Down Expand Up @@ -1287,6 +1310,11 @@ not necessary for the responder to keep the progress engine running.

### 4.2 Delivery complete

**Since libfabric 2.6, delivery complete is treated as a baseline feature. All supported
peers (v2.0+) support it, so a handshake is no longer required before sending DC packets.
The `EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE` flag is still advertised in `extra_info`
for backwards compatibility with v2.0 peers that check for it.**

The extra feature "delivery complete" was introduced with libfabric 1.12.0 and was assigned ID 1.

Delivery complete is a requirement the application can impose on an endpoint when opening the endpoint.
Expand Down
21 changes: 0 additions & 21 deletions prov/efa/src/rdm/efa_rdm_atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,27 +103,6 @@ ssize_t efa_rdm_atomic_post_atomic(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm
};

delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE;
if (delivery_complete_requested && !(txe->peer->is_local)) {
/*
* Because delivery complete is defined as an extra
* feature, the receiver might not support it.
* The sender cannot send with FI_DELIVERY_COMPLETE
* if the peer is not able to handle it.
* If the sender does not know whether the peer
* can handle it, it needs to trigger
* a handshake packet from the peer.
* The handshake packet contains
* the information whether the peer
* support it or not.
*/
if (!efa_rdm_ep->homogeneous_peers) {
if (!(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED))
return efa_rdm_ep_enforce_handshake_for_txe(efa_rdm_ep, txe);

if (!(txe->peer->is_self) && !efa_rdm_peer_support_delivery_complete(txe->peer))
return -FI_EOPNOTSUPP;
}
}

if (delivery_complete_requested && txe->op == ofi_op_atomic) {
return efa_rdm_ope_post_send(txe, EFA_RDM_DC_WRITE_RTA_PKT);
Expand Down
7 changes: 5 additions & 2 deletions prov/efa/src/rdm/efa_rdm_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,11 @@ ssize_t efa_rdm_msg_post_rtm(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe)
*
* Check handshake packet from peer to verify support status.
*/
if (!ep->homogeneous_peers && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED))
return efa_rdm_ep_enforce_handshake_for_txe(ep, txe);
if (!ep->homogeneous_peers && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) {
int ex_feature = EFA_RDM_PKT_TYPE_REQ_INFO_VEC[rtm_type].ex_feature_flag;
if (ex_feature)
return efa_rdm_ep_enforce_handshake_for_txe(ep, txe);
}

if (!ep->homogeneous_peers && !efa_rdm_pkt_type_is_supported_by_peer(rtm_type, txe->peer))
return -FI_EOPNOTSUPP;
Expand Down
13 changes: 0 additions & 13 deletions prov/efa/src/rdm/efa_rdm_peer.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,19 +176,6 @@ bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer)
(peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV);
}

static inline
bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer)
{
/* FI_DELIVERY_COMPLETE is an extra feature defined
* in version 4 (the base version).
* Because it is an extra feature,
* an EP will assume the peer does not support
* it before a handshake packet was received.
*/
return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) &&
(peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE);
}

static inline
bool efa_rdm_peer_support_read_nack(struct efa_rdm_peer *peer)
{
Expand Down
Loading
Loading