Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions fabtests/benchmarks/benchmark_shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,10 @@ int pingpong(void)
return ret;
}

if (inject_size_set)
inject_size = opts.inject_size;
if (inject_size_set && inject_size < opts.inject_size) {
FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size);
return -FI_EINVAL;
}

if (opts.options & FT_OPT_ENABLE_HMEM)
inject_size = 0;
Expand Down Expand Up @@ -305,8 +307,10 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;
}

if (inject_size_set)
inject_size = opts.inject_size;
if (inject_size_set && inject_size < opts.inject_size) {
FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size);
return -FI_EINVAL;
}

if (ft_check_opts(FT_OPT_ENABLE_HMEM))
inject_size = 0;
Expand Down Expand Up @@ -400,8 +404,10 @@ int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;
}

if (inject_size_set)
inject_size = opts.inject_size;
if (inject_size_set && inject_size < opts.inject_size) {
FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size);
return -FI_EINVAL;
}

if (ft_check_opts(FT_OPT_ENABLE_HMEM))
inject_size = 0;
Expand Down Expand Up @@ -529,8 +535,10 @@ int bandwidth(void)
return ret;
}

if (inject_size_set)
inject_size = opts.inject_size;
if (inject_size_set && inject_size < opts.inject_size) {
FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size);
return -FI_EINVAL;
}

if (opts.options & FT_OPT_ENABLE_HMEM)
inject_size = 0;
Expand Down Expand Up @@ -671,8 +679,10 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;
}

if (inject_size_set)
inject_size = opts.inject_size;
if (inject_size_set && inject_size < opts.inject_size) {
FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size);
return -FI_EINVAL;
}

if (ft_check_opts(FT_OPT_ENABLE_HMEM))
inject_size = 0;
Expand Down
2 changes: 1 addition & 1 deletion fabtests/benchmarks/rma_pingpong.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,5 +155,5 @@ int main(int argc, char **argv)
ret = run();

cleanup_ret = ft_free_res();
return -(ret ? ret : cleanup_ret);
return ft_exit_code(ret ? ret : cleanup_ret);
}
23 changes: 4 additions & 19 deletions fabtests/common/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -1365,16 +1365,16 @@ int ft_init_fabric(void)
if (ret)
return ret;

ret = ft_getinfo(hints, &fi);
if (ret)
return ret;

if (oob_sock >= 0 && opts.dst_addr) {
ret = ft_sock_sync(oob_sock, 0);
if (ret)
return ret;
}

ret = ft_getinfo(hints, &fi);
if (ret)
return ret;

ret = ft_open_fabric_res();
if (ret)
return ret;
Expand Down Expand Up @@ -1500,21 +1500,6 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av *
}
}

if (opts.inject_size) {
ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE,
&opts.inject_size, sizeof opts.inject_size);
if (ret && ret != -FI_EOPNOTSUPP) {
FT_PRINTERR("fi_setopt(FI_OPT_INJECT_MSG_SIZE)", ret);
return ret;
}
ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE,
&opts.inject_size, sizeof opts.inject_size);
if (ret && ret != -FI_EOPNOTSUPP) {
FT_PRINTERR("fi_setopt(FI_OPT_INJECT_RMA_SIZE)", ret);
return ret;
}
}

if (opts.min_multi_recv_size) {
ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV,
&opts.min_multi_recv_size, sizeof opts.min_multi_recv_size);
Expand Down
10 changes: 10 additions & 0 deletions fabtests/pytest/efa/test_rma_pingpong.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,13 @@ def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, rma_bw_compl
command = command + " -o " + operation_type
efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic,
memory_type_bi_dir, rma_pingpong_message_size, fabric=rma_fabric)


@pytest.mark.functional
@pytest.mark.parametrize("operation_type", ["writedata"])
@pytest.mark.parametrize("inject_size", [64])
def test_rma_pingpong_wide_wqe(cmdline_args, operation_type, inject_size):
"""Test RMA pingpong with wide WQE inject."""
command = "fi_rma_pingpong -e rdm -E -o {} -j {} -S {} --expect-error 61".format(operation_type, inject_size, inject_size)
efa_run_client_server_test(cmdline_args, command, "short", "delivery_complete","host_to_host", "all", fabric="efa-direct")
pytest.xfail("fi_info is expected to return FI_ENODATA on hardware without wide WQE support. Remove --expect-error and this line once FW deployed")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the planned behavior?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, fi_info returns ENODATA for hints->tx_attr->inject_size > 32 if rdma-core or/and FW are old. Therefore the test will repot XFAIL until everything was deployed.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then if firmware is deployed and we are not changing it, the runfabtests.py will fail?

8 changes: 8 additions & 0 deletions prov/efa/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,11 @@ AC_DEFUN([FI_EFA_CONFIGURE],[
[have_ibv_get_cq_event=1],
[have_ibv_get_cq_event=0],
[[#include <infiniband/verbs.h>]])

AC_CHECK_MEMBER([struct efadv_device_attr.inline_buf_size_ex],
[have_inline_buf_size_ex=1],
[have_inline_buf_size_ex=0],
[[#include <infiniband/efadv.h>]])
])

AC_DEFINE_UNQUOTED([HAVE_RDMA_SIZE],
Expand Down Expand Up @@ -263,6 +268,9 @@ AC_DEFUN([FI_EFA_CONFIGURE],[
AC_DEFINE_UNQUOTED([HAVE_EFADV_CQ_ATTR_DB],
[$have_efadv_cq_attr_db],
[Indicates if efadv_cq_attr struct has doorbell field])
AC_DEFINE_UNQUOTED([HAVE_INLINE_BUF_SIZE_EX],
[$have_inline_buf_size_ex],
[Indicates if efadv_device_attr has inline_buf_size_ex field for wide WQE])
AS_IF([test "$have_efadv_query_qp_wqs" = "1" -a "$have_efadv_query_cq" = "1"],
[have_efa_data_path_direct=1],
[have_efa_data_path_direct=0])
Expand Down
34 changes: 25 additions & 9 deletions prov/efa/src/efa_base_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep,
* @return int 0 on success, negative integer on failure
*/
int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex,
uint32_t tclass, bool use_unsolicited_write_recv)
uint32_t tclass, bool use_unsolicited_write_recv,
bool use_inline_write)
{
struct efadv_qp_init_attr efa_attr = { 0 };

Expand All @@ -310,6 +311,10 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex,
#if HAVE_CAPS_UNSOLICITED_WRITE_RECV
if (use_unsolicited_write_recv)
efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV;
#endif
#if HAVE_INLINE_BUF_SIZE_EX
if (use_inline_write)
efa_attr.flags |= EFADV_QP_FLAGS_INLINE_WRITE;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, this is a new flags beyond the existing IBV_QP_*INLINE_WRITE because we already use it in released libfabric and it cannot be reused for the real inline write support.

Right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@YonatanNachum is adding this flag in RDMA-Core PR: linux-rdma/rdma-core@88917d9
Libfabric has to pass it to enable RDMA write with iinline

#endif
efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD;
#if HAVE_EFADV_SL
Expand Down Expand Up @@ -382,7 +387,9 @@ void efa_base_ep_construct_ibv_qp_init_attr_ex(struct efa_base_ep *ep,
attr_ex->cap.max_send_sge = device_info->tx_attr->iov_limit;
attr_ex->cap.max_recv_wr = efa_base_ep_get_rx_pool_size(ep);
attr_ex->cap.max_recv_sge = device_info->rx_attr->iov_limit;
attr_ex->cap.max_inline_data = ep->domain->device->efa_attr.inline_buf_size;
attr_ex->cap.max_inline_data = EFA_INFO_TYPE_IS_DIRECT(ep->info) ?
ep->info->tx_attr->inject_size :
ep->domain->device->efa_attr.inline_buf_size;

EFA_INFO(FI_LOG_EP_CTRL,
"QP cap max_send_wr=%u max_recv_wr=%u max_send_sge=%u "
Expand Down Expand Up @@ -416,6 +423,14 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep,
int ret;
struct ibv_qp_init_attr_ex attr_ex = { 0 };
bool use_unsolicited_write_recv = true;
/*
* Inline RDMA write is only supported with 128-byte wide WQE,
* which is enabled when the requested inject size exceeds
* inline_buf_size.
*/
bool use_inline_write = EFA_INFO_TYPE_IS_DIRECT(base_ep->info) &&
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Why not enable inline RDMA write for smaller inject sizes?

Is it because inline RDMA write is only possible with wide WQE? In that case, need a comment e.g.

Inline RDMA write is only supported with 128 byte wide WQE
and we only enable wide WQE when the user requested inject size
is larger than inline_buf_size.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it was a design choice. Will add a comment.

base_ep->info->tx_attr->inject_size >
base_ep->domain->device->efa_attr.inline_buf_size;

efa_base_ep_construct_ibv_qp_init_attr_ex(base_ep, &attr_ex, tx_cq->ibv_cq_ex, rx_cq->ibv_cq_ex);

Expand All @@ -434,12 +449,12 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep,
}
EFA_INFO(FI_LOG_EP_CTRL, "creating QP with unsolicited write recv status: %d\n", use_unsolicited_write_recv);
ret = efa_qp_create(&base_ep->qp, &attr_ex, base_ep->info->tx_attr->tclass,
use_unsolicited_write_recv);
use_unsolicited_write_recv, use_inline_write);
if (ret)
return ret;

if (create_user_recv_qp) {
ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled);
ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled, use_inline_write);
if (ret) {
efa_base_ep_destruct_qp_unsafe(base_ep);
return ret;
Expand Down Expand Up @@ -603,10 +618,11 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep,
/* Use device's native limit as the default value of base ep*/
base_ep->max_msg_size = (size_t) base_ep->domain->device->ibv_port_attr.max_msg_sz;
base_ep->max_rma_size = (size_t) base_ep->domain->device->max_rdma_size;
base_ep->inject_msg_size = (size_t) base_ep->domain->device->efa_attr.inline_buf_size;
/* TODO: update inject_rma_size to inline size after firmware
* supports inline rdma write */
base_ep->inject_rma_size = 0;
base_ep->inject_msg_size = info->tx_attr->inject_size;
if (info->tx_attr->inject_size > base_ep->domain->device->efa_attr.inline_buf_size)
base_ep->inject_rma_size = info->tx_attr->inject_size;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should u combine this commit with the last commit? I see it is calling fi_getopt which should also be impacted by this change?

else
base_ep->inject_rma_size = 0;
base_ep->use_unsolicited_write_recv = true;
return 0;
}
Expand Down Expand Up @@ -801,7 +817,7 @@ int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *ep,
/* Create a dummy qp for query only */
efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq.ibv_cq_ex, ibv_cq.ibv_cq_ex);

ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled);
ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled, false);
if (ret)
goto out;

Expand Down
3 changes: 2 additions & 1 deletion prov/efa/src/efa_base_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info,
struct fid_ep **ep_fid, void *context);

int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex,
uint32_t tclass, bool enable_unsolicited_write_recv);
uint32_t tclass, bool enable_unsolicited_write_recv,
bool use_inline_write);

void efa_qp_destruct(struct efa_qp *qp);

Expand Down
22 changes: 16 additions & 6 deletions prov/efa/src/efa_data_path_direct_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ static inline int efa_data_path_direct_post_send(
uint32_t qkey)
{
struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq;
struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */
struct efa_io_tx_wqe_128 local_wqe = {0};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to remove the comment /* Stack variable - can be in registers */?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I did. I think this comment is misleading (not mentioning it's a classic example of code smell).
Although struct itself would fit in a register, we're taking addresses of the variable and structure members many times in the very same function. I do not believe any compiler is genius enough to produce equivalent code without using memory.

struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta;
int err = 0;

Expand Down Expand Up @@ -498,7 +498,7 @@ static inline int efa_data_path_direct_post_read(
uint32_t qkey)
{
struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq;
struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */
struct efa_io_tx_wqe_128 local_wqe = {0};
struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta;
struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem;
int err;
Expand Down Expand Up @@ -578,6 +578,8 @@ efa_data_path_direct_post_write(
struct efa_qp *qp,
const struct ibv_sge *sge_list,
size_t sge_count,
const struct ibv_data_buf *inline_data_list,
bool use_inline,
uint32_t remote_key,
uint64_t remote_addr,
uintptr_t wr_id,
Expand All @@ -588,7 +590,7 @@ efa_data_path_direct_post_write(
uint32_t qkey)
{
struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq;
struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */
struct efa_io_tx_wqe_128 local_wqe = {0};
struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta;
struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem;
int err;
Expand Down Expand Up @@ -644,10 +646,18 @@ efa_data_path_direct_post_write(

/* Set remote memory information */
efa_send_wr_set_rdma_addr(remote_mem, remote_key, remote_addr);
remote_mem->length = efa_sge_total_bytes(sge_list, sge_count);

/* Set local SGE list - caller has prepared sge_list */
efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count);
if (use_inline) {
assert(sge_count == 1);
memcpy(local_wqe.data.rdma_req.inline_data,
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe don't really need memcpy here

inline_data_list[0].addr, inline_data_list[0].length);
remote_mem->length = inline_data_list[0].length;
EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1);
meta_desc->length = inline_data_list[0].length;
} else {
remote_mem->length = efa_sge_total_bytes(sge_list, sge_count);
efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count);
}

efa_data_path_direct_send_wr_post(qp, sq, &local_wqe);

Expand Down
18 changes: 11 additions & 7 deletions prov/efa/src/efa_data_path_direct_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -594,19 +594,23 @@ EFA_ALWAYS_INLINE void
efa_data_path_direct_send_wr_post(
struct efa_qp *qp,
struct efa_data_path_direct_sq *sq,
struct efa_io_tx_wqe *wqe)
struct efa_io_tx_wqe_128 *wqe)
{
uint32_t sq_desc_idx;
uint64_t *src, *dst;

/* Calculate target address in write-combined memory */
/* Calculate target address in write-combined memory.
* Use byte-level arithmetic since wqe_size may be 64 or 128 bytes. */
sq_desc_idx = sq->wq.pc & sq->wq.desc_mask;
src = (uint64_t *)wqe;
dst = (uint64_t *)((struct efa_io_tx_wqe *)sq->desc + sq_desc_idx);
dst = (uint64_t *)((uint8_t *)sq->desc + sq_desc_idx * sq->wq.wqe_size);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to have a comment here to explain the math... I understand you changed the offset unit from 64 byte to 1 byte, so we need a comment


/* Copy 64-byte WQE using 8 uint64_t stores */
for (int i = 0; i < 8; i++)
dst[i] = src[i];
/*
* Use mmio_memcpy_x64 to copy the WQE to write-combined memory
* with proper 8-byte atomic stores. The wqe_size is either 64 or
* 128 bytes depending on whether wide WQE is enabled.
*/
mmio_memcpy_x64(dst, src, sq->wq.wqe_size);

#if HAVE_LTTNG
efa_data_path_direct_tracepoint_post_send(qp, sq, &wqe->meta);
Expand Down Expand Up @@ -644,7 +648,7 @@ EFA_ALWAYS_INLINE void efa_data_path_direct_set_ud_addr(struct efa_io_tx_meta_de
* @param num_buf Number of data buffers
* @param buf_list Array of data buffers
*/
EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe *wqe,
EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe_128 *wqe,
size_t num_buf,
const struct ibv_data_buf *buf_list)
{
Expand Down
13 changes: 9 additions & 4 deletions prov/efa/src/efa_data_path_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ int efa_qp_post_read(struct efa_qp *qp, const struct ibv_sge *sge_list,
uint64_t remote_addr, uintptr_t wr_id, uint64_t flags,
struct efa_ah *ah, uint32_t qpn, uint32_t qkey);
int efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list,
size_t sge_count, uint32_t remote_key,
uint64_t remote_addr, uintptr_t wr_id, uint64_t data,
uint64_t flags, struct efa_ah *ah, uint32_t qpn,
uint32_t qkey);
size_t sge_count,
const struct ibv_data_buf *inline_data_list,
bool use_inline,
uint32_t remote_key, uint64_t remote_addr,
uintptr_t wr_id, uint64_t data, uint64_t flags,
struct efa_ah *ah, uint32_t qpn, uint32_t qkey);
int efa_ibv_cq_start_poll(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr);
int efa_ibv_cq_next_poll(struct efa_ibv_cq *ibv_cq);
enum ibv_wc_opcode efa_ibv_cq_wc_read_opcode(struct efa_ibv_cq *ibv_cq);
Expand Down Expand Up @@ -271,6 +273,8 @@ static inline int
efa_qp_post_write(struct efa_qp *qp,
const struct ibv_sge *sge_list,
size_t sge_count,
const struct ibv_data_buf *inline_data_list,
bool use_inline,
uint32_t remote_key,
uint64_t remote_addr,
uintptr_t wr_id,
Expand All @@ -285,6 +289,7 @@ efa_qp_post_write(struct efa_qp *qp,
#if HAVE_EFA_DATA_PATH_DIRECT
if (qp->data_path_direct_enabled)
return efa_data_path_direct_post_write(qp, sge_list, sge_count,
inline_data_list, use_inline,
remote_key, remote_addr, wr_id, data, flags, ah, qpn, qkey);
#endif
return efa_ibv_post_write(qp, sge_list, sge_count,
Expand Down
Loading
Loading