diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index 896950fa909..9efb5a205b3 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -214,8 +214,10 @@ int pingpong(void) return ret; } - if (inject_size_set) - inject_size = opts.inject_size; + if (inject_size_set && inject_size < opts.inject_size) { + FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size); + return -FI_EINVAL; + } if (opts.options & FT_OPT_ENABLE_HMEM) inject_size = 0; @@ -305,8 +307,10 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) return ret; } - if (inject_size_set) - inject_size = opts.inject_size; + if (inject_size_set && inject_size < opts.inject_size) { + FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size); + return -FI_EINVAL; + } if (ft_check_opts(FT_OPT_ENABLE_HMEM)) inject_size = 0; @@ -400,8 +404,10 @@ int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) return ret; } - if (inject_size_set) - inject_size = opts.inject_size; + if (inject_size_set && inject_size < opts.inject_size) { + FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size); + return -FI_EINVAL; + } if (ft_check_opts(FT_OPT_ENABLE_HMEM)) inject_size = 0; @@ -529,8 +535,10 @@ int bandwidth(void) return ret; } - if (inject_size_set) - inject_size = opts.inject_size; + if (inject_size_set && inject_size < opts.inject_size) { + FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size); + return -FI_EINVAL; + } if (opts.options & FT_OPT_ENABLE_HMEM) inject_size = 0; @@ -671,8 +679,10 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) return ret; } - if (inject_size_set) - inject_size = opts.inject_size; + if (inject_size_set && inject_size < opts.inject_size) { + FT_ERR("Provider does not support inject size %zu (max size %zu)", opts.inject_size, inject_size); + return -FI_EINVAL; + } if (ft_check_opts(FT_OPT_ENABLE_HMEM)) inject_size = 0; diff --git a/fabtests/benchmarks/rma_pingpong.c b/fabtests/benchmarks/rma_pingpong.c index a736a673d8c..cd2c2a7b87e 100644 --- a/fabtests/benchmarks/rma_pingpong.c +++ b/fabtests/benchmarks/rma_pingpong.c @@ -155,5 +155,5 @@ int main(int argc, char **argv) ret = run(); cleanup_ret = ft_free_res(); - return -(ret ? ret : cleanup_ret); + return ft_exit_code(ret ? ret : cleanup_ret); } diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 98a31a3f7e1..0ecfbd9cd51 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -1365,16 +1365,16 @@ int ft_init_fabric(void) if (ret) return ret; + ret = ft_getinfo(hints, &fi); + if (ret) + return ret; + if (oob_sock >= 0 && opts.dst_addr) { ret = ft_sock_sync(oob_sock, 0); if (ret) return ret; } - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - ret = ft_open_fabric_res(); if (ret) return ret; @@ -1500,21 +1500,6 @@ int ft_enable_ep(struct fid_ep *bind_ep, struct fid_eq *bind_eq, struct fid_av * } } - if (opts.inject_size) { - ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, - &opts.inject_size, sizeof opts.inject_size); - if (ret && ret != -FI_EOPNOTSUPP) { - FT_PRINTERR("fi_setopt(FI_OPT_INJECT_MSG_SIZE)", ret); - return ret; - } - ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, - &opts.inject_size, sizeof opts.inject_size); - if (ret && ret != -FI_EOPNOTSUPP) { - FT_PRINTERR("fi_setopt(FI_OPT_INJECT_RMA_SIZE)", ret); - return ret; - } - } - if (opts.min_multi_recv_size) { ret = fi_setopt(&bind_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, &opts.min_multi_recv_size, sizeof opts.min_multi_recv_size); diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py index e68bb33fe65..aef9fb6ff74 100644 --- a/fabtests/pytest/efa/test_rma_pingpong.py +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -40,3 +40,13 @@ def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, rma_bw_compl command = command + " -o " + operation_type efa_run_client_server_test(cmdline_args, command, "short", rma_bw_completion_semantic, memory_type_bi_dir, rma_pingpong_message_size, fabric=rma_fabric) + + +@pytest.mark.functional +@pytest.mark.parametrize("operation_type", ["writedata"]) +@pytest.mark.parametrize("inject_size", [64]) +def test_rma_pingpong_wide_wqe(cmdline_args, operation_type, inject_size): + """Test RMA pingpong with wide WQE inject.""" + command = "fi_rma_pingpong -e rdm -E -o {} -j {} -S {} --expect-error 61".format(operation_type, inject_size, inject_size) + efa_run_client_server_test(cmdline_args, command, "short", "delivery_complete","host_to_host", "all", fabric="efa-direct") + pytest.xfail("fi_info is expected to return FI_ENODATA on hardware without wide WQE support. Remove --expect-error and this line once FW deployed") diff --git a/prov/efa/configure.m4 b/prov/efa/configure.m4 index 260a6358573..b12b12e3a85 100644 --- a/prov/efa/configure.m4 +++ b/prov/efa/configure.m4 @@ -216,6 +216,11 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ [have_ibv_get_cq_event=1], [have_ibv_get_cq_event=0], [[#include ]]) + + AC_CHECK_MEMBER([struct efadv_device_attr.inline_buf_size_ex], + [have_inline_buf_size_ex=1], + [have_inline_buf_size_ex=0], + [[#include ]]) ]) AC_DEFINE_UNQUOTED([HAVE_RDMA_SIZE], @@ -263,6 +268,9 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ AC_DEFINE_UNQUOTED([HAVE_EFADV_CQ_ATTR_DB], [$have_efadv_cq_attr_db], [Indicates if efadv_cq_attr struct has doorbell field]) + AC_DEFINE_UNQUOTED([HAVE_INLINE_BUF_SIZE_EX], + [$have_inline_buf_size_ex], + [Indicates if efadv_device_attr has inline_buf_size_ex field for wide WQE]) AS_IF([test "$have_efadv_query_qp_wqs" = "1" -a "$have_efadv_query_cq" = "1"], [have_efa_data_path_direct=1], [have_efa_data_path_direct=0]) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 5f179fe12ff..98a55437533 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -284,7 +284,8 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep, * @return int 0 on success, negative integer on failure */ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, - uint32_t tclass, bool use_unsolicited_write_recv) + uint32_t tclass, bool use_unsolicited_write_recv, + bool use_inline_write) { struct efadv_qp_init_attr efa_attr = { 0 }; @@ -310,6 +311,10 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, #if HAVE_CAPS_UNSOLICITED_WRITE_RECV if (use_unsolicited_write_recv) efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; +#endif +#if HAVE_INLINE_BUF_SIZE_EX + if (use_inline_write) + efa_attr.flags |= EFADV_QP_FLAGS_INLINE_WRITE; #endif efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; #if HAVE_EFADV_SL @@ -382,7 +387,9 @@ void efa_base_ep_construct_ibv_qp_init_attr_ex(struct efa_base_ep *ep, attr_ex->cap.max_send_sge = device_info->tx_attr->iov_limit; attr_ex->cap.max_recv_wr = efa_base_ep_get_rx_pool_size(ep); attr_ex->cap.max_recv_sge = device_info->rx_attr->iov_limit; - attr_ex->cap.max_inline_data = ep->domain->device->efa_attr.inline_buf_size; + attr_ex->cap.max_inline_data = EFA_INFO_TYPE_IS_DIRECT(ep->info) ? + ep->info->tx_attr->inject_size : + ep->domain->device->efa_attr.inline_buf_size; EFA_INFO(FI_LOG_EP_CTRL, "QP cap max_send_wr=%u max_recv_wr=%u max_send_sge=%u " @@ -416,6 +423,14 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep, int ret; struct ibv_qp_init_attr_ex attr_ex = { 0 }; bool use_unsolicited_write_recv = true; + /* + * Inline RDMA write is only supported with 128-byte wide WQE, + * which is enabled when the requested inject size exceeds + * inline_buf_size. + */ + bool use_inline_write = EFA_INFO_TYPE_IS_DIRECT(base_ep->info) && + base_ep->info->tx_attr->inject_size > + base_ep->domain->device->efa_attr.inline_buf_size; efa_base_ep_construct_ibv_qp_init_attr_ex(base_ep, &attr_ex, tx_cq->ibv_cq_ex, rx_cq->ibv_cq_ex); @@ -434,12 +449,12 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep, } EFA_INFO(FI_LOG_EP_CTRL, "creating QP with unsolicited write recv status: %d\n", use_unsolicited_write_recv); ret = efa_qp_create(&base_ep->qp, &attr_ex, base_ep->info->tx_attr->tclass, - use_unsolicited_write_recv); + use_unsolicited_write_recv, use_inline_write); if (ret) return ret; if (create_user_recv_qp) { - ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled); + ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled, use_inline_write); if (ret) { efa_base_ep_destruct_qp_unsafe(base_ep); return ret; @@ -603,10 +618,11 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, /* Use device's native limit as the default value of base ep*/ base_ep->max_msg_size = (size_t) base_ep->domain->device->ibv_port_attr.max_msg_sz; base_ep->max_rma_size = (size_t) base_ep->domain->device->max_rdma_size; - base_ep->inject_msg_size = (size_t) base_ep->domain->device->efa_attr.inline_buf_size; - /* TODO: update inject_rma_size to inline size after firmware - * supports inline rdma write */ - base_ep->inject_rma_size = 0; + base_ep->inject_msg_size = info->tx_attr->inject_size; + if (info->tx_attr->inject_size > base_ep->domain->device->efa_attr.inline_buf_size) + base_ep->inject_rma_size = info->tx_attr->inject_size; + else + base_ep->inject_rma_size = 0; base_ep->use_unsolicited_write_recv = true; return 0; } @@ -801,7 +817,7 @@ int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *ep, /* Create a dummy qp for query only */ efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq.ibv_cq_ex, ibv_cq.ibv_cq_ex); - ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled); + ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled, false); if (ret) goto out; diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 7144984c4ff..dd405d18dcc 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -119,7 +119,8 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *user_info, struct fid_ep **ep_fid, void *context); int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, - uint32_t tclass, bool enable_unsolicited_write_recv); + uint32_t tclass, bool enable_unsolicited_write_recv, + bool use_inline_write); void efa_qp_destruct(struct efa_qp *qp); diff --git a/prov/efa/src/efa_data_path_direct_entry.h b/prov/efa/src/efa_data_path_direct_entry.h index ea70155478d..d9311cd44b8 100644 --- a/prov/efa/src/efa_data_path_direct_entry.h +++ b/prov/efa/src/efa_data_path_direct_entry.h @@ -417,7 +417,7 @@ static inline int efa_data_path_direct_post_send( uint32_t qkey) { struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; - struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ + struct efa_io_tx_wqe_128 local_wqe = {0}; struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; int err = 0; @@ -498,7 +498,7 @@ static inline int efa_data_path_direct_post_read( uint32_t qkey) { struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; - struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ + struct efa_io_tx_wqe_128 local_wqe = {0}; struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem; int err; @@ -578,6 +578,8 @@ efa_data_path_direct_post_write( struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, + const struct ibv_data_buf *inline_data_list, + bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, @@ -588,7 +590,7 @@ efa_data_path_direct_post_write( uint32_t qkey) { struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; - struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ + struct efa_io_tx_wqe_128 local_wqe = {0}; struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem; int err; @@ -644,10 +646,18 @@ efa_data_path_direct_post_write( /* Set remote memory information */ efa_send_wr_set_rdma_addr(remote_mem, remote_key, remote_addr); - remote_mem->length = efa_sge_total_bytes(sge_list, sge_count); - /* Set local SGE list - caller has prepared sge_list */ - efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count); + if (use_inline) { + assert(sge_count == 1); + memcpy(local_wqe.data.rdma_req.inline_data, + inline_data_list[0].addr, inline_data_list[0].length); + remote_mem->length = inline_data_list[0].length; + EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + meta_desc->length = inline_data_list[0].length; + } else { + remote_mem->length = efa_sge_total_bytes(sge_list, sge_count); + efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count); + } efa_data_path_direct_send_wr_post(qp, sq, &local_wqe); diff --git a/prov/efa/src/efa_data_path_direct_internal.h b/prov/efa/src/efa_data_path_direct_internal.h index 3a0610e235f..9137d62f96c 100644 --- a/prov/efa/src/efa_data_path_direct_internal.h +++ b/prov/efa/src/efa_data_path_direct_internal.h @@ -594,19 +594,23 @@ EFA_ALWAYS_INLINE void efa_data_path_direct_send_wr_post( struct efa_qp *qp, struct efa_data_path_direct_sq *sq, - struct efa_io_tx_wqe *wqe) + struct efa_io_tx_wqe_128 *wqe) { uint32_t sq_desc_idx; uint64_t *src, *dst; - /* Calculate target address in write-combined memory */ + /* Calculate target address in write-combined memory. + * Use byte-level arithmetic since wqe_size may be 64 or 128 bytes. */ sq_desc_idx = sq->wq.pc & sq->wq.desc_mask; src = (uint64_t *)wqe; - dst = (uint64_t *)((struct efa_io_tx_wqe *)sq->desc + sq_desc_idx); + dst = (uint64_t *)((uint8_t *)sq->desc + sq_desc_idx * sq->wq.wqe_size); - /* Copy 64-byte WQE using 8 uint64_t stores */ - for (int i = 0; i < 8; i++) - dst[i] = src[i]; + /* + * Use mmio_memcpy_x64 to copy the WQE to write-combined memory + * with proper 8-byte atomic stores. The wqe_size is either 64 or + * 128 bytes depending on whether wide WQE is enabled. + */ + mmio_memcpy_x64(dst, src, sq->wq.wqe_size); #if HAVE_LTTNG efa_data_path_direct_tracepoint_post_send(qp, sq, &wqe->meta); @@ -644,7 +648,7 @@ EFA_ALWAYS_INLINE void efa_data_path_direct_set_ud_addr(struct efa_io_tx_meta_de * @param num_buf Number of data buffers * @param buf_list Array of data buffers */ -EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe *wqe, +EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe_128 *wqe, size_t num_buf, const struct ibv_data_buf *buf_list) { diff --git a/prov/efa/src/efa_data_path_ops.h b/prov/efa/src/efa_data_path_ops.h index b62d9ce936d..3109d01155e 100644 --- a/prov/efa/src/efa_data_path_ops.h +++ b/prov/efa/src/efa_data_path_ops.h @@ -43,10 +43,12 @@ int efa_qp_post_read(struct efa_qp *qp, const struct ibv_sge *sge_list, uint64_t remote_addr, uintptr_t wr_id, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, - size_t sge_count, uint32_t remote_key, - uint64_t remote_addr, uintptr_t wr_id, uint64_t data, - uint64_t flags, struct efa_ah *ah, uint32_t qpn, - uint32_t qkey); + size_t sge_count, + const struct ibv_data_buf *inline_data_list, + bool use_inline, + uint32_t remote_key, uint64_t remote_addr, + uintptr_t wr_id, uint64_t data, uint64_t flags, + struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int efa_ibv_cq_start_poll(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr); int efa_ibv_cq_next_poll(struct efa_ibv_cq *ibv_cq); enum ibv_wc_opcode efa_ibv_cq_wc_read_opcode(struct efa_ibv_cq *ibv_cq); @@ -271,6 +273,8 @@ static inline int efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, + const struct ibv_data_buf *inline_data_list, + bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, @@ -285,6 +289,7 @@ efa_qp_post_write(struct efa_qp *qp, #if HAVE_EFA_DATA_PATH_DIRECT if (qp->data_path_direct_enabled) return efa_data_path_direct_post_write(qp, sge_list, sge_count, + inline_data_list, use_inline, remote_key, remote_addr, wr_id, data, flags, ah, qpn, qkey); #endif return efa_ibv_post_write(qp, sge_list, sge_count, diff --git a/prov/efa/src/efa_device.c b/prov/efa/src/efa_device.c index fc5d16d1745..45ef95d9d9e 100644 --- a/prov/efa/src/efa_device.c +++ b/prov/efa/src/efa_device.c @@ -423,6 +423,26 @@ bool efa_device_support_rdma_write(void) } #endif +/** + * @brief check whether efa device supports wide WQE (128-byte) + * + * @return a boolean indicating wide WQE support + */ +#if HAVE_INLINE_BUF_SIZE_EX +bool efa_device_support_wide_wqe(void) +{ + assert(g_efa_selected_device_cnt > 0); + + return g_efa_selected_device_list[0].efa_attr.inline_buf_size_ex > + g_efa_selected_device_list[0].efa_attr.inline_buf_size; +} +#else +bool efa_device_support_wide_wqe(void) +{ + return false; +} +#endif + /** * @brief check whether efa device support unsolicited write recv * diff --git a/prov/efa/src/efa_device.h b/prov/efa/src/efa_device.h index 83102f87233..97f14bd81b0 100644 --- a/prov/efa/src/efa_device.h +++ b/prov/efa/src/efa_device.h @@ -54,6 +54,8 @@ bool efa_device_support_rdma_read(void); bool efa_device_support_rdma_write(void); +bool efa_device_support_wide_wqe(void); + bool efa_device_support_unsolicited_write_recv(void); bool efa_device_support_cq_with_ext_mem_dmabuf(void); diff --git a/prov/efa/src/efa_ep.c b/prov/efa/src/efa_ep.c index d49915b60fb..e47811069d2 100644 --- a/prov/efa/src/efa_ep.c +++ b/prov/efa/src/efa_ep.c @@ -132,10 +132,10 @@ static int efa_ep_setopt(fid_t fid, int level, int optname, const void *optval, EFA_EP_SETOPT_THRESHOLD(MAX_RMA_SIZE, ep->max_rma_size, (size_t) ep->domain->device->max_rdma_size) break; case FI_OPT_INJECT_MSG_SIZE: - EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, ep->inject_msg_size, (size_t) ep->domain->device->efa_attr.inline_buf_size) + EFA_EP_SETOPT_THRESHOLD(INJECT_MSG_SIZE, ep->inject_msg_size, (size_t) ep->info->tx_attr->inject_size) break; case FI_OPT_INJECT_RMA_SIZE: - EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, ep->inject_rma_size, (size_t) 0) + EFA_EP_SETOPT_THRESHOLD(INJECT_RMA_SIZE, ep->inject_rma_size, ep->inject_rma_size) break; /* no op as efa direct ep will not use cuda api and shm in data transfer */ case FI_OPT_CUDA_API_PERMITTED: /* fall through */ diff --git a/prov/efa/src/efa_io_defs.h b/prov/efa/src/efa_io_defs.h index efbc0578d4a..1798dd116d6 100644 --- a/prov/efa/src/efa_io_defs.h +++ b/prov/efa/src/efa_io_defs.h @@ -121,19 +121,24 @@ struct efa_io_remote_mem_addr { uint32_t buf_addr_hi; }; -struct efa_io_rdma_req { +struct efa_io_rdma_req_128 { /* Remote memory address */ struct efa_io_remote_mem_addr remote_mem; - /* Local memory address */ - struct efa_io_tx_buf_desc local_mem[1]; + union { + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; + + /* inline data for RDMA */ + uint8_t inline_data[80]; + }; }; /* - * Tx WQE, composed of tx meta descriptors followed by either tx buffer - * descriptors or inline data + * 128-byte Tx WQE, composed of tx meta descriptors followed by either tx + * buffer descriptors or inline data */ -struct efa_io_tx_wqe { +struct efa_io_tx_wqe_128 { /* TX meta */ struct efa_io_tx_meta_desc meta; @@ -141,10 +146,10 @@ struct efa_io_tx_wqe { /* Send buffer descriptors */ struct efa_io_tx_buf_desc sgl[2]; - uint8_t inline_data[32]; + uint8_t inline_data[80]; /* RDMA local and remote memory addresses */ - struct efa_io_rdma_req rdma_req; + struct efa_io_rdma_req_128 rdma_req; } data; }; diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index a555729d53c..27dd9b5b189 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -239,7 +239,11 @@ void efa_prov_info_set_tx_rx_attr(struct fi_info *prov_info, prov_info->tx_attr->mode |= FI_CONTEXT2; prov_info->rx_attr->mode |= FI_CONTEXT2; +#if HAVE_INLINE_BUF_SIZE_EX + prov_info->tx_attr->inject_size = device->efa_attr.inline_buf_size_ex; +#else prov_info->tx_attr->inject_size = device->efa_attr.inline_buf_size; +#endif prov_info->tx_attr->iov_limit = device->efa_attr.max_sq_sge; prov_info->tx_attr->size = rounddown_power_of_two(device->efa_attr.max_sq_wr); prov_info->rx_attr->iov_limit = device->efa_attr.max_rq_sge; diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index cf136e623b5..4f99d719cbc 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -200,18 +200,14 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, struct efa_conn *conn; size_t iov_count = msg->iov_count; struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */ + struct ibv_data_buf inline_data_list[2]; uintptr_t wr_id; + bool use_inline; int i, err = 0; size_t total_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); struct efa_context *efa_ctx; struct efa_direct_ope *direct_ope; - if (flags & FI_INJECT && total_len > base_ep->inject_rma_size) { - EFA_WARN(FI_LOG_EP_DATA, - "Message size of %zu exceeds efa-directs inject_rma_size size of %zu.\n", total_len, base_ep->inject_rma_size); - return -FI_ENOSYS; - } - efa_tracepoint(write_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); EFA_DBG(FI_LOG_EP_DATA, "total len: %zu, addr: %lu, context: %lx, flags: %lx\n", @@ -236,12 +232,28 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, wr_id = (uintptr_t) efa_ctx; } + if ((flags & FI_INJECT) && total_len > base_ep->inject_rma_size) { + EFA_WARN(FI_LOG_EP_DATA, + "inject size %zu exceeds inject_rma_size %zu\n", + total_len, base_ep->inject_rma_size); + err = -FI_EINVAL; + goto out_err; + } + + use_inline = (total_len <= base_ep->inject_rma_size && + (!msg->desc || !efa_mr_is_hmem(msg->desc[0]))); + /* Handle 0-byte write with bounce buffer */ if (total_len == 0) { sge_list[0].addr = (uint64_t)domain->zero_byte_bounce_buf; sge_list[0].length = 0; sge_list[0].lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; iov_count = 1; + } else if (use_inline) { + for (i = 0; i < msg->iov_count; i++) { + inline_data_list[i].addr = msg->msg_iov[i].iov_base; + inline_data_list[i].length = msg->msg_iov[i].iov_len; + } } else { /* Prepare SGE list */ for (i = 0; i < msg->iov_count; ++i) { @@ -263,6 +275,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, /* Use consolidated RDMA write function */ err = efa_qp_post_write(base_ep->qp, sge_list, iov_count, + inline_data_list, use_inline, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, msg->data, flags, conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); @@ -360,86 +373,50 @@ ssize_t efa_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, } ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t addr, uint64_t key) + fi_addr_t dest_addr, uint64_t addr, uint64_t key) { + struct fi_msg_rma msg; + struct iovec iov; + struct fi_rma_iov rma_iov; struct efa_base_ep *base_ep; - struct efa_domain *domain; - struct ibv_sge sge; - struct efa_conn *conn; - uintptr_t wr_id; int err; base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - domain = base_ep->domain; err = efa_rma_check_cap(base_ep); if (err) return err; - - /* Only support 0-byte inject for efa-direct */ if (len > base_ep->inject_rma_size) return -FI_ENOSYS; - ofi_genlock_lock(&base_ep->util_ep.lock); - - wr_id = (uintptr_t) efa_fill_context(NULL, dest_addr, FI_INJECT, FI_RMA | FI_WRITE); - - sge.addr = (uint64_t)domain->zero_byte_bounce_buf; - sge.length = 0; - sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); - - err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, 0, 0, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); - if (OFI_UNLIKELY(err)) - err = (err == ENOMEM) ? -FI_EAGAIN : -err; + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, 0); - ofi_genlock_unlock(&base_ep->util_ep.lock); - return err; + return efa_rma_post_write(base_ep, &msg, FI_INJECT); } -static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, - uint64_t addr, uint64_t key) +ssize_t efa_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, + size_t len, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key) { + struct fi_msg_rma msg; + struct iovec iov; + struct fi_rma_iov rma_iov; struct efa_base_ep *base_ep; - struct efa_domain *domain; - struct efa_conn *conn; - struct ibv_sge sge; - uintptr_t wr_id; int err; - base_ep = container_of(ep, struct efa_base_ep, util_ep.ep_fid); - domain = base_ep->domain; + base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); err = efa_rma_check_cap(base_ep); if (err) return err; - - /* Only support 0-byte inject for efa-direct */ if (len > base_ep->inject_rma_size) return -FI_ENOSYS; - ofi_genlock_lock(&base_ep->util_ep.lock); - - wr_id = (uintptr_t) efa_fill_context(NULL, dest_addr, FI_INJECT | FI_REMOTE_CQ_DATA, FI_RMA | FI_WRITE); - - sge.addr = (uint64_t)domain->zero_byte_bounce_buf; - sge.length = 0; - sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); - - err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, data, IBV_SEND_INLINE, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); - if (OFI_UNLIKELY(err)) - err = (err == ENOMEM) ? -FI_EAGAIN : -err; + EFA_SETUP_IOV(iov, buf, len); + EFA_SETUP_RMA_IOV(rma_iov, addr, len, key); + EFA_SETUP_MSG_RMA(msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, data); - ofi_genlock_unlock(&base_ep->util_ep.lock); - return err; + return efa_rma_post_write(base_ep, &msg, FI_INJECT | FI_REMOTE_CQ_DATA); } struct fi_ops_rma efa_dgram_ep_rma_ops = { diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 2e88f16a9d9..1ced1958a75 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -503,6 +503,51 @@ int efa_user_info_alter_direct(int version, struct fi_info *info, const struct f info->tx_attr->caps &= ~OFI_TX_RMA_CAPS; info->rx_attr->caps &= ~OFI_RX_RMA_CAPS; } + /* + * Handle inject_size for wide WQE support. + * + * prov_info advertises inline_buf_size_ex as inject_size so that + * ofi_check_info allows larger hints. Here we adjust the actual + * inject_size and tx queue depth based on what the user requested. + */ + struct efa_device *device = &g_efa_selected_device_list[0]; + uint16_t inline_buf_size = device->efa_attr.inline_buf_size; + + if (!hints || !hints->tx_attr || !hints->tx_attr->inject_size) { + /* No hint: default to inline_buf_size */ + info->tx_attr->inject_size = inline_buf_size; + } else if (hints->tx_attr->inject_size > inline_buf_size) { + /* Wide WQE: query actual tx depth */ +#if HAVE_INLINE_BUF_SIZE_EX + struct efadv_sq_depth_attr sq_attr = {0}; + int max_sq_depth; + + sq_attr.max_inline_data = hints->tx_attr->inject_size; + sq_attr.flags = EFADV_SQ_DEPTH_ATTR_INLINE_WRITE; + max_sq_depth = efadv_get_max_sq_depth(device->ibv_ctx, + &sq_attr, + sizeof(sq_attr)); + if (max_sq_depth < 0) { + EFA_INFO(FI_LOG_CORE, + "efadv_get_max_sq_depth failed: %d\n", + max_sq_depth); + return -FI_ENODATA; + } + if (hints->tx_attr->size > max_sq_depth) { + EFA_INFO(FI_LOG_CORE, + "Requested TX SQ depth (%zu) exceeds maximum depth (%d)" + " for inline size %zu\n", + hints->tx_attr->size, max_sq_depth, + hints->tx_attr->inject_size); + return -FI_ENODATA; + } + info->tx_attr->size = max_sq_depth; +#else + return -FI_ENODATA; +#endif + } + /* inject_size <= inline_buf_size: no adjustment needed, + * ofi_alter_info will set inject_size from hints */ /* * Handle user-provided hints and adapt the info object passed back up * based on EFA-specific constraints. diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 704af2c7613..de1d09ff659 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -710,7 +710,7 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) sge.length = len; sge.lkey = ((struct efa_mr *)desc)->ibv_mr->lkey; - err = efa_qp_post_write(qp, &sge, 1, remote_key, remote_buf, wr_id, + err = efa_qp_post_write(qp, &sge, 1, NULL, false, remote_key, remote_buf, wr_id, cq_data, txe->fi_flags, ah, qpn, qkey); #if ENABLE_DEBUG diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 563ac92e925..b441cc3eadf 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -298,10 +298,12 @@ void efa_unit_test_resource_destruct(struct efa_resource *resource) if (resource->info) { fi_freeinfo(resource->info); + resource->info = NULL; } if (resource->hints) { fi_freeinfo(resource->hints); + resource->hints = NULL; } } diff --git a/prov/efa/test/efa_unit_test_data_path_direct.c b/prov/efa/test/efa_unit_test_data_path_direct.c index b74406dc119..64651418274 100644 --- a/prov/efa/test/efa_unit_test_data_path_direct.c +++ b/prov/efa/test/efa_unit_test_data_path_direct.c @@ -45,7 +45,7 @@ static void test_efa_data_path_direct_multiple_sge_fail_impl(struct efa_resource if (fi_opcode == FI_READ) { ret = efa_data_path_direct_post_read(qp, sge_list, 2, 123456, 0x87654321, 0, 0, NULL, 0, 0); } else { - ret = efa_data_path_direct_post_write(qp, sge_list, 2, 123456, 0x87654321, 0, 0, 0, NULL, 0, 0); + ret = efa_data_path_direct_post_write(qp, sge_list, 2, NULL, false, 123456, 0x87654321, 0, 0, 0, NULL, 0, 0); } assert_int_equal(ret, EINVAL); diff --git a/prov/efa/test/efa_unit_test_data_path_ops.c b/prov/efa/test/efa_unit_test_data_path_ops.c index e7493ce2cbb..e0dbb6b6a8e 100644 --- a/prov/efa/test/efa_unit_test_data_path_ops.c +++ b/prov/efa/test/efa_unit_test_data_path_ops.c @@ -37,6 +37,7 @@ int efa_qp_post_read(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t s } int efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, + const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey) { diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index 72649fa71e0..d6bb4eff191 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -1190,3 +1190,147 @@ void test_info_direct_msg_rma_too_large_max_msg_size_fail() fi_freeinfo(hints); } +/* + * @brief Test inject_size hint with no hint (default behavior) + * Should return default 32-byte inject size + */ +void test_info_direct_inject_size_no_hint(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_device *device = &g_efa_selected_device_list[0]; + struct fi_info **info = &resource->info; + int err; + + err = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0ULL, NULL, info); + assert_int_equal(err, 0); + assert_non_null(*info); + assert_int_equal((*info)->tx_attr->inject_size, 32); + assert_int_equal((*info)->tx_attr->inject_size, device->efa_attr.inline_buf_size); +} + +/** + * @brief Test inject_size hint with small size (<=32 bytes) + * Should return requested inject size with normal TX queue depth + */ +void test_info_direct_inject_size_small(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_device *device = &g_efa_selected_device_list[0]; + struct fi_info **info = &resource->info; + struct fi_info *hints; + int err; + + resource->hints = hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + hints->tx_attr->inject_size = 16; + + err = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0ULL, hints, info); + assert_int_equal(err, 0); + assert_non_null(*info); + assert_int_equal((*info)->tx_attr->inject_size, 16); + assert_int_equal(device->efa_attr.inline_buf_size, 32); +} + +/** + * @brief Test inject_size hint with wide WQE size (>32 bytes, <=max_inline_buf_size) + * Should enable wide WQE and reduce TX queue depth by half + */ +void test_info_direct_inject_size_wide_wqe(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_info **info = &resource->info; + struct fi_info *hints; + int err; + + resource->hints = hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + hints->tx_attr->inject_size = 64; + + err = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0ULL, hints, info); + if (!efa_device_support_wide_wqe()) { + assert_int_equal(err, -FI_ENODATA); + return; + } + assert_int_equal(err, 0); + assert_int_equal((*info)->tx_attr->inject_size, 64); +} + +/** + * @brief Test inject_size hint exceeding device capability + * Should fail with -FI_ENODATA + */ +void test_info_direct_inject_size_exceeds_max(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_info **info = &resource->info; + struct fi_info *hints; + int err; + + resource->hints = hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + hints->tx_attr->inject_size = INT_MAX; + + err = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0ULL, hints, info); + assert_int_equal(err, -FI_ENODATA); +} + +/** + * @brief Test fi_getopt returns correct inject sizes for regular WQE + * Should return MSG inject size only, RMA inject disabled + */ +void test_ep_getopt_inject_size_regular_wqe(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_info *hints; + size_t inject_msg_size, inject_rma_size, sz; + + resource->hints = hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + hints->tx_attr->inject_size = 32; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 18), + hints, true, true); + sz = sizeof inject_msg_size; + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_msg_size, &sz), 0); + sz = sizeof inject_msg_size; + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, + &inject_rma_size, &sz), 0); + + assert_int_equal(inject_msg_size, 32); + assert_int_equal(inject_rma_size, 0); +} + +/** + * @brief Test fi_getopt returns correct inject sizes for wide WQE + * Should return both MSG and RMA inject sizes + */ +void test_ep_getopt_inject_size_wide_wqe(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_info **info = &resource->info; + struct fi_info *hints; + size_t inject_msg_size, inject_rma_size, sz; + int err; + + resource->hints = hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + hints->tx_attr->inject_size = 33; + + err = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0ULL, hints, info); + if (!efa_device_support_wide_wqe()) { + assert_int_equal(err, -FI_ENODATA); + return; + } + assert_int_equal(err, 0); + err = fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL); + assert_int_equal(err, 0); + err = fi_domain(resource->fabric, resource->info, &resource->domain, NULL); + assert_int_equal(err, 0); + err = fi_endpoint(resource->domain, resource->info, &resource->ep, NULL); + assert_int_equal(err, 0); + + sz = sizeof inject_msg_size; + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE, + &inject_msg_size, &sz), 0); + sz = sizeof inject_msg_size; + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE, + &inject_rma_size, &sz), 0); + + assert_int_equal(inject_msg_size, 33); + assert_int_equal(inject_rma_size, 33); +} diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index dcbb8ba9ee0..47f0ddcbe72 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -305,7 +305,7 @@ int efa_mock_efa_qp_post_read_return_mock(struct efa_qp *qp, const struct ibv_sg return mock_int(); } -int efa_mock_efa_qp_post_write_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey) +int efa_mock_efa_qp_post_write_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey) { efa_mock_efa_qp_post_save_wr_id(wr_id); return mock_int(); @@ -412,9 +412,9 @@ int __wrap_efa_qp_post_read(struct efa_qp *qp, const struct ibv_sge *sge_list, s return g_efa_unit_test_mocks.efa_qp_post_read(qp, sge_list, sge_count, remote_key, remote_addr, wr_id, flags, ah, qpn, qkey); } -int __wrap_efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey) +int __wrap_efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey) { - return g_efa_unit_test_mocks.efa_qp_post_write(qp, sge_list, sge_count, remote_key, remote_addr, wr_id, data, flags, ah, qpn, qkey); + return g_efa_unit_test_mocks.efa_qp_post_write(qp, sge_list, sge_count, inline_data_list, use_inline, remote_key, remote_addr, wr_id, data, flags, ah, qpn, qkey); } int __wrap_efa_ibv_cq_start_poll(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr) diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index 28cf6a6c6ec..d52f70148f8 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -101,7 +101,7 @@ ssize_t efa_mock_efa_rdm_ope_post_send_return_mock(struct efa_rdm_ope *ope, int int __real_efa_qp_post_recv(struct efa_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad); int __real_efa_qp_post_send(struct efa_qp *qp, const struct ibv_sge *sge_list, const struct ibv_data_buf *inline_data_list, size_t iov_count, bool use_inline, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int __real_efa_qp_post_read(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); -int __real_efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); +int __real_efa_qp_post_write(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int __real_efa_ibv_cq_start_poll(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr); int __real_efa_ibv_cq_next_poll(struct efa_ibv_cq *ibv_cq); enum ibv_wc_opcode __real_efa_ibv_cq_wc_read_opcode(struct efa_ibv_cq *ibv_cq); @@ -128,7 +128,7 @@ int efa_mock_efa_qp_post_recv_return_mock(struct efa_qp *qp, struct ibv_recv_wr int efa_mock_efa_qp_post_send_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, const struct ibv_data_buf *inline_data_list, size_t iov_count, bool use_inline, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int efa_mock_efa_qp_post_send_verify_handshake_pkt_local_host_id_and_save_wr(struct efa_qp *qp, const struct ibv_sge *sge_list, const struct ibv_data_buf *inline_data_list, size_t iov_count, bool use_inline, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int efa_mock_efa_qp_post_read_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); -int efa_mock_efa_qp_post_write_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); +int efa_mock_efa_qp_post_write_return_mock(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int efa_mock_efa_ibv_cq_start_poll_return_mock(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr); int efa_mock_efa_ibv_cq_next_poll_access_cur_wq(struct efa_ibv_cq *ibv_cq); int efa_mock_efa_ibv_cq_next_poll_return_mock(struct efa_ibv_cq *ibv_cq); @@ -205,7 +205,7 @@ struct efa_unit_test_mocks int (*efa_qp_post_recv)(struct efa_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad); int (*efa_qp_post_send)(struct efa_qp *qp, const struct ibv_sge *sge_list, const struct ibv_data_buf *inline_data_list, size_t iov_count, bool use_inline, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int (*efa_qp_post_read)(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); - int (*efa_qp_post_write)(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); + int (*efa_qp_post_write)(struct efa_qp *qp, const struct ibv_sge *sge_list, size_t sge_count, const struct ibv_data_buf *inline_data_list, bool use_inline, uint32_t remote_key, uint64_t remote_addr, uintptr_t wr_id, uint64_t data, uint64_t flags, struct efa_ah *ah, uint32_t qpn, uint32_t qkey); int (*efa_ibv_cq_start_poll)(struct efa_ibv_cq *ibv_cq, struct ibv_poll_cq_attr *attr); int (*efa_ibv_cq_next_poll)(struct efa_ibv_cq *ibv_cq); enum ibv_wc_opcode (*efa_ibv_cq_wc_read_opcode)(struct efa_ibv_cq *ibv_cq); diff --git a/prov/efa/test/efa_unit_test_rma.c b/prov/efa/test/efa_unit_test_rma.c index 80778c4165d..8cfae393022 100644 --- a/prov/efa/test/efa_unit_test_rma.c +++ b/prov/efa/test/efa_unit_test_rma.c @@ -14,7 +14,9 @@ extern struct fi_ops_rma efa_rma_ops; * fails with FI_RMA hints, then constructs the resource without FI_RMA * so callers can test the -FI_EOPNOTSUPP error path. */ -static bool test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) +static bool test_efa_rma_prep_with_inject_size(struct efa_resource *resource, + fi_addr_t *addr, + size_t inject_size) { struct efa_ep_addr raw_addr; size_t raw_addr_len = sizeof(raw_addr); @@ -24,6 +26,9 @@ static bool test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); assert_non_null(resource->hints); + if (inject_size && efa_device_support_wide_wqe()) + resource->hints->tx_attr->inject_size = inject_size; + fi_rma_supported = efa_device_support_rdma_read() && efa_device_support_rdma_write(); if (fi_rma_supported) { @@ -65,6 +70,11 @@ static bool test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) return fi_rma_supported; } +static bool test_efa_rma_prep(struct efa_resource *resource, fi_addr_t *addr) +{ + return test_efa_rma_prep_with_inject_size(resource, addr, 0); +} + void test_efa_rma_read(struct efa_resource **state) { struct efa_resource *resource = *state; @@ -273,6 +283,39 @@ void test_efa_rma_writemsg(struct efa_resource **state) efa_unit_test_buff_destruct(&local_buff); } +void test_efa_rma_writemsg_with_wide_wqe_inject(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff local_buff; + struct iovec iov; + struct fi_msg_rma msg = {0}; + struct fi_rma_iov rma_iov; + fi_addr_t dest_addr; + int ret; + + bool fi_rma_supported = test_efa_rma_prep_with_inject_size(resource, &dest_addr, 42); + + if (!fi_rma_supported || !efa_device_support_wide_wqe()) + skip(); + + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, NULL, 1, dest_addr, &rma_iov, + 1, NULL, 0); + + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); + ret = fi_writemsg(resource->ep, &msg, FI_INJECT); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); + + efa_unit_test_buff_destruct(&local_buff); +} + void test_efa_rma_writedata(struct efa_resource **state) { struct efa_resource *resource = *state; @@ -314,7 +357,7 @@ void test_efa_rma_inject_write(struct efa_resource **state) uint64_t remote_addr = 0x87654321; uint64_t remote_key = 123456; - bool fi_rma_supported = test_efa_rma_prep(resource, &dest_addr); + bool fi_rma_supported = test_efa_rma_prep_with_inject_size(resource, &dest_addr, 42); if (!fi_rma_supported) { ret = fi_inject_write(resource->ep, NULL, 0, dest_addr, remote_addr, remote_key); @@ -322,11 +365,26 @@ void test_efa_rma_inject_write(struct efa_resource **state) return; } + if (!efa_device_support_wide_wqe()) { + size_t inject_rma_size, sz = sizeof(inject_rma_size); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_INJECT_RMA_SIZE, &inject_rma_size, &sz), 0); + assert_int_equal(inject_rma_size, 0); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + ret = fi_inject_write(resource->ep, local_buff.buff, local_buff.size, + dest_addr, remote_addr, remote_key); + assert_int_equal(ret, -FI_ENOSYS); + efa_unit_test_buff_destruct(&local_buff); + return; + } + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_inject_write(resource->ep, local_buff.buff, local_buff.size, dest_addr, remote_addr, remote_key); - assert_int_equal(ret, -FI_ENOSYS); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); efa_unit_test_buff_destruct(&local_buff); } @@ -340,7 +398,7 @@ void test_efa_rma_inject_writedata(struct efa_resource **state) uint64_t remote_addr = 0x87654321; uint64_t remote_key = 123456; - bool fi_rma_supported = test_efa_rma_prep(resource, &dest_addr); + bool fi_rma_supported = test_efa_rma_prep_with_inject_size(resource, &dest_addr, 42); if (!fi_rma_supported) { ret = fi_inject_writedata(resource->ep, NULL, 0, 0, dest_addr, remote_addr, remote_key); @@ -348,12 +406,28 @@ void test_efa_rma_inject_writedata(struct efa_resource **state) return; } + if (!efa_device_support_wide_wqe()) { + size_t inject_rma_size, sz = sizeof(inject_rma_size); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_INJECT_RMA_SIZE, &inject_rma_size, &sz), 0); + assert_int_equal(inject_rma_size, 0); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + ret = fi_inject_writedata(resource->ep, local_buff.buff, + local_buff.size, 0, dest_addr, + remote_addr, remote_key); + assert_int_equal(ret, -FI_ENOSYS); + efa_unit_test_buff_destruct(&local_buff); + return; + } + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_inject_writedata(resource->ep, local_buff.buff, local_buff.size, 0, dest_addr, remote_addr, remote_key); - assert_int_equal(ret, -FI_ENOSYS); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); efa_unit_test_buff_destruct(&local_buff); } @@ -366,10 +440,9 @@ void test_efa_rma_writemsg_with_inject(struct efa_resource **state) struct fi_msg_rma msg = {0}; struct fi_rma_iov rma_iov; fi_addr_t dest_addr; - void *desc; int ret; - bool fi_rma_supported = test_efa_rma_prep(resource, &dest_addr); + bool fi_rma_supported = test_efa_rma_prep_with_inject_size(resource, &dest_addr, 42); if (!fi_rma_supported) { efa_unit_test_construct_msg_rma(&msg, &iov, NULL, 0, dest_addr, &rma_iov, 1, NULL, 0); @@ -378,19 +451,39 @@ void test_efa_rma_writemsg_with_inject(struct efa_resource **state) return; } - efa_unit_test_buff_construct(&local_buff, resource, 4096 /* buff_size */); + if (!efa_device_support_wide_wqe()) { + size_t inject_rma_size, sz = sizeof(inject_rma_size); + assert_int_equal(fi_getopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_INJECT_RMA_SIZE, &inject_rma_size, &sz), 0); + assert_int_equal(inject_rma_size, 0); + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); + iov.iov_base = local_buff.buff; + iov.iov_len = local_buff.size; + rma_iov.len = local_buff.size; + rma_iov.addr = 0x87654321; + rma_iov.key = 123456; + efa_unit_test_construct_msg_rma(&msg, &iov, NULL, 1, dest_addr, &rma_iov, + 1, NULL, 0); + ret = fi_writemsg(resource->ep, &msg, FI_INJECT); + assert_int_equal(ret, -FI_EINVAL); + efa_unit_test_buff_destruct(&local_buff); + return; + } + + efa_unit_test_buff_construct(&local_buff, resource, 32 /* buff_size */); iov.iov_base = local_buff.buff; iov.iov_len = local_buff.size; - desc = fi_mr_desc(local_buff.mr); rma_iov.len = local_buff.size; rma_iov.addr = 0x87654321; rma_iov.key = 123456; - efa_unit_test_construct_msg_rma(&msg, &iov, &desc, 1, dest_addr, &rma_iov, + efa_unit_test_construct_msg_rma(&msg, &iov, NULL, 1, dest_addr, &rma_iov, 1, NULL, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); ret = fi_writemsg(resource->ep, &msg, FI_INJECT); - assert_int_equal(ret, -FI_ENOSYS); + assert_int_equal(ret, 0); + assert_int_equal(g_ibv_submitted_wr_id_cnt, 1); efa_unit_test_buff_destruct(&local_buff); } diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 925deb888f4..616be0ecadb 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -303,6 +303,12 @@ int main(void) cmocka_unit_test_setup_teardown(test_info_direct_msg_only_large_max_msg_size_fail, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_direct_msg_rma_large_max_msg_size_success, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_direct_msg_rma_too_large_max_msg_size_fail, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_direct_inject_size_no_hint, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_direct_inject_size_small, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_direct_inject_size_wide_wqe, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_direct_inject_size_exceeds_max, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ep_getopt_inject_size_regular_wqe, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_ep_getopt_inject_size_wide_wqe, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), /* end efa_unit_test_info.c */ cmocka_unit_test_setup_teardown(test_efa_hmem_info_p2p_dmabuf_assumed_neuron, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -445,6 +451,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rma_inject_write, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_inject_writedata, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_writemsg_with_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rma_writemsg_with_wide_wqe_inject, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_read_0_byte, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_readv_0_byte, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rma_readmsg_0_byte, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 845564515a8..0b802cda0d7 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -258,6 +258,12 @@ void test_info_direct_msg_only_small_max_msg_size_success(); void test_info_direct_msg_only_large_max_msg_size_fail(); void test_info_direct_msg_rma_large_max_msg_size_success(); void test_info_direct_msg_rma_too_large_max_msg_size_fail(); +void test_info_direct_inject_size_no_hint(); +void test_info_direct_inject_size_small(); +void test_info_direct_inject_size_wide_wqe(); +void test_info_direct_inject_size_exceeds_max(); +void test_ep_getopt_inject_size_regular_wqe(); +void test_ep_getopt_inject_size_wide_wqe(); /* end efa_unit_test_info.c */ void test_efa_srx_min_multi_recv_size(); @@ -398,6 +404,7 @@ void test_efa_rma_writedata(); void test_efa_rma_inject_write(); void test_efa_rma_inject_writedata(); void test_efa_rma_writemsg_with_inject(); +void test_efa_rma_writemsg_with_wide_wqe_inject(); void test_efa_rma_read_0_byte(); void test_efa_rma_readv_0_byte(); void test_efa_rma_readmsg_0_byte();