-
Notifications
You must be signed in to change notification settings - Fork 499
prov/efa: Wide WQE support #11944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
prov/efa: Wide WQE support #11944
Changes from all commits
95b993e
6ed9dc2
8fefff5
ae5655b
1cec156
4618f26
8c839ff
03b0559
b7f6a51
82c6eb5
9d74d10
6ef2b35
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -284,7 +284,8 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep, | |
| * @return int 0 on success, negative integer on failure | ||
| */ | ||
| int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, | ||
| uint32_t tclass, bool use_unsolicited_write_recv) | ||
| uint32_t tclass, bool use_unsolicited_write_recv, | ||
| bool use_inline_write) | ||
| { | ||
| struct efadv_qp_init_attr efa_attr = { 0 }; | ||
|
|
||
|
|
@@ -310,6 +311,10 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex, | |
| #if HAVE_CAPS_UNSOLICITED_WRITE_RECV | ||
| if (use_unsolicited_write_recv) | ||
| efa_attr.flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; | ||
| #endif | ||
| #if HAVE_INLINE_BUF_SIZE_EX | ||
| if (use_inline_write) | ||
| efa_attr.flags |= EFADV_QP_FLAGS_INLINE_WRITE; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm, this is a new flags beyond the existing IBV_QP_*INLINE_WRITE because we already use it in released libfabric and it cannot be reused for the real inline write support. Right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YonatanNachum is adding this flag in RDMA-Core PR: linux-rdma/rdma-core@88917d9 |
||
| #endif | ||
| efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; | ||
| #if HAVE_EFADV_SL | ||
|
|
@@ -382,7 +387,9 @@ void efa_base_ep_construct_ibv_qp_init_attr_ex(struct efa_base_ep *ep, | |
| attr_ex->cap.max_send_sge = device_info->tx_attr->iov_limit; | ||
| attr_ex->cap.max_recv_wr = efa_base_ep_get_rx_pool_size(ep); | ||
| attr_ex->cap.max_recv_sge = device_info->rx_attr->iov_limit; | ||
| attr_ex->cap.max_inline_data = ep->domain->device->efa_attr.inline_buf_size; | ||
| attr_ex->cap.max_inline_data = EFA_INFO_TYPE_IS_DIRECT(ep->info) ? | ||
| ep->info->tx_attr->inject_size : | ||
| ep->domain->device->efa_attr.inline_buf_size; | ||
|
|
||
| EFA_INFO(FI_LOG_EP_CTRL, | ||
| "QP cap max_send_wr=%u max_recv_wr=%u max_send_sge=%u " | ||
|
|
@@ -416,6 +423,14 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep, | |
| int ret; | ||
| struct ibv_qp_init_attr_ex attr_ex = { 0 }; | ||
| bool use_unsolicited_write_recv = true; | ||
| /* | ||
| * Inline RDMA write is only supported with 128-byte wide WQE, | ||
| * which is enabled when the requested inject size exceeds | ||
| * inline_buf_size. | ||
| */ | ||
| bool use_inline_write = EFA_INFO_TYPE_IS_DIRECT(base_ep->info) && | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? Why not enable inline RDMA write for smaller inject sizes? Is it because inline RDMA write is only possible with wide WQE? In that case, need a comment e.g.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it was a design choice. Will add a comment. |
||
| base_ep->info->tx_attr->inject_size > | ||
| base_ep->domain->device->efa_attr.inline_buf_size; | ||
|
|
||
| efa_base_ep_construct_ibv_qp_init_attr_ex(base_ep, &attr_ex, tx_cq->ibv_cq_ex, rx_cq->ibv_cq_ex); | ||
|
|
||
|
|
@@ -434,12 +449,12 @@ static int efa_base_ep_create_qp(struct efa_base_ep *base_ep, | |
| } | ||
| EFA_INFO(FI_LOG_EP_CTRL, "creating QP with unsolicited write recv status: %d\n", use_unsolicited_write_recv); | ||
| ret = efa_qp_create(&base_ep->qp, &attr_ex, base_ep->info->tx_attr->tclass, | ||
| use_unsolicited_write_recv); | ||
| use_unsolicited_write_recv, use_inline_write); | ||
| if (ret) | ||
| return ret; | ||
|
|
||
| if (create_user_recv_qp) { | ||
| ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled); | ||
| ret = efa_qp_create(&base_ep->user_recv_qp, &attr_ex, base_ep->info->tx_attr->tclass, tx_cq->unsolicited_write_recv_enabled, use_inline_write); | ||
| if (ret) { | ||
| efa_base_ep_destruct_qp_unsafe(base_ep); | ||
| return ret; | ||
|
|
@@ -603,10 +618,11 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, | |
| /* Use device's native limit as the default value of base ep*/ | ||
| base_ep->max_msg_size = (size_t) base_ep->domain->device->ibv_port_attr.max_msg_sz; | ||
| base_ep->max_rma_size = (size_t) base_ep->domain->device->max_rdma_size; | ||
| base_ep->inject_msg_size = (size_t) base_ep->domain->device->efa_attr.inline_buf_size; | ||
| /* TODO: update inject_rma_size to inline size after firmware | ||
| * supports inline rdma write */ | ||
| base_ep->inject_rma_size = 0; | ||
| base_ep->inject_msg_size = info->tx_attr->inject_size; | ||
| if (info->tx_attr->inject_size > base_ep->domain->device->efa_attr.inline_buf_size) | ||
| base_ep->inject_rma_size = info->tx_attr->inject_size; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should u combine this commit with the last commit? I see it is calling fi_getopt which should also be impacted by this change? |
||
| else | ||
| base_ep->inject_rma_size = 0; | ||
| base_ep->use_unsolicited_write_recv = true; | ||
| return 0; | ||
| } | ||
|
|
@@ -801,7 +817,7 @@ int efa_base_ep_check_qp_in_order_aligned_128_bytes(struct efa_base_ep *ep, | |
| /* Create a dummy qp for query only */ | ||
| efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq.ibv_cq_ex, ibv_cq.ibv_cq_ex); | ||
|
|
||
| ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled); | ||
| ret = efa_qp_create(&qp, &attr_ex, FI_TC_UNSPEC, ibv_cq.unsolicited_write_recv_enabled, false); | ||
| if (ret) | ||
| goto out; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -417,7 +417,7 @@ static inline int efa_data_path_direct_post_send( | |
| uint32_t qkey) | ||
| { | ||
| struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; | ||
| struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ | ||
| struct efa_io_tx_wqe_128 local_wqe = {0}; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean to remove the comment
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I did. I think this comment is misleading (not mentioning it's a classic example of code smell). |
||
| struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; | ||
| int err = 0; | ||
|
|
||
|
|
@@ -498,7 +498,7 @@ static inline int efa_data_path_direct_post_read( | |
| uint32_t qkey) | ||
| { | ||
| struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; | ||
| struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ | ||
| struct efa_io_tx_wqe_128 local_wqe = {0}; | ||
| struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; | ||
| struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem; | ||
| int err; | ||
|
|
@@ -578,6 +578,8 @@ efa_data_path_direct_post_write( | |
| struct efa_qp *qp, | ||
| const struct ibv_sge *sge_list, | ||
| size_t sge_count, | ||
| const struct ibv_data_buf *inline_data_list, | ||
| bool use_inline, | ||
| uint32_t remote_key, | ||
| uint64_t remote_addr, | ||
| uintptr_t wr_id, | ||
|
|
@@ -588,7 +590,7 @@ efa_data_path_direct_post_write( | |
| uint32_t qkey) | ||
| { | ||
| struct efa_data_path_direct_sq *sq = &qp->data_path_direct_qp.sq; | ||
| struct efa_io_tx_wqe local_wqe = {0}; /* Stack variable - can be in registers */ | ||
| struct efa_io_tx_wqe_128 local_wqe = {0}; | ||
| struct efa_io_tx_meta_desc *meta_desc = &local_wqe.meta; | ||
| struct efa_io_remote_mem_addr *remote_mem = &local_wqe.data.rdma_req.remote_mem; | ||
| int err; | ||
|
|
@@ -644,10 +646,18 @@ efa_data_path_direct_post_write( | |
|
|
||
| /* Set remote memory information */ | ||
| efa_send_wr_set_rdma_addr(remote_mem, remote_key, remote_addr); | ||
| remote_mem->length = efa_sge_total_bytes(sge_list, sge_count); | ||
|
|
||
| /* Set local SGE list - caller has prepared sge_list */ | ||
| efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count); | ||
| if (use_inline) { | ||
| assert(sge_count == 1); | ||
| memcpy(local_wqe.data.rdma_req.inline_data, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe don't really need memcpy here |
||
| inline_data_list[0].addr, inline_data_list[0].length); | ||
| remote_mem->length = inline_data_list[0].length; | ||
| EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); | ||
| meta_desc->length = inline_data_list[0].length; | ||
| } else { | ||
| remote_mem->length = efa_sge_total_bytes(sge_list, sge_count); | ||
| efa_data_path_direct_set_sgl(local_wqe.data.rdma_req.local_mem, meta_desc, sge_list, sge_count); | ||
| } | ||
|
|
||
| efa_data_path_direct_send_wr_post(qp, sq, &local_wqe); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -594,19 +594,23 @@ EFA_ALWAYS_INLINE void | |
| efa_data_path_direct_send_wr_post( | ||
| struct efa_qp *qp, | ||
| struct efa_data_path_direct_sq *sq, | ||
| struct efa_io_tx_wqe *wqe) | ||
| struct efa_io_tx_wqe_128 *wqe) | ||
| { | ||
| uint32_t sq_desc_idx; | ||
| uint64_t *src, *dst; | ||
|
|
||
| /* Calculate target address in write-combined memory */ | ||
| /* Calculate target address in write-combined memory. | ||
| * Use byte-level arithmetic since wqe_size may be 64 or 128 bytes. */ | ||
| sq_desc_idx = sq->wq.pc & sq->wq.desc_mask; | ||
| src = (uint64_t *)wqe; | ||
| dst = (uint64_t *)((struct efa_io_tx_wqe *)sq->desc + sq_desc_idx); | ||
| dst = (uint64_t *)((uint8_t *)sq->desc + sq_desc_idx * sq->wq.wqe_size); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better to have a comment here to explain the math... I understand you changed the offset unit from 64 byte to 1 byte, so we need a comment |
||
|
|
||
| /* Copy 64-byte WQE using 8 uint64_t stores */ | ||
| for (int i = 0; i < 8; i++) | ||
| dst[i] = src[i]; | ||
| /* | ||
| * Use mmio_memcpy_x64 to copy the WQE to write-combined memory | ||
| * with proper 8-byte atomic stores. The wqe_size is either 64 or | ||
| * 128 bytes depending on whether wide WQE is enabled. | ||
| */ | ||
| mmio_memcpy_x64(dst, src, sq->wq.wqe_size); | ||
|
|
||
| #if HAVE_LTTNG | ||
| efa_data_path_direct_tracepoint_post_send(qp, sq, &wqe->meta); | ||
|
|
@@ -644,7 +648,7 @@ EFA_ALWAYS_INLINE void efa_data_path_direct_set_ud_addr(struct efa_io_tx_meta_de | |
| * @param num_buf Number of data buffers | ||
| * @param buf_list Array of data buffers | ||
| */ | ||
| EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe *wqe, | ||
| EFA_ALWAYS_INLINE void efa_data_path_direct_set_inline_data(struct efa_io_tx_wqe_128 *wqe, | ||
| size_t num_buf, | ||
| const struct ibv_data_buf *buf_list) | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this the planned behavior?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, fi_info returns ENODATA for
hints->tx_attr->inject_size > 32if rdma-core or/and FW are old. Therefore the test will repot XFAIL until everything was deployed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then if firmware is deployed and we are not changing it, the runfabtests.py will fail?