diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 408aa2c1638..7d9d5d1778a 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -78,7 +78,13 @@ _efa_files = \ prov/efa/src/rdm/efa_rdm_tracepoint_def.c \ prov/efa/src/rdm/efa_rdm_srx.c \ prov/efa/src/rdm/efa_rdm_util.c \ - prov/efa/src/rdm/efa_rdm_mr.c + prov/efa/src/rdm/efa_rdm_mr.c \ + prov/efa/src/rdm/efa_rdm_proto.c \ + prov/efa/src/rdm/efa_rdm_proto_eager.c \ + prov/efa/src/rdm/efa_rdm_proto_medium.c \ + prov/efa/src/rdm/efa_rdm_proto_longcts.c \ + prov/efa/src/rdm/efa_rdm_proto_longread.c \ + prov/efa/src/rdm/efa_rdm_proto_runtread.c if ENABLE_EFA_UNIT_TEST _efa_files += prov/efa/test/efa_unit_test_data_path_ops.c @@ -140,7 +146,13 @@ _efa_headers = \ prov/efa/src/rdm/efa_rdm_tracepoint.h \ prov/efa/src/rdm/efa_rdm_srx.h \ prov/efa/src/rdm/efa_rdm_util.h \ - prov/efa/src/rdm/efa_rdm_mr.h + prov/efa/src/rdm/efa_rdm_mr.h \ + prov/efa/src/rdm/efa_rdm_proto.h \ + prov/efa/src/rdm/efa_rdm_proto_eager.h \ + prov/efa/src/rdm/efa_rdm_proto_medium.h \ + prov/efa/src/rdm/efa_rdm_proto_longcts.h \ + prov/efa/src/rdm/efa_rdm_proto_longread.h \ + prov/efa/src/rdm/efa_rdm_proto_runtread.h if HAVE_LTTNG efa_LDFLAGS += -llttng-ust @@ -174,7 +186,8 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_msg.c \ prov/efa/test/efa_unit_test_rma.c \ prov/efa/test/efa_unit_test_rdm_rma.c \ - prov/efa/test/efa_unit_test_data_path_direct.c + prov/efa/test/efa_unit_test_data_path_direct.c \ + prov/efa/test/efa_unit_test_proto.c efa_CPPFLAGS += -I$(top_srcdir)/include -I$(top_srcdir)/prov/efa/test $(cmocka_CPPFLAGS) diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index dc8ebfe9f0a..7e85962cbee 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -93,21 +93,6 @@ #define EFA_DEFAULT_INTER_MIN_READ_WRITE_SIZE (65536) #define EFA_DEFAULT_INTRA_MAX_GDRCOPY_FROM_DEV_SIZE (3072) -/* - * The default memory alignment - */ -#define EFA_RDM_DEFAULT_MEMORY_ALIGNMENT (8) - -/* - * The CUDA memory alignment - */ -#define EFA_RDM_CUDA_MEMORY_ALIGNMENT (64) - -/* - * The alignment to support in-order aligned ops. - */ -#define EFA_RDM_IN_ORDER_ALIGNMENT (128) - /* * Set alignment to x86 cache line size. */ diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index 1850f665899..ac28e983b84 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -58,10 +58,6 @@ efa_rdm_atomic_alloc_txe(struct efa_rdm_ep *efa_rdm_ep, return NULL; } - efa_domain_ope_list_lock(efa_rdm_ep_domain(efa_rdm_ep)); - dlist_insert_tail(&txe->ep_entry, &efa_rdm_ep->txe_list); - efa_domain_ope_list_unlock(efa_rdm_ep_domain(efa_rdm_ep)); - ofi_ioc_to_iov(msg_atomic->msg_iov, iov, msg_atomic->iov_count, datatype_size); msg.addr = msg_atomic->addr; msg.msg_iov = iov; diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 7e28916af7e..c63c79d84c6 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -842,8 +842,36 @@ enum ibv_wc_status efa_rdm_cq_process_wc(struct efa_ibv_cq *cq, struct efa_rdm_e #if ENABLE_DEBUG ep->send_comps++; #endif - efa_rdm_pke_handle_send_completion(pkt_entry); - efa_rdm_cq_increment_pkt_entry_gen(pkt_entry); + if (pkt_entry->callback) { + efa_rdm_ep_record_tx_op_completed(pkt_entry->ep, + pkt_entry); + /* + * For a send completion, pkt_entry->peer can be + * NULL in 3 situations: + * 1. the pkt_entry is used for a local read + * operation + * 2. a new peer with same gid+qpn was inserted + * to av, thus the peer was removed from AV. + * 3. application removed the peer's address + * from av. In 1, we should proceed. For 2 and + * 3, the send completion should be ignored. + */ + if (!pkt_entry->peer && + !(pkt_entry->flags & + EFA_RDM_PKE_LOCAL_READ)) { + EFA_WARN( + FI_LOG_CQ, + "ignoring send completion of a " + "packet to a removed peer.\n"); + efa_rdm_pke_release_tx(pkt_entry); + } else { + pkt_entry->callback(pkt_entry); + } + efa_rdm_cq_increment_pkt_entry_gen(pkt_entry); + } else { + efa_rdm_pke_handle_send_completion(pkt_entry); + efa_rdm_cq_increment_pkt_entry_gen(pkt_entry); + } break; case IBV_WC_RECV: /* efa_rdm_cq_handle_recv_completion does additional work to determine the source diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 6008c59a389..8a9f5a50507 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -29,6 +29,21 @@ struct efa_rdm_ep_queued_copy { #define EFA_RDM_MAX_QUEUED_COPY (8) +/* + * The default memory alignment + */ +#define EFA_RDM_DEFAULT_MEMORY_ALIGNMENT (8) + +/* + * The CUDA memory alignment + */ +#define EFA_RDM_CUDA_MEMORY_ALIGNMENT (64) + +/* + * The alignment to support in-order aligned ops. + */ +#define EFA_RDM_IN_ORDER_ALIGNMENT (128) + /** * Max number of opes queued before handshake is made * with their peers. This cnt is per EP. @@ -195,7 +210,8 @@ struct efa_rdm_ep { struct efa_rdm_pke **pke_vec; /* Work arrays for efa_rdm_ope_post_send to avoid stack allocation */ struct efa_rdm_pke **send_pkt_entry_vec; - int *send_pkt_entry_size_vec; + size_t *send_pkt_entry_data_sizes; + size_t send_pkt_entry_vec_size; struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ size_t ope_queued_before_handshake_cnt; @@ -220,13 +236,6 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr); struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr_t addr); -struct efa_rdm_ope *efa_rdm_ep_alloc_txe(struct efa_rdm_ep *efa_rdm_ep, - struct efa_rdm_peer *peer, - const struct fi_msg *msg, - uint32_t op, - uint64_t tag, - uint64_t flags); - struct efa_rdm_ope *efa_rdm_ep_alloc_rxe(struct efa_rdm_ep *ep, struct efa_rdm_peer *peer, uint32_t op); @@ -253,7 +262,26 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep, struct dlist_entry *pkts); -size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface); +/** + * @brief Get memory alignment for given ep and hmem iface + * + * @param ep efa rdm ep + * @param iface hmem iface + * @return size_t the memory alignment + */ +static inline size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, + enum fi_hmem_iface iface) +{ + size_t memory_alignment = EFA_RDM_DEFAULT_MEMORY_ALIGNMENT; + + if (ep->sendrecv_in_order_aligned_128_bytes) { + memory_alignment = EFA_RDM_IN_ORDER_ALIGNMENT; + } else if (iface == FI_HMEM_CUDA) { + memory_alignment = EFA_RDM_CUDA_MEMORY_ALIGNMENT; + } + + return memory_alignment; +} static inline struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index e3c003002a1..56bb2e08994 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -680,9 +680,13 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, goto err_free_pke_vec; } - efa_rdm_ep->send_pkt_entry_size_vec = calloc(sizeof(int), efa_base_ep_get_tx_pool_size(&efa_rdm_ep->base_ep)); - if (!efa_rdm_ep->send_pkt_entry_size_vec) { - EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for efa_rdm_ep->send_pkt_entry_size_vec!\n"); + efa_rdm_ep->send_pkt_entry_data_sizes = + calloc(sizeof(size_t), + efa_base_ep_get_tx_pool_size(&efa_rdm_ep->base_ep)); + if (!efa_rdm_ep->send_pkt_entry_data_sizes) { + EFA_WARN(FI_LOG_EP_CTRL, + "cannot alloc memory for " + "efa_rdm_ep->send_pkt_entry_data_sizes!\n"); ret = -FI_ENOMEM; goto err_free_send_pkt_entry_vec; } @@ -1190,8 +1194,8 @@ static int efa_rdm_ep_close(struct fid *fid) free(efa_rdm_ep->pke_vec); if (efa_rdm_ep->send_pkt_entry_vec) free(efa_rdm_ep->send_pkt_entry_vec); - if (efa_rdm_ep->send_pkt_entry_size_vec) - free(efa_rdm_ep->send_pkt_entry_size_vec); + if (efa_rdm_ep->send_pkt_entry_data_sizes) + free(efa_rdm_ep->send_pkt_entry_data_sizes); ofi_genlock_unlock(&domain->srx_lock); diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 9c2132710db..cb754815551 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -317,36 +317,6 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe return err; } - - -/* create a new txe */ -struct efa_rdm_ope *efa_rdm_ep_alloc_txe(struct efa_rdm_ep *efa_rdm_ep, - struct efa_rdm_peer *peer, - const struct fi_msg *msg, - uint32_t op, - uint64_t tag, - uint64_t flags) -{ - struct efa_rdm_ope *txe; - - txe = ofi_buf_alloc(efa_rdm_ep->ope_pool); - if (OFI_UNLIKELY(!txe)) { - EFA_DBG(FI_LOG_EP_CTRL, "TX entries exhausted.\n"); - return NULL; - } - - efa_rdm_txe_construct(txe, efa_rdm_ep, peer, msg, op, flags); - if (op == ofi_op_tagged) { - txe->cq_entry.tag = tag; - txe->tag = tag; - } - - efa_domain_ope_list_lock(efa_rdm_ep_domain(efa_rdm_ep)); - dlist_insert_tail(&txe->ep_entry, &efa_rdm_ep->txe_list); - efa_domain_ope_list_unlock(efa_rdm_ep_domain(efa_rdm_ep)); - return txe; -} - /** * @brief record the event that a TX op has been submitted * @@ -674,15 +644,18 @@ static ssize_t efa_rdm_ep_handshake_common(struct efa_rdm_ep *ep, struct efa_rdm msg.addr = peer->conn->fi_addr; - txe = efa_rdm_ep_alloc_txe(ep, peer, &msg, ofi_op_write, 0, 0); - + txe = ofi_buf_alloc(ep->ope_pool); if (OFI_UNLIKELY(!txe)) { EFA_WARN(FI_LOG_EP_CTRL, "TX entries exhausted.\n"); return -FI_EAGAIN; } - /* efa_rdm_ep_alloc_txe() joins ep->base_ep.util_ep.tx_op_flags and passed in flags, - * reset to desired flags (remove things like FI_DELIVERY_COMPLETE, and FI_COMPLETION) + efa_rdm_txe_construct(txe, ep, peer, &msg, ofi_op_write, 0); + + /* + * efa_rdm_txe_construct() joins ep->base_ep.util_ep.tx_op_flags and + * passed in flags, reset to desired flags (remove things like + * FI_DELIVERY_COMPLETE, and FI_COMPLETION) */ txe->fi_flags = EFA_RDM_TXE_NO_COMPLETION | EFA_RDM_TXE_NO_COUNTER; txe->internal_flags |= EFA_RDM_OPE_INTERNAL; @@ -1068,26 +1041,6 @@ void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep) efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_INTERNAL_RX_BUF_POST); } -/** - * @brief Get memory alignment for given ep and hmem iface - * - * @param ep efa rdm ep - * @param iface hmem iface - * @return size_t the memory alignment - */ -size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface) -{ - size_t memory_alignment = EFA_RDM_DEFAULT_MEMORY_ALIGNMENT; - - if (ep->sendrecv_in_order_aligned_128_bytes) { - memory_alignment = EFA_RDM_IN_ORDER_ALIGNMENT; - } else if (iface == FI_HMEM_CUDA) { - memory_alignment = EFA_RDM_CUDA_MEMORY_ALIGNMENT; - } - - return memory_alignment; -} - /** * @brief Enforce a handshake to made for given txe. * It will trigger a handshake with peer and choose to diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 13844764156..da7d9195531 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -18,6 +18,9 @@ #include "efa_rdm_pke_utils.h" #include "efa_rdm_pke_req.h" +#include "efa_mr.h" +#include "efa_rdm_proto.h" +#include "efa_rdm_proto_eager.h" #include "efa_rdm_tracepoint.h" /** @@ -45,60 +48,11 @@ */ int efa_rdm_msg_select_rtm(struct efa_rdm_ep *efa_rdm_ep, struct efa_rdm_ope *txe, int use_p2p) { - /* - * For performance consideration, this function assume the tagged rtm packet type id is - * always the correspondent message rtm packet type id + 1, thus the assertion here. - */ - assert(EFA_RDM_EAGER_MSGRTM_PKT + 1 == EFA_RDM_EAGER_TAGRTM_PKT); - assert(EFA_RDM_MEDIUM_MSGRTM_PKT + 1 == EFA_RDM_MEDIUM_TAGRTM_PKT); - assert(EFA_RDM_LONGCTS_MSGRTM_PKT + 1 == EFA_RDM_LONGCTS_TAGRTM_PKT); - assert(EFA_RDM_LONGREAD_MSGRTM_PKT + 1 == EFA_RDM_LONGREAD_TAGRTM_PKT); - assert(EFA_RDM_DC_EAGER_MSGRTM_PKT + 1 == EFA_RDM_DC_EAGER_TAGRTM_PKT); - assert(EFA_RDM_DC_MEDIUM_MSGRTM_PKT + 1 == EFA_RDM_DC_MEDIUM_TAGRTM_PKT); - assert(EFA_RDM_DC_LONGCTS_MSGRTM_PKT + 1 == EFA_RDM_DC_LONGCTS_TAGRTM_PKT); - - int tagged; - int eager_rtm, medium_rtm, longcts_rtm, readbase_rtm, iface; - size_t eager_rtm_max_data_size; - bool delivery_complete_requested; - - assert(txe->op == ofi_op_msg || txe->op == ofi_op_tagged); - tagged = (txe->op == ofi_op_tagged); - assert(tagged == 0 || tagged == 1); - - iface = txe->desc[0] ? ((struct efa_mr*) txe->desc[0])->iface : FI_HMEM_SYSTEM; - - if (txe->fi_flags & FI_INJECT || efa_both_support_zero_hdr_data_transfer(efa_rdm_ep, txe->peer)) - delivery_complete_requested = false; - else - delivery_complete_requested = txe->fi_flags & FI_DELIVERY_COMPLETE; - - eager_rtm = (delivery_complete_requested) ? EFA_RDM_DC_EAGER_MSGRTM_PKT + tagged - : EFA_RDM_EAGER_MSGRTM_PKT + tagged; - - medium_rtm = (delivery_complete_requested) ? EFA_RDM_DC_MEDIUM_MSGRTM_PKT + tagged - : EFA_RDM_MEDIUM_MSGRTM_PKT + tagged; - - longcts_rtm = (delivery_complete_requested) ? EFA_RDM_DC_LONGCTS_MSGRTM_PKT + tagged - : EFA_RDM_LONGCTS_MSGRTM_PKT + tagged; - - eager_rtm_max_data_size = efa_rdm_txe_max_req_data_capacity(efa_rdm_ep, txe, eager_rtm); + // Only zero copy path should arrive here + assert(efa_rdm_ep->extra_info[0] & + EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP); - readbase_rtm = efa_rdm_peer_select_readbase_rtm(txe->peer, efa_rdm_ep, txe); - - if (use_p2p && - txe->total_len >= g_efa_hmem_info[iface].min_read_msg_size && - efa_rdm_interop_rdma_read(efa_rdm_ep, txe->peer) && - (txe->desc[0] || efa_is_cache_available(efa_rdm_ep_domain(efa_rdm_ep)))) - return readbase_rtm; - - if (txe->total_len <= eager_rtm_max_data_size) - return eager_rtm; - - if (txe->total_len <= g_efa_hmem_info[iface].max_medium_msg_size) - return medium_rtm; - - return longcts_rtm; + return EFA_RDM_DC_EAGER_MSGRTM_PKT; } /** @@ -134,24 +88,10 @@ ssize_t efa_rdm_msg_post_rtm(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) use_p2p = err; rtm_type = efa_rdm_msg_select_rtm(ep, txe, use_p2p); - assert(rtm_type >= EFA_RDM_REQ_PKT_BEGIN); - - if (rtm_type < EFA_RDM_EXTRA_REQ_PKT_BEGIN) { - /* rtm requires only baseline feature, which peer should always support. */ - return efa_rdm_ope_post_send(txe, rtm_type); - } + assert(rtm_type == EFA_RDM_EAGER_MSGRTM_PKT); - /* - * rtm_type requires an extra feature, which peer might not support. - * - * Check handshake packet from peer to verify support status. + /* rtm requires only baseline feature, which peer should always support */ - if (!ep->homogeneous_peers && !(txe->peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)) - return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); - - if (!ep->homogeneous_peers && !efa_rdm_pkt_type_is_supported_by_peer(rtm_type, txe->peer)) - return -FI_EOPNOTSUPP; - return efa_rdm_ope_post_send(txe, rtm_type); } @@ -162,6 +102,9 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, const struct fi_msg *msg ssize_t err; struct efa_rdm_ope *txe; struct efa_rdm_peer *peer; + int available_tx_pkts; + struct efa_rdm_proto *proto; + uint64_t pke_send_flags = 0; efa_rdm_tracepoint(send_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); @@ -177,12 +120,85 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, const struct fi_msg *msg goto out; } - txe = efa_rdm_ep_alloc_txe(ep, peer, msg, op, tag, flags); + // Handle case when there are no TX packets available + available_tx_pkts = ep->efa_max_outstanding_tx_ops - + ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt; + if (OFI_UNLIKELY(available_tx_pkts == 0)) { + err = -FI_EAGAIN; + goto out; + } + + txe = ofi_buf_alloc(ep->ope_pool); if (OFI_UNLIKELY(!txe)) { err = -FI_EAGAIN; goto out; } + /* First try to use the refactored code path */ + err = efa_rdm_proto_select_send_protocol(ep, peer, msg, op, flags, txe, + &proto); + if (err) + goto out; + + /* The refactored code path does not support the zero copy path */ + if (ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP) + proto = NULL; + + /* If a protocol is found, use it. Otherwise, fall back to the old code + * path */ + if (proto) { + err = proto->construct_tx_pkes(ep, peer, msg, op, tag, flags, + txe); + if (err) + goto out; + + assert(txe->op == ofi_op_msg || txe->op == ofi_op_tagged); + assert(ep->send_pkt_entry_vec_size <= + efa_base_ep_get_tx_pool_size(&ep->base_ep)); + + /** + * We currently respect FI_MORE only for eager pkt type because + * For non-eager REQ packets, we already send multiple pkts that + * contain data and make the firmware saturated, there is no + * meaning to queue pkts in this case. + */ + if (flags & FI_MORE && proto == &efa_rdm_proto_eager) { + pke_send_flags |= FI_MORE; + } + + EFA_DBG(FI_LOG_EP_DATA, + "peer: %" PRIu64 + ": size %lu tag: %lx op: %x flags: %lx msg_id: %" PRIu32 + "\n", + peer->conn->fi_addr, txe->total_len, tag, op, flags, + txe->msg_id); + + efa_rdm_tracepoint(send_begin, txe->msg_id, + (size_t) txe->cq_entry.op_context, + txe->total_len); + + err = efa_rdm_pke_sendv(ep->send_pkt_entry_vec, + ep->send_pkt_entry_vec_size, + pke_send_flags); + if (err) + goto out; + + peer->flags |= EFA_RDM_PEER_REQ_SENT; + + proto->handle_tx_pkes_posted(ep, txe); + goto out; + } + + /* Fallback to the old code path for zero copy receive */ + assert(ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP); + + efa_rdm_txe_construct(txe, ep, peer, msg, op, flags); + if (op == ofi_op_tagged) { + txe->cq_entry.tag = tag; + txe->tag = tag; + } + assert(txe->op == ofi_op_msg || txe->op == ofi_op_tagged); txe->msg_id = peer->next_msg_id++; diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 98406f70222..3c925d7eb2e 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -94,6 +94,10 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe, EFA_WARN(FI_LOG_CQ, "invalid operation type\n"); assert(0); } + + efa_domain_ope_list_lock(efa_rdm_ep_domain(ep)); + dlist_insert_tail(&txe->ep_entry, &ep->txe_list); + efa_domain_ope_list_unlock(efa_rdm_ep_domain(ep)); } void efa_rdm_txe_release(struct efa_rdm_ope *txe) @@ -458,10 +462,9 @@ size_t efa_rdm_txe_max_req_data_capacity(struct efa_rdm_ep *ep, struct efa_rdm_o * On success, return 0 * If there is not enough available packet entry in TX packet pool, return -FI_EAGAIN */ -ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope, - int pkt_type, - int *pkt_entry_cnt, - int *pkt_entry_data_size_vec) +ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope, int pkt_type, + size_t *pkt_entry_cnt, + size_t *pkt_entry_data_size_vec) { struct efa_rdm_ep *ep; size_t total_pkt_entry_data_size; /* total number of bytes send via packet entry's payload */ @@ -1785,20 +1788,23 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) struct efa_rdm_ep *ep; ssize_t err; int64_t segment_offset; - int pkt_entry_cnt, pkt_entry_cnt_allocated = 0; + int pkt_entry_cnt_allocated = 0; int i; uint64_t flags = 0; ep = ope->ep; assert(ep); - err = efa_rdm_ope_prepare_to_post_send(ope, pkt_type, &pkt_entry_cnt, ep->send_pkt_entry_size_vec); + err = efa_rdm_ope_prepare_to_post_send(ope, pkt_type, + &ep->send_pkt_entry_vec_size, + ep->send_pkt_entry_data_sizes); if (err) return err; - assert(pkt_entry_cnt <= efa_base_ep_get_tx_pool_size(&ep->base_ep)); + assert(ep->send_pkt_entry_vec_size <= + efa_base_ep_get_tx_pool_size(&ep->base_ep)); segment_offset = efa_rdm_pkt_type_contains_data(pkt_type) ? (int64_t) ope->bytes_sent : -1; - for (i = 0; i < pkt_entry_cnt; ++i) { + for (i = 0; i < ep->send_pkt_entry_vec_size; ++i) { ep->send_pkt_entry_vec[i] = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL); if (OFI_UNLIKELY(!ep->send_pkt_entry_vec[i])) { @@ -1808,21 +1814,19 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) pkt_entry_cnt_allocated++; - err = efa_rdm_pke_fill_data(ep->send_pkt_entry_vec[i], - pkt_type, - ope, - segment_offset, - ep->send_pkt_entry_size_vec[i]); + err = efa_rdm_pke_fill_data(ep->send_pkt_entry_vec[i], pkt_type, + ope, segment_offset, + ep->send_pkt_entry_data_sizes[i]); if (err) goto handle_err; - if (segment_offset != -1 && pkt_entry_cnt > 1) { - assert(ep->send_pkt_entry_size_vec[i] > 0); - segment_offset += ep->send_pkt_entry_size_vec[i]; + if (segment_offset != -1 && ep->send_pkt_entry_vec_size > 1) { + assert(ep->send_pkt_entry_data_sizes[i] > 0); + segment_offset += ep->send_pkt_entry_data_sizes[i]; } } - assert(pkt_entry_cnt == pkt_entry_cnt_allocated); + assert(ep->send_pkt_entry_vec_size == pkt_entry_cnt_allocated); /** * We currently respect FI_MORE only for eager pkt type because @@ -1838,12 +1842,13 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) if (ope->fi_flags & FI_MORE && efa_rdm_pkt_type_is_eager(pkt_type)) flags |= FI_MORE; - err = efa_rdm_pke_sendv(ep->send_pkt_entry_vec, pkt_entry_cnt, flags); + err = efa_rdm_pke_sendv(ep->send_pkt_entry_vec, + ep->send_pkt_entry_vec_size, flags); if (err) goto handle_err; ope->peer->flags |= EFA_RDM_PEER_REQ_SENT; - for (i = 0; i < pkt_entry_cnt; ++i) + for (i = 0; i < ep->send_pkt_entry_vec_size; ++i) efa_rdm_pke_handle_sent(ep->send_pkt_entry_vec[i], pkt_type, ope->peer); return FI_SUCCESS; @@ -1852,53 +1857,6 @@ ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type) for (i = 0; i < pkt_entry_cnt_allocated; ++i) efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[i]); - return efa_rdm_ope_post_send_fallback(ope, pkt_type, err); -} - -/** - * @brief Fallback to a different message type if a packet send fails. - * - * Currently, this function is only used in the read nack protocol. If a long read or - * runting read RTM packet fails to send because of a memory registration failure, it - * will send a long CTS RTM packet. - * - * @param[in] ope pointer to efa_rdm_ope. (either a txe or an rxe) - * @param[in] pkt_type packet type that failed to send - * @param[in] err error code of the original failure - * @return On success return 0, otherwise return a negative libfabric error code. Possible error codes include: - * -FI_EAGAIN temporarily out of resource - */ -ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, - int pkt_type, ssize_t err) -{ - bool delivery_complete_requested = ope->fi_flags & FI_DELIVERY_COMPLETE; - - if (err == -FI_ENOMR) { - /* Long read and runting read protocols could fail because of a - * lack of memory registrations. In that case, we retry with - * long CTS protocol - */ - switch (pkt_type) { - case EFA_RDM_LONGREAD_MSGRTM_PKT: - case EFA_RDM_RUNTREAD_MSGRTM_PKT: - EFA_INFO(FI_LOG_EP_CTRL, - "Sender fallback to long CTS untagged " - "protocol because memory registration limit " - "was reached on the sender\n"); - return efa_rdm_ope_post_send_or_queue( - ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_MSGRTM_PKT : EFA_RDM_LONGCTS_MSGRTM_PKT); - case EFA_RDM_LONGREAD_TAGRTM_PKT: - case EFA_RDM_RUNTREAD_TAGRTM_PKT: - EFA_INFO(FI_LOG_EP_CTRL, - "Sender fallback to long CTS tagged protocol " - "because memory registration limit was " - "reached on the sender\n"); - return efa_rdm_ope_post_send_or_queue( - ope, delivery_complete_requested ? EFA_RDM_DC_LONGCTS_TAGRTM_PKT : EFA_RDM_LONGCTS_TAGRTM_PKT); - default: - return err; - } - } return err; } diff --git a/prov/efa/src/rdm/efa_rdm_ope.h b/prov/efa/src/rdm/efa_rdm_ope.h index 61c2c74883d..52e3a452dea 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.h +++ b/prov/efa/src/rdm/efa_rdm_ope.h @@ -352,16 +352,12 @@ int efa_rdm_rxe_post_local_read_or_queue(struct efa_rdm_ope *rxe, struct efa_rdm_pke *pkt_entry, char *pkt_data, size_t data_size); -ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope, - int pkt_type, - int *pkt_entry_cnt, - int *pkt_entry_data_size_vec); +ssize_t efa_rdm_ope_prepare_to_post_send(struct efa_rdm_ope *ope, int pkt_type, + size_t *pkt_entry_cnt, + size_t *pkt_entry_data_size_vec); ssize_t efa_rdm_ope_post_send(struct efa_rdm_ope *ope, int pkt_type); -ssize_t efa_rdm_ope_post_send_fallback(struct efa_rdm_ope *ope, - int pkt_type, ssize_t err); - ssize_t efa_rdm_ope_post_send_or_queue(struct efa_rdm_ope *ope, int pkt_type); ssize_t efa_rdm_ope_repost_ope_queued_before_handshake(struct efa_rdm_ope *ope); diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 704af2c7613..669fe808b5d 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -109,6 +109,7 @@ struct efa_rdm_pke *efa_rdm_pke_alloc(struct efa_rdm_ep *ep, pkt_entry->payload_size = 0; pkt_entry->payload_mr = NULL; pkt_entry->peer = NULL; + pkt_entry->callback = NULL; switch (alloc_type) { case EFA_RDM_PKE_FROM_USER_RX_POOL: diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 2f21116640a..db30748dd84 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -246,6 +246,9 @@ struct efa_rdm_pke { /**@brief Generation counter. It is incremented every time the packet is posted to rdma-core */ uint8_t gen; + /**@brief Callback function called in TX and RX paths */ + void (*callback)(struct efa_rdm_pke *pkt_entry); + #if ENABLE_DEBUG struct efa_rdm_pke_debug_info_buffer *debug_info; /**< Pointer to debug info buffer */ #endif diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index 0f350526be0..437be344308 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -93,54 +93,23 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, ret = efa_rdm_pke_init_receipt(pkt_entry, ope); break; case EFA_RDM_EAGER_MSGRTM_PKT: - assert(data_offset == 0 && data_size == -1); - ret = efa_rdm_pke_init_eager_msgrtm(pkt_entry, ope); - break; case EFA_RDM_EAGER_TAGRTM_PKT: - assert(data_offset == 0 && data_size == -1); - ret = efa_rdm_pke_init_eager_tagrtm(pkt_entry, ope); + assert(0 && "Eager protocol moved to refactored code path"); break; case EFA_RDM_MEDIUM_MSGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_medium_msgrtm(pkt_entry, ope, data_offset, data_size); - break; case EFA_RDM_MEDIUM_TAGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_medium_tagrtm(pkt_entry, ope, data_offset, data_size); + assert(0 && "Medium protocol moved to refactored code path"); break; case EFA_RDM_LONGCTS_MSGRTM_PKT: - /* The data_offset will be non-zero when the long CTS RTM packet - * is sent to continue a runting read transfer after the - * receiver has run out of memory registrations */ - assert(data_offset == 0 || - ope->internal_flags & EFA_RDM_OPE_READ_NACK); - assert(data_size == -1); - ret = efa_rdm_pke_init_longcts_msgrtm(pkt_entry, ope); - break; case EFA_RDM_LONGCTS_TAGRTM_PKT: - /* The data_offset will be non-zero when the long CTS RTM packet - * is sent to continue a runting read transfer after the - * receiver has run out of memory registrations */ - assert(data_offset == 0 || - ope->internal_flags & EFA_RDM_OPE_READ_NACK); - assert(data_size == -1); - ret = efa_rdm_pke_init_longcts_tagrtm(pkt_entry, ope); + assert(0 && "Long CTS protocol moved to refactored code path"); break; case EFA_RDM_LONGREAD_MSGRTM_PKT: - assert(data_offset == -1 && data_size == -1); - ret = efa_rdm_pke_init_longread_msgrtm(pkt_entry, ope); - break; case EFA_RDM_LONGREAD_TAGRTM_PKT: - assert(data_offset == -1 && data_size == -1); - ret = efa_rdm_pke_init_longread_tagrtm(pkt_entry, ope); - break; + assert(0 && "Long read protocol moved to refactored code path"); case EFA_RDM_RUNTREAD_MSGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_runtread_msgrtm(pkt_entry, ope, data_offset, data_size); - break; case EFA_RDM_RUNTREAD_TAGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_runtread_tagrtm(pkt_entry, ope, data_offset, data_size); + assert(0 && "Runt read protocol moved to refactored code path"); break; case EFA_RDM_EAGER_RTW_PKT: assert(data_offset == 0 && data_size == -1); @@ -175,38 +144,16 @@ int efa_rdm_pke_fill_data(struct efa_rdm_pke *pkt_entry, ret = efa_rdm_pke_init_compare_rta(pkt_entry, ope); break; case EFA_RDM_DC_EAGER_MSGRTM_PKT: - assert(data_offset == 0 && data_size == -1); - ret = efa_rdm_pke_init_dc_eager_msgrtm(pkt_entry, ope); - break; case EFA_RDM_DC_EAGER_TAGRTM_PKT: - assert(data_offset == 0 && data_size == -1); - ret = efa_rdm_pke_init_dc_eager_tagrtm(pkt_entry, ope); + assert(0 && "Eager protocol moved to refactored code path"); break; case EFA_RDM_DC_MEDIUM_MSGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_dc_medium_msgrtm(pkt_entry, ope, data_offset, data_size); - break; case EFA_RDM_DC_MEDIUM_TAGRTM_PKT: - assert(data_offset >= 0 && data_size > 0); - ret = efa_rdm_pke_init_dc_medium_tagrtm(pkt_entry, ope, data_offset, data_size); + assert(0 && "Medium protocol moved to refactored code path"); break; case EFA_RDM_DC_LONGCTS_MSGRTM_PKT: - /* The data_offset will be non-zero when the DC long CTS RTM packet - * is sent to continue a runting read transfer after the - * receiver has run out of memory registrations */ - assert(data_offset == 0 || - ope->internal_flags & EFA_RDM_OPE_READ_NACK); - assert(data_size == -1); - ret = efa_rdm_pke_init_dc_longcts_msgrtm(pkt_entry, ope); - break; case EFA_RDM_DC_LONGCTS_TAGRTM_PKT: - /* The data_offset will be non-zero when the DC long CTS tagged RTM packet - * is sent to continue a runting read transfer after the - * receiver has run out of memory registrations */ - assert(data_offset == 0 || - ope->internal_flags & EFA_RDM_OPE_READ_NACK); - assert(data_size == -1); - ret = efa_rdm_pke_init_dc_longcts_tagrtm(pkt_entry, ope); + assert(0 && "Long CTS protocol moved to refactored code path"); break; case EFA_RDM_DC_EAGER_RTW_PKT: assert(data_offset == 0 && data_size == -1); @@ -268,27 +215,27 @@ void efa_rdm_pke_handle_sent(struct efa_rdm_pke *pkt_entry, int pkt_type, struct break; case EFA_RDM_EAGER_MSGRTM_PKT: case EFA_RDM_EAGER_TAGRTM_PKT: - /* nothing to do */ + assert(0 && "Eager protocol moved to refactored code path"); break; case EFA_RDM_MEDIUM_MSGRTM_PKT: case EFA_RDM_MEDIUM_TAGRTM_PKT: case EFA_RDM_DC_MEDIUM_MSGRTM_PKT: case EFA_RDM_DC_MEDIUM_TAGRTM_PKT: - efa_rdm_pke_handle_medium_rtm_sent(pkt_entry); + assert(0 && "Medium protocol moved to refactored code path"); break; case EFA_RDM_LONGCTS_MSGRTM_PKT: case EFA_RDM_DC_LONGCTS_MSGRTM_PKT: case EFA_RDM_LONGCTS_TAGRTM_PKT: case EFA_RDM_DC_LONGCTS_TAGRTM_PKT: - efa_rdm_pke_handle_longcts_rtm_sent(pkt_entry); + assert(0 && "Long CTS protocol moved to refactored code path"); break; case EFA_RDM_LONGREAD_MSGRTM_PKT: case EFA_RDM_LONGREAD_TAGRTM_PKT: - efa_rdm_pke_handle_longread_rtm_sent(pkt_entry); + assert(0 && "Long read protocol moved to refactored code path"); break; case EFA_RDM_RUNTREAD_MSGRTM_PKT: case EFA_RDM_RUNTREAD_TAGRTM_PKT: - efa_rdm_pke_handle_runtread_rtm_sent(pkt_entry, peer); + assert(0 && "Long read protocol moved to refactored code path"); break; case EFA_RDM_EAGER_RTW_PKT: /* nothing to do when EAGER RTW is sent */ @@ -312,8 +259,10 @@ void efa_rdm_pke_handle_sent(struct efa_rdm_pke *pkt_entry, int pkt_type, struct break; case EFA_RDM_DC_EAGER_MSGRTM_PKT: case EFA_RDM_DC_EAGER_TAGRTM_PKT: + assert(0 && "Eager protocol moved to refactored code path"); + break; case EFA_RDM_DC_EAGER_RTW_PKT: - /* nothing to do for DC EAGER RTM/RTW */ + /* nothing to do for DC EAGER RTW */ break; case EFA_RDM_CTSDATA_PKT: efa_rdm_pke_handle_ctsdata_sent(pkt_entry); @@ -555,13 +504,6 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) return; } - /* These pkts are eager pkts withour hdrs */ - if (pkt_entry->flags & EFA_RDM_PKE_SEND_TO_USER_RECV_QP) { - efa_rdm_pke_handle_eager_rtm_send_completion(pkt_entry); - efa_rdm_pke_release_tx(pkt_entry); - return; - } - /* Start handling pkts with hdrs */ switch (efa_rdm_pkt_type_of(pkt_entry)) { case EFA_RDM_HANDSHAKE_PKT: @@ -594,23 +536,23 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) break; case EFA_RDM_EAGER_MSGRTM_PKT: case EFA_RDM_EAGER_TAGRTM_PKT: - efa_rdm_pke_handle_eager_rtm_send_completion(pkt_entry); + assert(0 && "Eager protocol moved to refactored code path"); break; case EFA_RDM_MEDIUM_MSGRTM_PKT: case EFA_RDM_MEDIUM_TAGRTM_PKT: - efa_rdm_pke_handle_medium_rtm_send_completion(pkt_entry); + assert(0 && "Medium protocol moved to refactored code path"); break; case EFA_RDM_LONGCTS_MSGRTM_PKT: case EFA_RDM_LONGCTS_TAGRTM_PKT: - efa_rdm_pke_handle_longcts_rtm_send_completion(pkt_entry); + assert(0 && "Long CTS protocol moved to refactored code path"); break; case EFA_RDM_LONGREAD_MSGRTM_PKT: case EFA_RDM_LONGREAD_TAGRTM_PKT: - /* nothing to do */ + assert(0 && "Long read protocol moved to refactored code path"); break; case EFA_RDM_RUNTREAD_MSGRTM_PKT: case EFA_RDM_RUNTREAD_TAGRTM_PKT: - efa_rdm_pke_handle_runtread_rtm_send_completion(pkt_entry); + assert(0 && "Runt read protocol moved to refactored code path"); break; case EFA_RDM_EAGER_RTW_PKT: efa_rdm_pke_handle_eager_rtw_send_completion(pkt_entry); @@ -641,10 +583,12 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) case EFA_RDM_DC_EAGER_TAGRTM_PKT: case EFA_RDM_DC_MEDIUM_MSGRTM_PKT: case EFA_RDM_DC_MEDIUM_TAGRTM_PKT: - case EFA_RDM_DC_EAGER_RTW_PKT: - case EFA_RDM_DC_WRITE_RTA_PKT: case EFA_RDM_DC_LONGCTS_MSGRTM_PKT: case EFA_RDM_DC_LONGCTS_TAGRTM_PKT: + assert(0 && "Protocols moved to refactored code path"); + break; + case EFA_RDM_DC_EAGER_RTW_PKT: + case EFA_RDM_DC_WRITE_RTA_PKT: case EFA_RDM_DC_LONGCTS_RTW_PKT: /* For DC packets, use efa_outstanding_tx_ops to track TX completions * instead of bytes_acked to avoid issues with unset payload_size. diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index 96fa541dd32..c071820e740 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -68,70 +68,6 @@ size_t efa_rdm_pke_get_rtm_msg_length(struct efa_rdm_pke *pkt_entry) return 0; } -/** - * @brief init a RTM packet entry that will carry payload (user data) - * - * @details - * As the name indicate, this function is applied to RTM packets - * that will carry payload (user data), which include EAGER, MEDIUM, - * LONGCTS and RUNTREAD RTM (both DC and non-DC, tag and non-tag). - * It is not applied to LONGREAD RTM. - * - * @param[in,out] pkt_entry RTM packet entry - * @param[in] pkt_type RTM packet type - * @param[in] txe TX entry that has user buffer information - * @param[in] segmment_offset data offset in respect of user buffer - * @param[in] data_size user data size. If it is -1, the function - * will select data size based on maximum - * data capacity of packet entry. - * @returns - * 0 on success - * negative libfabric error code for failure. - */ -static inline -ssize_t efa_rdm_pke_init_rtm_with_payload(struct efa_rdm_pke *pkt_entry, - int pkt_type, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - struct efa_rdm_rtm_base_hdr *rtm_hdr; - - efa_rdm_pke_init_req_hdr_common(pkt_entry, pkt_type, txe); - - rtm_hdr = (struct efa_rdm_rtm_base_hdr *)pkt_entry->wiredata; - rtm_hdr->flags |= EFA_RDM_REQ_MSG; - rtm_hdr->msg_id = txe->msg_id; - - if (txe->internal_flags & EFA_RDM_OPE_READ_NACK) - rtm_hdr->flags |= EFA_RDM_REQ_READ_NACK; - - /* If this RTM packet is sent after the runting read protocol has failed - because of a MR registration limit on the receiver, we don't want to - send any data with the RTM packet. This is because the runting read RTM - packets have already delivered some of the data and the long CTS RTM - packet does not have a seg_offset field */ - if (txe->internal_flags & EFA_RDM_OPE_READ_NACK) { - data_size = 0; - } else if (data_size == -1) { - data_size = MIN(txe->total_len - segment_offset, - txe->ep->mtu_size - efa_rdm_pke_get_req_hdr_size(pkt_entry)); - - if (data_size + segment_offset < txe->total_len) { - if (efa_mr_is_cuda(txe->desc[0])) { - if (txe->ep->sendrecv_in_order_aligned_128_bytes) - data_size &= ~(EFA_RDM_IN_ORDER_ALIGNMENT - 1); - else - data_size &= ~(EFA_RDM_CUDA_MEMORY_ALIGNMENT -1); - } - } - } - - return efa_rdm_pke_init_payload_from_ope(pkt_entry, txe, - efa_rdm_pke_get_req_hdr_size(pkt_entry), - segment_offset, data_size); -} - /** * @brief Update RX entry with the information in RTM packet entry. * @@ -534,134 +470,6 @@ void efa_rdm_pke_handle_rtm_rta_recv(struct efa_rdm_pke *pkt_entry) efa_rdm_peer_proc_pending_items_in_robuf(peer, ep); } -/** - * @brief construct a eager msgrtm pkt without hdr - * - * @param[in,out] pkt_entry pkt to be initialized - * @param[in] txe TX entry - */ -static inline -ssize_t efa_rdm_pke_init_eager_msgrtm_zero_hdr(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - pkt_entry->ope = txe; - pkt_entry->peer = txe->peer; - - return efa_rdm_pke_init_payload_from_ope(pkt_entry, txe, - 0, 0, txe->total_len); -} - -/** - * @brief initialzie a EFA_RDM_EAGER_MSGRTM pacekt entry - * - * @param[in,out] pkt_entry EFA_RDM_EAGER_MSGRTM to be initialized - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_eager_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - int ret; - - if (pkt_entry->flags & EFA_RDM_PKE_HAS_NO_BASE_HDR) - ret = efa_rdm_pke_init_eager_msgrtm_zero_hdr(pkt_entry, txe); - else - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, - EFA_RDM_EAGER_MSGRTM_PKT, - txe, 0, -1); - if (ret) - return ret; - - assert(txe->total_len == pkt_entry->payload_size); - return 0; -} - -/** - * @brief initialize a EFA_RDM_EAGER_TAGRTM packet entry - * @param[in,out] pkt_entry EFA_RDM_EAGER_TAGRTM to be initialized - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_eager_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_base_hdr *base_hdr; - int ret; - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_EAGER_TAGRTM_PKT, txe, 0, -1); - if (ret) - return ret; - assert(txe->total_len == pkt_entry->payload_size); - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief initialzie a EFA_RDM_DC_EAGER_MSGRTM pacekt entry - * - * @param[in,out] pkt_entry EFA_RDM_DC_EAGER_MSGRTM to be initialized - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_dc_eager_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) - -{ - struct efa_rdm_dc_eager_msgrtm_hdr *dc_eager_msgrtm_hdr; - int ret; - - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_DC_EAGER_MSGRTM_PKT, txe, 0, -1); - if (ret) - return ret; - dc_eager_msgrtm_hdr = efa_rdm_pke_get_dc_eager_msgrtm_hdr(pkt_entry); - dc_eager_msgrtm_hdr->hdr.send_id = txe->tx_id; - return 0; -} - -/** - * @brief initialize a EFA_RDM_DC_EAGER_TAGRTM pacekt entry - * - * @param[in,out] pkt_entry EFA_RDM_DC_EAGER_TAGRTM to be initialized - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_dc_eager_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_base_hdr *base_hdr; - struct efa_rdm_dc_eager_tagrtm_hdr *dc_eager_tagrtm_hdr; - int ret; - - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_DC_EAGER_TAGRTM_PKT, txe, 0, -1); - if (ret) - return ret; - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - - dc_eager_tagrtm_hdr = efa_rdm_pke_get_dc_eager_tagrtm_hdr(pkt_entry); - dc_eager_tagrtm_hdr->hdr.send_id = txe->tx_id; - return 0; -} - -/** - * @brief handle the event that an EAGER RTM has send completed - * - * @details - * This function applies to EAGER_MSGRTM and EAGER_TAGRTM, it - * does not apply to DC_EAGER_MSGRTM and DC_EAGER_TAGRTM - * - * @param[in,out] pkt_entry EAGER_MSGRTM or EAGER_TAGRTM packet entry - */ -void efa_rdm_pke_handle_eager_rtm_send_completion(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - - txe = pkt_entry->ope; - assert(txe->total_len == pkt_entry->payload_size); - efa_rdm_ope_handle_send_completed(txe); -} - /** * @brief process a matched eager rtm packet entry * @@ -690,171 +498,6 @@ ssize_t efa_rdm_pke_proc_matched_eager_rtm(struct efa_rdm_pke *pkt_entry) return err; } - -/** - * @brief initialize a EFA_RDM_MEDIUM_MSGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_MEDIUM_MSGRTM packet entry - * @param[in] txe TX entry - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_medium_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) - -{ - struct efa_rdm_medium_rtm_base_hdr *rtm_hdr; - int ret; - - efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_MEDIUM_MSGRTM_PKT, - txe, segment_offset, data_size); - if (ret) - return ret; - - rtm_hdr = efa_rdm_pke_get_medium_rtm_base_hdr(pkt_entry); - rtm_hdr->msg_length = txe->total_len; - rtm_hdr->seg_offset = segment_offset; - return 0; -} - -/** - * @brief initialize a EFA_RDM_MEDIUM_TAGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_MEDIUM_TAGRTM packet entry - * @param[in] txe TX entry - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_medium_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - struct efa_rdm_medium_rtm_base_hdr *rtm_hdr; - int ret; - - efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_MEDIUM_TAGRTM_PKT, - txe, segment_offset, data_size); - if (ret) - return ret; - - rtm_hdr = efa_rdm_pke_get_medium_rtm_base_hdr(pkt_entry); - rtm_hdr->msg_length = txe->total_len; - rtm_hdr->seg_offset = segment_offset; - rtm_hdr->hdr.flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief initialize a EFA_RDM_DC_MEDIUM_MSGRTM packet - * - * @details - * DC means delivery complete - * - * @param[in,out] pkt_entry EFA_RDM_DC_MEDIUM_MSGRTM packet entry - * @param[in] txe TX entry - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_dc_medium_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - struct efa_rdm_dc_medium_msgrtm_hdr *dc_medium_msgrtm_hdr; - int ret; - - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - - efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_DC_MEDIUM_MSGRTM_PKT, - txe, segment_offset, data_size); - if (ret) - return ret; - - dc_medium_msgrtm_hdr = efa_rdm_pke_get_dc_medium_msgrtm_hdr(pkt_entry); - dc_medium_msgrtm_hdr->hdr.msg_length = txe->total_len; - dc_medium_msgrtm_hdr->hdr.seg_offset = segment_offset; - dc_medium_msgrtm_hdr->hdr.send_id = txe->tx_id; - return 0; -} - -/** - * @brief initialize a EFA_RDM_DC_MEDIUM_TAGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_DC_MEDIUM_TAGRTM packet entry - * @param[in] txe TX entry - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_dc_medium_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - struct efa_rdm_dc_medium_tagrtm_hdr *dc_medium_tagrtm_hdr; - int ret; - - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - - efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, EFA_RDM_DC_MEDIUM_TAGRTM_PKT, - txe, segment_offset, data_size); - if (ret) - return ret; - - dc_medium_tagrtm_hdr = efa_rdm_pke_get_dc_medium_tagrtm_hdr(pkt_entry); - dc_medium_tagrtm_hdr->hdr.msg_length = txe->total_len; - dc_medium_tagrtm_hdr->hdr.seg_offset = segment_offset; - dc_medium_tagrtm_hdr->hdr.hdr.flags |= EFA_RDM_REQ_TAGGED; - dc_medium_tagrtm_hdr->hdr.send_id = txe->tx_id; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief handle the event that a MEDIUM RTM has been sent - * - * @details - * this function applies to all 4 types of MEDIUM RTM - * - * @param[in,out] pkt_entry MEDIUM RTM packet entry - */ -void efa_rdm_pke_handle_medium_rtm_sent(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - - txe = pkt_entry->ope; - txe->bytes_sent += pkt_entry->payload_size; -} - -/** - * @brief handle the event that a MEDIUM RTM has sent completed - * - * @details - * this function applies to non DC version of MEDIUM RTM - * - * @param[in,out] pkt_entry MEDIUM RTM packet entry - */ -void efa_rdm_pke_handle_medium_rtm_send_completion(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - - txe = pkt_entry->ope; - txe->bytes_acked += pkt_entry->payload_size; - if (txe->total_len == txe->bytes_acked) - efa_rdm_ope_handle_send_completed(txe); -} - /** * @brief process a matched MEDIUM or RUNTREAD RTM * @@ -951,245 +594,6 @@ ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry) return ret; } -/** - * @brief initialize a LONGCTS RTM packet - * - * @details - * This function is used by all 4 types of LONGCTS RTM - * - * @param[in,out] pkt_entry LONGCTS RTM packet entry - * @param[in] pkt_type packe type, must be one of: - * EFA_RDM_LONGCTS_MSGRTM_PKT, - * EFA_RDM_LONGCTS_TAGGRTM_PKT, - * EFA_RDM_DC_LONGCTS_MSGRTM_PKT, - * EFA_RDM_DC_LONGCTS_TAGRTM_PKT, - * @param[in] txe TX entry - */ -int efa_rdm_pke_init_longcts_rtm_common(struct efa_rdm_pke *pkt_entry, - int pkt_type, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_longcts_rtm_base_hdr *rtm_hdr; - int ret; - - ret = efa_rdm_pke_init_rtm_with_payload(pkt_entry, pkt_type, txe, 0, -1); - if (ret) - return ret; - - rtm_hdr = efa_rdm_pke_get_longcts_rtm_base_hdr(pkt_entry); - rtm_hdr->msg_length = txe->total_len; - rtm_hdr->send_id = txe->tx_id; - rtm_hdr->credit_request = efa_env.tx_min_credits; - return 0; -} - -/** - * @brief initialize a EFA_RDM_LONGCTS_MSGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_LONGCTS_MSGRTM packet entry - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_longcts_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - return efa_rdm_pke_init_longcts_rtm_common(pkt_entry, - EFA_RDM_LONGCTS_MSGRTM_PKT, - txe); -} - -/** - * @brief initialize a EFA_RDM_LONGCTS_TAGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_LONGCTS_TAGRTM packet entry - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_longcts_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_base_hdr *base_hdr; - int ret; - - ret = efa_rdm_pke_init_longcts_rtm_common(pkt_entry, - EFA_RDM_LONGCTS_TAGRTM_PKT, - txe); - if (ret) - return ret; - - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief initialize a EFA_RDM_DC_LONGCTS_MSGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_DC_LONGCTS_TAGRTM packet entry - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_dc_longcts_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - return efa_rdm_pke_init_longcts_rtm_common(pkt_entry, - EFA_RDM_DC_LONGCTS_MSGRTM_PKT, - txe); -} - -/** - * @brief initialize a EFA_RDM_DC_LONGCTS_TAGRTM packet - * - * @param[in,out] pkt_entry EFA_RDM_DC_MEDIUM_TAGRTM packet entry - * @param[in] txe TX entry - */ -ssize_t efa_rdm_pke_init_dc_longcts_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_base_hdr *base_hdr; - int ret; - - txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; - ret = efa_rdm_pke_init_longcts_rtm_common(pkt_entry, - EFA_RDM_DC_LONGCTS_TAGRTM_PKT, - txe); - if (ret) - return ret; - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief handle the event that a LONGCTS RTM has been sent - * - * this function applies to all 4 types of LONGCTS RTM - * - * @param[in,out] pkt_entry LONGCTS RTM packet entry - */ -void efa_rdm_pke_handle_longcts_rtm_sent(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - - txe = pkt_entry->ope; - txe->bytes_sent += pkt_entry->payload_size; - assert(txe->bytes_sent < txe->total_len); - - if (efa_is_cache_available(efa_rdm_ep_domain(pkt_entry->ep))) - efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); -} - -/** - * @brief handle the event that a LONGCTS RTM has been sent - * - * this function only applies to non DC version of LONGCTS RTM - * - * @param[in,out] pkt_entry LONGCTS RTM packet entry - */ -void efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - - /** - * A zero-payload longcts rtm pkt currently should only happen when it's - * used for the READ NACK protocol. In this case, this pkt doesn't - * contribute to the send completion, and the associated tx entry - * may be released earlier as the CTSDATA pkts have already kicked off - * and finished the send. - */ - if (pkt_entry->payload_size == 0) { - assert(efa_rdm_pke_get_rtm_base_hdr(pkt_entry)->flags & EFA_RDM_REQ_READ_NACK); - return; - } - - txe = pkt_entry->ope; - txe->bytes_acked += pkt_entry->payload_size; - if (txe->total_len == txe->bytes_acked) - efa_rdm_ope_handle_send_completed(txe); -} - -/** - * @brief initialize a longread RTM packet - * - * @details - * This function applies to both tagged and non-tagged version - * of LONGREAD RTM. Note that there is no DC longread RTM, because - * LONGREAD protocol ensures DC by nature - */ -ssize_t efa_rdm_pke_init_longread_rtm(struct efa_rdm_pke *pkt_entry, - int pkt_type, - struct efa_rdm_ope *txe) -{ - struct efa_rdm_longread_rtm_base_hdr *rtm_hdr; - struct fi_rma_iov *read_iov; - size_t hdr_size; - int err; - - efa_rdm_pke_init_req_hdr_common(pkt_entry, pkt_type, txe); - - rtm_hdr = efa_rdm_pke_get_longread_rtm_base_hdr(pkt_entry); - rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; - rtm_hdr->hdr.msg_id = txe->msg_id; - rtm_hdr->msg_length = txe->total_len; - rtm_hdr->send_id = txe->tx_id; - rtm_hdr->read_iov_count = txe->iov_count; - - hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); - read_iov = (struct fi_rma_iov *)(pkt_entry->wiredata + hdr_size); - err = efa_rdm_txe_prepare_to_be_read(txe, read_iov); - if (OFI_UNLIKELY(err)) - return err; - - pkt_entry->pkt_size = hdr_size + txe->iov_count * sizeof(struct fi_rma_iov); - pkt_entry->ope = txe; - pkt_entry->peer = txe->peer; - return 0; -} - -/** - * @brief initialize a EFA_RDM_LONGREAD_RTA_MSGRTM - * - */ -ssize_t efa_rdm_pke_init_longread_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - return efa_rdm_pke_init_longread_rtm(pkt_entry, EFA_RDM_LONGREAD_MSGRTM_PKT, txe); -} - -/** - * @brief initialize a EFA_RDM_LONGREAD_RTA_TAGRTM - * - */ -ssize_t efa_rdm_pke_init_longread_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe) -{ - ssize_t err; - struct efa_rdm_base_hdr *base_hdr; - - err = efa_rdm_pke_init_longread_rtm(pkt_entry, EFA_RDM_LONGREAD_TAGRTM_PKT, txe); - if (err) - return err; - - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief handle the event that a longread RTM has been sent - * - * @details - * this function applies to both tagged and non-tagged - * longread RTM - * - * @param[in,out] pkt_entry packet entry - */ -void efa_rdm_pke_handle_longread_rtm_sent(struct efa_rdm_pke *pkt_entry) -{ - efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight += 1; -} - /** * @brief process a matched longread RTM * @@ -1227,151 +631,3 @@ ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry) } return err; } - -/** - * @brief fill in the efa_rdm_runtread_rtm_base_hdr and data of a RUNTREAD packet - * - * only thing left that need to be set is tag - * - * @param[out] pkt_entry pkt_entry to be initialzied - * @param[in] pkt_type EFA_RDM_RUNREAD_MSGRTM or EFA_RDM_RUNTREAD_TAGRTM - * @param[in] txe contains information of the send operation - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -static -ssize_t efa_rdm_pke_init_runtread_rtm(struct efa_rdm_pke *pkt_entry, - int pkt_type, - struct efa_rdm_ope *txe, - int64_t segment_offset, - int64_t data_size) -{ - struct efa_rdm_runtread_rtm_base_hdr *rtm_hdr; - struct fi_rma_iov *read_iov; - size_t hdr_size, payload_offset; - int err; - - assert(txe->bytes_runt); - - efa_rdm_pke_init_req_hdr_common(pkt_entry, pkt_type, txe); - - rtm_hdr = efa_rdm_pke_get_runtread_rtm_base_hdr(pkt_entry); - rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; - rtm_hdr->hdr.msg_id = txe->msg_id; - rtm_hdr->msg_length = txe->total_len; - rtm_hdr->send_id = txe->tx_id; - rtm_hdr->seg_offset = segment_offset; - rtm_hdr->runt_length = txe->bytes_runt; - rtm_hdr->read_iov_count = txe->iov_count; - - hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); - read_iov = (struct fi_rma_iov *)(pkt_entry->wiredata + hdr_size); - err = efa_rdm_txe_prepare_to_be_read(txe, read_iov); - if (OFI_UNLIKELY(err)) - return err; - - payload_offset = hdr_size + txe->iov_count * sizeof(struct fi_rma_iov); - return efa_rdm_pke_init_payload_from_ope(pkt_entry, txe, - payload_offset, - segment_offset, - data_size); -} - -/** - * @brief initialize a EFA_RDM_RUNTREAD_MSGRTM packet - * - * @param[out] pkt_entry pkt_entry to be initialzied - * @param[in] txe contains information of the send operation - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_runtread_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - return efa_rdm_pke_init_runtread_rtm(pkt_entry, - EFA_RDM_RUNTREAD_MSGRTM_PKT, - txe, - segment_offset, - data_size); -} - -/** - * @brief initialize a EFA_RDM_RUNTREAD_TAGRTM packet - * - * @param[out] pkt_entry pkt_entry to be initialzied - * @param[in] txe contains information of the send operation - * @param[in] segment_offset data offset in repect of user buffer - * @param[in] data_size data size in the unit of bytes - */ -ssize_t efa_rdm_pke_init_runtread_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t segment_offset, - int data_size) -{ - ssize_t err; - struct efa_rdm_base_hdr *base_hdr; - - err = efa_rdm_pke_init_runtread_rtm(pkt_entry, - EFA_RDM_RUNTREAD_TAGRTM_PKT, - txe, - segment_offset, - data_size); - if (err) - return err; - - base_hdr = efa_rdm_pke_get_base_hdr(pkt_entry); - base_hdr->flags |= EFA_RDM_REQ_TAGGED; - efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); - return 0; -} - -/** - * @brief handle the event that a runtread RTM has been sent - * - * This function applies to both RUNTREAD_MSGRTM and RUNTREAD_TAGRTM. - * - * @param[in,out] pkt_entry packet entry - */ -void efa_rdm_pke_handle_runtread_rtm_sent(struct efa_rdm_pke *pkt_entry, struct efa_rdm_peer *peer) -{ - struct efa_rdm_ope *txe; - size_t pkt_data_size = pkt_entry->payload_size; - - assert(peer); - - txe = pkt_entry->ope; - txe->bytes_sent += pkt_data_size; - peer->num_runt_bytes_in_flight += pkt_data_size; - - if (efa_rdm_pke_get_runtread_rtm_base_hdr(pkt_entry)->seg_offset == 0 && - txe->total_len > txe->bytes_runt) - efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight += 1; -} - -/** - * @brief handle the event that the send of a runtread RTM has been completed - * - * This function applies to both RUNTREAD_MSGRTM and RUNTREAD_TAGRTM - * There is no DC version of RUNTREAD. If user requested DC, - * LONGREAD RTM will be used. - * @param[in,out] pkt_entry packet entry - */ -void efa_rdm_pke_handle_runtread_rtm_send_completion(struct efa_rdm_pke *pkt_entry) -{ - struct efa_rdm_ope *txe; - struct efa_rdm_peer *peer; - size_t pkt_data_size; - - txe = pkt_entry->ope; - pkt_data_size = pkt_entry->payload_size; - txe->bytes_acked += pkt_data_size; - - peer = txe->peer; - assert(peer); - assert(peer->num_runt_bytes_in_flight >= pkt_data_size); - peer->num_runt_bytes_in_flight -= pkt_data_size; - if (txe->total_len == txe->bytes_acked) - efa_rdm_ope_handle_send_completed(txe); -} diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.h b/prov/efa/src/rdm/efa_rdm_pke_rtm.h index ced0ee4bce1..bed2e8cb4e5 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.h +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.h @@ -104,18 +104,6 @@ struct efa_rdm_dc_eager_rtm_base_hdr *efa_rdm_pke_get_dc_eager_rtm_base_hdr(stru return (struct efa_rdm_dc_eager_rtm_base_hdr *)pke->wiredata; } -static inline -struct efa_rdm_dc_eager_msgrtm_hdr *efa_rdm_pke_get_dc_eager_msgrtm_hdr(struct efa_rdm_pke *pke) -{ - return (struct efa_rdm_dc_eager_msgrtm_hdr *)pke->wiredata; -} - -static inline -struct efa_rdm_dc_eager_tagrtm_hdr *efa_rdm_pke_get_dc_eager_tagrtm_hdr(struct efa_rdm_pke *pke) -{ - return (struct efa_rdm_dc_eager_tagrtm_hdr *)pke->wiredata; -} - static inline struct efa_rdm_medium_rtm_base_hdr *efa_rdm_pke_get_medium_rtm_base_hdr(struct efa_rdm_pke *pke) { @@ -158,86 +146,14 @@ struct efa_rdm_runtread_rtm_base_hdr *efa_rdm_pke_get_runtread_rtm_base_hdr(stru return (struct efa_rdm_runtread_rtm_base_hdr *)pke->wiredata; } -ssize_t efa_rdm_pke_init_eager_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_eager_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_dc_eager_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_dc_eager_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -void efa_rdm_pke_handle_eager_rtm_send_completion(struct efa_rdm_pke *pkt_entry); - ssize_t efa_rdm_pke_proc_matched_eager_rtm(struct efa_rdm_pke *pkt_entry); -ssize_t efa_rdm_pke_init_medium_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -ssize_t efa_rdm_pke_init_medium_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -ssize_t efa_rdm_pke_init_dc_medium_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -ssize_t efa_rdm_pke_init_dc_medium_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -void efa_rdm_pke_handle_medium_rtm_sent(struct efa_rdm_pke *pkt_entry); - -void efa_rdm_pke_handle_medium_rtm_send_completion(struct efa_rdm_pke *pkt_entry); - ssize_t efa_rdm_pke_proc_matched_mulreq_rtm(struct efa_rdm_pke *pkt_entry); -ssize_t efa_rdm_pke_init_longcts_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_longcts_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_dc_longcts_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_dc_longcts_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -void efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_rdm_pke *pkt_entry); - -void efa_rdm_pke_handle_longcts_rtm_sent(struct efa_rdm_pke *pkt_entry); - -ssize_t efa_rdm_pke_init_longread_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - -ssize_t efa_rdm_pke_init_longread_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe); - ssize_t efa_rdm_pke_proc_matched_longread_rtm(struct efa_rdm_pke *pkt_entry); -void efa_rdm_pke_handle_longread_rtm_sent(struct efa_rdm_pke *pkt_entry); - -ssize_t efa_rdm_pke_init_runtread_msgrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -ssize_t efa_rdm_pke_init_runtread_tagrtm(struct efa_rdm_pke *pkt_entry, - struct efa_rdm_ope *txe, - size_t data_offset, - int data_size); - -void efa_rdm_pke_handle_runtread_rtm_sent(struct efa_rdm_pke *pkt_entry, struct efa_rdm_peer *peer); - -void efa_rdm_pke_handle_runtread_rtm_send_completion(struct efa_rdm_pke *pkt_entry); +ssize_t efa_rdm_pke_init_rtm_with_payload(struct efa_rdm_pke *pkt_entry, + int pkt_type, struct efa_rdm_ope *txe, + size_t segment_offset, int data_size); #endif \ No newline at end of file diff --git a/prov/efa/src/rdm/efa_rdm_proto.c b/prov/efa/src/rdm/efa_rdm_proto.c new file mode 100644 index 00000000000..38bea8fb9a6 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto.c @@ -0,0 +1,227 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto.h" +#include "efa.h" +#include "efa_rdm_proto_eager.h" +#include "efa_rdm_proto_longcts.h" +#include "efa_rdm_proto_longread.h" +#include "efa_rdm_proto_medium.h" +#include "efa_rdm_proto_runtread.h" + +/* We have total of 5 protocols in the EFA provider. Use a slightly larger + * number to accomodate the NULL sentinel and future protocols. */ +#define EFA_RDM_MAX_PROTO 8 + +/* List of supported protocols. + * The protocols listed here will be tried in the order they're listed. + * The first protocol that can be used for the TX operation will be used. + */ +struct efa_rdm_proto *efa_rdm_protocols[EFA_RDM_MAX_PROTO] = { + &efa_rdm_proto_eager, + &efa_rdm_proto_medium, + &efa_rdm_proto_runtread, + &efa_rdm_proto_longread, + /* Long CTS should be the last protocol because it can always be used */ + &efa_rdm_proto_longcts, + NULL, /* Sentinel used to stop iteration */ +}; + +int efa_rdm_proto_select_send_protocol(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, uint32_t op, + uint64_t flags, struct efa_rdm_ope *txe, + struct efa_rdm_proto **proto) +{ + struct efa_rdm_proto *selected_proto; + int req_pkt_type, iface, err, use_p2p; + uint16_t header_flags = 0; + bool tagged, delivery_complete_requested, mr_attempted = false; + uint64_t mr_access_flags; + + if (flags & FI_INJECT || + efa_both_support_zero_hdr_data_transfer(ep, peer)) + delivery_complete_requested = false; + else + delivery_complete_requested = flags & FI_DELIVERY_COMPLETE; + + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + txe->total_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + + /* TODO: These fields are copied to the txe because the current + * implementation of efa_rdm_ope_try_fill_desc relies on it. Eliminate + * unncessary copies wherever possible. */ + txe->ep = ep; + txe->iov_count = msg->iov_count; + memcpy(txe->iov, msg->msg_iov, sizeof(struct iovec) * msg->iov_count); + memset(txe->mr, 0, sizeof(*txe->mr) * msg->iov_count); + if (msg->desc) + memcpy(txe->desc, msg->desc, + sizeof(*msg->desc) * msg->iov_count); + else + memset(txe->desc, 0, sizeof(*txe->desc) * msg->iov_count); + + iface = (msg->desc && msg->desc[0]) ? + ((struct efa_mr *) msg->desc[0])->iface : + FI_HMEM_SYSTEM; + + err = efa_rdm_ep_use_p2p( + ep, (msg->desc && msg->desc[0]) ? msg->desc[0] : NULL); + if (err < 0) + return err; + use_p2p = err; + + /* Logic copied from efa_rdm_txe_max_req_data_capacity */ + if (efa_rdm_peer_need_raw_addr_hdr(peer)) + header_flags |= EFA_RDM_REQ_OPT_RAW_ADDR_HDR; + else if (efa_rdm_peer_need_connid(peer)) + header_flags |= EFA_RDM_PKT_CONNID_HDR; + + if (flags & FI_REMOTE_CQ_DATA) + header_flags |= EFA_RDM_REQ_OPT_CQ_DATA_HDR; + + for (int i = 0; i < EFA_RDM_MAX_PROTO; ++i) { + selected_proto = efa_rdm_protocols[i]; + + if (!selected_proto) + break; + + /* + * For performance consideration, this function assume the + * tagged rtm packet type id is always the correspondent message + * rtm packet type id + 1, thus the assertion here. + */ + assert(selected_proto->req_pkt_type_tagged == + selected_proto->req_pkt_type + 1); + assert(selected_proto->req_pkt_type_tagged_dc == + selected_proto->req_pkt_type_dc + 1); + + /* TODO: The req_pkt_type is again needed in each protocol when + * allocating pkes Option 1: Make pkt headers independent of tag + * and DC to avoid these checks Option 2: Store the req_pkt_type + * in the txe + */ + req_pkt_type = + delivery_complete_requested ? + selected_proto->req_pkt_type_dc + tagged : + selected_proto->req_pkt_type + tagged; + + /* All protocols other than the eager protocol can benefit from + * registering the application buffers. + * TODO: Move function efa_rdm_ope_try_fill_desc to + * efa_rdm_proto.c + */ + if (!mr_attempted && selected_proto != &efa_rdm_proto_eager) { + // Try to register buffer if MR cache is available + mr_access_flags = FI_SEND; + if (use_p2p) + mr_access_flags |= FI_REMOTE_READ; + + if (efa_is_cache_available(efa_rdm_ep_domain(ep))) { + efa_rdm_ope_try_fill_desc(txe, 0, + mr_access_flags); + } + mr_attempted = true; + } + + if (selected_proto->can_use_protocol_for_send( + txe, peer, req_pkt_type, header_flags, iface, + use_p2p)) { + *proto = selected_proto; + EFA_INFO(FI_LOG_EP_DATA, "Selected protocol: %s\n", + selected_proto->name); + return FI_SUCCESS; + } + } + + *proto = NULL; + return FI_SUCCESS; +} + +/* Utility funcions */ + +void efa_rdm_proto_txe_fill(struct efa_rdm_ope *txe, struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, const struct fi_msg *msg, + uint32_t op, uint64_t tag, uint64_t flags) +{ + /* Logic copied from efa_rdm_txe_construct */ + uint64_t tx_op_flags; + + /* txe->mr, txe->desc, txe->total_len are filled by + * efa_rdm_ope_try_fill_desc in efa_rdm_proto_select_send_protocol + */ + + txe->ep = ep; + txe->type = EFA_RDM_TXE; + txe->op = op; + txe->tx_id = ofi_buf_index(txe); + txe->state = EFA_RDM_TXE_REQ; + txe->peer = peer; + + /* peer would be NULL for local read operation */ + if (txe->peer) { + dlist_insert_tail(&txe->peer_entry, &txe->peer->txe_list); + } + + txe->internal_flags = 0; + txe->bytes_received = 0; + txe->bytes_copied = 0; + txe->bytes_acked = 0; + txe->bytes_sent = 0; + txe->window = 0; + txe->iov_count = msg->iov_count; + txe->rma_iov_count = 0; + txe->msg_id = 0; + txe->efa_outstanding_tx_ops = 0; + dlist_init(&txe->queued_pkts); + + memcpy(txe->iov, msg->msg_iov, sizeof(struct iovec) * msg->iov_count); + + /* cq_entry on completion */ + txe->cq_entry.op_context = msg->context; + txe->cq_entry.data = msg->data; + txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count); + txe->cq_entry.buf = + OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL; + + /* set flags */ + assert(ep->base_ep.util_ep.tx_msg_flags == 0 || + ep->base_ep.util_ep.tx_msg_flags == FI_COMPLETION); + tx_op_flags = ep->base_ep.util_ep.tx_op_flags; + if (ep->base_ep.util_ep.tx_msg_flags == 0) + tx_op_flags &= ~FI_COMPLETION; + txe->fi_flags = flags | tx_op_flags; + txe->bytes_runt = 0; + dlist_init(&txe->entry); + + switch (op) { + case ofi_op_tagged: + txe->cq_entry.flags = FI_TRANSMIT | FI_MSG | FI_TAGGED; + txe->cq_entry.tag = tag; + txe->tag = tag; + break; + case ofi_op_write: + txe->cq_entry.flags = FI_RMA | FI_WRITE; + break; + case ofi_op_read_req: + txe->cq_entry.flags = FI_RMA | FI_READ; + break; + case ofi_op_msg: + txe->cq_entry.flags = FI_TRANSMIT | FI_MSG; + break; + case ofi_op_atomic: + txe->cq_entry.flags = (FI_WRITE | FI_ATOMIC); + break; + case ofi_op_atomic_fetch: + case ofi_op_atomic_compare: + txe->cq_entry.flags = (FI_READ | FI_ATOMIC); + break; + default: + EFA_WARN(FI_LOG_CQ, "invalid operation type\n"); + assert(0); + } + + dlist_insert_tail(&txe->ep_entry, &ep->txe_list); +} diff --git a/prov/efa/src/rdm/efa_rdm_proto.h b/prov/efa/src/rdm/efa_rdm_proto.h new file mode 100644 index 00000000000..b467c997d7e --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto.h @@ -0,0 +1,125 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_H +#define _EFA_RDM_PROTO_H + +#include "efa.h" +#include "efa_rdm_pkt_type.h" + +/** + * @brief Interface for EFA RDM protocols. + * + * Each protocol (eager, medium, long CTS, long read, runt read) + * implements this interface to define how it handles TX and RX operations. + * + * The TX send path works as follows: + * + * 1. efa_rdm_proto_select_send_protocol() iterates through the protocol + * registry (efa_rdm_protocols[]) in priority order, calling + * can_use_protocol_for_send() on each. The first match is selected. + * + * 2. The selected protocol's construct_tx_pkes() builds the packet + * entries and stores them in ep->send_pkt_entry_vec. + * + * 3. efa_rdm_msg_generic_send posts all packet entries via efa_rdm_pke_sendv(). + * + * 4. handle_tx_pkes_posted() is called for post-send bookkeeping. + * + * 5. When the device completes a send, the CQ handler invokes the + * per-packet callback (set in step 2) which handles completion + * logic, CQ reporting, and TXE/PKE release. + * + * Protocol priority is determined by position in efa_rdm_protocols[]. + * Protocols are ordered from most restrictive (eager) to least + * restrictive (long CTS) so the most efficient protocol is always + * selected. + */ +struct efa_rdm_proto { + char name[32]; + + /* TX path handlers */ + + /* This function determines whether the protocol can be used for a given + * send operation. + */ + bool (*can_use_protocol_for_send)(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, int iface, + bool use_p2p); + + /* This function will allocate the pkes that need to be sent for a given + * TX operation. At the end of this function, ep->send_pkt_entry_vec + * will be correctly populated with the all of the pkes that need to be + * sent including copying the application data into the pke buffer if + * necessary. Each pke will have an appropriate callback function set to + * handle the TX completion of that pke. This function also constructs + * and returns the txe + */ + int (*construct_tx_pkes)(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, uint32_t op, + uint64_t tag, uint64_t flags, + struct efa_rdm_ope *txe); + + /* This function is called after all pkes are posted to the EFA device. + * It is useful for some protocols: e.g. to register the buffer after + * posting a Long CTS RTM pke or to update the number of in flight reads + * and read bytes + */ + void (*handle_tx_pkes_posted)(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe); + + /* TX utitlities */ + int req_pkt_type; + int req_pkt_type_tagged; + int req_pkt_type_dc; + int req_pkt_type_tagged_dc; +}; + +/** + * @brief Select the appropriate send protocol for a TX operation. + * + * Iterates through registered protocols in priority order and selects + * the first one whose can_use_protocol_for_send() returns true. + * + * It will also handle memory registration of user buffers. If read based + * protocols are appropriate but MR fails, it will automatically switch to a + * different protocol. + * + * @param[in] ep Endpoint + * @param[in] peer Peer to send to + * @param[in] msg Message descriptor from application + * @param[in] op Operation type (ofi_op_msg or ofi_op_tagged) + * @param[in] flags Operation flags (FI_INJECT, FI_DELIVERY_COMPLETE, etc.) + * @param[out] txe Pre-allocated TXE, partially initialized on return + * @param[out] proto Selected protocol, or NULL if none matched + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_select_send_protocol(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, uint32_t op, + uint64_t flags, struct efa_rdm_ope *txe, + struct efa_rdm_proto **proto); + +/* Utility funcions */ +static inline void +efa_rdm_proto_handle_tx_pkes_posted_no_op(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + return; +}; + +/** + * @brief Initialize a TXE for use by the new protocol interface. + * + * Similar to efa_rdm_txe_construct but does not set mr, desc, or + * total_len since those are already populated by + * efa_rdm_proto_select_send_protocol. + */ +void efa_rdm_proto_txe_fill(struct efa_rdm_ope *txe, struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, const struct fi_msg *msg, + uint32_t op, uint64_t tag, uint64_t flags); + +#endif /* _EFA_RDM_PROTO_H */ diff --git a/prov/efa/src/rdm/efa_rdm_proto_eager.c b/prov/efa/src/rdm/efa_rdm_proto_eager.c new file mode 100644 index 00000000000..ae877ed2fba --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_eager.c @@ -0,0 +1,216 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto_eager.h" +#include "efa.h" +#include "efa_rdm_pke_req.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pkt_type.h" + +/* + * List of packet types used by this protocol + * + * For send/recv operations + * EFA_RDM_EAGER_MSGRTM_PKT + * EFA_RDM_EAGER_TAGRTM_PKT + * EFA_RDM_DC_EAGER_MSGRTM_PKT + * EFA_RDM_DC_EAGER_TAGRTM_PKT + * + * For FI_DELIVERY_COMPLETE - shared with other protocols + * EFA_RDM_RECEIPT_PKT + */ + +/* + * Description of the protocol + * https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#eager-message-featuresubprotocol + */ + +/** + * @brief Check if the eager protocol can handle this send operation. + * + * Returns true if the message fits in a single MTU-sized packet after + * accounting for the request header size. + */ +static bool efa_rdm_proto_eager_can_use_for_send(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, + int iface, bool use_p2p) +{ + size_t max_data_offset, max_rtm_data_capacity; + + /* TODO: For emulated read and atomics, need to consider RMA + * IOVs in the header + * https://github.com/ofiwg/libfabric/blob/cff899c9ef6dd823a1e3b35d3205622013c6eb6c/prov/efa/src/rdm/efa_rdm_pkt_type.c#L101-L103 + */ + max_data_offset = efa_rdm_pkt_type_get_req_hdr_size(req_pkt_type, + header_flags, 0); + max_rtm_data_capacity = txe->ep->mtu_size - max_data_offset; + + return txe->total_len <= max_rtm_data_capacity; +} + +struct efa_rdm_proto efa_rdm_proto_eager = { + .name = "EAGER", + .can_use_protocol_for_send = &efa_rdm_proto_eager_can_use_for_send, + .construct_tx_pkes = &efa_rdm_proto_eager_construct_tx_pkes, + .req_pkt_type = EFA_RDM_EAGER_MSGRTM_PKT, + .req_pkt_type_dc = EFA_RDM_DC_EAGER_MSGRTM_PKT, + .req_pkt_type_tagged = EFA_RDM_EAGER_TAGRTM_PKT, + .req_pkt_type_tagged_dc = EFA_RDM_DC_EAGER_TAGRTM_PKT, + .handle_tx_pkes_posted = &efa_rdm_proto_handle_tx_pkes_posted_no_op, +}; + +/* TX path callbacks - one callback for each packet type that this protocol uses + */ +/** + * @brief Handle send completion for a non-DC eager RTM packet. + * + * Reports the CQ completion, releases the TXE, and releases the + * TX packet entry. + */ +void efa_rdm_proto_eager_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + txe = pkt_entry->ope; + assert(txe); + assert(txe->total_len == pkt_entry->payload_size); + + efa_rdm_ope_handle_send_completed(txe); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Handle send completion for a DC eager RTM packet. + * + * Only releases the TXE when both all send completions have arrived + * (efa_outstanding_tx_ops == 0) and the receipt packet has been received. + */ +void efa_rdm_proto_eager_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + txe = pkt_entry->ope; + assert(txe); + assert(txe->total_len == pkt_entry->payload_size); + + if (efa_rdm_txe_dc_ready_for_release(txe)) + efa_rdm_txe_release(txe); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Construct TX packet entries for the eager protocol. + * + * Allocates a single TX packet entry, initializes the RTM header with + * the message payload, and sets the per-packet send completion callback. + * Supports both regular and delivery-complete (DC) eager packets. + * + * On success, ep->send_pkt_entry_vec[0] contains the packet entry and + * ep->send_pkt_entry_vec_size is set to 1. + * + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_eager_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, uint32_t op, + uint64_t tag, uint64_t flags, + struct efa_rdm_ope *txe) +{ + int ret, req_pkt_type, pkt_entry_cnt; + bool tagged, delivery_complete_requested; + struct efa_rdm_pke *pkt_entry = NULL; + struct efa_rdm_rtm_base_hdr *rtm_hdr; + struct efa_rdm_dc_eager_rtm_base_hdr *dc_base_hdr; + + efa_rdm_proto_txe_fill(txe, ep, peer, msg, op, tag, flags); + + txe->msg_id = peer->next_msg_id++; + + // Eager protocol sends 1 packet by definition + pkt_entry_cnt = 1; + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + if (flags & FI_INJECT || + efa_both_support_zero_hdr_data_transfer(ep, peer)) + delivery_complete_requested = false; + else + delivery_complete_requested = flags & FI_DELIVERY_COMPLETE; + + req_pkt_type = delivery_complete_requested ? + efa_rdm_proto_eager.req_pkt_type_dc + tagged : + efa_rdm_proto_eager.req_pkt_type + tagged; + + pkt_entry = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + + pkt_entry->ope = txe; + pkt_entry->peer = peer; + pkt_entry->callback = &efa_rdm_proto_eager_handle_rtm_send_completion; + + // Refactored code path does not support zero copy + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + efa_rdm_pke_init_req_hdr_common(pkt_entry, req_pkt_type, txe); + + rtm_hdr = (struct efa_rdm_rtm_base_hdr *) pkt_entry->wiredata; + rtm_hdr->flags |= EFA_RDM_REQ_MSG; + rtm_hdr->msg_id = txe->msg_id; + + if (tagged) { + rtm_hdr->flags |= EFA_RDM_REQ_TAGGED; + efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); + } + + EFA_DBG(FI_LOG_EP_DATA, + "eager protocol: dc_requested=%d tagged=%d req_pkt_type=%d\n", + delivery_complete_requested, tagged, req_pkt_type); + + if (delivery_complete_requested) { + txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; + dc_base_hdr = (struct efa_rdm_dc_eager_rtm_base_hdr *) + pkt_entry->wiredata; + dc_base_hdr->send_id = txe->tx_id; + pkt_entry->callback = + &efa_rdm_proto_eager_handle_rtm_dc_send_completion; + } + + ret = efa_rdm_pke_init_payload_from_ope( + pkt_entry, txe, efa_rdm_pke_get_req_hdr_size(pkt_entry), 0, + txe->total_len); + + if (ret) + goto out; + + // Verify that all of the data has been copied to the pke buffer + assert(txe->total_len == pkt_entry->payload_size); + + ep->send_pkt_entry_vec[0] = pkt_entry; + ep->send_pkt_entry_vec_size = pkt_entry_cnt; + EFA_INFO(FI_LOG_EP_DATA, + "eager protocol: posting 1 pke, size %lu, msg_id %" PRIu32 "\n", + txe->total_len, txe->msg_id); + + return FI_SUCCESS; + +out: + if (txe) { + peer->next_msg_id--; + efa_rdm_txe_release(txe); + } + if (pkt_entry) + efa_rdm_pke_release_tx(pkt_entry); + return ret; +} diff --git a/prov/efa/src/rdm/efa_rdm_proto_eager.h b/prov/efa/src/rdm/efa_rdm_proto_eager.h new file mode 100644 index 00000000000..dc3fab0ea4c --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_eager.h @@ -0,0 +1,23 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_EAGER_H +#define _EFA_RDM_PROTO_EAGER_H + +#include "efa_rdm_proto.h" + +extern struct efa_rdm_proto efa_rdm_proto_eager; + +int efa_rdm_proto_eager_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, uint32_t op, + uint64_t tag, uint64_t flags, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_eager_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_eager_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry); + +#endif /* _EFA_RDM_PROTO_EAGER_H */ diff --git a/prov/efa/src/rdm/efa_rdm_proto_longcts.c b/prov/efa/src/rdm/efa_rdm_proto_longcts.c new file mode 100644 index 00000000000..0cb85521e17 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_longcts.c @@ -0,0 +1,271 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto_longcts.h" +#include "efa.h" +#include "efa_rdm_pke_req.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pkt_type.h" + +/* + * List of packet types used by this protocol + * + * For send/recv operations + * EFA_RDM_LONGCTS_MSGRTM_PKT + * EFA_RDM_LONGCTS_TAGRTM_PKT + * EFA_RDM_DC_LONGCTS_MSGRTM_PKT + * EFA_RDM_DC_LONGCTS_TAGRTM_PKT + * EFA_RDM_CTS_PKT + * EFA_RDM_CTSDATA_PKT + * + * For FI_DELIVERY_COMPLETE - shared with other protocols + * EFA_RDM_RECEIPT_PKT + */ + +/* + * Description of the protocol + * https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#long-cts-message-featuresubprotocol + */ + +/** + * @brief Check if the long CTS protocol can handle this send operation. + * + * Long CTS is the fallback protocol that can always be used. Returns + * true unconditionally. + */ +static bool efa_rdm_proto_longcts_can_use_for_send(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, + int iface, bool use_p2p) +{ + // Long CTS is always usable + return true; +} + +struct efa_rdm_proto efa_rdm_proto_longcts = { + .name = "LONGCTS", + .can_use_protocol_for_send = &efa_rdm_proto_longcts_can_use_for_send, + .construct_tx_pkes = &efa_rdm_proto_longcts_construct_tx_pkes, + .req_pkt_type = EFA_RDM_LONGCTS_MSGRTM_PKT, + .req_pkt_type_dc = EFA_RDM_DC_LONGCTS_MSGRTM_PKT, + .req_pkt_type_tagged = EFA_RDM_LONGCTS_TAGRTM_PKT, + .req_pkt_type_tagged_dc = EFA_RDM_DC_LONGCTS_TAGRTM_PKT, + .handle_tx_pkes_posted = &efa_rdm_proto_longcts_handle_tx_pkes_posted, +}; + +void efa_rdm_proto_longcts_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + for (int i = 0; i < ep->send_pkt_entry_vec_size; ++i) { + txe->bytes_sent += ep->send_pkt_entry_vec[i]->payload_size; + assert(txe->bytes_sent < txe->total_len); + } + + /* Try to register application buffer again + * We first try to register application's buffer in + * efa_rdm_proto_select_send_protocol. We try here again in case the + * first attempt failed because of e.g. number of MRs reaching device + * limits */ + if (efa_is_cache_available(efa_rdm_ep_domain(ep))) + efa_rdm_ope_try_fill_desc(txe, 0, FI_SEND); +} + +/* TX path callbacks - one callback for each packet type that this protocol uses + */ +/** + * @brief Handle send completion for a long CTS RTM packet. + * + * Tracks bytes_acked and reports completion when all data has been + * acknowledged. + */ +void efa_rdm_proto_longcts_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + /** + * A zero-payload longcts rtm pkt currently should only happen when it's + * used for the READ NACK protocol. In this case, this pkt doesn't + * contribute to the send completion, and the associated tx entry + * may be released earlier as the CTSDATA pkts have already kicked off + * and finished the send. + */ + if (pkt_entry->payload_size == 0) { + assert(efa_rdm_pke_get_rtm_base_hdr(pkt_entry)->flags & + EFA_RDM_REQ_READ_NACK); + return; + } + + txe = pkt_entry->ope; + + txe->bytes_acked += pkt_entry->payload_size; + + /* Long CTS protocol should not be used when the total buffer size can + * fit in one packet */ + assert(txe->total_len != txe->bytes_acked); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Handle send completion for a DC long CTS RTM packet. + * + * Only releases the TXE when both all send completions and the receipt + * have been received. + */ +void efa_rdm_proto_longcts_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + txe = pkt_entry->ope; + assert(txe); + + if (efa_rdm_txe_dc_ready_for_release(txe)) + efa_rdm_txe_release(txe); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/* TODO: Implement TX completions for pke when implementing the RX path */ +void efa_rdm_proto_longcts_handle_cts_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + return; +} + +/* TODO: Implement TX completions for pke when implementing the RX path */ +void efa_rdm_proto_longcts_handle_ctsdata_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + return; +} + +/** + * @brief Construct TX packet entries for the long CTS protocol. + * + * Sends a single RTM packet containing the first chunk of data. The + * receiver will respond with a CTS packet to request the remaining data. + * + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_longcts_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe) +{ + int ret, req_pkt_type, iface, pkt_entry_cnt; + size_t hdr_size, rtm_payload_size, memory_alignment; + bool tagged, delivery_complete_requested; + struct efa_rdm_pke *pkt_entry = NULL; + struct efa_rdm_longcts_rtm_base_hdr *rtm_hdr; + + efa_rdm_proto_txe_fill(txe, ep, peer, msg, op, tag, flags); + + txe->msg_id = peer->next_msg_id++; + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + // Long CTS protocol sends 1 req packet by definition + pkt_entry_cnt = 1; + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + // Refactored code path does not support zero copy + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + // Inject should use eager protocol + assert(!(flags & FI_INJECT)); + + delivery_complete_requested = flags & FI_DELIVERY_COMPLETE; + + EFA_DBG(FI_LOG_EP_DATA, + "longcts protocol: dc_requested=%d tagged=%d\n", + delivery_complete_requested, tagged); + + req_pkt_type = delivery_complete_requested ? + efa_rdm_proto_longcts.req_pkt_type_dc + tagged : + efa_rdm_proto_longcts.req_pkt_type + tagged; + + pkt_entry = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + + pkt_entry->ope = txe; + pkt_entry->peer = peer; + pkt_entry->callback = &efa_rdm_proto_longcts_handle_rtm_send_completion; + + // Zero copy path should use the eager protocol + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + efa_rdm_pke_init_req_hdr_common(pkt_entry, req_pkt_type, txe); + + rtm_hdr = (struct efa_rdm_longcts_rtm_base_hdr *) pkt_entry->wiredata; + rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; + rtm_hdr->hdr.msg_id = txe->msg_id; + + rtm_hdr->msg_length = txe->total_len; + rtm_hdr->send_id = txe->tx_id; + rtm_hdr->credit_request = efa_env.tx_min_credits; + + if (tagged) { + rtm_hdr->hdr.flags |= EFA_RDM_REQ_TAGGED; + efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); + } + + if (delivery_complete_requested) { + txe->internal_flags |= EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; + rtm_hdr->send_id = txe->tx_id; + pkt_entry->callback = + &efa_rdm_proto_longcts_handle_rtm_dc_send_completion; + } + + // Calculate hdr_size after initializing the flags in + // efa_rdm_pke_init_req_hdr_common + hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); + rtm_payload_size = txe->ep->mtu_size - hdr_size; + + iface = (msg->desc && msg->desc[0]) ? + ((struct efa_mr *) msg->desc[0])->iface : + FI_HMEM_SYSTEM; + memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface); + + rtm_payload_size &= ~(memory_alignment - 1); + + ret = efa_rdm_pke_init_payload_from_ope(pkt_entry, txe, hdr_size, 0, + rtm_payload_size); + + if (ret) + goto out; + + // Set ep->send_pkt_entry_vec and related fields + ep->send_pkt_entry_vec[0] = pkt_entry; + ep->send_pkt_entry_vec_size = pkt_entry_cnt; + EFA_INFO(FI_LOG_EP_DATA, + "longcts protocol: posting %d pke(s), total_len %lu, msg_id %" PRIu32 "\n", + pkt_entry_cnt, txe->total_len, txe->msg_id); + + return FI_SUCCESS; + +out: + if (txe) { + peer->next_msg_id--; + efa_rdm_txe_release(txe); + } + + if (pkt_entry) + efa_rdm_pke_release_tx(pkt_entry); + + return ret; +} diff --git a/prov/efa/src/rdm/efa_rdm_proto_longcts.h b/prov/efa/src/rdm/efa_rdm_proto_longcts.h new file mode 100644 index 00000000000..a1317e70d2f --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_longcts.h @@ -0,0 +1,33 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_LONGCTS_H +#define _EFA_RDM_PROTO_LONGCTS_H + +#include "efa_rdm_proto.h" + +extern struct efa_rdm_proto efa_rdm_proto_longcts; + +int efa_rdm_proto_longcts_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_longcts_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_longcts_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_longcts_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_longcts_handle_cts_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_longcts_handle_ctsdata_send_completion( + struct efa_rdm_pke *pkt_entry); + +#endif /* _EFA_RDM_PROTO_LONGCTS_H */ diff --git a/prov/efa/src/rdm/efa_rdm_proto_longread.c b/prov/efa/src/rdm/efa_rdm_proto_longread.c new file mode 100644 index 00000000000..0c34eb42e67 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_longread.c @@ -0,0 +1,200 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto_longread.h" +#include "efa.h" +#include "efa_rdm_pke_req.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pkt_type.h" + +/* + * List of packet types used by this protocol + * + * For send/recv operations + * EFA_RDM_LONGREAD_MSGRTM_PKT + * EFA_RDM_LONGREAD_TAGRTM_PKT + * + * EFA_RDM_EOR_PKT + */ + +/* + * Description of the protocol + * https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#long-read-message-featuresubprotocol + */ + +/** + * @brief Check if the long read protocol can handle this send operation. + * + * Requires p2p availability, registered memory descriptors, the message + * meeting the minimum read size threshold, and peer RDMA read support. + */ +static bool efa_rdm_proto_longread_can_use_for_send(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, + int iface, bool use_p2p) +{ + bool size, read_interop, mr_avail; + + mr_avail = (txe->desc[0] != NULL); + size = txe->total_len >= g_efa_hmem_info[iface].min_read_msg_size; + read_interop = efa_rdm_interop_rdma_read(txe->ep, peer); + + EFA_DBG(FI_LOG_EP_DATA, + "longread eligibility: use_p2p=%d mr_avail=%d size=%d read_interop=%d\n", + use_p2p, mr_avail, size, read_interop); + + if (use_p2p && mr_avail && size && read_interop) + return true; + + return false; +} + +struct efa_rdm_proto efa_rdm_proto_longread = { + .name = "LONGREAD", + .can_use_protocol_for_send = &efa_rdm_proto_longread_can_use_for_send, + .construct_tx_pkes = &efa_rdm_proto_longread_construct_tx_pkes, + // Long read protocol is always delivery complete + .req_pkt_type = EFA_RDM_LONGREAD_MSGRTM_PKT, + .req_pkt_type_dc = EFA_RDM_LONGREAD_MSGRTM_PKT, + .req_pkt_type_tagged = EFA_RDM_LONGREAD_TAGRTM_PKT, + .req_pkt_type_tagged_dc = EFA_RDM_LONGREAD_TAGRTM_PKT, + .handle_tx_pkes_posted = &efa_rdm_proto_longread_handle_tx_pkes_posted, +}; + +void efa_rdm_proto_longread_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + efa_rdm_ep_domain(ep)->num_read_msg_in_flight += 1; +} + +/* TX path callbacks - one callback for each packet type that this protocol uses + */ +/** + * @brief Handle send completion for a long read RTM packet. + * + * Simply releases the TX packet entry since the actual data transfer + * is driven by the receiver via RDMA reads. + */ +void efa_rdm_proto_longread_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + // Nothing to do except to release the pkt_entry + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Construct TX packet entries for the long read protocol. + * + * Sends a single RTM packet containing the memory keys and addresses + * of the sender's registered buffers. The receiver performs RDMA reads + * to fetch the data. + * + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_longread_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe) +{ + int i, ret, req_pkt_type, pkt_entry_cnt; + bool tagged; + struct efa_rdm_pke *pkt_entry = NULL; + struct efa_rdm_longread_rtm_base_hdr *rtm_hdr; + struct fi_rma_iov *read_iov; + size_t hdr_size; + + efa_rdm_proto_txe_fill(txe, ep, peer, msg, op, tag, flags); + + txe->msg_id = peer->next_msg_id++; + + /* Read based protocols shouldn't be chosen if the local buffer cannot + * be registered */ + assert(txe->desc[0]); + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + // Refactored code path does not support zero copy + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + // Inject should use eager protocol + assert(!(flags & FI_INJECT)); + + // Long read protocol sends 1 req packet by definition + pkt_entry_cnt = 1; + + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + // Long read protocol is always delivery complete + assert(efa_rdm_proto_longread.req_pkt_type == + efa_rdm_proto_longread.req_pkt_type_dc); + assert(efa_rdm_proto_longread.req_pkt_type_tagged == + efa_rdm_proto_longread.req_pkt_type_tagged_dc); + + req_pkt_type = efa_rdm_proto_longread.req_pkt_type + tagged; + + pkt_entry = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + if (!pkt_entry) { + ret = -FI_EAGAIN; + goto out; + } + + pkt_entry->ope = txe; + pkt_entry->peer = peer; + pkt_entry->callback = + &efa_rdm_proto_longread_handle_rtm_send_completion; + + efa_rdm_pke_init_req_hdr_common(pkt_entry, req_pkt_type, txe); + + rtm_hdr = (struct efa_rdm_longread_rtm_base_hdr *) pkt_entry->wiredata; + rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; + rtm_hdr->hdr.msg_id = txe->msg_id; + rtm_hdr->msg_length = txe->total_len; + rtm_hdr->send_id = txe->tx_id; + rtm_hdr->read_iov_count = txe->iov_count; + + hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); + read_iov = (struct fi_rma_iov *) (pkt_entry->wiredata + hdr_size); + + pkt_entry->pkt_size = + hdr_size + txe->iov_count * sizeof(struct fi_rma_iov); + + // Logic copied from efa_rdm_txe_prepare_to_be_read + for (i = 0; i < txe->iov_count; ++i) { + read_iov[i].addr = (uint64_t) txe->iov[i].iov_base; + read_iov[i].len = txe->iov[i].iov_len; + read_iov[i].key = fi_mr_key(txe->desc[i]); + } + + if (tagged) { + rtm_hdr->hdr.flags |= EFA_RDM_REQ_TAGGED; + efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); + } + + // Set ep->send_pkt_entry_vec and related fields + ep->send_pkt_entry_vec[0] = pkt_entry; + ep->send_pkt_entry_vec_size = pkt_entry_cnt; + EFA_INFO(FI_LOG_EP_DATA, + "longread protocol: posting 1 pke, pkt_size %lu, total_len %lu\n", + pkt_entry->pkt_size, txe->total_len); + + return FI_SUCCESS; + +out: + if (txe) { + peer->next_msg_id--; + efa_rdm_txe_release(txe); + } + + if (pkt_entry) + efa_rdm_pke_release_tx(pkt_entry); + + return ret; +} diff --git a/prov/efa/src/rdm/efa_rdm_proto_longread.h b/prov/efa/src/rdm/efa_rdm_proto_longread.h new file mode 100644 index 00000000000..b8b27be3b23 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_longread.h @@ -0,0 +1,27 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_LONGREAD_H +#define _EFA_RDM_PROTO_LONGREAD_H + +#include "efa_rdm_proto.h" + +extern struct efa_rdm_proto efa_rdm_proto_longread; + +int efa_rdm_proto_longread_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_longread_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_longread_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_longread_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry); + +#endif /* _EFA_RDM_PROTO_LONGREAD_H */ diff --git a/prov/efa/src/rdm/efa_rdm_proto_medium.c b/prov/efa/src/rdm/efa_rdm_proto_medium.c new file mode 100644 index 00000000000..729bf0374d6 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_medium.c @@ -0,0 +1,295 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto_medium.h" +#include "efa.h" +#include "efa_rdm_pke_req.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pkt_type.h" + +/* + * List of packet types used by this protocol + * + * For send/recv operations + * EFA_RDM_MEDIUM_MSGRTM_PKT + * EFA_RDM_MEDIUM_TAGRTM_PKT + * EFA_RDM_DC_MEDIUM_MSGRTM_PKT + * EFA_RDM_DC_MEDIUM_TAGRTM_PKT + * + * For FI_DELIVERY_COMPLETE - shared with other protocols + * EFA_RDM_RECEIPT_PKT + */ + +/* + * Description of the protocol + * https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#medium-message-featuresubprotocol + */ + +/** + * @brief Check if the medium protocol can handle this send operation. + * + * Returns true if the message size is within the medium threshold + * (default 64KB for system memory). + */ +static bool efa_rdm_proto_medium_can_use_for_send(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, + int iface, bool use_p2p) +{ + return txe->total_len <= g_efa_hmem_info[iface].max_medium_msg_size; +} + +struct efa_rdm_proto efa_rdm_proto_medium = { + .name = "MEDIUM", + .can_use_protocol_for_send = &efa_rdm_proto_medium_can_use_for_send, + .construct_tx_pkes = &efa_rdm_proto_medium_construct_tx_pkes, + .req_pkt_type = EFA_RDM_MEDIUM_MSGRTM_PKT, + .req_pkt_type_dc = EFA_RDM_DC_MEDIUM_MSGRTM_PKT, + .req_pkt_type_tagged = EFA_RDM_MEDIUM_TAGRTM_PKT, + .req_pkt_type_tagged_dc = EFA_RDM_DC_MEDIUM_TAGRTM_PKT, + .handle_tx_pkes_posted = &efa_rdm_proto_medium_handle_tx_pkes_posted, +}; + +void efa_rdm_proto_medium_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + for (int i = 0; i < ep->send_pkt_entry_vec_size; ++i) { + txe->bytes_sent += ep->send_pkt_entry_vec[i]->payload_size; + } + + // For medium protocol, all of the data is posted at once. + assert(txe->bytes_sent == txe->total_len); +} + +/* TX path callbacks - one callback for each packet type that this protocol uses + */ +/** + * @brief Handle send completion for a non-DC medium RTM packet. + * + * Tracks bytes_acked and only reports completion and releases the TXE + * when all packets of the message have been acknowledged. + */ +void efa_rdm_proto_medium_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + txe = pkt_entry->ope; + assert(txe); + + txe->bytes_acked += pkt_entry->payload_size; + if (txe->bytes_acked >= txe->total_len) + efa_rdm_ope_handle_send_completed(txe); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Handle send completion for a DC medium RTM packet. + * + * Only releases the TXE when both all send completions have arrived + * and the receipt packet has been received. + */ +void efa_rdm_proto_medium_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + + txe = pkt_entry->ope; + assert(txe); + + if (efa_rdm_txe_dc_ready_for_release(txe)) + efa_rdm_txe_release(txe); + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief Construct TX packet entries for the medium protocol. + * + * Splits the message data across multiple packet entries, balancing + * data sizes for performance and respecting memory alignment. Each + * packet carries a segment offset and total message length in its header. + * + * On success, ep->send_pkt_entry_vec contains the packet entries and + * ep->send_pkt_entry_vec_size is set to the number of packets. + * + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_medium_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe) +{ + int pkt_entry_cnt, pkt_entry_cnt_allocated = 0, + single_pkt_entry_max_data_size, memory_alignment; + int i, ret, req_pkt_type, iface, available_tx_pkts, + single_pkt_entry_data_size, remainder; + size_t segment_offset, hdr_size; + size_t *pkt_entry_data_size_vec = ep->send_pkt_entry_data_sizes; + bool tagged, delivery_complete_requested; + struct efa_rdm_pke *pkt_entry = NULL; + struct efa_rdm_medium_rtm_base_hdr *rtm_hdr; + struct efa_rdm_dc_medium_rtm_base_hdr *dc_medium_rtm_base_hdr; + + efa_rdm_proto_txe_fill(txe, ep, peer, msg, op, tag, flags); + + txe->msg_id = peer->next_msg_id++; + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + /* Select req_pkt_type based on whether FI_TAGGED is set and whether + * delivery_complete is requested + */ + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + // Refactored code path does not support zero copy + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + // Inject should use eager protocol + assert(!(flags & FI_INJECT)); + + delivery_complete_requested = flags & FI_DELIVERY_COMPLETE; + + req_pkt_type = delivery_complete_requested ? + efa_rdm_proto_medium.req_pkt_type_dc + tagged : + efa_rdm_proto_medium.req_pkt_type + tagged; + + single_pkt_entry_max_data_size = + efa_rdm_txe_max_req_data_capacity(ep, txe, req_pkt_type); + assert(single_pkt_entry_max_data_size > 0); + + EFA_DBG(FI_LOG_EP_DATA, + "medium protocol: total_len %lu, max_data_per_pkt %d\n", + txe->total_len, single_pkt_entry_max_data_size); + + iface = (msg->desc && msg->desc[0]) ? + ((struct efa_mr *) msg->desc[0])->iface : + FI_HMEM_SYSTEM; + memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface); + + pkt_entry_cnt = + (txe->total_len - 1) / single_pkt_entry_max_data_size + 1; + + /* when sending multiple packets, it is more performant that the data + * size of each packet are close to achieve that, we calculate the + * single packet size + */ + single_pkt_entry_data_size = (txe->total_len - 1) / pkt_entry_cnt + 1; + + /* each packet must be aligned */ + single_pkt_entry_data_size = + single_pkt_entry_data_size & ~(memory_alignment - 1); + + assert(single_pkt_entry_data_size); + + pkt_entry_cnt = txe->total_len / single_pkt_entry_data_size; + for (i = 0; i < pkt_entry_cnt; ++i) + pkt_entry_data_size_vec[i] = single_pkt_entry_data_size; + + remainder = txe->total_len - pkt_entry_cnt * single_pkt_entry_data_size; + if (single_pkt_entry_data_size + remainder <= + single_pkt_entry_max_data_size) { + pkt_entry_data_size_vec[pkt_entry_cnt - 1] += remainder; + } else { + pkt_entry_data_size_vec[pkt_entry_cnt] = remainder; + pkt_entry_cnt += 1; + } + + available_tx_pkts = ep->efa_max_outstanding_tx_ops - + ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt; + + if (pkt_entry_cnt > available_tx_pkts) + return -FI_EAGAIN; + + assert(pkt_entry_cnt <= efa_base_ep_get_tx_pool_size(&ep->base_ep)); + + segment_offset = 0; + for (i = 0; i < pkt_entry_cnt; ++i) { + pkt_entry = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + + if (OFI_UNLIKELY(!pkt_entry)) { + ret = -FI_EAGAIN; + goto out; + } + + pkt_entry_cnt_allocated++; + ep->send_pkt_entry_vec[i] = pkt_entry; + + pkt_entry->ope = txe; + pkt_entry->peer = peer; + + assert(pkt_entry_data_size_vec[i] > 0); + + efa_rdm_pke_init_req_hdr_common(pkt_entry, req_pkt_type, txe); + + rtm_hdr = efa_rdm_pke_get_medium_rtm_base_hdr(pkt_entry); + rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; + rtm_hdr->hdr.msg_id = txe->msg_id; + + if (tagged) { + rtm_hdr->hdr.flags |= EFA_RDM_REQ_TAGGED; + efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); + } + + if (delivery_complete_requested) { + txe->internal_flags |= + EFA_RDM_TXE_DELIVERY_COMPLETE_REQUESTED; + dc_medium_rtm_base_hdr = + (struct efa_rdm_dc_medium_rtm_base_hdr *) + pkt_entry->wiredata; + dc_medium_rtm_base_hdr->send_id = txe->tx_id; + dc_medium_rtm_base_hdr->msg_length = txe->total_len; + dc_medium_rtm_base_hdr->seg_offset = segment_offset; + pkt_entry->callback = + &efa_rdm_proto_medium_handle_rtm_dc_send_completion; + } else { + rtm_hdr->msg_length = txe->total_len; + rtm_hdr->seg_offset = segment_offset; + pkt_entry->callback = + &efa_rdm_proto_medium_handle_rtm_send_completion; + } + + assert(pkt_entry->callback); + + hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); + + ret = efa_rdm_pke_init_payload_from_ope( + pkt_entry, txe, hdr_size, segment_offset, + pkt_entry_data_size_vec[i]); + + if (ret) + goto out; + + segment_offset += ep->send_pkt_entry_data_sizes[i]; + } + + assert(pkt_entry_cnt == pkt_entry_cnt_allocated); + + ep->send_pkt_entry_vec_size = pkt_entry_cnt; + EFA_INFO(FI_LOG_EP_DATA, + "medium protocol: posting %d pkes, total_len %lu, msg_id %" PRIu32 "\n", + pkt_entry_cnt, txe->total_len, txe->msg_id); + + return FI_SUCCESS; + +out: + if (txe) { + peer->next_msg_id--; + efa_rdm_txe_release(txe); + } + for (i = 0; i < pkt_entry_cnt_allocated; ++i) + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[i]); + return ret; +} + + diff --git a/prov/efa/src/rdm/efa_rdm_proto_medium.h b/prov/efa/src/rdm/efa_rdm_proto_medium.h new file mode 100644 index 00000000000..a58b6df874d --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_medium.h @@ -0,0 +1,27 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_MEDIUM_H +#define _EFA_RDM_PROTO_MEDIUM_H + +#include "efa_rdm_proto.h" + +extern struct efa_rdm_proto efa_rdm_proto_medium; + +int efa_rdm_proto_medium_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_medium_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_medium_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_medium_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe); + +#endif /* _EFA_RDM_PROTO_MEDIUM_H */ diff --git a/prov/efa/src/rdm/efa_rdm_proto_runtread.c b/prov/efa/src/rdm/efa_rdm_proto_runtread.c new file mode 100644 index 00000000000..1469bde2295 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_runtread.c @@ -0,0 +1,333 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#include "efa_rdm_proto_runtread.h" +#include "efa.h" +#include "efa_rdm_pke_req.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pkt_type.h" + +/* + * List of packet types used by this protocol + * + * For send/recv operations + * EFA_RDM_RUNTREAD_MSGRTM_PKT + * EFA_RDM_RUNTREAD_TAGRTM_PKT + * + * EFA_RDM_EOR_PKT + */ + +/* + * Description of the protocol + * https://github.com/ofiwg/libfabric/blob/main/prov/efa/docs/efa_rdm_protocol_v4.md#45-runting-read-message-subprotocol + */ + +/** + * @brief Check if the runt read protocol can handle this send operation. + * + * Requires p2p availability, registered memory descriptors, the message + * meeting the minimum read size, and peer RDMA read support. Also + * requires a non-zero runt size. + */ +static bool efa_rdm_proto_runtread_can_use_for_send(struct efa_rdm_ope *txe, + struct efa_rdm_peer *peer, + int req_pkt_type, + uint16_t header_flags, + int iface, bool use_p2p) +{ + bool size, read_interop, mr_avail, no_read_in_progress, runt_allowed; + + mr_avail = (txe->desc[0] != NULL); + size = txe->total_len >= g_efa_hmem_info[iface].min_read_msg_size; + read_interop = efa_rdm_interop_rdma_read(txe->ep, peer); + no_read_in_progress = + efa_rdm_ep_domain(txe->ep)->num_read_msg_in_flight == 0; + runt_allowed = g_efa_hmem_info[iface].runt_size < + peer->num_runt_bytes_in_flight; + + if (use_p2p && mr_avail && size && read_interop && + no_read_in_progress && runt_allowed) + return true; + + return false; +} + +struct efa_rdm_proto efa_rdm_proto_runtread = { + .name = "RUNTREAD", + .can_use_protocol_for_send = &efa_rdm_proto_runtread_can_use_for_send, + .construct_tx_pkes = &efa_rdm_proto_runtread_construct_tx_pkes, + // Runting read protocol is always delivery complete + .req_pkt_type = EFA_RDM_RUNTREAD_MSGRTM_PKT, + .req_pkt_type_dc = EFA_RDM_RUNTREAD_MSGRTM_PKT, + .req_pkt_type_tagged = EFA_RDM_RUNTREAD_TAGRTM_PKT, + .req_pkt_type_tagged_dc = EFA_RDM_RUNTREAD_TAGRTM_PKT, + .handle_tx_pkes_posted = &efa_rdm_proto_runtread_handle_tx_pkes_posted, +}; + +void efa_rdm_proto_runtread_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + size_t pkt_data_size; + + for (int i = 0; i < ep->send_pkt_entry_vec_size; ++i) { + pkt_data_size = ep->send_pkt_entry_vec[i]->payload_size; + txe->bytes_sent += pkt_data_size; + txe->peer->num_runt_bytes_in_flight += pkt_data_size; + } + + efa_rdm_ep_domain(txe->ep)->num_read_msg_in_flight++; +} + +/* TX path callbacks - one callback for each packet type that this protocol uses + */ +/** + * @brief Handle send completion for a runt read RTM packet. + * + * Simply releases the TX packet entry since the remaining data transfer + * is driven by the receiver via RDMA reads. + */ +void efa_rdm_proto_runtread_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_ope *txe; + struct efa_rdm_peer *peer; + size_t pkt_data_size; + + txe = pkt_entry->ope; + assert(txe); + + pkt_data_size = pkt_entry->payload_size; + txe->bytes_acked += pkt_data_size; + + /* If the entire buffer could be sent in RTM packets, we should have + * used the medium protocol instead of the runting read + */ + assert(txe->bytes_acked < txe->total_len); + + peer = txe->peer; + assert(peer); + assert(peer->num_runt_bytes_in_flight >= pkt_data_size); + peer->num_runt_bytes_in_flight -= pkt_data_size; + + efa_rdm_pke_release_tx(pkt_entry); +} + +/** + * @brief calculate and set the bytes_runt field of a txe + * + * bytes_runt is number of bytes for a message to be sent by runting + * + * @param[in] ep endpoint + * @param[in,out] txe txe to be set + */ +static inline void efa_rdm_proto_runtread_set_runt_size(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe) +{ + assert(txe->type == EFA_RDM_TXE); + + if (txe->bytes_runt > 0) + return; + + assert(txe->peer); + txe->bytes_runt = efa_rdm_peer_get_runt_size(txe->peer, ep, txe); + + EFA_DBG(FI_LOG_EP_DATA, + "runtread protocol: runt_size %lu, total_len %lu\n", + txe->bytes_runt, txe->total_len); + + assert(txe->bytes_runt); +} + +/** + * @brief Construct TX packet entries for the runt read protocol. + * + * Splits the runt portion across multiple packets (similar to medium) + * while including RDMA read IOVs in each packet header so the receiver + * can fetch the remaining data. + * + * @return 0 on success, negative errno on failure + */ +int efa_rdm_proto_runtread_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe) +{ + int pkt_entry_cnt, pkt_entry_cnt_allocated = 0, + single_pkt_entry_max_data_size, memory_alignment; + int i, j, ret, req_pkt_type, iface, available_tx_pkts, + single_pkt_entry_data_size, remainder; + size_t segment_offset, hdr_size, payload_offset; + struct fi_rma_iov *read_iov; + size_t *pkt_entry_data_size_vec = ep->send_pkt_entry_data_sizes; + bool tagged; + struct efa_rdm_pke *pkt_entry = NULL; + struct efa_rdm_runtread_rtm_base_hdr *rtm_hdr; + + efa_rdm_proto_txe_fill(txe, ep, peer, msg, op, tag, flags); + efa_rdm_proto_runtread_set_runt_size(ep, txe); + + // Should use medium protocol if the entire buffer can fit in the runt + // size + assert(txe->bytes_runt < txe->total_len); + + /* Read based protocols shouldn't be chosen if the local buffer cannot + * be registered */ + assert(txe->desc[0]); + + // Verify that the send queue is not full + assert(ep->efa_max_outstanding_tx_ops - ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt > + 0); + + // Refactored code path does not support zero copy + assert(!efa_both_support_zero_hdr_data_transfer(ep, peer)); + + // Inject should use eager protocol + assert(!(flags & FI_INJECT)); + + txe->msg_id = peer->next_msg_id++; + + /* Select req_pkt_type based on whether FI_TAGGED is set and whether + * delivery_complete is requested + */ + tagged = (op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + // Runting read is always delivery complete + assert(efa_rdm_proto_runtread.req_pkt_type == + efa_rdm_proto_runtread.req_pkt_type_dc); + assert(efa_rdm_proto_runtread.req_pkt_type_tagged == + efa_rdm_proto_runtread.req_pkt_type_tagged_dc); + + req_pkt_type = efa_rdm_proto_runtread.req_pkt_type + tagged; + + single_pkt_entry_max_data_size = + efa_rdm_txe_max_req_data_capacity(ep, txe, req_pkt_type); + assert(single_pkt_entry_max_data_size > 0); + + iface = (msg->desc && msg->desc[0]) ? + ((struct efa_mr *) msg->desc[0])->iface : + FI_HMEM_SYSTEM; + memory_alignment = efa_rdm_ep_get_memory_alignment(ep, iface); + + pkt_entry_cnt = + (txe->total_len - 1) / single_pkt_entry_max_data_size + 1; + + /* when sending multiple packets, it is more performant that the data + * size of each packet are close to achieve that, we calculate the + * single packet size + */ + single_pkt_entry_data_size = (txe->bytes_runt - 1) / pkt_entry_cnt + 1; + + /* each packet must be aligned */ + single_pkt_entry_data_size = + single_pkt_entry_data_size & ~(memory_alignment - 1); + + assert(single_pkt_entry_data_size); + + pkt_entry_cnt = txe->total_len / single_pkt_entry_data_size; + for (i = 0; i < pkt_entry_cnt; ++i) + pkt_entry_data_size_vec[i] = single_pkt_entry_data_size; + + remainder = txe->total_len - pkt_entry_cnt * single_pkt_entry_data_size; + if (single_pkt_entry_data_size + remainder <= + single_pkt_entry_max_data_size) { + pkt_entry_data_size_vec[pkt_entry_cnt - 1] += remainder; + } else { + pkt_entry_data_size_vec[pkt_entry_cnt] = remainder; + pkt_entry_cnt += 1; + } + + available_tx_pkts = ep->efa_max_outstanding_tx_ops - + ep->efa_outstanding_tx_ops - + ep->efa_rnr_queued_pkt_cnt; + + if (pkt_entry_cnt > available_tx_pkts) + return -FI_EAGAIN; + + assert(pkt_entry_cnt <= efa_base_ep_get_tx_pool_size(&ep->base_ep)); + + segment_offset = 0; + for (i = 0; i < pkt_entry_cnt; ++i) { + pkt_entry = efa_rdm_pke_alloc(ep, ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + + if (OFI_UNLIKELY(!pkt_entry)) { + ret = -FI_EAGAIN; + goto out; + } + + pkt_entry_cnt_allocated++; + ep->send_pkt_entry_vec[i] = pkt_entry; + + pkt_entry->ope = txe; + pkt_entry->peer = peer; + pkt_entry->callback = + &efa_rdm_proto_runtread_handle_rtm_send_completion; + + assert(pkt_entry_data_size_vec[i] > 0); + + efa_rdm_pke_init_req_hdr_common(pkt_entry, req_pkt_type, txe); + + rtm_hdr = (struct efa_rdm_runtread_rtm_base_hdr *) + pkt_entry->wiredata; + rtm_hdr->hdr.flags |= EFA_RDM_REQ_MSG; + rtm_hdr->hdr.msg_id = txe->msg_id; + rtm_hdr->msg_length = txe->total_len; + rtm_hdr->send_id = txe->tx_id; + rtm_hdr->seg_offset = segment_offset; + rtm_hdr->runt_length = txe->bytes_runt; + rtm_hdr->read_iov_count = txe->iov_count; + + if (tagged) { + rtm_hdr->hdr.flags |= EFA_RDM_REQ_TAGGED; + efa_rdm_pke_set_rtm_tag(pkt_entry, txe->tag); + } + + hdr_size = efa_rdm_pke_get_req_hdr_size(pkt_entry); + read_iov = + (struct fi_rma_iov *) (pkt_entry->wiredata + hdr_size); + + // Logic copied from efa_rdm_txe_prepare_to_be_read + for (j = 0; j < txe->iov_count; ++j) { + read_iov[j].addr = (uint64_t) txe->iov[j].iov_base; + read_iov[j].len = txe->iov[j].iov_len; + read_iov[j].key = fi_mr_key(txe->desc[j]); + } + payload_offset = + hdr_size + sizeof(struct fi_rma_iov) * txe->iov_count; + + assert(pkt_entry->callback); + + ret = efa_rdm_pke_init_payload_from_ope( + pkt_entry, txe, payload_offset, segment_offset, + pkt_entry_data_size_vec[i]); + + if (ret) + goto out; + + segment_offset += ep->send_pkt_entry_data_sizes[i]; + } + + assert(pkt_entry_cnt == pkt_entry_cnt_allocated); + + ep->send_pkt_entry_vec_size = pkt_entry_cnt; + EFA_INFO(FI_LOG_EP_DATA, + "runtread protocol: posting %d pkes, runt_size %lu, total_len %lu\n", + pkt_entry_cnt, txe->bytes_runt, txe->total_len); + + return FI_SUCCESS; + +out: + if (txe) { + peer->next_msg_id--; + efa_rdm_txe_release(txe); + } + + for (i = 0; i < pkt_entry_cnt_allocated; ++i) + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[i]); + + return ret; +} diff --git a/prov/efa/src/rdm/efa_rdm_proto_runtread.h b/prov/efa/src/rdm/efa_rdm_proto_runtread.h new file mode 100644 index 00000000000..dff85fd7f13 --- /dev/null +++ b/prov/efa/src/rdm/efa_rdm_proto_runtread.h @@ -0,0 +1,27 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ + +#ifndef _EFA_RDM_PROTO_RUNTREAD_H +#define _EFA_RDM_PROTO_RUNTREAD_H + +#include "efa_rdm_proto.h" + +extern struct efa_rdm_proto efa_rdm_proto_runtread; + +int efa_rdm_proto_runtread_construct_tx_pkes(struct efa_rdm_ep *ep, + struct efa_rdm_peer *peer, + const struct fi_msg *msg, + uint32_t op, uint64_t tag, + uint64_t flags, + struct efa_rdm_ope *txe); + +void efa_rdm_proto_runtread_handle_rtm_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_runtread_handle_rtm_dc_send_completion( + struct efa_rdm_pke *pkt_entry); + +void efa_rdm_proto_runtread_handle_tx_pkes_posted(struct efa_rdm_ep *ep, + struct efa_rdm_ope *txe); + +#endif /* _EFA_RDM_PROTO_RUNTREAD_H */ diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 9d9434014a2..d361dd564b5 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -74,9 +74,6 @@ efa_rdm_rma_alloc_txe(struct efa_rdm_ep *efa_rdm_ep, memcpy(txe->rma_iov, msg_rma->rma_iov, sizeof(struct fi_rma_iov) * msg_rma->rma_iov_count); - efa_domain_ope_list_lock(efa_rdm_ep_domain(efa_rdm_ep)); - dlist_insert_tail(&txe->ep_entry, &efa_rdm_ep->txe_list); - efa_domain_ope_list_unlock(efa_rdm_ep_domain(efa_rdm_ep)); return txe; } diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index 563ac92e925..2f2c131b0a5 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -403,6 +403,7 @@ struct efa_rdm_ope *efa_unit_test_alloc_txe(struct efa_resource *resource, uint3 struct efa_rdm_peer *peer; struct fi_msg msg = {0}; struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_ope *txe; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -415,7 +416,12 @@ struct efa_rdm_ope *efa_unit_test_alloc_txe(struct efa_resource *resource, uint3 peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - return efa_rdm_ep_alloc_txe(efa_rdm_ep, peer, &msg, op, 0, 0); + txe = ofi_buf_alloc(efa_rdm_ep->ope_pool); + if (!txe) + return NULL; + + efa_rdm_txe_construct(txe, efa_rdm_ep, peer, &msg, op, 0); + return txe; } struct efa_rdm_ope *efa_unit_test_alloc_rxe(struct efa_resource *resource, uint32_t op) diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 6033778591b..da4a34226f6 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -478,128 +478,6 @@ void test_efa_rdm_ep_dc_atomic_queue_before_handshake(struct efa_resource **stat assert_true(txe->internal_flags & EFA_RDM_OPE_QUEUED_BEFORE_HANDSHAKE); } -/** - * @brief when delivery complete send was used and handshake packet has not been received - * verify the txe is queued - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_efa_rdm_ep_dc_send_queue_before_handshake(struct efa_resource **state) -{ - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_msg msg = {0}; - struct iovec iov; - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int err, numaddr; - struct efa_rdm_ope *txe; - - /* disable shm to force using efa device to send */ - efa_unit_test_resource_construct_rdm_shm_disabled(resource); - - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - msg.addr = peer_addr; - msg.iov_count = 1; - iov.iov_base = NULL; - iov.iov_len = 0; - msg.msg_iov = &iov; - msg.desc = NULL; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think - * a REQ packet has been sent to the peer (so no need to send again) - * handshake has not been received, so we do not know whether the peer support DC - */ - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - peer->flags = EFA_RDM_PEER_REQ_SENT; - peer->is_local = false; - - assert_false(efa_rdm_ep->homogeneous_peers); - assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - /* DC has been reuquested, but ep do not know whether peer supports it, therefore - * the ope has been queued to domain->ope_queued_list - */ - assert_int_equal(err, 0); - assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1); - assert_int_equal(efa_unit_test_get_dlist_length(&(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list)), 1); - txe = container_of(efa_rdm_ep_domain(efa_rdm_ep)->ope_queued_list.next, struct efa_rdm_ope, queued_entry); - assert_true((txe->op == ofi_op_msg)); - assert_true(txe->internal_flags & EFA_RDM_OPE_QUEUED_BEFORE_HANDSHAKE); -} - -/** - * @brief when delivery complete send was used and handshake packet has not been received - * verify the txes are queued before the number of requests reach EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE. - * After reaching the limit, fi_send should return -FI_EAGAIN - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(struct efa_resource **state) -{ - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_msg msg = {0}; - struct iovec iov; - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int err, numaddr; - int i; - - /* disable shm to force using efa device to send */ - efa_unit_test_resource_construct_rdm_shm_disabled(resource); - - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - msg.addr = peer_addr; - msg.iov_count = 1; - iov.iov_base = NULL; - iov.iov_len = 0; - msg.msg_iov = &iov; - msg.desc = NULL; - - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* set peer->flag to EFA_RDM_PEER_REQ_SENT will make efa_rdm_atomic() think - * a REQ packet has been sent to the peer (so no need to send again) - * handshake has not been received, so we do not know whether the peer support DC - */ - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - peer->flags = EFA_RDM_PEER_REQ_SENT; - peer->is_local = false; - - assert_false(efa_rdm_ep->homogeneous_peers); - assert_true(dlist_empty(&efa_rdm_ep->txe_list)); - - for (i = 0; i < EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE; i++) { - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - assert_int_equal(err, 0); - } - - assert_true(efa_rdm_ep->ope_queued_before_handshake_cnt == EFA_RDM_MAX_QUEUED_OPE_BEFORE_HANDSHAKE); - err = fi_sendmsg(resource->ep, &msg, FI_DELIVERY_COMPLETE); - assert_int_equal(err, -FI_EAGAIN); -} - /** * @brief verify tx entry is queued for rma (read or write) request before handshake is made. * diff --git a/prov/efa/test/efa_unit_test_ope.c b/prov/efa/test/efa_unit_test_ope.c index 6bca9813103..14f45f0610b 100644 --- a/prov/efa/test/efa_unit_test_ope.c +++ b/prov/efa/test/efa_unit_test_ope.c @@ -4,6 +4,7 @@ #include "efa_unit_tests.h" #include "rdm/efa_rdm_pke_cmd.h" #include "rdm/efa_rdm_pke_nonreq.h" +#include "rdm/efa_rdm_proto_eager.h" typedef void (*efa_rdm_ope_handle_error_func_t)(struct efa_rdm_ope *ope, int err, int prov_errno); @@ -12,7 +13,7 @@ void test_efa_rdm_ope_prepare_to_post_send_impl(struct efa_resource *resource, size_t total_len, int expected_ret, int expected_pkt_entry_cnt, - int *expected_pkt_entry_data_size_vec) + size_t *expected_pkt_entry_data_size_vec) { struct efa_ep_addr raw_addr; struct efa_rdm_mr mock_mr; @@ -20,7 +21,8 @@ void test_efa_rdm_ope_prepare_to_post_send_impl(struct efa_resource *resource, struct efa_rdm_peer mock_peer; size_t raw_addr_len = sizeof(raw_addr); fi_addr_t addr; - int pkt_entry_cnt, pkt_entry_data_size_vec[1024]; + size_t pkt_entry_cnt; + size_t pkt_entry_data_size_vec[1024]; int i, err, ret; ret = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -93,7 +95,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory(struct efa_resource **sta struct efa_resource *resource = *state; size_t msg_length; int expected_pkt_entry_cnt; - int expected_pkt_entry_data_size_vec[1024]; + size_t expected_pkt_entry_data_size_vec[1024]; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); @@ -142,7 +144,7 @@ void test_efa_rdm_ope_prepare_to_post_send_host_memory_align128(struct efa_resou struct efa_rdm_ep *efa_rdm_ep; size_t msg_length; int expected_pkt_entry_cnt; - int expected_pkt_entry_data_size_vec[1024]; + size_t expected_pkt_entry_data_size_vec[1024]; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -191,7 +193,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory(struct efa_resource **sta struct efa_resource *resource = *state; size_t msg_length; int expected_pkt_entry_cnt; - int expected_pkt_entry_data_size_vec[1024]; + size_t expected_pkt_entry_data_size_vec[1024]; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); @@ -216,7 +218,7 @@ void test_efa_rdm_ope_prepare_to_post_send_cuda_memory_align128(struct efa_resou struct efa_rdm_ep *efa_rdm_ep; size_t msg_length; int expected_pkt_entry_cnt; - int expected_pkt_entry_data_size_vec[1024]; + size_t expected_pkt_entry_data_size_vec[1024]; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -599,8 +601,9 @@ void test_efa_rdm_txe_prepare_local_read_pkt_entry(struct efa_resource **state) assert_int_equal(fi_endpoint(resource->domain, resource->info, &ep, NULL), 0); efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - txe = efa_rdm_ep_alloc_txe(efa_rdm_ep, NULL, &msg, ofi_op_msg, 0, 0); + txe = ofi_buf_alloc(efa_rdm_ep->ope_pool); assert_non_null(txe); + efa_rdm_txe_construct(txe, efa_rdm_ep, NULL, &msg, ofi_op_msg, 0); /* Use ooo rx pkt because it doesn't have mr so a read_copy pkt clone is enforced. */ pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->rx_ooo_pkt_pool, EFA_RDM_PKE_FROM_OOO_POOL); @@ -1235,6 +1238,8 @@ static void test_efa_rdm_txe_dc_release_common(struct efa_resource *resource, bo /* Set DC packet type in wiredata */ struct efa_rdm_base_hdr *base_hdr = (struct efa_rdm_base_hdr *)dc_pkt_entry->wiredata; base_hdr->type = EFA_RDM_DC_EAGER_MSGRTM_PKT; + dc_pkt_entry->callback = + &efa_rdm_proto_eager_handle_rtm_dc_send_completion; /* Create fake receipt packet entry */ receipt_pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); @@ -1251,7 +1256,8 @@ static void test_efa_rdm_txe_dc_release_common(struct efa_resource *resource, bo if (send_first) { /* Send completion first - should not release TXE yet */ - efa_rdm_pke_handle_send_completion(dc_pkt_entry); + efa_rdm_ep_record_tx_op_completed(efa_rdm_ep, dc_pkt_entry); + efa_unit_test_pke_handle_send_completion(dc_pkt_entry); assert_int_equal(efa_unit_test_get_dlist_length(&efa_rdm_ep->txe_list), 1); assert_false(efa_rdm_txe_dc_ready_for_release(txe)); if (txe_in_send_state) { @@ -1282,7 +1288,8 @@ static void test_efa_rdm_txe_dc_release_common(struct efa_resource *resource, bo } /* Send completion - should now release TXE */ - efa_rdm_pke_handle_send_completion(dc_pkt_entry); + efa_rdm_ep_record_tx_op_completed(efa_rdm_ep, dc_pkt_entry); + efa_unit_test_pke_handle_send_completion(dc_pkt_entry); } /* Verify TXE is released */ diff --git a/prov/efa/test/efa_unit_test_pke.c b/prov/efa/test/efa_unit_test_pke.c index e5eb24dfa3a..e1512da7d10 100644 --- a/prov/efa/test/efa_unit_test_pke.c +++ b/prov/efa/test/efa_unit_test_pke.c @@ -3,74 +3,52 @@ #include "rdm/efa_rdm_pke_rta.h" #include "rdm/efa_rdm_pke_rtw.h" #include "rdm/efa_rdm_pke_utils.h" +#include "rdm/efa_rdm_proto_longcts.h" /** - * @brief When handling a long cts rtm as read nack fallback, - * efa_rdm_pke_handle_longcts_rtm_send_completion shouldn't touch - * txe and write send completion. + * @brief When handling a long CTS RTM send completion for a READ NACK + * fallback packet (zero payload), the callback should return early + * without touching the TXE or writing a CQ completion. + * + * This tests the READ NACK handling in + * efa_rdm_proto_longcts_handle_rtm_send_completion. */ -void test_efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_resource **state) +void test_efa_rdm_pke_handle_longcts_rtm_send_completion( + struct efa_resource **state) { - struct efa_resource *resource = *state; - struct efa_rdm_pke *pkt_entry; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - struct fi_msg msg = {0}; - char buf[16]; - struct iovec iov = { - .iov_base = buf, - .iov_len = sizeof buf - }; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int err, numaddr; - struct efa_rdm_ope *txe; + struct efa_resource *resource = *state; + struct efa_rdm_pke *pkt_entry; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_rtm_base_hdr *rtm_hdr; - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + /* Allocate a TX packet entry and set up as a zero-payload + * long CTS RTM with READ_NACK flag */ + pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, + EFA_RDM_PKE_FROM_EFA_TX_POOL); + assert_non_null(pkt_entry); + + pkt_entry->payload_size = 0; + pkt_entry->ep = efa_rdm_ep; + pkt_entry->ope = NULL; /* TXE already released by CTSDATA completions */ - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); - assert_non_null(peer); - - /* Construct a txe with read nack flag added */ - msg.addr = peer_addr; - msg.iov_count = 1; - msg.msg_iov = &iov; - msg.desc = NULL; - txe = efa_rdm_ep_alloc_txe(efa_rdm_ep, peer, &msg, ofi_op_msg, 0, 0); - assert_non_null(txe); - txe->internal_flags |= EFA_RDM_OPE_READ_NACK; - - /* construct a fallback long cts rtm pkt */ - pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL); - assert_non_null(pkt_entry); - - err = efa_rdm_pke_init_longcts_msgrtm(pkt_entry, txe); - assert_int_equal(err, 0); - - assert_int_equal(pkt_entry->payload_size, 0); - - /* Mimic the case when CTSDATA pkts have completed all data and released the txe */ - txe->bytes_acked = txe->total_len; - txe->bytes_sent = txe->total_len; - efa_rdm_txe_release(txe); - - efa_rdm_pke_handle_longcts_rtm_send_completion(pkt_entry); - - /* CQ should be empty as send completion shouldn't be written */ - assert_int_equal(fi_cq_read(resource->cq, NULL, 1), -FI_EAGAIN); - - efa_rdm_pke_release_tx(pkt_entry); + rtm_hdr = efa_rdm_pke_get_rtm_base_hdr(pkt_entry); + rtm_hdr->type = EFA_RDM_LONGCTS_MSGRTM_PKT; + rtm_hdr->flags = EFA_RDM_REQ_READ_NACK; + + /* Call the new send completion callback. + * It should return early without dereferencing the NULL ope. */ + efa_rdm_proto_longcts_handle_rtm_send_completion(pkt_entry); + + /* CQ should be empty - no send completion written */ + assert_int_equal(fi_cq_read(resource->cq, NULL, 1), -FI_EAGAIN); + + efa_rdm_pke_release_tx(pkt_entry); } /** @@ -322,8 +300,9 @@ void test_efa_rdm_pke_flag_tracking(struct efa_resource **state) msg.iov_count = 1; msg.msg_iov = &iov; msg.desc = NULL; - txe = efa_rdm_ep_alloc_txe(efa_rdm_ep, peer, &msg, ofi_op_msg, 0, 0); + txe = ofi_buf_alloc(efa_rdm_ep->ope_pool); assert_non_null(txe); + efa_rdm_txe_construct(txe, efa_rdm_ep, peer, &msg, ofi_op_msg, 0); /* Allocate a packet entry */ pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_tx_pkt_pool, EFA_RDM_PKE_FROM_EFA_TX_POOL); diff --git a/prov/efa/test/efa_unit_test_proto.c b/prov/efa/test/efa_unit_test_proto.c new file mode 100644 index 00000000000..18d3db02303 --- /dev/null +++ b/prov/efa/test/efa_unit_test_proto.c @@ -0,0 +1,912 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + +#include "efa_unit_tests.h" +#include "rdm/efa_rdm_proto.h" +#include "rdm/efa_rdm_proto_eager.h" +#include "rdm/efa_rdm_proto_longcts.h" +#include "rdm/efa_rdm_proto_longread.h" +#include "rdm/efa_rdm_proto_medium.h" +#include "rdm/efa_rdm_proto_runtread.h" + +/* Tests from efa_unit_test_proto_select.c */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All + * rights reserved. */ + + +/** + * @brief Helper to set up an endpoint, peer, and TXE for protocol selection + * tests. + * + * Returns the efa_rdm_ep. Caller must provide a peer_addr output and a + * pre-allocated txe pointer output. + */ +static struct efa_rdm_ep *setup_proto_select_test(struct efa_resource *resource, + fi_addr_t *peer_addr) +{ + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct efa_rdm_ep *ep; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, peer_addr, 0, NULL), + 1); + + return ep; +} + +/** + * @brief Test that eager protocol is selected for small messages. + */ +void test_proto_select_eager_for_small_msg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + fi_addr_t peer_addr; + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + ep = setup_proto_select_test(resource, &peer_addr); + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + iov.iov_base = NULL; + iov.iov_len = 64; /* Small message, fits in eager */ + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, NULL); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, 0, + txe, &proto); + assert_int_equal(err, 0); + assert_non_null(proto); + assert_ptr_equal(proto, &efa_rdm_proto_eager); + + ofi_buf_free(txe); +} + +/** + * @brief Test that medium protocol is selected for messages between eager + * capacity and 64KB. + */ +void test_proto_select_medium_for_mid_msg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + fi_addr_t peer_addr; + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + ep = setup_proto_select_test(resource, &peer_addr); + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + /* 16KB - too large for eager, fits in medium */ + iov.iov_base = NULL; + iov.iov_len = 16384; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, NULL); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, 0, + txe, &proto); + assert_int_equal(err, 0); + assert_non_null(proto); + assert_ptr_equal(proto, &efa_rdm_proto_medium); + + ofi_buf_free(txe); +} + +/** + * @brief Test that long CTS is selected for large messages when no p2p + * or no registered memory is available. + */ +void test_proto_select_longcts_for_large_msg_no_p2p(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + fi_addr_t peer_addr; + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + ep = setup_proto_select_test(resource, &peer_addr); + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + /* 128KB - too large for medium, no desc so no read-based protocols */ + iov.iov_base = NULL; + iov.iov_len = 131072; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, NULL); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, 0, + txe, &proto); + assert_int_equal(err, 0); + assert_non_null(proto); + assert_ptr_equal(proto, &efa_rdm_proto_longcts); + + ofi_buf_free(txe); +} + +/** + * @brief Test that eager is selected before medium for messages that fit + * in eager (protocol priority ordering). + */ +void test_proto_select_eager_has_priority_over_medium( + struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + fi_addr_t peer_addr; + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + ep = setup_proto_select_test(resource, &peer_addr); + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + /* 1 byte - fits in both eager and medium, eager should win */ + iov.iov_base = NULL; + iov.iov_len = 1; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, NULL); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, 0, + txe, &proto); + assert_int_equal(err, 0); + assert_ptr_equal(proto, &efa_rdm_proto_eager); + + ofi_buf_free(txe); +} + +/** + * @brief Test that zero-length messages select eager protocol. + */ +void test_proto_select_eager_for_zero_len_msg(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + fi_addr_t peer_addr; + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + ep = setup_proto_select_test(resource, &peer_addr); + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + iov.iov_base = NULL; + iov.iov_len = 0; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, NULL); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, 0, + txe, &proto); + assert_int_equal(err, 0); + assert_ptr_equal(proto, &efa_rdm_proto_eager); + + ofi_buf_free(txe); +} + +/** + * @brief Test that long read protocol is selected over long CTS when + * p2p is available, memory is registered, and peer supports RDMA read. + */ +void test_proto_select_longread_over_longcts_with_p2p( + struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + struct efa_unit_test_buff send_buff; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + void *desc; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + /* 2MB - above min_read_msg_size (1MB default) */ + efa_unit_test_buff_construct(&send_buff, resource, 2 * 1024 * 1024); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, + &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal(fi_av_insert(resource->av, &raw_addr, 1, + &peer_addr, 0, NULL), 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + /* Enable RDMA read support on peer */ + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_READ; + peer->device_version = + efa_rdm_ep_domain(ep)->device->ibv_attr.vendor_part_id; + + /* Enable RDMA read on the endpoint and device */ + ep->use_device_rdma = true; + efa_rdm_ep_domain(ep)->device->device_caps |= + EFADV_DEVICE_ATTR_CAPS_RDMA_READ; + + desc = fi_mr_desc(send_buff.mr); + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, &desc); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, + 0, txe, &proto); + assert_int_equal(err, 0); + assert_non_null(proto); + assert_ptr_equal(proto, &efa_rdm_proto_longread); + + /* Clean up MRs that select_send_protocol may have registered */ + for (int i = 0; i < txe->iov_count; i++) { + if (txe->mr[i]) + fi_close(&txe->mr[i]->fid); + } + ofi_buf_free(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that runt read protocol is selected over long read and + * long CTS when conditions are met: p2p available, memory registered, + * peer supports RDMA read, no reads in flight, and runt is allowed. + * + * Runt read has higher priority than long read in the protocol list. + */ +void test_proto_select_runtread_over_longread(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_proto *proto = NULL; + struct efa_unit_test_buff send_buff; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + void *desc; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + /* 2MB - above min_read_msg_size */ + efa_unit_test_buff_construct(&send_buff, resource, 2 * 1024 * 1024); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, + &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal(fi_av_insert(resource->av, &raw_addr, 1, + &peer_addr, 0, NULL), 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_READ; + peer->device_version = + efa_rdm_ep_domain(ep)->device->ibv_attr.vendor_part_id; + /* Runt read requires no reads in flight and runt allowed */ + efa_rdm_ep_domain(ep)->num_read_msg_in_flight = 0; + /* Set runt_size > num_runt_bytes_in_flight for system memory */ + g_efa_hmem_info[FI_HMEM_SYSTEM].runt_size = 1000; + peer->num_runt_bytes_in_flight = 2000; + + /* Enable RDMA read on the endpoint and device */ + ep->use_device_rdma = true; + efa_rdm_ep_domain(ep)->device->device_caps |= + EFADV_DEVICE_ATTR_CAPS_RDMA_READ; + + desc = fi_mr_desc(send_buff.mr); + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, &desc); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + err = efa_rdm_proto_select_send_protocol(ep, peer, &msg, ofi_op_msg, + 0, txe, &proto); + assert_int_equal(err, 0); + assert_non_null(proto); + assert_ptr_equal(proto, &efa_rdm_proto_runtread); + + for (int i = 0; i < txe->iov_count; i++) { + if (txe->mr[i]) + fi_close(&txe->mr[i]->fid); + } + ofi_buf_free(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that eager construct_tx_pkes produces exactly 1 PKE with + * the correct callback set. + */ +void test_proto_eager_construct_pkes_single_pke(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 64); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, + (void **) &send_buff.mr); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + /* Initialize fields that select_send_protocol would set */ + txe->ep = ep; + txe->total_len = send_buff.size; + txe->iov_count = 1; + memcpy(txe->iov, &iov, sizeof(iov)); + txe->desc[0] = fi_mr_desc(send_buff.mr); + memset(txe->mr, 0, sizeof(*txe->mr)); + + err = efa_rdm_proto_eager.construct_tx_pkes(ep, peer, &msg, ofi_op_msg, + 0, 0, txe); + assert_int_equal(err, 0); + assert_int_equal(ep->send_pkt_entry_vec_size, 1); + assert_non_null(ep->send_pkt_entry_vec[0]); + assert_non_null(ep->send_pkt_entry_vec[0]->callback); + assert_ptr_equal(ep->send_pkt_entry_vec[0]->ope, txe); + + /* Clean up */ + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[0]); + efa_rdm_txe_release(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that eager send completion callback releases TXE and PKE + * for non-DC messages. + */ +void test_proto_eager_send_completion_releases_txe(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + struct efa_rdm_pke *pkt_entry; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 64); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + /* Mock efa_qp_post_send to succeed */ + g_efa_unit_test_mocks.efa_qp_post_send = + &efa_mock_efa_qp_post_send_return_mock; + will_return_int_maybe(efa_mock_efa_qp_post_send_return_mock, 0); + + /* Send a message via fi_send which goes through the new code path */ + err = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), peer_addr, NULL); + assert_int_equal(err, 0); + assert_int_equal(efa_unit_test_get_dlist_length(&ep->txe_list), 1); + + /* Get the TXE and PKE */ + txe = container_of(ep->txe_list.next, struct efa_rdm_ope, ep_entry); + pkt_entry = ep->send_pkt_entry_vec[0]; + assert_non_null(pkt_entry->callback); + + /* Simulate send completion: record_tx_op_completed + callback */ + efa_rdm_ep_record_tx_op_completed(ep, pkt_entry); + pkt_entry->callback(pkt_entry); + + /* TXE should be released */ + assert_int_equal(efa_unit_test_get_dlist_length(&ep->txe_list), 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that eager assigns msg_id from peer->next_msg_id. + */ +void test_proto_eager_assigns_msg_id(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + uint32_t initial_msg_id; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 64); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + initial_msg_id = peer->next_msg_id; + + g_efa_unit_test_mocks.efa_qp_post_send = + &efa_mock_efa_qp_post_send_return_mock; + will_return_int_maybe(efa_mock_efa_qp_post_send_return_mock, 0); + + err = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), peer_addr, NULL); + assert_int_equal(err, 0); + + /* msg_id should have been assigned and next_msg_id incremented */ + struct efa_rdm_ope *txe = + container_of(ep->txe_list.next, struct efa_rdm_ope, ep_entry); + assert_int_equal(txe->msg_id, initial_msg_id); + assert_int_equal(peer->next_msg_id, initial_msg_id + 1); + + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that medium construct_tx_pkes produces multiple PKEs for a + * message that requires segmentation (16KB = 2 packets at ~8KB MTU). + */ +void test_proto_medium_construct_pkes_multi_pke(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err, i; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 16384); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + void *desc = fi_mr_desc(send_buff.mr); + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, + &desc); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + txe->ep = ep; + txe->total_len = send_buff.size; + txe->iov_count = 1; + memcpy(txe->iov, &iov, sizeof(iov)); + txe->desc[0] = fi_mr_desc(send_buff.mr); + memset(txe->mr, 0, sizeof(*txe->mr)); + + err = efa_rdm_proto_medium.construct_tx_pkes(ep, peer, &msg, ofi_op_msg, + 0, 0, txe); + assert_int_equal(err, 0); + + /* 16KB should require at least 2 packets */ + assert_true(ep->send_pkt_entry_vec_size >= 2); + + /* Each PKE should have a callback and correct TXE */ + for (i = 0; i < ep->send_pkt_entry_vec_size; i++) { + assert_non_null(ep->send_pkt_entry_vec[i]); + assert_non_null(ep->send_pkt_entry_vec[i]->callback); + assert_ptr_equal(ep->send_pkt_entry_vec[i]->ope, txe); + } + + /* Clean up */ + for (i = 0; i < ep->send_pkt_entry_vec_size; i++) + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[i]); + efa_rdm_txe_release(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that medium send completion tracks bytes_acked and only + * releases TXE when all bytes are acked. + */ +void test_proto_medium_send_completion_tracks_bytes_acked( + struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err, i; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 16384); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + g_efa_unit_test_mocks.efa_qp_post_send = + &efa_mock_efa_qp_post_send_return_mock; + will_return_int_maybe(efa_mock_efa_qp_post_send_return_mock, 0); + + err = fi_send(resource->ep, send_buff.buff, send_buff.size, + fi_mr_desc(send_buff.mr), peer_addr, NULL); + assert_int_equal(err, 0); + assert_int_equal(efa_unit_test_get_dlist_length(&ep->txe_list), 1); + + txe = container_of(ep->txe_list.next, struct efa_rdm_ope, ep_entry); + assert_true(ep->send_pkt_entry_vec_size >= 2); + + /* Complete first PKE - TXE should NOT be released yet */ + struct efa_rdm_pke *first_pke = ep->send_pkt_entry_vec[0]; + efa_rdm_ep_record_tx_op_completed(ep, first_pke); + first_pke->callback(first_pke); + assert_int_equal(efa_unit_test_get_dlist_length(&ep->txe_list), 1); + + /* Complete remaining PKEs */ + for (i = 1; i < ep->send_pkt_entry_vec_size; i++) { + struct efa_rdm_pke *pke = ep->send_pkt_entry_vec[i]; + efa_rdm_ep_record_tx_op_completed(ep, pke); + pke->callback(pke); + } + + /* Now TXE should be released */ + assert_int_equal(efa_unit_test_get_dlist_length(&ep->txe_list), 0); + + efa_unit_test_buff_destruct(&send_buff); +} + +/* Long CTS protocol TX tests */ + +/** + * @brief Test that long read construct_tx_pkes produces 1 PKE with + * correct pkt_size (header + read IOVs). + */ +void test_proto_longcts_construct_pkes_single_rtm(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + /* 128KB - large enough to require long CTS */ + efa_unit_test_buff_construct(&send_buff, resource, 131072); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, + (void **) &send_buff.mr); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + txe->ep = ep; + txe->total_len = send_buff.size; + txe->iov_count = 1; + memcpy(txe->iov, &iov, sizeof(iov)); + txe->desc[0] = fi_mr_desc(send_buff.mr); + memset(txe->mr, 0, sizeof(*txe->mr)); + + err = efa_rdm_proto_longcts.construct_tx_pkes(ep, peer, &msg, + ofi_op_msg, 0, 0, txe); + assert_int_equal(err, 0); + + /* Long CTS sends exactly 1 RTM packet initially */ + assert_int_equal(ep->send_pkt_entry_vec_size, 1); + assert_non_null(ep->send_pkt_entry_vec[0]); + assert_non_null(ep->send_pkt_entry_vec[0]->callback); + assert_ptr_equal(ep->send_pkt_entry_vec[0]->ope, txe); + + /* Clean up */ + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[0]); + efa_rdm_txe_release(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that long read construct_tx_pkes produces 1 PKE with + * correct pkt_size (header + read IOVs). + */ + +void test_proto_longread_construct_pkes_has_read_iovs( + struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + int err; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + efa_unit_test_buff_construct(&send_buff, resource, 131072); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, + (void **) &send_buff.mr); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + txe->ep = ep; + txe->total_len = send_buff.size; + txe->iov_count = 1; + memcpy(txe->iov, &iov, sizeof(iov)); + txe->desc[0] = fi_mr_desc(send_buff.mr); + memset(txe->mr, 0, sizeof(*txe->mr)); + + err = efa_rdm_proto_longread.construct_tx_pkes(ep, peer, &msg, + ofi_op_msg, 0, 0, txe); + assert_int_equal(err, 0); + + /* Long read sends exactly 1 packet */ + assert_int_equal(ep->send_pkt_entry_vec_size, 1); + + struct efa_rdm_pke *pke = ep->send_pkt_entry_vec[0]; + assert_non_null(pke); + assert_non_null(pke->callback); + + /* pkt_size should include header + read IOVs (1 IOV for 1 iov_count) */ + assert_true(pke->pkt_size > 0); + assert_true(pke->pkt_size > sizeof(struct fi_rma_iov)); + + /* Clean up */ + efa_rdm_pke_release_tx(pke); + efa_rdm_txe_release(txe); + efa_unit_test_buff_destruct(&send_buff); +} + +/** + * @brief Test that runt read construct_tx_pkes produces multiple PKEs + * with runt data and includes RDMA read IOVs in the packet headers. + */ +void test_proto_runtread_construct_pkes_has_runt_and_read_iovs( + struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_unit_test_buff send_buff; + struct efa_rdm_ep *ep; + struct efa_rdm_peer *peer; + struct efa_rdm_ope *txe; + fi_addr_t peer_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(raw_addr); + struct fi_msg msg = {0}; + struct iovec iov; + void *desc; + int err, i; + + efa_unit_test_resource_construct_rdm_shm_disabled(resource); + /* 2MB message to trigger runt read */ + efa_unit_test_buff_construct(&send_buff, resource, 2 * 1024 * 1024); + + ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + assert_int_equal( + fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + assert_int_equal( + fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL), + 1); + + peer = efa_rdm_ep_get_peer(ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + + desc = fi_mr_desc(send_buff.mr); + iov.iov_base = send_buff.buff; + iov.iov_len = send_buff.size; + efa_unit_test_construct_msg(&msg, &iov, 1, peer_addr, NULL, 0, &desc); + + txe = ofi_buf_alloc(ep->ope_pool); + assert_non_null(txe); + + txe->ep = ep; + txe->total_len = send_buff.size; + txe->iov_count = 1; + memcpy(txe->iov, &iov, sizeof(iov)); + txe->desc[0] = desc; + memset(txe->mr, 0, sizeof(*txe->mr)); + + /* Set up runt size so the protocol can compute bytes_runt */ + g_efa_hmem_info[FI_HMEM_SYSTEM].runt_size = EFA_DEFAULT_RUNT_SIZE; + + err = efa_rdm_proto_runtread.construct_tx_pkes( + ep, peer, &msg, ofi_op_msg, 0, 0, txe); + assert_int_equal(err, 0); + + /* Runt read should produce multiple PKEs for the runt portion */ + assert_true(ep->send_pkt_entry_vec_size >= 1); + + /* Each PKE should have a callback and correct TXE */ + for (i = 0; i < ep->send_pkt_entry_vec_size; i++) { + assert_non_null(ep->send_pkt_entry_vec[i]); + assert_non_null(ep->send_pkt_entry_vec[i]->callback); + assert_ptr_equal(ep->send_pkt_entry_vec[i]->ope, txe); + } + + /* bytes_runt should be set */ + assert_true(txe->bytes_runt > 0); + assert_true(txe->bytes_runt < txe->total_len); + + /* Clean up */ + for (i = 0; i < ep->send_pkt_entry_vec_size; i++) + efa_rdm_pke_release_tx(ep->send_pkt_entry_vec[i]); + efa_rdm_txe_release(txe); + efa_unit_test_buff_destruct(&send_buff); +} diff --git a/prov/efa/test/efa_unit_test_rnr.c b/prov/efa/test/efa_unit_test_rnr.c index 8b67d9c1b97..2b50dc05e69 100644 --- a/prov/efa/test/efa_unit_test_rnr.c +++ b/prov/efa/test/efa_unit_test_rnr.c @@ -69,7 +69,7 @@ void test_efa_rnr_queue_and_resend_impl(struct efa_resource **state, uint32_t op assert_int_equal(efa_rdm_ep->efa_rnr_queued_pkt_cnt, 0); assert_int_equal(efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr)->rnr_queued_pkt_cnt, 0); - efa_rdm_pke_handle_send_completion(pkt_entry); + efa_unit_test_pke_handle_send_completion(pkt_entry); efa_unit_test_buff_destruct(&send_buff); } diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 047da6a5f3e..f47eb21d5b0 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -179,8 +179,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_page_alignment, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_atomic_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_send_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_send_queue_limit_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_read_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_write_queue_before_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_trigger_handshake, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -575,6 +574,24 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_rma_should_write_using_rdma_remote_cq_data_single_iovs_with_rdma_support, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_rma_should_write_using_rdma_unsolicited_write_recv_not_match, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), /* end efa_unit_test_rdm_rma.c */ + + /* begin efa_unit_test_proto.c */ + cmocka_unit_test_setup_teardown(test_proto_select_eager_for_small_msg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_medium_for_mid_msg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_longcts_for_large_msg_no_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_eager_has_priority_over_medium, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_eager_for_zero_len_msg, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_longread_over_longcts_with_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_select_runtread_over_longread, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_eager_construct_pkes_single_pke, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_eager_send_completion_releases_txe, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_eager_assigns_msg_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_medium_construct_pkes_multi_pke, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_medium_send_completion_tracks_bytes_acked, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_longcts_construct_pkes_single_rtm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_longread_construct_pkes_has_read_iovs, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_proto_runtread_construct_pkes_has_runt_and_read_iovs, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + /* end efa_unit_test_proto.c */ }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 81dfcb6bc93..aab18122d51 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -14,10 +14,25 @@ #include #include #include -#include "stdio.h" +#include #include "efa.h" +#include "efa_rdm_pke_cmd.h" #include "efa_unit_test_mocks.h" +/* + * TODO: Remove this utility once all protocols are migrated to the + * refactored code path with callbacks. At that point, all PKEs will + * have a callback set and we can call pkt_entry->callback directly. + */ +static inline void +efa_unit_test_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) +{ + if (pkt_entry->callback) + pkt_entry->callback(pkt_entry); + else + efa_rdm_pke_handle_send_completion(pkt_entry); +} + extern int g_ibv_ah_limit; extern int g_ibv_ah_cnt; extern int g_self_ah_cnt; @@ -142,8 +157,7 @@ void test_efa_rdm_ep_tx_pkt_pool_flags(); void test_efa_rdm_ep_rx_pkt_pool_flags(); void test_efa_rdm_ep_pkt_pool_page_alignment(); void test_efa_rdm_ep_dc_atomic_queue_before_handshake(); -void test_efa_rdm_ep_dc_send_queue_before_handshake(); -void test_efa_rdm_ep_dc_send_queue_limit_before_handshake(); + void test_efa_rdm_ep_write_queue_before_handshake(); void test_efa_rdm_ep_read_queue_before_handshake(); void test_efa_rdm_ep_trigger_handshake(); @@ -571,4 +585,21 @@ int efa_unit_test_get_dlist_length(struct dlist_entry *head) void efa_unit_test_rdm_0byte_prep(struct efa_resource *resource, fi_addr_t *addr); +/* Protocol TX path tests */ +void test_proto_select_eager_for_small_msg(); +void test_proto_select_medium_for_mid_msg(); +void test_proto_select_longcts_for_large_msg_no_p2p(); +void test_proto_select_eager_has_priority_over_medium(); +void test_proto_select_eager_for_zero_len_msg(); +void test_proto_select_longread_over_longcts_with_p2p(); +void test_proto_select_runtread_over_longread(); +void test_proto_eager_construct_pkes_single_pke(); +void test_proto_eager_send_completion_releases_txe(); +void test_proto_eager_assigns_msg_id(); +void test_proto_medium_construct_pkes_multi_pke(); +void test_proto_medium_send_completion_tracks_bytes_acked(); +void test_proto_longcts_construct_pkes_single_rtm(); +void test_proto_longread_construct_pkes_has_read_iovs(); +void test_proto_runtread_construct_pkes_has_runt_and_read_iovs(); + #endif