From 73b570d0a027720cdae42cddaaecf0692e3288ed Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 13 Apr 2026 14:22:17 -0600 Subject: [PATCH 01/16] prov/efa: add efa_av_is_valid_address helper function Add a shared helper that checks whether an efa_ep_addr has a non-zero GID. This validation is needed by both the base AV insert path (efa_av.c) and the protocol AV insert path (efa_proto_av.c) to reject all-zero addresses. Placing it in efa_av.h alongside the efa_ep_addr definition avoids duplicating the check in both files. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_av.c | 7 ------- prov/efa/src/efa_av.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 4bb8e5b680d..4215a34dfb0 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -189,13 +189,6 @@ fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, return FI_ADDR_NOTAVAIL; } -static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) -{ - struct efa_ep_addr all_zeros = { 0 }; - - return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); -} - /** * @brief Move the conn to the front of the LRU list to indicate that it is the * most recently used entry diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 6cbe7b506ea..b7a822c7861 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -35,6 +35,19 @@ struct efa_av_entry { struct efa_conn conn; }; +/** + * @brief check if an efa_ep_addr has a non-zero GID + * + * @param[in] addr address to check + * @return non-zero if valid, 0 if all-zeros + */ +static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) +{ + struct efa_ep_addr all_zeros = { 0 }; + + return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); +} + struct efa_cur_reverse_av_key { uint16_t ahn; uint16_t qpn; From 6b89ad2db01e7b4541406545cdb1ff4176326172 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 8 Apr 2026 18:15:57 -0600 Subject: [PATCH 02/16] prov/efa: add efa_proto_av header with struct definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add rdm/efa_proto_av.h containing the struct definitions for the protocol AV layer. The header defines efa_proto_av_entry as a flat layout AV entry with ep_addr, ah, fi_addr, implicit_fi_addr, peer map, shm_fi_addr, implicit-AV LRU linkage, AH implicit-conn linkage, and a back-pointer to the owning AV. efa_proto_av embeds struct efa_av as its first member so a base-layer efa_av pointer can be recovered via container_of. efa_proto_av_entry_ep_peer_map_entry describes a single (endpoint, peer) mapping stored on each entry's per-entry peer hash. Header-only commit — no implementation is added here. The structs are referenced by the efa_proto_av.c implementation added in a subsequent commit, and the protocol-AV-aware peer switch several commits later changes efa_rdm_peer's backing object from struct efa_conn * to struct efa_proto_av_entry *. Introducing the types ahead of their first use lets each later commit focus on a single concern. Signed-off-by: Seth Zegelstein --- libfabric.vcxproj | 1 + prov/efa/Makefile.include | 1 + prov/efa/src/efa.h | 1 + prov/efa/src/rdm/efa_proto_av.h | 190 ++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 prov/efa/src/rdm/efa_proto_av.h diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 76d1f73ba48..e8909e8e236 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -1019,6 +1019,7 @@ + diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 3d7780e389f..25025e60efa 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -117,6 +117,7 @@ _efa_headers = \ prov/efa/src/efa_data_path_direct_internal.h \ prov/efa/src/efa_mmio.h \ prov/efa/src/rdm/efa_rdm_peer.h \ + prov/efa/src/rdm/efa_proto_av.h \ prov/efa/src/rdm/efa_rdm_cq.h \ prov/efa/src/rdm/efa_rdm_cntr.h \ prov/efa/src/rdm/efa_rdm_ep.h \ diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index 7e85962cbee..b59d796cbc6 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -52,6 +52,7 @@ #include "rdm/efa_rdm_ope.h" #include "rdm/efa_rdm_pke.h" #include "rdm/efa_rdm_peer.h" +#include "rdm/efa_proto_av.h" #include "rdm/efa_rdm_util.h" #include "fi_ext_efa.h" diff --git a/prov/efa/src/rdm/efa_proto_av.h b/prov/efa/src/rdm/efa_proto_av.h new file mode 100644 index 00000000000..85e1ff0df29 --- /dev/null +++ b/prov/efa/src/rdm/efa_proto_av.h @@ -0,0 +1,190 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#ifndef EFA_PROTO_AV_H +#define EFA_PROTO_AV_H + +#include "efa_av.h" + +struct efa_rdm_ep; +struct efa_rdm_peer; + +/** + * @brief Protocol AV entry — flat layout with same field prefix as efa_av_entry + * + * pahole: + * size: 112, cachelines: 2, members: 9 + * + * Cache line 0 (64 bytes): data-path hot fields + * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20) + * ah* off=32 — TX hot (EFA send path) + * fi_addr off=40 — RX hot (explicit peer lookup, CQ poll) + * implicit_fi_addr off=48 — RX hot (implicit peer lookup, CQ poll) + * ep_peer_map* off=56 — TX+RX hot (peer lookup on every op) + * + * Cache line 1 (48 bytes): SHM-only TX / control-path fields + * shm_fi_addr off=64 — SHM TX only + * implicit_av_lru_entry off=72 — implicit RX LRU bookkeeping + * ah_implicit_conn_list_entry off=88 — implicit AV insert/release + * av* off=104 — back-pointer for AH eviction + */ +struct efa_proto_av_entry { + uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */ + struct efa_ah *ah; /* 32 8 */ + fi_addr_t fi_addr; /* 40 8 */ + fi_addr_t implicit_fi_addr; /* 48 8 */ + struct efa_proto_av_entry_ep_peer_map_entry *ep_peer_map; /* 56 8 */ + /* --- cacheline 1 boundary (64 bytes) --- */ + fi_addr_t shm_fi_addr; /* 64 8 */ + struct dlist_entry implicit_av_lru_entry; /* 72 16 */ + struct dlist_entry ah_implicit_conn_list_entry; /* 88 16 */ + struct efa_proto_av *av; /* 104 8 */ +}; + +/** + * @brief Peer map entry — maps (ep_ptr) to efa_rdm_peer for a given AV entry + * + * pahole: size: 328, cachelines: 6 + */ +struct efa_proto_av_entry_ep_peer_map_entry { + struct efa_rdm_ep *ep_ptr; /* 0 8 */ + struct efa_rdm_peer peer; /* 8 264 */ + UT_hash_handle hh; /* 272 56 */ +}; + +/** + * @brief Protocol AV — embeds efa_av as first member (castable) + * + * pahole: + * size: 672, cachelines: 11, members: 10 + * + * efa_av off=0 size=320 (cachelines 0-4) + * domain* off=0 — cacheline 0 + * cur_reverse_av* off=24 — RX hot: explicit peer reverse lookup + * prv_reverse_av* off=32 — RX hot: QPN reuse fallback + * util_av off=40 size=280 + * --- cacheline 5 boundary (320 bytes) --- + * shm_rdm_av* off=320 — control path only + * util_av_implicit off=328 size=280 + * --- cacheline 9 boundary (576 bytes) + 32 --- + * cur_reverse_av_implicit* off=608 — RX hot (implicit peers only) + * prv_reverse_av_implicit* off=616 — RX hot (implicit peers only) + * implicit_av_lru_list off=624 — implicit RX: LRU reorder + * --- cacheline 10 boundary (640 bytes) --- + * used_implicit off=640 + * shm_used off=648 + * implicit_av_size off=656 + * evicted_peers_hashset* off=664 + * + * RX hot path (every RX completion): + * efa_av.cur_reverse_av (off=24) — HASH_FIND for explicit peer reverse lookup + * efa_av.prv_reverse_av (off=32) — HASH_FIND fallback for QPN reuse (connid mismatch) + * These are in cacheline 0 — explicit peer reverse lookup stays in one line. + * + * RX hot path for implicit (unknown) peers: + * cur_reverse_av_implicit (off=608) — HASH_FIND for implicit peer reverse lookup + * prv_reverse_av_implicit (off=616) — HASH_FIND fallback + * implicit_av_lru_list (off=624) — LRU reorder on every implicit RX + * All three are in cacheline 9 — implicit peer reverse lookup + LRU + * update stays in one cache line. + * + * Control path only (AV insert/remove/close): + * shm_rdm_av, util_av_implicit, used_implicit, shm_used, + * implicit_av_size, evicted_peers_hashset + */ +struct efa_proto_av { + struct efa_av efa_av; /* 0 320 */ + /* --- cacheline 5 boundary (320 bytes) --- */ + struct fid_av *shm_rdm_av; /* 320 8 */ + /* implicit AV is used when receiving messages from peers not + * explicitly inserted by the application */ + struct util_av util_av_implicit; /* 328 280 */ + struct efa_cur_reverse_av *cur_reverse_av_implicit; /* 608 8 */ + struct efa_prv_reverse_av *prv_reverse_av_implicit; /* 616 8 */ + struct dlist_entry implicit_av_lru_list; /* 624 16 */ + /* --- cacheline 10 boundary (640 bytes) --- */ + size_t used_implicit; /* 640 8 */ + size_t shm_used; /* 648 8 */ + size_t implicit_av_size; /* 656 8 */ + struct efa_ep_addr_hashable *evicted_peers_hashset; /* 664 8 */ +}; + +/** + * @brief typed accessor for the ep_addr field of a proto AV entry + * + * @param[in] entry proto AV entry + * @return pointer to the efa_ep_addr embedded in the entry + */ +static inline struct efa_ep_addr * +efa_proto_av_entry_ep_addr(struct efa_proto_av_entry *entry) +{ + return (struct efa_ep_addr *)entry->ep_addr; +} + +/* Address lookup */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av, + fi_addr_t fi_addr); + +struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit( + struct efa_proto_av *av, fi_addr_t fi_addr); + +/* Peer map operations */ +void efa_proto_av_entry_ep_peer_map_insert( + struct efa_proto_av_entry *entry, + struct efa_proto_av_entry_ep_peer_map_entry *map_entry); + +struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep); + +void efa_proto_av_entry_ep_peer_map_remove( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep); + +/* SHM AV operations */ +int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +void efa_proto_av_entry_deinit(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +/* Implicit AV LRU */ +void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +/* Reverse lookup for protocol path */ +fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry); + +fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry); + +/* Entry alloc/release */ +struct efa_proto_av_entry *efa_proto_av_entry_alloc( + struct efa_proto_av *av, struct efa_ep_addr *raw_addr, + uint64_t flags, void *context, bool insert_shm_av, + bool insert_implicit_av); + +void efa_proto_av_entry_release(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av); + +void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av); + +/* Implicit to explicit migration */ +int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, + struct efa_ep_addr *raw_addr, + fi_addr_t implicit_fi_addr, + fi_addr_t *fi_addr); + +/* AV open/close/insert/remove for protocol path */ +int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, + struct fid_av **av_fid, void *context); + +int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context, + bool insert_shm_av, bool insert_implicit_av); + +#endif From 12f00c738437cad29f166d890f979f39e4df36c4 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 20:27:45 -0600 Subject: [PATCH 03/16] prov/efa: add context_len parameter to efa_av_init_util_av efa_av_init_util_av currently hard-codes util_attr.context_len to sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN. Add a context_len parameter and update the sole caller to pass this value explicitly. This prepares for efa_proto_av_open (added in a later commit) to call efa_av_init_util_av with a different context_len reflecting struct efa_proto_av_entry's size. No behavior change. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_av.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 4215a34dfb0..878b64d718d 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -826,18 +826,20 @@ static struct fi_ops efa_av_fi_ops = { * @param[in] attr AV attr application passed to fi_av_open * @param[out] util_av util_av field in efa_av * @param[in] context contexted application passed to fi_av_open + * @param[in] context_len size of provider-specific context per AV entry * @return On success, return 0. * On failure, return a negative libfabric error code. */ int efa_av_init_util_av(struct efa_domain *efa_domain, struct fi_av_attr *attr, struct util_av *util_av, - void *context) + void *context, + size_t context_len) { struct util_av_attr util_attr; util_attr.addrlen = EFA_EP_ADDR_LEN; - util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN; + util_attr.context_len = context_len; util_attr.flags = 0; return ofi_av_init(&efa_domain->util_domain, attr, &util_attr, util_av, context); @@ -887,11 +889,13 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, &universe_size) == FI_SUCCESS) attr->count = MAX(attr->count, universe_size); - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context); + ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context, + sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN); if (ret) goto err; - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context); + ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context, + sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN); if (ret) goto err_close_util_av_implicit; From e66e6420ce8afcfc6f2fdbc21a2a78160dc419ae Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 20:28:56 -0600 Subject: [PATCH 04/16] prov/efa: add efa_av_addr_to_entry and efa_av_entry_ep_addr helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce two base-layer helpers on struct efa_av_entry so callers no longer need to reach through the embedded struct efa_conn to get at the entry's identity or its ep_addr byte array. efa_av_addr_to_entry wraps the existing efa_av_addr_to_conn with a container_of so it returns the enclosing efa_av_entry directly. efa_av_entry_ep_addr returns a typed struct efa_ep_addr * view over the ep_addr[] byte array stored at the front of the entry. Both are inline. Callers in the base path can use these helpers today; more importantly, when the protocol-AV variant struct efa_proto_av_entry is added later it will have the same leading byte array but no conn pointer, so the byte-array accessor makes it possible to share reverse-AV helpers between the two entry types without a type shim. This commit also promotes efa_av_init_util_av from file-local to a public declaration in efa_av.h so the protocol AV implementation, added in a subsequent commit, can call it. Purely additive — no existing callers are migrated in this commit. Subsequent commits update base-path callers to use the new helpers. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_av.c | 23 +++++++++++++++++++++++ prov/efa/src/efa_av.h | 19 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 878b64d718d..6faa58b0dea 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -57,6 +57,29 @@ struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, fi_addr_t fi_ad return efa_av_addr_to_conn_impl(&av->util_av_implicit, fi_addr); } +/** + * @brief Look up an efa_av_entry by fi_addr in the base (explicit) AV + * + * Wrapper around efa_av_addr_to_conn that returns the containing + * efa_av_entry via container_of. Exposed as the base-layer lookup + * primitive for callers that need to work with efa_av_entry rather + * than the embedded efa_conn. + * + * @param[in] av address vector + * @param[in] fi_addr libfabric address + * @return pointer to efa_av_entry, or NULL if not found + */ +struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr) +{ + struct efa_conn *conn; + + conn = efa_av_addr_to_conn(av, fi_addr); + if (!conn) + return NULL; + + return container_of(conn, struct efa_av_entry, conn); +} + /** * @brief find fi_addr for efa endpoint * diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index b7a822c7861..2a9289c87e5 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -35,6 +35,17 @@ struct efa_av_entry { struct efa_conn conn; }; +/** + * @brief typed accessor for the ep_addr field of an AV entry + * + * @param[in] entry AV entry + * @return pointer to the efa_ep_addr embedded in the entry + */ +static inline struct efa_ep_addr *efa_av_entry_ep_addr(struct efa_av_entry *entry) +{ + return (struct efa_ep_addr *)entry->ep_addr; +} + /** * @brief check if an efa_ep_addr has a non-zero GID * @@ -130,4 +141,12 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, void efa_av_implicit_av_lru_conn_move(struct efa_av *av, struct efa_conn *conn); +struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr); + +int efa_av_init_util_av(struct efa_domain *efa_domain, + struct fi_av_attr *attr, + struct util_av *util_av, + void *context, + size_t context_len); + #endif \ No newline at end of file From bd5b66b87052b840f1f42c2a202c4d1c37a89ff5 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 20:31:37 -0600 Subject: [PATCH 05/16] prov/efa: migrate base path callers to efa_av_addr_to_entry Update efa_msg.c, efa_rma.c, efa_base_ep.c, and efa_domain.c to use the new efa_av_addr_to_entry() / efa_av_entry_ep_addr() helpers instead of efa_av_addr_to_conn() / conn->ep_addr. Reads are still routed through the embedded conn for ah (av_entry->conn.ah) because struct efa_conn is still embedded in efa_av_entry. A later commit strips the conn embedding and collapses these accesses to av_entry->ah directly. Mechanical rename; no behavior change. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_base_ep.c | 6 +++--- prov/efa/src/efa_domain.c | 10 +++++----- prov/efa/src/efa_msg.c | 8 ++++---- prov/efa/src/efa_rma.c | 36 ++++++++++++++++++------------------ 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index a429be8e26c..1b18879c72b 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -693,11 +693,11 @@ const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, siz struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, fi_addr_t addr) { struct efa_av *efa_av; - struct efa_conn *efa_conn; + struct efa_av_entry *av_entry; efa_av = base_ep->av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ep_addr : NULL; + av_entry = efa_av_addr_to_entry(efa_av, addr); + return av_entry ? efa_av_entry_ep_addr(av_entry) : NULL; } /** diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 3eab28231d4..978fd86710b 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -496,14 +496,14 @@ static int efa_domain_query_addr(struct fid_ep *ep_fid, fi_addr_t addr, uint32_t *remote_qkey) { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - struct efa_conn *conn = efa_av_addr_to_conn(base_ep->av, addr); - if (!conn || !conn->ah || !conn->ep_addr) { + struct efa_av_entry *av_entry = efa_av_addr_to_entry(base_ep->av, addr); + if (!av_entry || !av_entry->conn.ah || !efa_av_entry_ep_addr(av_entry)) { EFA_WARN(FI_LOG_EP_CTRL, "Failed to find connection for addr %lu\n", addr); return -FI_EINVAL; } - *ahn = conn->ah->ahn; - *remote_qpn = conn->ep_addr->qpn; - *remote_qkey = conn->ep_addr->qkey; + *ahn = av_entry->conn.ah->ahn; + *remote_qpn = efa_av_entry_ep_addr(av_entry)->qpn; + *remote_qkey = efa_av_entry_ep_addr(av_entry)->qkey; return FI_SUCCESS; } diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index b6a7e83b864..96850325031 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -206,7 +206,7 @@ static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) { struct efa_qp *qp = base_ep->qp; - struct efa_conn *conn; + struct efa_av_entry *av_entry; struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */ struct ibv_data_buf inline_data_list[2]; struct efa_context *efa_ctx; @@ -227,8 +227,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi dump_msg(msg, "send"); - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); assert(msg->iov_count <= base_ep->info->tx_attr->iov_limit); @@ -330,7 +330,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi /* Use consolidated send function */ ret = efa_qp_post_send(qp, sg_list, inline_data_list, iov_count, use_inline, wr_id, msg->data, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(ret)) ret = (ret == ENOMEM) ? -FI_EAGAIN : -ret; diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index cf136e623b5..151b0353553 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -38,7 +38,7 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, { struct efa_domain *domain = base_ep->domain; struct efa_mr *efa_mr; - struct efa_conn *conn; + struct efa_av_entry *av_entry; size_t iov_count = msg->iov_count; struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */ uintptr_t wr_id; @@ -102,15 +102,15 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, } } - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); /* Use consolidated RDMA read function */ /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ err = efa_qp_post_read(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -197,7 +197,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, uint64_t flags) { struct efa_domain *domain = base_ep->domain; - struct efa_conn *conn; + struct efa_av_entry *av_entry; size_t iov_count = msg->iov_count; struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */ uintptr_t wr_id; @@ -258,14 +258,14 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, } } - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); /* Use consolidated RDMA write function */ err = efa_qp_post_write(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, msg->data, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -365,7 +365,7 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_base_ep *base_ep; struct efa_domain *domain; struct ibv_sge sge; - struct efa_conn *conn; + struct efa_av_entry *av_entry; uintptr_t wr_id; int err; @@ -387,12 +387,12 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, sge.length = 0; sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, 0, 0, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); + wr_id, 0, 0, av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, + efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -406,7 +406,7 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size { struct efa_base_ep *base_ep; struct efa_domain *domain; - struct efa_conn *conn; + struct efa_av_entry *av_entry; struct ibv_sge sge; uintptr_t wr_id; int err; @@ -429,12 +429,12 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size sge.length = 0; sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, data, IBV_SEND_INLINE, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); + wr_id, data, IBV_SEND_INLINE, av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, + efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; From 0eb01d7412f12b3afdbc2f51f521fe421266e9ee Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 20:34:40 -0600 Subject: [PATCH 06/16] prov/efa: change efa_av_reverse_av_add/_remove signature to take efa_av_entry * MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reverse-AV helpers previously took struct efa_conn *. Change them to take struct efa_av_entry * so they can be shared across the base (efa-direct) and protocol (RDM, added in a later commit) layers without a type shim. Update internal field reads to go through the byte-array accessor efa_av_entry_ep_addr() rather than conn->ep_addr pointer deref. The ep_addr pointer and the byte array reference the same storage, so behavior is unchanged — but efa_av_entry_ep_addr() also works for struct efa_proto_av_entry (the protocol's flat layout variant added later), which lacks an ep_addr pointer field. Store struct efa_av_entry * in the efa_cur_reverse_av / efa_prv_reverse_av structs instead of struct efa_conn *. Update callers in efa_av.c and efa_conn.c using container_of() to pass the enclosing efa_av_entry when they only have a conn pointer. No behavior change. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_av.c | 61 ++++++++++++++++++++--------------------- prov/efa/src/efa_av.h | 10 +++---- prov/efa/src/efa_conn.c | 9 ++++-- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 6faa58b0dea..dd85cbe147a 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -99,7 +99,7 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) cur_key.qpn = qpn; HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); - return (OFI_LIKELY(!!cur_entry)) ? cur_entry->conn->fi_addr : FI_ADDR_NOTAVAIL; + return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->conn.fi_addr : FI_ADDR_NOTAVAIL; } static inline struct efa_conn * @@ -129,7 +129,7 @@ efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, * the pkt_entry is allocated from a buffer user posted that * doesn't expect any pkt hdr. */ - return cur_entry->conn; + return &cur_entry->av_entry->conn; } connid = efa_rdm_pke_connid_ptr(pkt_entry); @@ -142,11 +142,11 @@ efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, "The communication can continue but it is " "encouraged to use\n" "a newer version of libfabric\n"); - return cur_entry->conn; + return &cur_entry->av_entry->conn; } - if (OFI_LIKELY(*connid == cur_entry->conn->ep_addr->qkey)) - return cur_entry->conn; + if (OFI_LIKELY(*connid == efa_av_entry_ep_addr(cur_entry->av_entry)->qkey)) + return &cur_entry->av_entry->conn; /* the packet is from a previous peer, look for its address from the * prv_reverse_av */ @@ -155,7 +155,7 @@ efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, prv_key.connid = *connid; HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry); - return OFI_LIKELY(!!prv_entry) ? prv_entry->conn : NULL; + return OFI_LIKELY(!!prv_entry) ? &prv_entry->av_entry->conn : NULL; }; /** @@ -240,22 +240,22 @@ void efa_av_implicit_av_lru_conn_move(struct efa_av *av, * @param[in] av EFA AV object * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] conn efa_conn object + * @param[in] av_entry efa_av_entry object * @return On success, return 0. * Otherwise, return a negative libfabric error code */ int efa_av_reverse_av_add(struct efa_av *av, struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn) + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_entry; struct efa_prv_reverse_av *prv_entry; struct efa_cur_reverse_av_key cur_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = conn->ah->ahn; - cur_key.qpn = conn->ep_addr->qpn; + cur_key.ahn = av_entry->conn.ah->ahn; + cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; cur_entry = NULL; HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); @@ -268,7 +268,7 @@ int efa_av_reverse_av_add(struct efa_av *av, cur_entry->key.ahn = cur_key.ahn; cur_entry->key.qpn = cur_key.qpn; - cur_entry->conn = conn; + cur_entry->av_entry = av_entry; HASH_ADD(hh, *cur_reverse_av, key, sizeof(cur_key), cur_entry); return 0; @@ -286,11 +286,11 @@ int efa_av_reverse_av_add(struct efa_av *av, prv_entry->key.ahn = cur_key.ahn; prv_entry->key.qpn = cur_key.qpn; - prv_entry->key.connid = cur_entry->conn->ep_addr->qkey; - prv_entry->conn = cur_entry->conn; + prv_entry->key.connid = efa_av_entry_ep_addr(cur_entry->av_entry)->qkey; + prv_entry->av_entry = cur_entry->av_entry; HASH_ADD(hh, *prv_reverse_av, key, sizeof(prv_entry->key), prv_entry); - cur_entry->conn = conn; + cur_entry->av_entry = av_entry; return 0; } @@ -301,16 +301,13 @@ int efa_av_reverse_av_add(struct efa_av *av, * cur_reverse_av. Keeping the address in prv_reverse_av helps avoid QPN * collisions. * - * @param[in] av EFA AV object * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] conn efa_conn object - * @return On success, return 0. - * Otherwise, return a negative libfabric error code + * @param[in] av_entry efa_av_entry object */ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn) + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_reverse_av_entry; struct efa_prv_reverse_av *prv_reverse_av_entry; @@ -318,22 +315,22 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av_key prv_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = conn->ah->ahn; - cur_key.qpn = conn->ep_addr->qpn; + cur_key.ahn = av_entry->conn.ah->ahn; + cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_reverse_av_entry); - if (cur_reverse_av_entry && cur_reverse_av_entry->conn == conn) { + if (cur_reverse_av_entry && cur_reverse_av_entry->av_entry == av_entry) { HASH_DEL(*cur_reverse_av, cur_reverse_av_entry); free(cur_reverse_av_entry); } else { memset(&prv_key, 0, sizeof(prv_key)); - prv_key.ahn = conn->ah->ahn; - prv_key.qpn = conn->ep_addr->qpn; - prv_key.connid = conn->ep_addr->qkey; + prv_key.ahn = av_entry->conn.ah->ahn; + prv_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; + prv_key.connid = efa_av_entry_ep_addr(av_entry)->qkey; HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_reverse_av_entry); assert(prv_reverse_av_entry && - prv_reverse_av_entry->conn == conn); + prv_reverse_av_entry->av_entry == av_entry); HASH_DEL(*prv_reverse_av, prv_reverse_av_entry); free(prv_reverse_av_entry); } @@ -423,7 +420,7 @@ static int efa_conn_implicit_to_explicit(struct efa_av *av, /* Handle reverse AV and AV ref counts */ efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, implicit_conn); + &av->prv_reverse_av_implicit, implicit_av_entry); dlist_remove(&implicit_av_entry->conn.implicit_av_lru_entry); @@ -439,7 +436,7 @@ static int efa_conn_implicit_to_explicit(struct efa_av *av, av->used_implicit--; err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av, - explicit_conn); + explicit_av_entry); if (err) return err; @@ -767,11 +764,11 @@ static void efa_av_close_reverse_av(struct efa_av *av) ofi_genlock_lock(&av->util_av.lock); HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) { - efa_conn_release(av, cur_entry->conn, false); + efa_conn_release(av, &cur_entry->av_entry->conn, false); } HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) { - efa_conn_release(av, prv_entry->conn, false); + efa_conn_release(av, &prv_entry->av_entry->conn, false); } ofi_genlock_unlock(&av->util_av.lock); @@ -779,11 +776,11 @@ static void efa_av_close_reverse_av(struct efa_av *av) ofi_genlock_lock(&av->util_av_implicit.lock); HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) { - efa_conn_release(av, cur_entry->conn, true); + efa_conn_release(av, &cur_entry->av_entry->conn, true); } HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) { - efa_conn_release(av, prv_entry->conn, true); + efa_conn_release(av, &prv_entry->av_entry->conn, true); } ofi_genlock_unlock(&av->util_av_implicit.lock); diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 2a9289c87e5..35650d5c725 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -66,7 +66,7 @@ struct efa_cur_reverse_av_key { struct efa_cur_reverse_av { struct efa_cur_reverse_av_key key; - struct efa_conn *conn; + struct efa_av_entry *av_entry; UT_hash_handle hh; }; @@ -78,7 +78,7 @@ struct efa_prv_reverse_av_key { struct efa_prv_reverse_av { struct efa_prv_reverse_av_key key; - struct efa_conn *conn; + struct efa_av_entry *av_entry; UT_hash_handle hh; }; @@ -132,11 +132,11 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn); int efa_av_reverse_av_add(struct efa_av *av, struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn); + struct efa_av_entry *av_entry); void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn); + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry); void efa_av_implicit_av_lru_conn_move(struct efa_av *av, struct efa_conn *conn); diff --git a/prov/efa/src/efa_conn.c b/prov/efa/src/efa_conn.c index a58f1f6e333..6b191ac22f3 100644 --- a/prov/efa/src/efa_conn.c +++ b/prov/efa/src/efa_conn.c @@ -300,7 +300,8 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, } } - err = efa_av_reverse_av_add(av, cur_reverse_av, prv_reverse_av, conn); + err = efa_av_reverse_av_add(av, cur_reverse_av, prv_reverse_av, + container_of(conn, struct efa_av_entry, conn)); if (err) { if (av->domain->info_type == EFA_INFO_RDM) { /* insert_implicit_av is only true for the CQ read path @@ -334,14 +335,16 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn, bool release_from_implicit_av) { + struct efa_av_entry *av_entry = container_of(conn, struct efa_av_entry, conn); + if (release_from_implicit_av) { assert(ofi_genlock_held(&av->util_av_implicit.lock)); efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, conn); + &av->prv_reverse_av_implicit, av_entry); } else { assert(ofi_genlock_held(&av->util_av.lock)); efa_av_reverse_av_remove(&av->cur_reverse_av, - &av->prv_reverse_av, conn); + &av->prv_reverse_av, av_entry); } } From c4743cf0d1007da2fb6ce9d83333ee4969cacfab Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 20:38:04 -0600 Subject: [PATCH 07/16] prov/efa: add efa_proto_av implementation Add the full efa_proto_av.c implementation as a single unit. Protocol AV concentrates all RDM-specific AV concerns in one file: implicit AV insert and release, implicit-to-explicit migration, per-endpoint peer map, implicit-AV LRU tracking, SHM AV insertion, connid-aware reverse lookup, and peer allocation tied to entry lifetime. Moving these responsibilities out of efa_av.c / efa_conn.c is what lets the base AV shrink to an efa-direct-only layer later in this series. This commit drops the file in but deliberately leaves it unwired. fi_ops_domain_rdm.av_open still points at efa_av_open, so nothing in the RDM domain path reaches any code in efa_proto_av.c yet. Functions are given external linkage so the library still links. A later commit registers efa_proto_av_open as the RDM domain's av_open, switches struct efa_rdm_peer's backing object from struct efa_conn to struct efa_proto_av_entry, and updates all RDM callers in one atomic step. One interim-state detail: the newly added pke peer-fi_addr callback reads peer->conn->fi_addr because struct efa_rdm_peer still carries a struct efa_conn * at this point. When the peer switch lands, that single expression becomes peer->av_entry->fi_addr along with the rest of the peer-type change. Signed-off-by: Seth Zegelstein --- libfabric.vcxproj | 1 + prov/efa/Makefile.include | 1 + prov/efa/src/rdm/efa_proto_av.c | 1373 +++++++++++++++++++++++++++++++ 3 files changed, 1375 insertions(+) create mode 100644 prov/efa/src/rdm/efa_proto_av.c diff --git a/libfabric.vcxproj b/libfabric.vcxproj index e8909e8e236..88106138abc 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -874,6 +874,7 @@ + diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 25025e60efa..b8faaedfd40 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -37,6 +37,7 @@ _efa_files = \ prov/efa/src/efa_av.c \ prov/efa/src/efa_ah.c \ prov/efa/src/efa_conn.c \ + prov/efa/src/rdm/efa_proto_av.c \ prov/efa/src/efa_domain.c \ prov/efa/src/efa_fabric.c \ prov/efa/src/efa_mr.c \ diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c new file mode 100644 index 00000000000..bb26d745e74 --- /dev/null +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -0,0 +1,1373 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include + +#include "efa.h" +#include "efa_av.h" +#include "rdm/efa_proto_av.h" +#include "rdm/efa_rdm_pke_utils.h" + +/* + * During the coexistence period between efa_conn and efa_proto_av_entry, + * efa_av_reverse_av_add / _remove read av_entry->conn.ah->ahn on a + * pointer that may actually be an efa_proto_av_entry *. This relies on + * a layout coincidence: the 'ah' field sits at offset 32 in both + * struct efa_av_entry (inside the embedded efa_conn, which itself starts + * with struct efa_ah *ah) and struct efa_proto_av_entry (directly). + * If either struct is ever reordered, these static asserts break loudly + * instead of silently reading the wrong field. + */ +_Static_assert(offsetof(struct efa_proto_av_entry, ep_addr) == + offsetof(struct efa_av_entry, ep_addr), + "efa_av_entry and efa_proto_av_entry must share ep_addr offset"); +_Static_assert(offsetof(struct efa_proto_av_entry, ah) == + offsetof(struct efa_av_entry, conn) + offsetof(struct efa_conn, ah), + "efa_av_entry->conn.ah and efa_proto_av_entry->ah must be at the same offset"); + +/** + * @brief Local/remote peer detection by comparing peer GID with stored local GIDs + * + * @param[in] av efa AV + * @param[in] addr peer address to check + * @return true if local, false otherwise + */ +static bool efa_is_local_peer(struct efa_av *av, const void *addr) +{ + int i; + uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw; + +#if ENABLE_DEBUG + char raw_gid_str[INET6_ADDRSTRLEN] = { 0 }; + + if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) { + EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno); + return 0; + } + EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str); +#endif + for (i = 0; i < g_efa_ibv_gid_cnt; ++i) { + if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) { + EFA_INFO(FI_LOG_AV, "The peer is local.\n"); + return 1; + } + } + + return 0; +} + +/* ---- Address lookup ---- */ + +/** + * @brief find proto AV entry using fi_addr in the given util_av + * + * @param[in] util_av util AV to search + * @param[in] fi_addr fabric address to look up + * @return pointer to entry if valid, NULL otherwise + */ +static inline struct efa_proto_av_entry * +efa_proto_av_addr_to_entry_impl(struct util_av *util_av, fi_addr_t fi_addr) +{ + struct util_av_entry *util_av_entry; + struct efa_proto_av_entry *entry; + + if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) + return NULL; + + if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr))) + util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); + else + return NULL; + + entry = (struct efa_proto_av_entry *)util_av_entry->data; + return entry->ah ? entry : NULL; +} + +/** + * @brief find proto AV entry using fi_addr in the explicit AV + * + * @param[in] av protocol AV + * @param[in] fi_addr fabric address + * @return pointer to entry if valid, NULL otherwise + */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av, + fi_addr_t fi_addr) +{ + return efa_proto_av_addr_to_entry_impl(&av->efa_av.util_av, fi_addr); +} + +/** + * @brief find proto AV entry using fi_addr in the implicit AV + * + * @param[in] av protocol AV + * @param[in] fi_addr fabric address + * @return pointer to entry if valid, NULL otherwise + */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit( + struct efa_proto_av *av, fi_addr_t fi_addr) +{ + return efa_proto_av_addr_to_entry_impl(&av->util_av_implicit, fi_addr); +} + +/* ---- Peer map operations ---- */ + +/** + * @brief insert an entry into the peer map for a given AV entry + * + * @param[in] entry proto AV entry + * @param[in] map_entry peer map entry to insert + */ +void efa_proto_av_entry_ep_peer_map_insert( + struct efa_proto_av_entry *entry, + struct efa_proto_av_entry_ep_peer_map_entry *map_entry) +{ + HASH_ADD_PTR(entry->ep_peer_map, ep_ptr, map_entry); +} + +/** + * @brief look up a peer in the peer map for a given AV entry and endpoint + * + * @param[in] entry proto AV entry + * @param[in] ep RDM endpoint + * @return pointer to peer if found, NULL otherwise + */ +struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep) +{ + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; + + HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry); + return map_entry ? &map_entry->peer : NULL; +} + +/** + * @brief remove an endpoint's peer from the peer map for a given AV entry + * + * @param[in] entry proto AV entry + * @param[in] ep RDM endpoint whose peer to remove + */ +void efa_proto_av_entry_ep_peer_map_remove( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep) +{ + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; + + HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry); + assert(map_entry); + HASH_DELETE(hh, entry->ep_peer_map, map_entry); + ofi_buf_free(map_entry); +} + +/* ---- SHM AV operations ---- */ + +/** + * @brief Insert the address into SHM provider's AV + * + * If shm transfer is enabled and the addr comes from local peer, + * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later. + * 2. insert gid_qpn into shm's av + * 3. store returned fi_addr from shm into the hash table + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry + * @return On success return 0, otherwise return a negative error code + */ +int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + int err, ret; + char smr_name[EFA_SHM_NAME_MAX]; + size_t smr_name_len; + struct efa_ep_addr *ep_addr = efa_proto_av_entry_ep_addr(entry); + + assert(ep_addr); + + if (efa_is_local_peer(&av->efa_av, ep_addr) && av->shm_rdm_av) { + if (av->shm_used >= efa_env.shm_av_size) { + EFA_WARN(FI_LOG_AV, + "Max number of shm AV entry (%d) has been reached.\n", + efa_env.shm_av_size); + return -FI_ENOMEM; + } + + smr_name_len = EFA_SHM_NAME_MAX; + err = efa_shm_ep_name_construct(smr_name, &smr_name_len, ep_addr); + if (err != FI_SUCCESS) { + EFA_WARN(FI_LOG_AV, + "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err); + return err; + } + + /* + * The shm provider supports FI_AV_USER_ID flag. This flag + * associates a user-assigned identifier with each av entry that + * is returned with any completion entry in place of the AV's + * address. Below, &entry->shm_fi_addr is both input and output. + * It is passed in with value entry->fi_addr (the efa provider's + * fi_addr). shm records this as user id for cq write, then + * overwrites shm_fi_addr with the actual fi_addr in shm's av. + * The efa provider uses shm_fi_addr for transmissions through + * the shm ep. + */ + entry->shm_fi_addr = entry->fi_addr; + ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &entry->shm_fi_addr, FI_AV_USER_ID, NULL); + if (OFI_UNLIKELY(ret != 1)) { + EFA_WARN(FI_LOG_AV, + "Failed to insert address to shm provider's av: %s\n", + fi_strerror(-ret)); + entry->shm_fi_addr = FI_ADDR_NOTAVAIL; + return ret; + } + + EFA_INFO(FI_LOG_AV, + "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", + smr_name, entry->fi_addr, entry->shm_fi_addr); + + assert(entry->shm_fi_addr < efa_env.shm_av_size); + av->shm_used++; + } + + return 0; +} + +/** + * @brief Release the protocol-specific resources of an AV entry. + * + * Releases the shm av entry and destroys rdm peers. Caller must hold + * the SRX lock because this function modifies the peer map and destroys + * peers which are accessed and modified in the CQ read path. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry + */ +void efa_proto_av_entry_deinit(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + int err; + struct efa_proto_av_entry_ep_peer_map_entry *peer_map_entry, *tmp; + + assert((entry->fi_addr != FI_ADDR_NOTAVAIL && + entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) || + (entry->implicit_fi_addr != FI_ADDR_NOTAVAIL && + entry->fi_addr == FI_ADDR_NOTAVAIL)); + + if (entry->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { + err = fi_av_remove(av->shm_rdm_av, &entry->shm_fi_addr, 1, 0); + if (err) { + EFA_WARN(FI_LOG_AV, + "remove address from shm av failed! err=%d\n", + err); + } else { + av->shm_used--; + assert(entry->shm_fi_addr < efa_env.shm_av_size); + } + } + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + HASH_ITER(hh, entry->ep_peer_map, peer_map_entry, tmp) { + dlist_remove(&peer_map_entry->peer.ep_peer_list_entry); + efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr); + HASH_DEL(entry->ep_peer_map, peer_map_entry); + ofi_buf_free(peer_map_entry); + } + assert(HASH_CNT(hh, entry->ep_peer_map) == 0); +} + +/* ---- Implicit AV LRU ---- */ + +/** + * @brief Add entry to the LRU list. If the list is full, evict the least + * recently used entry at the front of the LRU list and add the latest one. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to be added to the LRU list + */ +static inline int efa_proto_av_implicit_av_lru_insert(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + size_t cur_size; + struct efa_ep_addr_hashable *ep_addr_hashable; + struct efa_proto_av_entry *entry_to_release; + + if (av->implicit_av_size == 0) + goto out; + + cur_size = HASH_CNT(hh, av->util_av_implicit.hash); + if (cur_size <= av->implicit_av_size) + goto out; + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + dlist_pop_front(&av->implicit_av_lru_list, struct efa_proto_av_entry, + entry_to_release, implicit_av_lru_entry); + /* + * dlist_pop_front leaves entry_to_release's dlist_entry pointing at its + * old neighbors. Re-init so that efa_proto_av_entry_release's call to + * dlist_remove is a no-op on the already-popped node and does not + * disturb the surrounding list. + */ + dlist_init(&entry_to_release->implicit_av_lru_entry); + + EFA_INFO(FI_LOG_AV, + "Evicting AV entry for peer implicit fi_addr %" PRIu64 + " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from " + "implicit AV\n", + entry_to_release->implicit_fi_addr, entry_to_release->ah->ahn, + efa_proto_av_entry_ep_addr(entry_to_release)->qpn, + efa_proto_av_entry_ep_addr(entry_to_release)->qkey); + + ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable)); + if (!ep_addr_hashable) { + EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n"); + /* Re-insert the victim at the head so it remains tracked in the LRU. */ + dlist_insert_head(&entry_to_release->implicit_av_lru_entry, + &av->implicit_av_lru_list); + return -FI_ENOMEM; + } + memcpy(ep_addr_hashable, entry_to_release->ep_addr, sizeof(struct efa_ep_addr)); + HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable); + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + efa_proto_av_entry_release(av, entry_to_release, true); + + assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size); + +out: + dlist_insert_tail(&entry->implicit_av_lru_entry, + &av->implicit_av_lru_list); + return FI_SUCCESS; +} + +/** + * @brief Move entry to the end of the LRU list (most recently used) + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to move + */ +void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + assert(av->implicit_av_size == 0 || + HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size); + assert(dlist_entry_in_list(&av->implicit_av_lru_list, + &entry->implicit_av_lru_entry)); + + dlist_remove(&entry->implicit_av_lru_entry); + dlist_insert_tail(&entry->implicit_av_lru_entry, + &av->implicit_av_lru_list); + + efa_ah_implicit_av_lru_ah_move(av->efa_av.domain, entry->ah); +} + +/* ---- Reverse lookup (protocol, connid-aware) ---- */ + +/** + * @brief reverse lookup a proto AV entry by AHN, QPN, and optional connid + * + * @param[in] cur_reverse_av current reverse AV hash table + * @param[in] prv_reverse_av previous reverse AV hash table + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or packet entry to extract connid from + * @return pointer to entry if found, NULL otherwise + */ +static inline struct efa_proto_av_entry * +efa_proto_av_reverse_lookup_entry(struct efa_cur_reverse_av **cur_reverse_av, + struct efa_prv_reverse_av **prv_reverse_av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + uint32_t *connid; + struct efa_cur_reverse_av *cur_entry; + struct efa_prv_reverse_av *prv_entry; + struct efa_cur_reverse_av_key cur_key; + struct efa_prv_reverse_av_key prv_key; + + cur_key.ahn = ahn; + cur_key.qpn = qpn; + + HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); + + if (OFI_UNLIKELY(!cur_entry)) + return NULL; + + /* + * Cast is safe: in protocol path, av_entry points to the ep_addr field + * of a efa_proto_av_entry which has the same layout prefix. + */ + if (!pkt_entry) { + return (struct efa_proto_av_entry *)cur_entry->av_entry; + } + + connid = efa_rdm_pke_connid_ptr(pkt_entry); + if (!connid) { + EFA_WARN_ONCE(FI_LOG_EP_CTRL, + "An incoming packet does NOT have connection ID " + "in its header.\n" + "This means the peer is using an older version " + "of libfabric.\n" + "The communication can continue but it is " + "encouraged to use\n" + "a newer version of libfabric\n"); + return (struct efa_proto_av_entry *)cur_entry->av_entry; + } + + if (OFI_LIKELY(*connid == efa_av_entry_ep_addr(cur_entry->av_entry)->qkey)) + return (struct efa_proto_av_entry *)cur_entry->av_entry; + + prv_key.ahn = ahn; + prv_key.qpn = qpn; + prv_key.connid = *connid; + HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry); + + return OFI_LIKELY(!!prv_entry) ? (struct efa_proto_av_entry *)prv_entry->av_entry : NULL; +} + +/** + * @brief find fi_addr for RDM endpoint in the explicit AV (connid-aware) + * + * @param[in] av protocol AV + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid + * @return fi_addr on success, FI_ADDR_NOTAVAIL if not found + */ +fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + struct efa_proto_av_entry *entry; + + entry = efa_proto_av_reverse_lookup_entry( + &av->efa_av.cur_reverse_av, &av->efa_av.prv_reverse_av, + ahn, qpn, pkt_entry); + + if (OFI_LIKELY(!!entry)) + return entry->fi_addr; + + return FI_ADDR_NOTAVAIL; +} + +/** + * @brief find fi_addr for RDM endpoint in the implicit AV (connid-aware) + * + * Caller must hold srx_lock. Updates LRU list on hit. + * + * @param[in] av protocol AV + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid + * @return implicit fi_addr on success, FI_ADDR_NOTAVAIL if not found + */ +fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + struct efa_proto_av_entry *entry; + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + entry = efa_proto_av_reverse_lookup_entry( + &av->cur_reverse_av_implicit, &av->prv_reverse_av_implicit, + ahn, qpn, pkt_entry); + + if (OFI_LIKELY(!!entry)) { + efa_proto_av_implicit_av_lru_entry_move(av, entry); + return entry->implicit_fi_addr; + } + + return FI_ADDR_NOTAVAIL; +} + +/* ---- Entry release helpers ---- */ + +/** + * @brief remove entry from the appropriate reverse AV hash tables + * + * @param[in] av protocol AV + * @param[in] entry entry to remove + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +static void efa_proto_av_entry_release_reverse_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + if (release_from_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, + &av->prv_reverse_av_implicit, + (struct efa_av_entry *)entry); + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + efa_av_reverse_av_remove(&av->efa_av.cur_reverse_av, + &av->efa_av.prv_reverse_av, + (struct efa_av_entry *)entry); + } +} + +/** + * @brief remove entry from the appropriate util_av and clear its fields + * + * @param[in] av protocol AV + * @param[in] entry entry to remove + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +static void efa_proto_av_entry_release_util_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + struct util_av *util_av; + char gidstr[INET6_ADDRSTRLEN]; + fi_addr_t fi_addr; + int err; + + if (release_from_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + util_av = &av->util_av_implicit; + fi_addr = entry->implicit_fi_addr; + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + util_av = &av->efa_av.util_av; + fi_addr = entry->fi_addr; + } + + err = ofi_av_remove_addr(util_av, fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); + + inet_ntop(AF_INET6, efa_proto_av_entry_ep_addr(entry)->raw, gidstr, INET6_ADDRSTRLEN); + EFA_INFO(FI_LOG_AV, "efa_proto_av_entry released! entry[%p] GID[%s] QP[%u]\n", + entry, gidstr, efa_proto_av_entry_ep_addr(entry)->qpn); + + entry->ah = NULL; + memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN); +} + +/** + * @brief Release a proto AV entry. + * + * Caller must hold srx_lock. Acquires util_domain.lock internally + * via efa_ah_release. Called from the AV removal path. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to release + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +void efa_proto_av_entry_release(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av); + efa_proto_av_entry_deinit(av, entry); + + if (release_from_implicit_av) { + dlist_remove(&entry->ah_implicit_conn_list_entry); + dlist_remove(&entry->implicit_av_lru_entry); + } + + efa_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av); + efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); + + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; +} + +/** + * @brief Release a proto AV entry without acquiring util_domain.lock. + * + * Caller must hold srx_lock AND util_domain.lock. Called from the AH + * eviction path in the CQ read path which already holds both locks. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to release + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + assert(ofi_genlock_held(&av->efa_av.domain->util_domain.lock)); + + efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av); + efa_proto_av_entry_deinit(av, entry); + + if (release_from_implicit_av) { + dlist_remove(&entry->ah_implicit_conn_list_entry); + dlist_remove(&entry->implicit_av_lru_entry); + } + + /* Decrement refcnts before release_util_av which NULLs entry->ah */ + release_from_implicit_av ? entry->ah->implicit_refcnt-- : + entry->ah->explicit_refcnt--; + + efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); + + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; +} + +/* ---- Entry alloc ---- */ + +/** + * @brief Allocate and initialize a proto AV entry. + * + * Caller must hold util_av.lock (explicit) or util_av_implicit.lock (implicit), + * and must hold srx_lock. srx_lock is required because this function calls + * efa_proto_av_entry_deinit on the error path, which walks the per-entry + * ep_peer_map and destructs peers under srx_lock. + * + * @param[in] av protocol address vector + * @param[in] raw_addr raw efa address + * @param[in] flags flags application passed to fi_av_insert + * @param[in] context context application passed to fi_av_insert + * @param[in] insert_shm_av whether to insert address into shm av + * @param[in] insert_implicit_av whether to insert into implicit AV + * @return on success, return a pointer to the entry; otherwise NULL + */ +struct efa_proto_av_entry *efa_proto_av_entry_alloc( + struct efa_proto_av *av, struct efa_ep_addr *raw_addr, + uint64_t flags, void *context, bool insert_shm_av, + bool insert_implicit_av) +{ + struct util_av *util_av; + struct efa_cur_reverse_av **cur_reverse_av; + struct efa_prv_reverse_av **prv_reverse_av; + struct util_av_entry *util_av_entry = NULL; + struct efa_proto_av_entry *entry; + fi_addr_t fi_addr; + int err; + bool on_lru_list = false; + + if (flags & FI_SYNC_ERR) + memset(context, 0, sizeof(int)); + + if (insert_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + util_av = &av->util_av_implicit; + cur_reverse_av = &av->cur_reverse_av_implicit; + prv_reverse_av = &av->prv_reverse_av_implicit; + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + util_av = &av->efa_av.util_av; + cur_reverse_av = &av->efa_av.cur_reverse_av; + prv_reverse_av = &av->efa_av.prv_reverse_av; + } + + err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", + fi_strerror(err)); + return NULL; + } + + util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); + entry = (struct efa_proto_av_entry *)util_av_entry->data; + assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)entry->ep_addr)); + + memset((char *)entry + EFA_EP_ADDR_LEN, 0, + sizeof(*entry) - EFA_EP_ADDR_LEN); + assert(av->efa_av.type == FI_AV_TABLE); + + entry->av = av; + + if (insert_implicit_av) { + entry->fi_addr = FI_ADDR_NOTAVAIL; + entry->implicit_fi_addr = fi_addr; + err = efa_proto_av_implicit_av_lru_insert(av, entry); + if (err) + goto err_release; + on_lru_list = true; + } else { + entry->fi_addr = fi_addr; + entry->implicit_fi_addr = FI_ADDR_NOTAVAIL; + } + + entry->ah = efa_ah_alloc(av->efa_av.domain, raw_addr->raw, insert_implicit_av); + if (!entry->ah) + goto err_release; + + if (insert_implicit_av) + dlist_insert_tail(&entry->ah_implicit_conn_list_entry, + &entry->ah->implicit_conn_list); + + entry->shm_fi_addr = FI_ADDR_NOTAVAIL; + + /* + * This function is called in two situations: + * 1. application calls fi_av_insert API + * 2. efa progress engine gets a message from unknown peer through + * efa device, meaning peer is not local or shm is disabled. + * For situation 1, shm av insertion should happen when peer is local + * (insert_shm_av=1). For situation 2, it shouldn't (insert_shm_av=0). + */ + if (insert_shm_av) { + err = efa_proto_av_entry_insert_shm_av(av, entry); + if (err) { + errno = -err; + goto err_release; + } + } + + err = efa_av_reverse_av_add(&av->efa_av, cur_reverse_av, prv_reverse_av, + (struct efa_av_entry *)entry); + if (err) { + efa_proto_av_entry_deinit(av, entry); + goto err_release; + } + + insert_implicit_av ? av->used_implicit++ : av->efa_av.used_explicit++; + + return entry; + +err_release: + if (insert_implicit_av && on_lru_list) + dlist_remove(&entry->implicit_av_lru_entry); + + if (entry->ah) + efa_ah_release(av->efa_av.domain, entry->ah, insert_implicit_av); + + entry->ah = NULL; + memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN); + err = ofi_av_remove_addr(util_av, fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", + err); + + return NULL; +} + +/* ---- Implicit to explicit migration ---- */ + +/** + * @brief get the fi_addr from a peer rx entry's packet context + * + * Used as a callback for foreach_unspec_addr during implicit-to-explicit + * migration. + * + * @param[in] rx_entry peer rx entry + * @return fi_addr of the peer + */ +static fi_addr_t +efa_proto_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) +{ + struct efa_rdm_pke *pke; + + pke = (struct efa_rdm_pke *) rx_entry->peer_context; + + return pke->peer->conn->fi_addr; +} + +/** + * @brief migrate an implicit AV entry to the explicit AV + * + * Moves the entry, its peer map, AH, and SHM fi_addr from the implicit + * AV to the explicit AV. Updates reverse AVs and notifies the SRX to + * move unexpected messages from the unspecified queue. + * + * Caller must hold util_av.lock and util_av_implicit.lock. + * + * @param[in] av protocol AV + * @param[in] raw_addr raw efa address + * @param[in] implicit_fi_addr fi_addr in the implicit AV + * @param[out] fi_addr fi_addr assigned in the explicit AV + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, + struct efa_ep_addr *raw_addr, + fi_addr_t implicit_fi_addr, + fi_addr_t *fi_addr) +{ + int err; + struct efa_ah *ah; + struct efa_proto_av_entry *implicit_entry, *explicit_entry; + struct efa_rdm_ep *ep; + struct dlist_entry *list_entry; + struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry, *tmp; + struct fid_peer_srx *peer_srx; + + EFA_INFO(FI_LOG_AV, + "Moving peer with implicit fi_addr %" PRIu64 + " to explicit AV\n", + implicit_fi_addr); + + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + + implicit_util_av_entry = + ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr); + implicit_entry = (struct efa_proto_av_entry *) implicit_util_av_entry->data; + + assert(implicit_entry); + assert(efa_is_same_addr( + raw_addr, (struct efa_ep_addr *) implicit_entry->ep_addr)); + assert(implicit_entry->fi_addr == FI_ADDR_NOTAVAIL && + implicit_entry->implicit_fi_addr == implicit_fi_addr); + + ah = implicit_entry->ah; + + /* Create explicit util AV entry */ + err = ofi_av_insert_addr(&av->efa_av.util_av, raw_addr, fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, + "ofi_av_insert_addr into explicit AV failed! Error " + "message: %s\n", + fi_strerror(err)); + return err; + } + + explicit_util_av_entry = + ofi_bufpool_get_ibuf(av->efa_av.util_av.av_entry_pool, *fi_addr); + explicit_entry = (struct efa_proto_av_entry *) explicit_util_av_entry->data; + assert(efa_is_same_addr( + raw_addr, (struct efa_ep_addr *) explicit_entry->ep_addr)); + + /* Copy information from implicit to explicit */ + memset((char *)explicit_entry + EFA_EP_ADDR_LEN, 0, + sizeof(*explicit_entry) - EFA_EP_ADDR_LEN); + assert(av->efa_av.type == FI_AV_TABLE); + explicit_entry->av = av; + explicit_entry->ah = implicit_entry->ah; + explicit_entry->fi_addr = *fi_addr; + explicit_entry->shm_fi_addr = implicit_entry->shm_fi_addr; + explicit_entry->implicit_fi_addr = FI_ADDR_NOTAVAIL; + HASH_ITER(hh, implicit_entry->ep_peer_map, map_entry, tmp) { + HASH_DELETE(hh, implicit_entry->ep_peer_map, map_entry); + HASH_ADD_PTR(explicit_entry->ep_peer_map, ep_ptr, map_entry); + map_entry->peer.conn = (struct efa_conn *)explicit_entry; + } + assert(HASH_CNT(hh, implicit_entry->ep_peer_map) == 0); + + /* Handle reverse AV and AV ref counts */ + efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, + &av->prv_reverse_av_implicit, + (struct efa_av_entry *)implicit_entry); + + dlist_remove(&implicit_entry->implicit_av_lru_entry); + + err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, + "ofi_av_remove_addr from implicit AV failed! Error " + "message: %s\n", + fi_strerror(err)); + return err; + } + + av->used_implicit--; + + err = efa_av_reverse_av_add(&av->efa_av, &av->efa_av.cur_reverse_av, + &av->efa_av.prv_reverse_av, + (struct efa_av_entry *)explicit_entry); + if (err) + return err; + + av->efa_av.used_explicit++; + + /* Handle AH LRU list and refcnt */ + assert(!dlist_empty(&ah->implicit_conn_list)); + dlist_remove(&implicit_entry->ah_implicit_conn_list_entry); + efa_ah_implicit_av_lru_ah_move(av->efa_av.domain, ah); + ah->implicit_refcnt--; + ah->explicit_refcnt++; + + EFA_INFO(FI_LOG_AV, + "Peer with implicit fi_addr %" PRIu64 + " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n", + implicit_fi_addr, *fi_addr); + + ofi_genlock_lock(&av->efa_av.util_av.ep_list_lock); + dlist_foreach(&av->efa_av.util_av.ep_list, list_entry) { + ep = container_of(list_entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); + peer_srx = util_get_peer_srx(ep->peer_srx_ep); + peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_proto_av_get_addr_from_peer_rx_entry); + } + ofi_genlock_unlock(&av->efa_av.util_av.ep_list_lock); + + return FI_SUCCESS; +} + +/* ---- Protocol AV insert_one ---- */ + +/** + * @brief insert one address into the protocol AV + * + * Checks explicit and implicit AVs for duplicates. Handles + * implicit-to-explicit migration when an implicit entry exists. + * + * Caller must hold srx_lock. + * + * @param[in] av protocol AV + * @param[in] addr raw address (gid:qpn:qkey) + * @param[out] fi_addr output fi_addr + * @param[in] flags flags from fi_av_insert + * @param[in] context context from fi_av_insert + * @param[in] insert_shm_av whether to insert into SHM AV + * @param[in] insert_implicit_av whether to insert into implicit AV + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context, + bool insert_shm_av, bool insert_implicit_av) +{ + struct efa_proto_av_entry *entry; + char raw_gid_str[INET6_ADDRSTRLEN]; + fi_addr_t efa_fiaddr; + fi_addr_t implicit_fi_addr; + int ret = 0; + + if (!efa_av_is_valid_address(addr)) { + EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n"); + *fi_addr = FI_ADDR_NOTAVAIL; + return -FI_EADDRNOTAVAIL; + } + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + ofi_genlock_lock(&av->util_av_implicit.lock); + ofi_genlock_lock(&av->efa_av.util_av.lock); + + memset(raw_gid_str, 0, sizeof(raw_gid_str)); + if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) { + EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno); + ret = -FI_EINVAL; + *fi_addr = FI_ADDR_NOTAVAIL; + goto out; + } + + EFA_INFO(FI_LOG_AV, + "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n", + raw_gid_str, addr->qpn, addr->qkey, + insert_implicit_av ? "implicit" : "explicit"); + + /* Check explicit AV */ + efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->efa_av.util_av, addr); + if (efa_fiaddr != FI_ADDR_NOTAVAIL) { + assert(!insert_implicit_av); + EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr); + *fi_addr = efa_fiaddr; + ret = 0; + goto out; + } + + /* Check implicit AV */ + implicit_fi_addr = + ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr); + if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { + EFA_INFO(FI_LOG_AV, + "Found implicit AV entry id %ld for the same address\n", + implicit_fi_addr); + + if (insert_implicit_av) { + entry = efa_proto_av_addr_to_entry_implicit(av, implicit_fi_addr); + efa_proto_av_implicit_av_lru_entry_move(av, entry); + *fi_addr = implicit_fi_addr; + goto out; + } + + ret = efa_proto_av_entry_implicit_to_explicit(av, addr, implicit_fi_addr, fi_addr); + if (ret) + *fi_addr = FI_ADDR_NOTAVAIL; + goto out; + } + + entry = efa_proto_av_entry_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av); + if (!entry) { + *fi_addr = FI_ADDR_NOTAVAIL; + ret = -FI_EADDRNOTAVAIL; + goto out; + } + + if (insert_implicit_av) { + *fi_addr = entry->implicit_fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to implicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); + } else { + *fi_addr = entry->fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); + } + ret = 0; + +out: + ofi_genlock_unlock(&av->efa_av.util_av.lock); + ofi_genlock_unlock(&av->util_av_implicit.lock); + return ret; +} + +/* ---- Protocol AV fi_ops ---- */ + +/** + * @brief insert addresses into protocol AV (fi_av_insert implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr buffer containing addresses to insert + * @param[in] count number of addresses + * @param[out] fi_addr array for returned fabric addresses + * @param[in] flags operation flags + * @param[in] context user context + * @return number of addresses successfully inserted + */ +static int efa_proto_av_insert(struct fid_av *av_fid, const void *addr, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av); + int ret = 0, success_cnt = 0; + size_t i = 0; + struct efa_ep_addr *addr_i; + fi_addr_t fi_addr_res; + + if (av->efa_av.util_av.flags & FI_EVENT) + return -FI_ENOEQ; + + if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) + return -FI_EINVAL; + + flags &= ~FI_MORE; + if (flags) + return -FI_ENOSYS; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + + for (i = 0; i < count; i++) { + addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); + + ret = efa_proto_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false); + if (ret) { + EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret); + break; + } + + if (fi_addr) + fi_addr[i] = fi_addr_res; + success_cnt++; + } + + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); + + for (; i < count ; i++) { + if (fi_addr) + fi_addr[i] = FI_ADDR_NOTAVAIL; + } + + return success_cnt; +} + +/** + * @brief retrieve an address from the protocol AV (fi_av_lookup implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr fabric address to look up + * @param[out] addr buffer to store the returned address + * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, + void *addr, size_t *addrlen) +{ + struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av); + struct efa_proto_av_entry *entry = NULL; + + if (av->efa_av.type != FI_AV_TABLE) + return -FI_EINVAL; + + if (fi_addr == FI_ADDR_NOTAVAIL) + return -FI_EINVAL; + + ofi_genlock_lock(&av->efa_av.util_av.lock); + entry = efa_proto_av_addr_to_entry(av, fi_addr); + if (!entry) { + ofi_genlock_unlock(&av->efa_av.util_av.lock); + return -FI_EINVAL; + } + + memcpy(addr, (void *)entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); + ofi_genlock_unlock(&av->efa_av.util_av.lock); + if (*addrlen > EFA_EP_ADDR_LEN) + *addrlen = EFA_EP_ADDR_LEN; + return 0; +} + +/** + * @brief remove addresses from the protocol AV (fi_av_remove implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr array of fabric addresses to remove + * @param[in] count number of addresses + * @param[in] flags operation flags + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) +{ + int err = 0; + size_t i; + struct efa_av *base_av; + struct efa_proto_av *av; + struct efa_proto_av_entry *entry; + + if (!fi_addr) + return -FI_EINVAL; + + base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + av = container_of(base_av, struct efa_proto_av, efa_av); + if (av->efa_av.type != FI_AV_TABLE) + return -FI_EINVAL; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + ofi_genlock_lock(&av->efa_av.util_av.lock); + for (i = 0; i < count; i++) { + entry = efa_proto_av_addr_to_entry(av, fi_addr[i]); + if (!entry) { + err = -FI_EINVAL; + break; + } + + efa_proto_av_entry_release(av, entry, false); + } + + if (i < count) + assert(err); + + ofi_genlock_unlock(&av->efa_av.util_av.lock); + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); + return err; +} + +/** + * @brief convert an address to a printable string (fi_av_straddr implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr address to convert + * @param[out] buf buffer to store the string + * @param[in,out] len on input, size of buf; on output, bytes written + * @return pointer to buf + */ +static const char *efa_proto_av_straddr(struct fid_av *av_fid, const void *addr, + char *buf, size_t *len) +{ + return ofi_straddr(buf, len, FI_ADDR_EFA, addr); +} + +static struct fi_ops_av efa_proto_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = efa_proto_av_insert, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = efa_proto_av_remove, + .lookup = efa_proto_av_lookup, + .straddr = efa_proto_av_straddr +}; + +/** + * @brief release all entries in the explicit and implicit reverse AVs + * + * @param[in] av protocol AV + */ +static void efa_proto_av_close_reverse_av(struct efa_proto_av *av) +{ + struct efa_cur_reverse_av *cur_entry, *curtmp; + struct efa_prv_reverse_av *prv_entry, *prvtmp; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + + ofi_genlock_lock(&av->efa_av.util_av.lock); + + HASH_ITER(hh, av->efa_av.cur_reverse_av, cur_entry, curtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, false); + } + + HASH_ITER(hh, av->efa_av.prv_reverse_av, prv_entry, prvtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, false); + } + + ofi_genlock_unlock(&av->efa_av.util_av.lock); + + ofi_genlock_lock(&av->util_av_implicit.lock); + + HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, true); + } + + HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, true); + } + + ofi_genlock_unlock(&av->util_av_implicit.lock); + + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); +} + +/** + * @brief close the protocol AV and release all resources (fi_close implementation) + * + * @param[in] fid fid of AV + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_close(struct fid *fid) +{ + struct efa_av *base_av; + struct efa_proto_av *av; + int err = 0; + struct efa_ep_addr_hashable *ep_addr_hashable, *tmp; + + base_av = container_of(fid, struct efa_av, util_av.av_fid.fid); + av = container_of(base_av, struct efa_proto_av, efa_av); + + efa_proto_av_close_reverse_av(av); + + err = ofi_av_close(&av->efa_av.util_av); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n", + fi_strerror(err)); + + err = ofi_av_close(&av->util_av_implicit); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n", + fi_strerror(err)); + + if (av->shm_rdm_av) { + err = fi_close(&av->shm_rdm_av->fid); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, + "Failed to close shm av: %s\n", + fi_strerror(err)); + } + + HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) { + HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable); + free(ep_addr_hashable); + } + + free(av); + return err; +} + +static struct fi_ops efa_proto_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = efa_proto_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +/** + * @brief open a protocol AV (fi_av_open implementation for RDM) + * + * @param[in] domain_fid fid of domain + * @param[in] attr AV attributes + * @param[out] av_fid pointer to store the opened AV fid + * @param[in] context user context + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, + struct fid_av **av_fid, void *context) +{ + struct efa_domain *efa_domain; + struct efa_proto_av *av; + struct fi_av_attr av_attr = { 0 }; + size_t context_len; + size_t universe_size; + int ret, retv; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->flags) + return -FI_ENOSYS; + + if (!attr->count) + attr->count = EFA_MIN_AV_SIZE; + else + attr->count = MAX(attr->count, EFA_MIN_AV_SIZE); + + av = calloc(1, sizeof(*av)); + if (!av) + return -FI_ENOMEM; + + if (attr->type == FI_AV_MAP) { + EFA_INFO(FI_LOG_AV, "FI_AV_MAP is deprecated in Libfabric 2.x. Please use FI_AV_TABLE. " + "EFA provider will now switch to using FI_AV_TABLE.\n"); + } + attr->type = FI_AV_TABLE; + + efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); + + if (fi_param_get_size_t(NULL, "universe_size", + &universe_size) == FI_SUCCESS) + attr->count = MAX(attr->count, universe_size); + + context_len = sizeof(struct efa_proto_av_entry) - EFA_EP_ADDR_LEN; + + ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context, + context_len); + if (ret) + goto err; + + ret = efa_av_init_util_av(efa_domain, attr, &av->efa_av.util_av, context, + context_len); + if (ret) + goto err_close_util_av_implicit; + + if (efa_domain->fabric && efa_domain->fabric->shm_fabric) { + av_attr = *attr; + if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) { + ret = -FI_ENOSYS; + EFA_WARN(FI_LOG_AV, + "The requested av size is beyond" + " shm supported maximum av size: %s\n", + fi_strerror(-ret)); + goto err_close_util_av; + } + av_attr.count = efa_env.shm_av_size; + assert(av_attr.type == FI_AV_TABLE); + ret = fi_av_open(efa_domain->shm_domain, &av_attr, + &av->shm_rdm_av, context); + if (ret) + goto err_close_util_av; + } + + EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags); + + av->efa_av.domain = efa_domain; + av->efa_av.type = attr->type; + av->efa_av.used_explicit = 0; + av->implicit_av_size = efa_env.implicit_av_size; + av->used_implicit = 0; + av->shm_used = 0; + + *av_fid = &av->efa_av.util_av.av_fid; + (*av_fid)->fid.fclass = FI_CLASS_AV; + (*av_fid)->fid.context = context; + (*av_fid)->fid.ops = &efa_proto_av_fi_ops; + (*av_fid)->ops = &efa_proto_av_ops; + + dlist_init(&av->implicit_av_lru_list); + + return 0; + +err_close_util_av: + retv = ofi_av_close(&av->efa_av.util_av); + if (retv) + EFA_WARN(FI_LOG_AV, + "Unable to close util_av: %s\n", fi_strerror(-retv)); + +err_close_util_av_implicit: + retv = ofi_av_close(&av->util_av_implicit); + if (retv) + EFA_WARN(FI_LOG_AV, + "Unable to close util_av_implicit: %s\n", fi_strerror(-retv)); + +err: + free(av); + return ret; +} From 5059ae0ce3d942ad67115a587837ab2e12fcfb88 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 23:17:28 -0600 Subject: [PATCH 08/16] prov/efa: use FI_ADDR_NOTAVAIL for empty fi_addr slot check efa_rdm_ep_destroy_buffer_pools tests whether a peer's explicit fi_addr is populated; if not, falls through to the implicit AV. The test was written with FI_ADDR_UNSPEC, which is semantically "unspecified peer address" (used by e.g. fi_recvfrom for any source) rather than "this slot has no valid fi_addr". FI_ADDR_UNSPEC and FI_ADDR_NOTAVAIL are both defined as (uint64_t)-1 in fabric.h, so the runtime behavior is identical. This is a naming correction, not a behavior change. Every other site in the provider that tests the same peer->conn->fi_addr / ->implicit_fi_addr condition uses FI_ADDR_NOTAVAIL (e.g. efa_rdm_cq.c, efa_proto_av.c, efa_av.c). Bring this site in line for consistency. No behavior change. Signed-off-by: Seth Zegelstein --- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 5020c487bc6..7d37ceab868 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -735,12 +735,12 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) struct efa_rdm_peer, peer, ep_peer_list_entry, tmp) { - if (peer->conn->fi_addr != FI_ADDR_UNSPEC) { + if (peer->conn->fi_addr != FI_ADDR_NOTAVAIL) { util_av_entry = ofi_bufpool_get_ibuf( efa_rdm_ep->base_ep.av->util_av.av_entry_pool, peer->conn->fi_addr); } else { - assert(peer->conn->implicit_fi_addr != FI_ADDR_UNSPEC); + assert(peer->conn->implicit_fi_addr != FI_ADDR_NOTAVAIL); util_av_entry = ofi_bufpool_get_ibuf( efa_rdm_ep->base_ep.av->util_av_implicit.av_entry_pool, From 9c35b0ca9dd2f793720f5d737b300b0ff290e5f9 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Mon, 4 May 2026 23:18:12 -0600 Subject: [PATCH 09/16] prov/efa: wire efa_proto_av into RDM, switch peer backing to efa_proto_av_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the atomic switch that moves the RDM protocol layer off of struct efa_conn and onto struct efa_proto_av_entry. Because the peer's backing object changes type and every caller that reaches through a peer pointer must be updated in lockstep, splitting this work across multiple commits would leave the tree uncompilable between them. At the domain level, fi_ops_domain_rdm.av_open is set to efa_proto_av_open so RDM domains instantiate an efa_proto_av instead of the base efa_av. A proto_av field is added to struct efa_rdm_ep and populated in efa_rdm_ep_bind via container_of(av, struct efa_proto_av, efa_av); caching it on the endpoint avoids repeating the container_of on the hot path. struct efa_rdm_peer's backing object changes from struct efa_conn * to struct efa_proto_av_entry *. The field is renamed from conn to av_entry to reflect the new type, and efa_rdm_peer_construct's signature is updated to take the new pointer. All RDM sources that dereferenced peer->conn are updated to peer->av_entry: efa_rdm_cq.c, efa_rdm_msg.c, efa_rdm_ope.c, efa_rdm_pke*.c, efa_rdm_util.c, efa_domain.c, and efa_rdm_ep_utils.c. The efa_rdm_ep_get_peer_* helpers in efa_rdm_ep_utils.c are migrated from efa_av_addr_to_conn / conn-based peer-map lookup to efa_proto_av_addr_to_entry and efa_proto_av_entry_ep_peer_map_lookup. efa_rdm_ep_get_explicit_shm_fi_addr moves out of efa_rdm_ep.h into efa_rdm_ep_utils.c. Keeping the accessor inline in the header would force efa_rdm_ep.h to include efa_proto_av.h, and efa_proto_av.h depends transitively on efa_rdm_peer.h; an out-of-line definition keeps the include graph acyclic. The lone pke callback introduced with efa_proto_av.c that still read peer->conn->fi_addr is flipped to peer->av_entry->fi_addr as part of this commit — it is the single expression that had to wait for the peer-type switch to land. efa_rdm_ep_destroy_buffer_pools similarly migrates to the efa_proto_av_entry peer-map entry type. efa_ah_implicit_av_evict_ah in efa_ah.c is updated to iterate efa_proto_av_entry on the AH LRU list and call efa_proto_av_entry_release_ah_unsafe. This function is relocated to efa_proto_av.c entirely in the subsequent AH-layering commit; the update here keeps the intermediate state functional. Test mocks and the efa AV test file are updated for the new peer field name, the new efa_rdm_peer_construct signature, and the new peer-map entry type. test_av_verify_av_hash_cnt gains a struct efa_proto_av * parameter so it can reach both the base efa_av hash counters and the protocol-only implicit-AV fields through a single call. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_ah.c | 10 +- prov/efa/src/efa_av.c | 85 +---------- prov/efa/src/efa_domain.c | 7 +- prov/efa/src/rdm/efa_proto_av.c | 4 +- prov/efa/src/rdm/efa_rdm_cq.c | 32 ++--- prov/efa/src/rdm/efa_rdm_ep.h | 11 +- prov/efa/src/rdm/efa_rdm_ep_fiops.c | 48 +++---- prov/efa/src/rdm/efa_rdm_ep_utils.c | 58 ++++---- prov/efa/src/rdm/efa_rdm_msg.c | 10 +- prov/efa/src/rdm/efa_rdm_ope.c | 14 +- prov/efa/src/rdm/efa_rdm_peer.c | 9 +- prov/efa/src/rdm/efa_rdm_peer.h | 10 +- prov/efa/src/rdm/efa_rdm_pke.c | 41 +++--- prov/efa/src/rdm/efa_rdm_pke_nonreq.c | 4 +- prov/efa/src/rdm/efa_rdm_pke_print.c | 4 +- prov/efa/src/rdm/efa_rdm_util.c | 2 +- prov/efa/test/efa_unit_test_av.c | 197 +++++++++++++++----------- prov/efa/test/efa_unit_test_cq.c | 2 +- prov/efa/test/efa_unit_test_ep.c | 2 +- prov/efa/test/efa_unit_test_srx.c | 6 +- 20 files changed, 256 insertions(+), 300 deletions(-) diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index 53bf736f1fd..ff25d13af3b 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -35,7 +35,7 @@ void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, } static inline int efa_ah_implicit_av_evict_ah(struct efa_domain *domain) { - struct efa_conn *conn_to_release; + struct efa_proto_av_entry *entry_to_release; struct efa_ah *ah_tmp, *ah_to_release = NULL; struct dlist_entry *tmp; @@ -57,13 +57,13 @@ static inline int efa_ah_implicit_av_evict_ah(struct efa_domain *domain) { assert(ah_to_release->implicit_refcnt > 0); dlist_foreach_container_safe(&ah_to_release->implicit_conn_list, - struct efa_conn, conn_to_release, + struct efa_proto_av_entry, entry_to_release, ah_implicit_conn_list_entry, tmp) { - assert(conn_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && - conn_to_release->fi_addr == FI_ADDR_NOTAVAIL); + assert(entry_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && + entry_to_release->fi_addr == FI_ADDR_NOTAVAIL); - efa_conn_release_ah_unsafe(conn_to_release->av, conn_to_release, true); + efa_proto_av_entry_release_ah_unsafe(entry_to_release->av, entry_to_release, true); } if (ah_to_release->implicit_refcnt == 0 && diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index dd85cbe147a..4cc34c81db0 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -102,62 +102,6 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->conn.fi_addr : FI_ADDR_NOTAVAIL; } -static inline struct efa_conn * -efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - uint16_t ahn, uint16_t qpn, - struct efa_rdm_pke *pkt_entry) -{ - uint32_t *connid; - struct efa_cur_reverse_av *cur_entry; - struct efa_prv_reverse_av *prv_entry; - struct efa_cur_reverse_av_key cur_key; - struct efa_prv_reverse_av_key prv_key; - - cur_key.ahn = ahn; - cur_key.qpn = qpn; - - HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); - - if (OFI_UNLIKELY(!cur_entry)) - return NULL; - - if (!pkt_entry) { - /** - * There is no packet entry to extract connid from when we get - * an IBV_WC_RECV_RDMA_WITH_IMM completion from rdma-core. Or - * the pkt_entry is allocated from a buffer user posted that - * doesn't expect any pkt hdr. - */ - return &cur_entry->av_entry->conn; - } - - connid = efa_rdm_pke_connid_ptr(pkt_entry); - if (!connid) { - EFA_WARN_ONCE(FI_LOG_EP_CTRL, - "An incoming packet does NOT have connection ID " - "in its header.\n" - "This means the peer is using an older version " - "of libfabric.\n" - "The communication can continue but it is " - "encouraged to use\n" - "a newer version of libfabric\n"); - return &cur_entry->av_entry->conn; - } - - if (OFI_LIKELY(*connid == efa_av_entry_ep_addr(cur_entry->av_entry)->qkey)) - return &cur_entry->av_entry->conn; - - /* the packet is from a previous peer, look for its address from the - * prv_reverse_av */ - prv_key.ahn = ahn; - prv_key.qpn = qpn; - prv_key.connid = *connid; - HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry); - - return OFI_LIKELY(!!prv_entry) ? &prv_entry->av_entry->conn : NULL; -}; - /** * @brief find fi_addr for rdm endpoint in the explicit AV * @@ -171,15 +115,9 @@ efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct efa_rdm_pke *pkt_entry) { - struct efa_conn *conn; + struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); - conn = efa_av_reverse_lookup_rdm_conn( - &av->cur_reverse_av, &av->prv_reverse_av, ahn, qpn, pkt_entry); - - if (OFI_LIKELY(!!conn)) - return conn->fi_addr; - - return FI_ADDR_NOTAVAIL; + return efa_proto_av_reverse_lookup(proto_av, ahn, qpn, pkt_entry); } /** @@ -196,20 +134,9 @@ fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, uint16_t qpn, struct efa_rdm_pke *pkt_entry) { - struct efa_conn *conn; - - assert(ofi_genlock_held(&av->domain->srx_lock)); - - conn = efa_av_reverse_lookup_rdm_conn(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, ahn, - qpn, pkt_entry); - - if (OFI_LIKELY(!!conn)) { - efa_av_implicit_av_lru_conn_move(av, conn); - return conn->implicit_fi_addr; - } + struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); - return FI_ADDR_NOTAVAIL; + return efa_proto_av_reverse_lookup_implicit(proto_av, ahn, qpn, pkt_entry); } /** @@ -344,7 +271,7 @@ efa_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) pke = (struct efa_rdm_pke *) rx_entry->peer_context; - return pke->peer->conn->fi_addr; + return pke->peer->av_entry->fi_addr; } static int efa_conn_implicit_to_explicit(struct efa_av *av, @@ -414,7 +341,7 @@ static int efa_conn_implicit_to_explicit(struct efa_av *av, HASH_ITER(hh, implicit_conn->ep_peer_map, map_entry, tmp) { HASH_DELETE(hh, implicit_conn->ep_peer_map, map_entry); HASH_ADD_PTR(explicit_conn->ep_peer_map, ep_ptr, map_entry); - map_entry->peer.conn = explicit_conn; + map_entry->peer.av_entry = (struct efa_proto_av_entry *)explicit_conn; } assert(HASH_CNT(hh, implicit_conn->ep_peer_map) == 0); diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 978fd86710b..7f093cb2ed0 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -8,6 +8,7 @@ #include "config.h" #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_cntr.h" #include "rdm/efa_rdm_cntr.h" #include "rdm/efa_rdm_cq.h" @@ -46,7 +47,7 @@ static struct fi_ops_domain efa_domain_ops = { static struct fi_ops_domain efa_domain_ops_rdm = { .size = sizeof(struct fi_ops_domain), - .av_open = efa_av_open, + .av_open = efa_proto_av_open, .cq_open = efa_rdm_cq_open, .endpoint = efa_rdm_ep_open, .scalable_ep = fi_no_scalable_ep, @@ -824,8 +825,8 @@ void efa_domain_progress_rdm_peers_and_queues(struct efa_domain *domain) EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer fi_addr: " "%ld implicit fi_addr: %ld. %s\n", - peer->conn->fi_addr, - peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, + peer->av_entry->implicit_fi_addr, fi_strerror(-ret)); efa_base_ep_write_eq_error(&peer->ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE); continue; diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c index bb26d745e74..031d62c8063 100644 --- a/prov/efa/src/rdm/efa_proto_av.c +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -756,7 +756,7 @@ efa_proto_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) pke = (struct efa_rdm_pke *) rx_entry->peer_context; - return pke->peer->conn->fi_addr; + return pke->peer->av_entry->fi_addr; } /** @@ -836,7 +836,7 @@ int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, HASH_ITER(hh, implicit_entry->ep_peer_map, map_entry, tmp) { HASH_DELETE(hh, implicit_entry->ep_peer_map, map_entry); HASH_ADD_PTR(explicit_entry->ep_peer_map, ep_ptr, map_entry); - map_entry->peer.conn = (struct efa_conn *)explicit_entry; + map_entry->peer.av_entry = explicit_entry; } assert(HASH_CNT(hh, implicit_entry->ep_peer_map) == 0); diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 1999e540520..1170861498e 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -6,6 +6,7 @@ #include "efa_data_path_ops.h" #include "ofi_util.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_cntr.h" #include "efa_rdm_pke_cmd.h" #include "efa_rdm_pke_utils.h" @@ -190,17 +191,15 @@ static void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( struct util_cq *target_cq; int ret; fi_addr_t src_addr; - struct efa_av *efa_av; uint32_t imm_data = efa_ibv_cq_wc_read_imm_data(ibv_cq); uint32_t len = efa_ibv_cq_wc_read_byte_len(ibv_cq); target_cq = ep->base_ep.util_ep.rx_cq; - efa_av = ep->base_ep.av; if (ep->base_ep.util_ep.caps & FI_SOURCE) { /* Only check the explicit AV when writing completions */ - src_addr = efa_av_reverse_lookup_rdm(efa_av, + src_addr = efa_proto_av_reverse_lookup(ep->proto_av, efa_ibv_cq_wc_read_slid(ibv_cq), efa_ibv_cq_wc_read_src_qp(ibv_cq), NULL); @@ -361,7 +360,7 @@ efa_rdm_cq_lookup_raw_addr(struct efa_rdm_pke *pke, } /* Next check implicit AV */ - addr = ofi_av_lookup_fi_addr(&ep->base_ep.av->util_av_implicit, + addr = ofi_av_lookup_fi_addr(&ep->proto_av->util_av_implicit, (void *) efa_ep_addr); if (addr != FI_ADDR_NOTAVAIL) { implicit = true; @@ -401,7 +400,6 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, struct efa_ibv_cq *efa_ibv_cq, struct efa_rdm_pke *pkt_entry) { - struct efa_av *efa_av = ep->base_ep.av; fi_addr_t explicit_fi_addr, implicit_fi_addr; struct efa_ep_addr efa_ep_addr = {0}; struct efa_ep_addr_hashable *efa_ep_addr_hashable = NULL; @@ -433,7 +431,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * behavior is fixed */ explicit_fi_addr = - efa_av_reverse_lookup_rdm(efa_av, gid, qpn, pkt_entry); + efa_proto_av_reverse_lookup(ep->proto_av, gid, qpn, pkt_entry); if (explicit_fi_addr != FI_ADDR_NOTAVAIL) { EFA_DBG(FI_LOG_CQ, @@ -445,7 +443,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, } implicit_fi_addr = - efa_av_reverse_lookup_rdm_implicit(efa_av, gid, qpn, pkt_entry); + efa_proto_av_reverse_lookup_implicit(ep->proto_av, gid, qpn, pkt_entry); if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { EFA_DBG(FI_LOG_CQ, @@ -473,7 +471,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * TODO: continue communication with peer by saving the previous state * and restoring it */ - HASH_FIND(hh, ep->base_ep.av->evicted_peers_hashset, &efa_ep_addr, + HASH_FIND(hh, ep->proto_av->evicted_peers_hashset, &efa_ep_addr, sizeof(struct efa_ep_addr), efa_ep_addr_hashable); if (OFI_UNLIKELY(!!efa_ep_addr_hashable)) { EFA_WARN(FI_LOG_CQ, "Received packet from peer already evicted " @@ -494,7 +492,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * not local or shm is disabled for transmission. We shouldn't insert * in to shm av in this case. */ - ret = efa_av_insert_one(ep->base_ep.av, &efa_ep_addr, &implicit_fi_addr, + ret = efa_proto_av_insert_one(ep->proto_av, &efa_ep_addr, &implicit_fi_addr, 0, NULL, false, true); if (OFI_UNLIKELY(ret != 0)) { efa_base_ep_write_eq_error(&ep->base_ep, ret, @@ -506,10 +504,10 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, out: assert(peer); - assert((peer->conn->fi_addr != FI_ADDR_NOTAVAIL && - peer->conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) || - (peer->conn->implicit_fi_addr != FI_ADDR_NOTAVAIL && - peer->conn->fi_addr == FI_ADDR_NOTAVAIL)); + assert((peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL && + peer->av_entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) || + (peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL && + peer->av_entry->fi_addr == FI_ADDR_NOTAVAIL)); return peer; } @@ -584,8 +582,8 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct EFA_WARN(FI_LOG_CQ, "Peer fi_addr: %ld implicit fi_addr %ld is requesting " "feature %d, which this EP does not support.\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr, + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr, base_hdr->type); assert(0 && "invalid REQ packet type"); @@ -690,7 +688,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc_closing_ep(struct efa_ibv_cq *cq, struc efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, (size_t) pkt_entry->ope->cq_entry.op_context, pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, - pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL, + pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL, efa_rdm_pkt_type_of_pke(pkt_entry)); #endif @@ -756,7 +754,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc(struct efa_ibv_cq *cq, struct efa_rdm_e efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, (size_t) pkt_entry->ope->cq_entry.op_context, pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, - pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL, + pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL, efa_rdm_pkt_type_of_pke(pkt_entry)); #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 8684bdf7305..2e05fcad221 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -54,6 +54,7 @@ struct efa_rdm_ep_queued_copy { struct efa_rdm_ep { struct efa_base_ep base_ep; + struct efa_proto_av *proto_av; /* set during fi_ep_bind, avoids container_of on hot path */ /* self_ah necessary for local reads when application does not insert * its own address into the AV */ @@ -561,15 +562,7 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep); char ep_addr_str[OFI_ADDRSTRLEN] = {0}; \ efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &(size_t){sizeof ep_addr_str}); -static inline -fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr) -{ - struct efa_conn *conn; - - assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn(ep->base_ep.av, addr); - return conn ? conn->shm_fi_addr : FI_ADDR_NOTAVAIL; -} +fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr); static inline size_t efa_rdm_ep_get_available_tx_pkts(struct efa_rdm_ep *ep) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 7d37ceab868..75a0e922426 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -3,6 +3,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_ep.h" #include "efa_rdm_cq.h" #include "efa_rdm_srx.h" @@ -259,7 +260,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) goto err_free; ret = ofi_bufpool_create(&ep->peer_map_entry_pool, - sizeof(struct efa_conn_ep_peer_map_entry), + sizeof(struct efa_proto_av_entry_ep_peer_map_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit to max_cnt */ EFA_RDM_EP_MIN_PEER_POOL_SIZE, @@ -660,10 +661,12 @@ static int efa_rdm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) if (ret) return ret; + efa_rdm_ep->proto_av = container_of(av, struct efa_proto_av, efa_av); + /* Bind shm provider endpoint & shm av */ if (efa_rdm_ep->shm_ep) { - assert(av->shm_rdm_av); - ret = fi_ep_bind(efa_rdm_ep->shm_ep, &av->shm_rdm_av->fid, flags); + assert(efa_rdm_ep->proto_av->shm_rdm_av); + ret = fi_ep_bind(efa_rdm_ep->shm_ep, &efa_rdm_ep->proto_av->shm_rdm_av->fid, flags); if (ret) return ret; } @@ -722,9 +725,9 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) struct efa_rdm_ope *rxe; struct efa_rdm_ope *txe; struct efa_rdm_peer *peer; - struct util_av_entry *util_av_entry; - struct efa_av_entry *av_entry; - struct efa_conn_ep_peer_map_entry *peer_map_entry; + struct efa_proto_av_entry *proto_entry; + struct efa_proto_av_entry_ep_peer_map_entry *pm_entry; + /* * Destruct peers first so overflow packets are properly @@ -735,28 +738,24 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) struct efa_rdm_peer, peer, ep_peer_list_entry, tmp) { - if (peer->conn->fi_addr != FI_ADDR_NOTAVAIL) { - util_av_entry = ofi_bufpool_get_ibuf( - efa_rdm_ep->base_ep.av->util_av.av_entry_pool, - peer->conn->fi_addr); + if (peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL) { + proto_entry = efa_proto_av_addr_to_entry( + efa_rdm_ep->proto_av, peer->av_entry->fi_addr); } else { - assert(peer->conn->implicit_fi_addr != FI_ADDR_NOTAVAIL); + assert(peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL); - util_av_entry = ofi_bufpool_get_ibuf( - efa_rdm_ep->base_ep.av->util_av_implicit.av_entry_pool, - peer->conn->implicit_fi_addr); + proto_entry = efa_proto_av_addr_to_entry_implicit( + efa_rdm_ep->proto_av, peer->av_entry->implicit_fi_addr); } dlist_remove(&peer->ep_peer_list_entry); efa_rdm_peer_destruct(peer, efa_rdm_ep); - peer_map_entry = container_of( - peer, struct efa_conn_ep_peer_map_entry, peer); - - av_entry = (struct efa_av_entry *) util_av_entry->data; - HASH_DEL(av_entry->conn.ep_peer_map, peer_map_entry); - ofi_buf_free(peer_map_entry); + pm_entry = container_of( + peer, struct efa_proto_av_entry_ep_peer_map_entry, peer); + HASH_DEL(proto_entry->ep_peer_map, pm_entry); + ofi_buf_free(pm_entry); } #if ENABLE_DEBUG @@ -803,7 +802,6 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) efa_rdm_txe_release(txe); } - if (efa_rdm_ep->ope_pool) ofi_bufpool_destroy(efa_rdm_ep->ope_pool); @@ -1184,7 +1182,6 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) { int ret, retv = 0; struct efa_domain *efa_domain; - struct efa_av *efa_av; struct efa_rdm_cq *efa_rdm_cq; @@ -1194,14 +1191,13 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) retv = ret; } - efa_av = efa_rdm_ep->base_ep.av; - if (efa_av->shm_rdm_av) { - ret = fi_close(&efa_av->shm_rdm_av->fid); + if (efa_rdm_ep->proto_av->shm_rdm_av) { + ret = fi_close(&efa_rdm_ep->proto_av->shm_rdm_av->fid); if (ret) { EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm av: %s\n", fi_strerror(-ret)); retv = ret; } - efa_av->shm_rdm_av = NULL; + efa_rdm_ep->proto_av->shm_rdm_av = NULL; } efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq); diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 834519802bd..5e6e76da1c0 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -9,6 +9,7 @@ #include #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_msg.h" #include "efa_rdm_rma.h" #include "efa_rdm_atomic.h" @@ -37,12 +38,10 @@ struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep) */ int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_av *efa_av; - struct efa_conn *efa_conn; + struct efa_proto_av_entry *entry; - efa_av = ep->base_ep.av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ah->ahn : -1; + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + return entry ? entry->ah->ahn : -1; } @@ -74,18 +73,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr) */ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_conn *conn; - struct efa_conn_ep_peer_map_entry *map_entry; + struct efa_proto_av_entry *entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; struct efa_rdm_peer *peer; assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn(ep->base_ep.av, addr); - - if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + if (!entry) return NULL; - peer = efa_conn_ep_peer_map_lookup(conn, ep); + peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep); if (peer) return peer; @@ -100,9 +98,9 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr memset(map_entry, 0, sizeof(*map_entry)); map_entry->ep_ptr = ep; - efa_rdm_peer_construct(&map_entry->peer, ep, conn); + efa_rdm_peer_construct(&map_entry->peer, ep, entry); - efa_conn_ep_peer_map_insert(conn, map_entry); + efa_proto_av_entry_ep_peer_map_insert(entry, map_entry); dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list); @@ -119,18 +117,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr */ struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_conn *conn; + struct efa_proto_av_entry *entry; struct efa_rdm_peer *peer; - struct efa_conn_ep_peer_map_entry *map_entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn_implicit(ep->base_ep.av, addr); - - if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) + entry = efa_proto_av_addr_to_entry_implicit(ep->proto_av, addr); + if (!entry) return NULL; - peer = efa_conn_ep_peer_map_lookup(conn, ep); + peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep); if (peer) goto out; @@ -145,17 +142,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr memset(map_entry, 0, sizeof(*map_entry)); map_entry->ep_ptr = ep; - efa_rdm_peer_construct(&map_entry->peer, ep, conn); + efa_rdm_peer_construct(&map_entry->peer, ep, entry); peer = &map_entry->peer; - efa_conn_ep_peer_map_insert(conn, map_entry); + efa_proto_av_entry_ep_peer_map_insert(entry, map_entry); dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list); out: assert(peer); /* Move to the front of the LRU list */ - efa_av_implicit_av_lru_conn_move(ep->base_ep.av, peer->conn); + efa_proto_av_implicit_av_lru_entry_move(ep->proto_av, peer->av_entry); return peer; } @@ -532,7 +529,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent "initializing backoff timeout for peer fi_addr: " "%" PRIu64 " implicit fi_addr: %" PRIu64 " timeout: %ld rnr_queued_pkts: %d\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt); } else { peer->rnr_backoff_wait_time = MIN(peer->rnr_backoff_wait_time * 2, @@ -541,7 +538,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent "increasing backoff timeout for peer fi_addr: %" PRIu64 " implicit fi_addr %" PRIu64 " to %ld rnr_queued_pkts: %d\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt); } } @@ -575,7 +572,7 @@ static ssize_t efa_rdm_ep_handshake_common(struct efa_rdm_ep *ep, struct efa_rdm (peer->flags & EFA_RDM_PEER_REQ_SENT))) return 0; - msg.addr = peer->conn->fi_addr; + msg.addr = peer->av_entry->fi_addr; txe = ofi_buf_alloc(ep->ope_pool); if (OFI_UNLIKELY(!txe)) { @@ -712,7 +709,7 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer fi_addr: %ld implicit fi_addr %ld. %s\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, fi_strerror(-err)); + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, fi_strerror(-err)); efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE); return; } @@ -1008,3 +1005,12 @@ int efa_rdm_ep_enforce_handshake_for_txe(struct efa_rdm_ep *ep, struct efa_rdm_o } return FI_SUCCESS; } + +fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr) +{ + struct efa_proto_av_entry *entry; + + assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + return entry ? entry->shm_fi_addr : FI_ADDR_NOTAVAIL; +} diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index ab5e0fb8f63..25aa40efd2b 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -209,7 +209,7 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, const struct fi_msg *msg EFA_DBG(FI_LOG_EP_DATA, "peer: %" PRIu64 ": size %lu tag: %lx op: %x flags: %lx msg_id: %" PRIu32 "\n", - peer->conn->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id); + peer->av_entry->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id); efa_rdm_tracepoint(send_begin, txe->msg_id, (size_t) txe->cq_entry.op_context, txe->total_len); @@ -794,7 +794,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, peer_srx = util_get_peer_srx(ep->peer_srx_ep); peer = (*pkt_entry_ptr)->peer; - attr.addr = peer->conn->fi_addr; + attr.addr = peer->av_entry->fi_addr; attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr); attr.tag = 0; ret = peer_srx->owner_ops->get_msg(peer_srx, &attr, &peer_rxe); @@ -832,7 +832,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, efa_rdm_tracepoint(msg_recv_unexpected_nontagged, (uint64_t) orig_pke_ptr, (*pkt_entry_ptr)->pkt_size, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr); #endif } else { /* Unexpected errors */ @@ -882,7 +882,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, peer = (*pkt_entry_ptr)->peer; peer_srx = util_get_peer_srx(ep->peer_srx_ep); - attr.addr = peer->conn->fi_addr; + attr.addr = peer->av_entry->fi_addr; attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr); attr.tag = efa_rdm_pke_get_rtm_tag(*pkt_entry_ptr); @@ -927,7 +927,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, efa_rdm_tracepoint(msg_recv_unexpected_tagged, (uint64_t) orig_pke_ptr, (*pkt_entry_ptr)->pkt_size, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr); #endif } else { /* Unexpected errors */ diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index feed792c600..a378446c1b2 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -876,7 +876,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) " implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %" PRIu64 " incoming message size: %" PRIu64 " receiving buffer size: %zu\n", - rxe->peer->conn->fi_addr, rxe->peer->conn->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, + rxe->peer->av_entry->fi_addr, rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, rxe->total_len, rxe->cq_entry.len); ret = ofi_cq_write_error_trunc(ep->base_ep.util_ep.rx_cq, @@ -909,13 +909,13 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) " implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx total_len: %" PRIu64 "\n", - rxe->peer->conn->fi_addr, - rxe->peer->conn->implicit_fi_addr, rxe->rx_id, + rxe->peer->av_entry->fi_addr, + rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, rxe->total_len); efa_rdm_tracepoint(recv_end, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->cq_entry.tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->cq_entry.tag, rxe->peer->av_entry->fi_addr); if (ep->base_ep.util_ep.caps & FI_SOURCE) @@ -926,7 +926,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) rxe->cq_entry.buf, rxe->cq_entry.data, rxe->cq_entry.tag, - rxe->peer->conn->fi_addr); + rxe->peer->av_entry->fi_addr); else ret = ofi_cq_write(rx_cq, rxe->cq_entry.op_context, @@ -1010,13 +1010,13 @@ void efa_rdm_txe_report_completion(struct efa_rdm_ope *txe) "Writing send completion for txe to peer: %" PRIu64 " tx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx len: %" PRIu64 "\n", - txe->peer->conn->fi_addr, txe->tx_id, txe->msg_id, + txe->peer->av_entry->fi_addr, txe->tx_id, txe->msg_id, txe->cq_entry.tag, txe->total_len); efa_rdm_tracepoint(send_end, txe->msg_id, (size_t) txe->cq_entry.op_context, - txe->total_len, txe->cq_entry.tag, txe->peer->conn->fi_addr); + txe->total_len, txe->cq_entry.tag, txe->peer->av_entry->fi_addr); /* TX completions should not send peer address to util_cq */ if (txe->ep->base_ep.util_ep.caps & FI_SOURCE) diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 9188f5b96ec..4809bed5f75 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -3,6 +3,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_pkt_type.h" #include "efa_rdm_pke_rtm.h" #include "efa_rdm_pke_utils.h" @@ -17,14 +18,14 @@ * @param[in] conn efa conn object * @relates efa_rdm_peer */ -void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn) +void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry) { int ret; memset(peer, 0, sizeof(struct efa_rdm_peer)); peer->ep = ep; - peer->conn = conn; - peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, conn->ep_addr); + peer->av_entry = av_entry; + peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, efa_proto_av_entry_ep_addr(av_entry)); peer->host_id = peer->is_self ? ep->host_id : 0; /* Peer host id is exchanged via handshake */ peer->num_runt_bytes_in_flight = 0; /* allocate the robuf circular queue from the pre-allocated buffer pool */ @@ -39,7 +40,7 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st dlist_init(&peer->rxe_list); dlist_init(&peer->overflow_pke_list); - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL) { + if (av_entry->shm_fi_addr != FI_ADDR_NOTAVAIL) { peer->is_local = 1; } diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index caf804111be..ac68d58f54d 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -9,6 +9,8 @@ #include "efa_rdm_protocol.h" #include "efa_rdm_rxe_map.h" +struct efa_proto_av_entry; + #define EFA_RDM_PEER_DEFAULT_REORDER_BUFFER_SIZE (16) #define EFA_RDM_PEER_REQ_SENT BIT_ULL(0) /**< A REQ packet has been sent to the peer (peer should send a handshake back) */ @@ -90,7 +92,7 @@ struct efa_rdm_peer { bool is_self; /**< flag indicating whether the peer is the endpoint itself */ bool is_local; /**< flag indicating wehther the peer is local (on the same instance) */ uint32_t device_version; /**< EFA device version */ - struct efa_conn *conn; /**< pointer to efa_conn struct in the av entry */ + struct efa_proto_av_entry *av_entry; /**< pointer to efa_proto_av_entry in the av entry */ uint64_t host_id; /* Optional peer host id. Default 0 */ /** * @brief reorder buffer @@ -239,9 +241,9 @@ bool efa_rdm_peer_need_connid(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_REQUEST_CONNID_HEADER); } -struct efa_conn; +struct efa_proto_av_entry; -void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn); +void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry); void efa_rdm_peer_destruct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep); @@ -260,6 +262,6 @@ int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_e /* Macro for getting peer address string */ #define EFA_RDM_GET_PEER_ADDR_STR(ep, peer, peer_addr_str) \ char peer_addr_str[OFI_ADDRSTRLEN] = {0}; \ - efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str}); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str}); #endif /* EFA_RDM_PEER_H */ diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index e45456e2cce..4855bcd5e63 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -10,6 +10,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_data_path_ops.h" #include "efa_tp.h" @@ -189,8 +190,8 @@ void efa_rdm_pke_release_tx(struct efa_rdm_pke *pkt_entry) EFA_DBG(FI_LOG_EP_DATA, "reset backoff timer for peer fi_addr: %" PRIu64 " implicit fi_addr: %" PRIu64 "\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr); + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr); } efa_rdm_pke_release(pkt_entry); @@ -454,7 +455,7 @@ static inline uint64_t efa_rdm_pke_get_wr_id(struct efa_rdm_pke *pkt_entry) ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, int pkt_entry_cnt, uint64_t flags) { - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct efa_rdm_ep *ep; struct efa_rdm_pke *pkt_entry; struct efa_rdm_peer *peer; @@ -477,8 +478,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, if (peer->flags & EFA_RDM_PEER_IN_BACKOFF) return -FI_EAGAIN; - conn = pkt_entry_vec[0]->peer->conn; - assert(conn && conn->ep_addr); + av_entry = pkt_entry_vec[0]->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); for (pkt_idx = 0; pkt_idx < pkt_entry_cnt; ++pkt_idx) { pkt_entry = pkt_entry_vec[pkt_idx]; @@ -521,8 +522,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, qpn = peer->user_recv_qp.qpn; qkey = peer->user_recv_qp.qkey; } else { - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } /* This will make efa_qp_post_send not ring the doorbell until the last itertion of the loop */ @@ -533,7 +534,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, ret = efa_qp_post_send(ep->base_ep.qp, sg_list, inline_data_list, iov_cnt, use_inline, - wr_id, cq_data, flags_in_loop, conn->ah, + wr_id, cq_data, flags_in_loop, av_entry->ah, qpn, qkey); if (OFI_UNLIKELY(ret)) @@ -580,7 +581,7 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, { struct efa_rdm_ep *ep; struct efa_qp *qp; - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct ibv_sge sge; struct efa_rdm_ope *txe; int err = 0; @@ -599,11 +600,11 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, qpn = qp->qp_num; qkey = qp->qkey; } else { - conn = pkt_entry->peer->conn; - assert(conn && conn->ep_addr); - ah = conn->ah; - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + av_entry = pkt_entry->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); + ah = av_entry->ah; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } sge.addr = (uint64_t)local_buf; @@ -652,7 +653,7 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_ep *ep; struct efa_qp *qp; - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct ibv_sge sge; struct efa_rdm_rma_context_pkt *rma_context_pkt; struct efa_rdm_ope *txe; @@ -689,11 +690,11 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) qpn = qp->qp_num; qkey = qp->qkey; } else { - conn = pkt_entry->peer->conn; - assert(conn && conn->ep_addr); - ah = conn->ah; - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + av_entry = pkt_entry->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); + ah = av_entry->ah; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } wr_id = efa_rdm_pke_get_wr_id(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index 2ed75f38a00..452d09e5c2e 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -93,8 +93,8 @@ void efa_rdm_pke_handle_handshake_recv(struct efa_rdm_pke *pkt_entry) EFA_INFO(FI_LOG_CQ, "HANDSHAKE received from peer with explicit fi_addr %" PRIu64 " implicit fi_addr %" PRIu64 "\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr); + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr); handshake_pkt = (struct efa_rdm_handshake_hdr *)pkt_entry->wiredata; diff --git a/prov/efa/src/rdm/efa_rdm_pke_print.c b/prov/efa/src/rdm/efa_rdm_pke_print.c index 529fddfe0f3..37b80505355 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_print.c +++ b/prov/efa/src/rdm/efa_rdm_pke_print.c @@ -154,7 +154,7 @@ static void efa_rdm_pke_print_eager_tag_rtm(char *prefix, tag_rtm_hdr = (struct efa_rdm_eager_tagrtm_hdr *) pkt_entry->wiredata; if (pkt_entry->peer) - fi_addr = pkt_entry->peer->conn->fi_addr; + fi_addr = pkt_entry->peer->av_entry->fi_addr; EFA_DBG(FI_LOG_EP_DATA, "%s EFA RDM RTM packet - type: %" PRIu32 " version: %" PRIu8 @@ -195,7 +195,7 @@ static void efa_rdm_pke_print_longread_rtw(char *prefix, " msg_length: %" PRIu64 " send_id: %" PRIu32 " read_iov_count: %" PRIu32 "\n", prefix, base_hdr->type, base_hdr->version, base_hdr->flags, - pkt_entry->peer->conn->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count, + pkt_entry->peer->av_entry->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count, rtw_hdr->msg_length, rtw_hdr->send_id, rtw_hdr->read_iov_count); efa_rdm_pke_print_fi_rma_iov("rma_iov", rtw_hdr->rma_iov_count, diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index de0f3d4c478..894a38b745b 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -119,7 +119,7 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, len = sizeof(ep_addr_str); efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &len); len = sizeof(peer_addr_str); - efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &len); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &len); if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { strcpy(local_host_id_str, "N/A"); diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index fdbc2bc71e5..22967a28b7e 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -5,6 +5,7 @@ #include "efa_rdm_cq.h" #include "efa_rdm_pke_req.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" /** * @brief Only works on nodes with EFA devices @@ -246,7 +247,7 @@ void test_av_multiple_ep_efa_direct(struct efa_resource **state) return test_av_multiple_ep_impl(state, EFA_DIRECT_FABRIC_NAME); } -static void test_av_verify_av_hash_cnt(struct efa_av *av, +static void test_av_verify_av_hash_cnt(struct efa_av *av, struct efa_proto_av *proto_av, int explicit_cur_av_count, int explicit_prv_av_count, int implicit_cur_av_count, @@ -259,11 +260,11 @@ static void test_av_verify_av_hash_cnt(struct efa_av *av, assert_int_equal(HASH_CNT(hh, av->prv_reverse_av), explicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, av->util_av_implicit.hash), + assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), implicit_cur_av_count + implicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, av->cur_reverse_av_implicit), + assert_int_equal(HASH_CNT(hh, proto_av->cur_reverse_av_implicit), implicit_cur_av_count); - assert_int_equal(HASH_CNT(hh, av->prv_reverse_av_implicit), + assert_int_equal(HASH_CNT(hh, proto_av->prv_reverse_av_implicit), implicit_prv_av_count); } @@ -280,6 +281,7 @@ void test_av_reinsertion(struct efa_resource **state) size_t raw_addr_len = sizeof(struct efa_ep_addr); fi_addr_t fi_addr; struct efa_av *av; + struct efa_proto_av *proto_av; struct efa_rdm_ep *efa_rdm_ep; int err; @@ -291,41 +293,42 @@ void test_av_reinsertion(struct efa_resource **state) raw_addr.qkey = 0x1234; av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); assert_int_equal(err, 1); assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); assert_int_equal(err, 0); assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->conn->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); err = fi_av_remove(resource->av, &fi_addr, 1, 0); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); assert_int_equal(err, 1); assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); assert_int_equal(err, 0); assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->conn->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); err = fi_av_remove(resource->av, &fi_addr, 1, 0); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); } /** @@ -345,6 +348,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) size_t raw_addr_len = sizeof(struct efa_ep_addr); fi_addr_t fi_addr1, fi_addr2; struct efa_av *av; + struct efa_proto_av *proto_av; struct efa_rdm_ep *efa_rdm_ep; uint32_t ahn; int err; @@ -355,6 +359,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) assert_int_equal(err, 0); av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); ahn = efa_rdm_ep->self_ah->ahn; @@ -364,7 +369,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) raw_addr.qkey = 0xAAAA; err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL); assert_int_equal(err, 1); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); /* cur_reverse_av (ahn, 100) -> conn1 (fi_addr1) */ assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), fi_addr1); @@ -376,7 +381,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL); assert_int_equal(err, 1); assert_int_not_equal(fi_addr1, fi_addr2); - test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 1, 0, 0); /* cur_reverse_av (ahn, 100) now points to conn2 (fi_addr2); peer1 is * in prv_reverse_av keyed by its own qkey. */ assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), @@ -387,7 +392,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) err = fi_av_remove(resource->av, &fi_addr1, 1, 0); assert_int_equal(err, 0); /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), fi_addr2); @@ -395,7 +400,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */ err = fi_av_remove(resource->av, &fi_addr2, 1, 0); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), FI_ADDR_NOTAVAIL); } @@ -419,10 +424,12 @@ static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resourc struct efa_rdm_peer *peer; fi_addr_t implicit_fi_addr, test_addr; struct efa_av *av; + struct efa_proto_av *proto_av; uint32_t ahn; int err; av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); @@ -436,15 +443,15 @@ static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resourc /* Manually insert into implicit AV */ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, &raw_addr, &implicit_fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, &raw_addr, &implicit_fi_addr, 0, NULL, true, true); peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); - assert_int_equal(peer->conn->implicit_fi_addr, implicit_fi_addr); - assert_int_equal(peer->conn->fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); + assert_int_equal(peer->av_entry->implicit_fi_addr, implicit_fi_addr); + assert_int_equal(peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); - test_addr = efa_av_reverse_lookup_rdm_implicit(av, ahn, raw_addr.qpn, NULL); + test_addr = efa_proto_av_reverse_lookup_implicit(proto_av, ahn, raw_addr.qpn, NULL); assert_int_equal(test_addr, implicit_fi_addr); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); @@ -481,11 +488,13 @@ void test_av_implicit_to_explicit(struct efa_resource **state) struct efa_rdm_peer *peer; fi_addr_t explicit_fi_addr, test_addr; struct efa_av *av; + struct efa_proto_av *proto_av; uint32_t ahn; int err; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */ @@ -499,22 +508,22 @@ void test_av_implicit_to_explicit(struct efa_resource **state) peer->flags |= EFA_RDM_PEER_IN_BACKOFF; /* Insert explicitly */ - raw_addr.qpn = peer->conn->ep_addr->qpn; - raw_addr.qkey = peer->conn->ep_addr->qkey; + raw_addr.qpn = efa_proto_av_entry_ep_addr(peer->av_entry)->qpn; + raw_addr.qkey = efa_proto_av_entry_ep_addr(peer->av_entry)->qkey; err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len); assert_int_equal(err, 0); assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); - assert_int_equal(peer->conn->fi_addr, explicit_fi_addr); - assert_int_equal(peer->conn->implicit_fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); + assert_int_equal(peer->av_entry->fi_addr, explicit_fi_addr); + assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); ahn = efa_rdm_ep->self_ah->ahn; - test_addr = efa_av_reverse_lookup_rdm(av, ahn, raw_addr.qpn, NULL); + test_addr = efa_proto_av_reverse_lookup(proto_av, ahn, raw_addr.qpn, NULL); assert_int_equal(test_addr, explicit_fi_addr); /* Verify the manually set peer properties above */ @@ -526,26 +535,26 @@ void test_av_implicit_to_explicit(struct efa_resource **state) err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); } static void test_av_implicit_av_verify_lru_list_first_last_elements( - struct efa_av *av, struct efa_conn *first_conn_expected, - struct efa_conn *last_conn_expected) + struct efa_proto_av *proto_av, struct efa_proto_av_entry *first_entry_expected, + struct efa_proto_av_entry *last_entry_expected) { struct dlist_entry *first_entry, *last_entry; - struct efa_conn *first_conn_actual, *last_conn_actual; + struct efa_proto_av_entry *first_entry_actual, *last_entry_actual; - first_entry = av->implicit_av_lru_list.next; - last_entry = av->implicit_av_lru_list.prev; + first_entry = proto_av->implicit_av_lru_list.next; + last_entry = proto_av->implicit_av_lru_list.prev; - first_conn_actual = container_of(first_entry, struct efa_conn, + first_entry_actual = container_of(first_entry, struct efa_proto_av_entry, implicit_av_lru_entry); - last_conn_actual = container_of(last_entry, struct efa_conn, + last_entry_actual = container_of(last_entry, struct efa_proto_av_entry, implicit_av_lru_entry); - assert_ptr_equal(first_conn_actual, first_conn_expected); - assert_ptr_equal(last_conn_actual, last_conn_expected); + assert_ptr_equal(first_entry_actual, first_entry_expected); + assert_ptr_equal(last_entry_actual, last_entry_expected); } /** @@ -560,80 +569,82 @@ void test_av_implicit_av_lru_insertion(struct efa_resource **state) struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_peer *peer0, *peer1, *peer2; struct efa_av *av; + struct efa_proto_av *proto_av; fi_addr_t implicit_fi_addr; uint32_t ahn; int err; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* Manually insert first address into implicit AV */ peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 1, 0); /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer0->av_entry); /* Manually insert second address into implicit AV */ peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); /* Manually insert third address into implicit AV */ peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); /* Access peer0 through the CQ read path */ ahn = efa_rdm_ep->self_ah->ahn; ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer0->conn->ep_addr->qpn, NULL); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + proto_av, ahn, efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(implicit_fi_addr, 0); /* Expected LRU list: HEAD->peer1->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer0->av_entry); /* Access peer2 through the CQ read path */ ahn = efa_rdm_ep->self_ah->ahn; ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer2->conn->ep_addr->qpn, NULL); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + proto_av, ahn, efa_proto_av_entry_ep_addr(peer2->av_entry)->qpn, NULL); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(implicit_fi_addr, 2); /* Expected LRU list: HEAD->peer1->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer2->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer2->av_entry); /* Access peer1 through repeated AV insertion path */ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer1->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer1->av_entry), &implicit_fi_addr, 0, NULL, true, true); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(err, 0); assert_int_equal(implicit_fi_addr, 1); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); /* Expected LRU list: HEAD->peer0->peer2->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); /* Access peer2 through repeated AV insertion path */ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer2->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer2->av_entry), &implicit_fi_addr, 0, NULL, true, true); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(err, 0); assert_int_equal(implicit_fi_addr, 2); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); } /** @@ -647,86 +658,101 @@ void test_av_implicit_av_lru_eviction(struct efa_resource **state) struct efa_resource *resource = *state; struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3; + struct efa_ep_addr peer1_ep_addr, peer2_ep_addr; struct efa_ep_addr_hashable *efa_ep_addr_hashable; struct efa_av *av; + struct efa_proto_av *proto_av; fi_addr_t implicit_fi_addr; uint32_t ahn; int err; efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); /* Modify implicit AV size */ - av->implicit_av_size = 2; + proto_av->implicit_av_size = 2; /* Manually insert first address into implicit AV */ peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 1, 0); /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer0->av_entry); /* Manually insert second address into implicit AV */ peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); + + /* + * Snapshot peer1/peer2 ep_addr before they are evicted. After + * eviction the enclosing peer_map_entry is returned to the bufpool + * and peer->av_entry becomes stale memory (entry->ep_addr is zeroed + * by efa_proto_av_entry_release_util_av). + */ + memcpy(&peer1_ep_addr, efa_proto_av_entry_ep_addr(peer1->av_entry), + sizeof(struct efa_ep_addr)); /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); /* Access peer0 through the CQ read path */ ahn = efa_rdm_ep->self_ah->ahn; ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer0->conn->ep_addr->qpn, NULL); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + proto_av, ahn, efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(implicit_fi_addr, 0); /* Expected LRU list: HEAD->peer1->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer0->av_entry); /* Manually insert third address into implicit AV */ peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); + /* Snapshot peer2 ep_addr before it too gets evicted later. */ + memcpy(&peer2_ep_addr, efa_proto_av_entry_ep_addr(peer2->av_entry), + sizeof(struct efa_ep_addr)); /* Expected LRU list: HEAD->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); /* Verify that peer1 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 1); - HASH_FIND(hh, av->evicted_peers_hashset, peer1->conn->ep_addr, + assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 1); + HASH_FIND(hh, proto_av->evicted_peers_hashset, &peer1_ep_addr, sizeof(struct efa_ep_addr), efa_ep_addr_hashable); assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(peer1->conn->ep_addr, + assert_int_equal(efa_is_same_addr(&peer1_ep_addr, &efa_ep_addr_hashable->addr), 1); /* Access peer0 through repeated AV insertion path */ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer0->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer0->av_entry), &implicit_fi_addr, 0, NULL, true, true); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); assert_int_equal(err, 0); assert_int_equal(implicit_fi_addr, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); /* Expected LRU list: HEAD->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->conn, peer0->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer2->av_entry, peer0->av_entry); /* Manually insert fourth address into implicit AV */ peer3 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); /* Verify that peer2 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 2); - HASH_FIND(hh, av->evicted_peers_hashset, peer2->conn->ep_addr, + assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 2); + HASH_FIND(hh, proto_av->evicted_peers_hashset, &peer2_ep_addr, sizeof(struct efa_ep_addr), efa_ep_addr_hashable); assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(peer2->conn->ep_addr, + assert_int_equal(efa_is_same_addr(&peer2_ep_addr, &efa_ep_addr_hashable->addr), 1); /* Expected LRU list: HEAD->peer0->peer3 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer3->conn); + test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer3->av_entry); } /** @@ -744,6 +770,7 @@ void test_ah_refcnt(struct efa_resource **state) struct efa_domain *efa_domain; struct efa_rdm_peer *peer; struct efa_av *av; + struct efa_proto_av *proto_av; struct efa_ah *efa_ah = NULL; int err; @@ -762,6 +789,7 @@ void test_ah_refcnt(struct efa_resource **state) efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */ assert_int_equal(g_ibv_ah_cnt, 1); @@ -772,11 +800,11 @@ void test_ah_refcnt(struct efa_resource **state) /* Manually insert into implicit AV */ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, &raw_addr, &fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, &raw_addr, &fi_addr, 0, NULL, true, true); peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr); ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - efa_ah = peer->conn->ah; + efa_ah = peer->av_entry->ah; assert_int_equal(g_ibv_ah_cnt, 2); @@ -824,6 +852,7 @@ void test_ah_lru_eviction_impl(bool explicit) struct efa_rdm_ep *efa_rdm_ep[2]; struct efa_rdm_peer *peer; struct efa_av *efa_av[2]; + struct efa_proto_av *proto_av; struct efa_ah *efa_ah = NULL; int err; struct fi_av_attr av_attr = {0}; @@ -891,16 +920,17 @@ void test_ah_lru_eviction_impl(bool explicit) cur = cur->next; } + proto_av = container_of(efa_av[0], struct efa_proto_av, efa_av); assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); /* Manually insert into implicit AV in first domain */ ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_av_insert_one(efa_av[0], &raw_addr[0], &fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, &raw_addr[0], &fi_addr, 0, NULL, true, true); peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - efa_ah = peer->conn->ah; + efa_ah = peer->av_entry->ah; assert_int_equal(efa_ah->implicit_refcnt, 1); assert_int_equal(efa_ah->explicit_refcnt, 0); @@ -910,14 +940,14 @@ void test_ah_lru_eviction_impl(bool explicit) peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr); } else { ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_av_insert_one(efa_av[0], &raw_addr[1], &fi_addr, 0, NULL, true, true); + err = efa_proto_av_insert_one(proto_av, &raw_addr[1], &fi_addr, 0, NULL, true, true); peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); } assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - efa_ah = peer->conn->ah; + efa_ah = peer->av_entry->ah; if (explicit) { assert_int_equal(efa_ah->implicit_refcnt, 0); assert_int_equal(efa_ah->explicit_refcnt, 1); @@ -929,6 +959,7 @@ void test_ah_lru_eviction_impl(bool explicit) if (explicit) { err = fi_av_remove(av_fid[0], &fi_addr, 1, 0); assert_int_equal(err, 0); + proto_av = container_of(efa_av[0], struct efa_proto_av, efa_av); assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); } diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 1972611fc01..3f4b2d3ddc8 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -1084,7 +1084,7 @@ static void test_efa_cq_read_prep(struct efa_resource *resource, will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_imm_data_return_mock, 0x1); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_qp_num_return_mock, base_ep->qp->qp_num); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_byte_len_return_mock, 4096); - will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_conn(base_ep->av, addr)->ah->ahn); + will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_entry(base_ep->av, addr)->conn.ah->ahn); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_src_qp_return_mock, raw_addr.qpn); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index e532c0813a9..584b5296241 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -480,7 +480,7 @@ void test_efa_rdm_ep_rma_queue_before_handshake(struct efa_resource **state, int peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); peer->flags = EFA_RDM_PEER_REQ_SENT; /* Do not use shm in this unit test because we are testing efa rma path */ - peer->conn->shm_fi_addr = FI_ADDR_NOTAVAIL; + peer->av_entry->shm_fi_addr = FI_ADDR_NOTAVAIL; assert_false(efa_rdm_ep->homogeneous_peers); assert_true(dlist_empty(&efa_rdm_ep->txe_list)); diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index 9a54e522bad..01239822b6b 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -84,7 +84,7 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state) struct efa_rdm_pke *pke; struct efa_ep_addr raw_addr = {0}; size_t raw_addr_len = sizeof(raw_addr); - struct efa_conn conn = {0}; + struct efa_proto_av_entry fake_entry = {0}; struct efa_rdm_peer peer; struct efa_unit_test_eager_rtm_pkt_attr pke_attr = {.msg_id = 0, .connid = 0x1234}; @@ -113,8 +113,8 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state) fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); raw_addr.qpn = 0; raw_addr.qkey = 0x1234; - conn.ep_addr = &raw_addr; - efa_rdm_peer_construct(&peer, efa_rdm_ep, &conn); + memcpy(fake_entry.ep_addr, &raw_addr, EFA_EP_ADDR_LEN); + efa_rdm_peer_construct(&peer, efa_rdm_ep, &fake_entry); pke->peer = &peer; efa_unit_test_eager_msgrtm_pkt_construct(pke, &pke_attr); From 027f6ca78c7222a66f5cecf8947c6ed1ff106016 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 6 May 2026 09:53:27 -0600 Subject: [PATCH 10/16] prov/efa: release util_domain.lock on malloc failure in efa_ah_alloc efa_ah_alloc acquires util_domain.lock before looking up the AH in the domain's hash map. If the subsequent malloc for a new efa_ah fails, the function sets errno to FI_ENOMEM, logs a warning, and returns NULL without releasing the lock. The caller has no way to know the lock is still held, so every subsequent operation that touches util_domain.lock (including the caller's own cleanup path and any other thread trying to allocate an AH) deadlocks or UBs. Release util_domain.lock on the malloc-failure path before returning NULL, matching the two other failure paths in the function that do the unlock via err_free_efa_ah. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_ah.c | 1 + 1 file changed, 1 insertion(+) diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index ff25d13af3b..4363e205a14 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -126,6 +126,7 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, if (!efa_ah) { errno = FI_ENOMEM; EFA_WARN(FI_LOG_AV, "cannot allocate memory for efa_ah\n"); + ofi_genlock_unlock(&domain->util_domain.lock); return NULL; } From 65f70558922e22d95288811d8219862931bb1da4 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 6 May 2026 09:54:34 -0600 Subject: [PATCH 11/16] =?UTF-8?q?prov/efa:=20fix=20AH=20layering=20?= =?UTF-8?q?=E2=80=94=20move=20eviction=20to=20efa=5Fproto=5Fav.c?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit efa_ah.c reached into protocol-layer types to run AH eviction on ENOMEM: it iterated an AH's implicit_conn_list as struct efa_proto_av_entry and called efa_proto_av_entry_release_ah_unsafe to drop the associated AV entries. Those types are defined in rdm/efa_proto_av.h, which efa_ah.c picks up transitively through efa.h; the base AH translation unit should not be aware of them. The circularity shows up in practice whenever the protocol layer grows a new concern — every change to efa_proto_av_entry forces a recompile of efa_ah.c and risks dragging protocol-only behavior into the efa-direct build. The fix splits AH allocation into a base path and a protocol path. efa_ah_alloc stays in efa_ah.c but returns NULL on ENOMEM without attempting eviction; its only responsibilities are now creating the ibv_ah, populating gid/ahn/refcnt, and inserting into the domain's AH hash. The pre-existing EINVAL diagnostic (pretty-printing local and remote GIDs plus the PD pointer for the common cross-AZ / invalid-GID / invalid-PD causes) is preserved in the base path. A new efa_proto_ah_alloc is added in efa_proto_av.c. It calls efa_ah_alloc and, on ENOMEM, invokes efa_proto_ah_evict to release the AH at the head of the LRU whose explicit_refcnt is zero, then retries. The eviction logic itself is lifted wholesale from efa_ah.c into efa_proto_av.c, where it belongs alongside the other implicit-AV bookkeeping. efa_ah_destroy_ah is promoted from a file-local function prototype to a public declaration in efa_ah.h so efa_proto_ah_evict can call it after dropping the last reference. efa_ah.c no longer references efa_proto_av_entry. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_ah.c | 118 +++++++++++--------------------- prov/efa/src/efa_ah.h | 2 + prov/efa/src/rdm/efa_proto_av.c | 102 ++++++++++++++++++++++++++- 3 files changed, 142 insertions(+), 80 deletions(-) diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index 4363e205a14..982c979fbe0 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -8,8 +8,6 @@ #include "efa_conn.h" #include -void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah); - /** * @brief Move the AH to the end of the LRU list to indicate that it is the * most recently used entry @@ -19,8 +17,8 @@ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah); * list to remove AH entries with only implicit AV entries, so it is OK to do * that. * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list + * @param[in] domain efa domain + * @param[in] ah efa AH to move */ void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, struct efa_ah *ah) @@ -34,46 +32,16 @@ void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, &domain->ah_lru_list); } -static inline int efa_ah_implicit_av_evict_ah(struct efa_domain *domain) { - struct efa_proto_av_entry *entry_to_release; - struct efa_ah *ah_tmp, *ah_to_release = NULL; - struct dlist_entry *tmp; - - dlist_foreach_container (&domain->ah_lru_list, struct efa_ah, ah_tmp, - domain_lru_ah_list_entry) { - if (ah_tmp->explicit_refcnt == 0) { - ah_to_release = ah_tmp; - break; - } - } - - if (!ah_to_release) { - EFA_WARN(FI_LOG_AV, - "AH creation for implicit AV entry failed with ENOMEM " - "but no AH entries available to evict\n"); - return -FI_ENOMEM; - } - - assert(ah_to_release->implicit_refcnt > 0); - - dlist_foreach_container_safe(&ah_to_release->implicit_conn_list, - struct efa_proto_av_entry, entry_to_release, - ah_implicit_conn_list_entry, tmp) { - - assert(entry_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && - entry_to_release->fi_addr == FI_ADDR_NOTAVAIL); - - efa_proto_av_entry_release_ah_unsafe(entry_to_release->av, entry_to_release, true); - } - - if (ah_to_release->implicit_refcnt == 0 && - ah_to_release->explicit_refcnt == 0) { - efa_ah_destroy_ah(domain, ah_to_release); - } - - return FI_SUCCESS; -} - +/** + * @brief Emit a detailed warning for ibv_create_ah EINVAL. + * + * The most common reasons for EINVAL are cross-AZ addressing, invalid + * remote GID, and invalid PD. Log both local and remote GIDs plus the + * PD pointer to help operators diagnose failures from logs alone. + * + * @param[in] domain efa domain (for local GID and PD) + * @param[in] gid remote GID that failed + */ static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t *gid) { char remote_gid_str[INET6_ADDRSTRLEN] = {0}; @@ -96,11 +64,16 @@ static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t * /** * @brief allocate an ibv_ah object from GID. - * This function use a hash map to store GID to ibv_ah map, - * and re-use ibv_ah for same GID * - * @param[in] domain efa_domain - * @param[in] gid GID + * Uses a hash map to store GID to ibv_ah mapping and reuses ibv_ah for + * the same GID. If ibv_create_ah fails, returns NULL with errno set. + * The caller is responsible for handling ENOMEM (e.g. by evicting AH + * entries and retrying). + * + * @param[in] domain efa_domain + * @param[in] gid GID + * @param[in] insert_implicit_av whether this is for an implicit AV entry + * @return pointer to efa_ah on success, NULL on failure (errno set) */ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, bool insert_implicit_av) @@ -135,39 +108,13 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, memcpy(ibv_ah_attr.grh.dgid.raw, gid, EFA_GID_LEN); efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr); if (!efa_ah->ibv_ah) { - /* If the failure is because we have too many AH entries, try to - * evict an AH entry with no explicit AV entries and try AH - * creation again */ - if (errno == FI_ENOMEM) { - EFA_INFO( - FI_LOG_AV, - "ibv_create_ah failed with ENOMEM for implicit " - "AV insertion. Attempting to evict AH entry\n"); - - err = efa_ah_implicit_av_evict_ah(domain); - if (err) - goto err_free_efa_ah; - - efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr); - if (!efa_ah->ibv_ah) { - if (errno == EINVAL) { - efa_ah_warn_create_einval(domain, gid); - } else { - EFA_WARN(FI_LOG_AV, - "ibv_create_ah failed for implicit AV " - "insertion! errno: %d\n", - errno); - } - goto err_free_efa_ah; - } - } else if (errno == EINVAL) { + if (errno == EINVAL) { efa_ah_warn_create_einval(domain, gid); - goto err_free_efa_ah; } else { EFA_WARN(FI_LOG_AV, - "ibv_create_ah failed! errno: %s\n", strerror(errno)); - goto err_free_efa_ah; + "ibv_create_ah failed! errno: %d\n", errno); } + goto err_free_efa_ah; } err = efadv_query_ah(efa_ah->ibv_ah, &efa_ah_attr, sizeof(efa_ah_attr)); @@ -196,6 +143,15 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, return NULL; } +/** + * @brief destroy an efa_ah object + * + * Removes AH from hash map and LRU list, destroys ibv_ah, frees memory. + * Caller must hold util_domain.lock. + * + * @param[in] domain efa_domain + * @param[in] ah efa_ah object to destroy + */ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) { int err; @@ -216,8 +172,12 @@ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) /** * @brief release an efa_ah object after acquiring the util domain lock * - * @param[in] domain efa_domain - * @param[in] ah efa_ah object pointer + * Decrements the appropriate refcount. If both refcounts reach zero, + * destroys the AH. + * + * @param[in] domain efa_domain + * @param[in] ah efa_ah object pointer + * @param[in] release_from_implicit_av whether releasing from implicit AV */ void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, bool release_from_implicit_av) diff --git a/prov/efa/src/efa_ah.h b/prov/efa/src/efa_ah.h index b04b53a0114..fe7feafe51c 100644 --- a/prov/efa/src/efa_ah.h +++ b/prov/efa/src/efa_ah.h @@ -36,4 +36,6 @@ void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, void efa_ah_release_unsafe(struct efa_domain *domain, struct efa_ah *ah, bool release_from_implicit_av); +void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah); + #endif \ No newline at end of file diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c index 031d62c8063..a78d02d8e8f 100644 --- a/prov/efa/src/rdm/efa_proto_av.c +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -608,6 +608,106 @@ void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; } +/* ---- AH alloc with eviction ---- */ + +/** + * @brief Evict the least recently used AH that has no explicit AV entries. + * + * Finds the LRU AH with only implicit references, releases all its + * implicit AV entries, and destroys the AH. Called when ibv_create_ah + * fails with ENOMEM. + * + * Caller must hold srx_lock. This function acquires util_domain.lock. + * + * @param[in] domain efa domain + * @return 0 on success, -FI_ENOMEM if no AH is available to evict + */ +static int efa_proto_ah_evict(struct efa_domain *domain) +{ + struct efa_proto_av_entry *entry_to_release; + struct efa_ah *ah_tmp, *ah_to_release = NULL; + struct dlist_entry *tmp; + + assert(ofi_genlock_held(&domain->srx_lock)); + + ofi_genlock_lock(&domain->util_domain.lock); + + dlist_foreach_container(&domain->ah_lru_list, struct efa_ah, ah_tmp, + domain_lru_ah_list_entry) { + if (ah_tmp->explicit_refcnt == 0) { + ah_to_release = ah_tmp; + break; + } + } + + if (!ah_to_release) { + ofi_genlock_unlock(&domain->util_domain.lock); + EFA_WARN(FI_LOG_AV, + "AH creation for implicit AV entry failed with ENOMEM " + "but no AH entries available to evict\n"); + return -FI_ENOMEM; + } + + assert(ah_to_release->implicit_refcnt > 0); + + dlist_foreach_container_safe(&ah_to_release->implicit_conn_list, + struct efa_proto_av_entry, entry_to_release, + ah_implicit_conn_list_entry, tmp) { + + assert(entry_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && + entry_to_release->fi_addr == FI_ADDR_NOTAVAIL); + + efa_proto_av_entry_release_ah_unsafe(entry_to_release->av, + entry_to_release, true); + } + + if (ah_to_release->implicit_refcnt == 0 && + ah_to_release->explicit_refcnt == 0) { + efa_ah_destroy_ah(domain, ah_to_release); + } + + ofi_genlock_unlock(&domain->util_domain.lock); + + return FI_SUCCESS; +} + +/** + * @brief Allocate an AH with eviction retry for protocol AV. + * + * Wraps efa_ah_alloc with ENOMEM handling: if ibv_create_ah fails due + * to too many AH entries, evicts an AH with only implicit references + * and retries. + * + * @param[in] domain efa domain + * @param[in] gid GID + * @param[in] insert_implicit_av whether this is for an implicit AV entry + * @return pointer to efa_ah on success, NULL on failure + */ +static struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, + const uint8_t *gid, + bool insert_implicit_av) +{ + struct efa_ah *ah; + int err; + + ah = efa_ah_alloc(domain, gid, insert_implicit_av); + if (ah) + return ah; + + if (errno != FI_ENOMEM) + return NULL; + + EFA_INFO(FI_LOG_AV, + "ibv_create_ah failed with ENOMEM. " + "Attempting to evict AH entry\n"); + + err = efa_proto_ah_evict(domain); + if (err) + return NULL; + + return efa_ah_alloc(domain, gid, insert_implicit_av); +} + /* ---- Entry alloc ---- */ /** @@ -684,7 +784,7 @@ struct efa_proto_av_entry *efa_proto_av_entry_alloc( entry->implicit_fi_addr = FI_ADDR_NOTAVAIL; } - entry->ah = efa_ah_alloc(av->efa_av.domain, raw_addr->raw, insert_implicit_av); + entry->ah = efa_proto_ah_alloc(av->efa_av.domain, raw_addr->raw, insert_implicit_av); if (!entry->ah) goto err_release; From 5415fed32f1720f07c9b900c34fc03853d7b15b9 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 6 May 2026 09:58:21 -0600 Subject: [PATCH 12/16] prov/efa: split efa_ah into base and protocol layers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The efa_ah struct contained protocol-specific fields (implicit_refcnt, explicit_refcnt, implicit_conn_list, domain_lru_ah_list_entry) that are only used by the RDM protocol path for implicit AV management and AH LRU eviction. The efa-direct path only needs the ibv_ah, GID, AHN, and a reference count. This commit separates them following the same pattern used for efa_av / efa_proto_av. The base efa_ah struct is stripped to: gid, ibv_ah, ahn, refcnt, and the hash handle. The split implicit/explicit refcounts are replaced with a single refcnt. efa_ah_alloc now takes an alloc_size parameter instead of bool insert_implicit_av, allowing the protocol layer to allocate a larger efa_proto_ah. efa_ah_release no longer takes a bool parameter — it simply decrements refcnt and destroys at zero. The base layer has no knowledge of LRU lists, implicit AV entries, or eviction. A new efa_proto_ah struct is added to efa_proto_av.h, embedding efa_ah as its first member (castable via container_of). It adds the protocol-specific fields: implicit_refcnt, explicit_refcnt, implicit_conn_list (list of implicit AV entries using this AH), and lru_list_entry (position in the domain's AH LRU list). efa_proto_ah_alloc in efa_proto_av.c allocates sizeof(efa_proto_ah) via efa_ah_alloc, initializes the protocol fields for new AHs, manages the implicit/explicit refcount split, maintains the LRU list, and handles ENOMEM with eviction retry. efa_proto_ah_release decrements the appropriate protocol refcount and removes from the LRU list when both reach zero before calling efa_ah_release to decrement the base refcount. The efa_proto_ah_lru_move function (formerly efa_ah_implicit_av_lru_ah_move in efa_ah.c) is now static in efa_proto_av.c, operating on the efa_proto_ah lru_list_entry. The efa_proto_ah_from_ah inline accessor is added to efa_proto_av.h for converting base efa_ah pointers to their protocol wrapper. Update efa_conn.c and efa_av.c to access the moved fields (implicit_refcnt, explicit_refcnt, implicit_conn_list) through efa_proto_ah_from_ah(). These files contain the old implicit-AV code paths that remain alive until the next commit strips efa_av and deletes efa_conn. Update efa_ah_alloc/release callers in these files for the new signatures. Split the efa_ah_cnt_av_impl test helper into two path-specific variants: efa_ah_cnt_av_efa_impl uses efa_proto_ah_from_ah() to check the protocol refcnts (implicit/explicit), while efa_ah_cnt_av_efa_direct_impl checks the base efa_ah refcnt directly. The combined helper would have dereferenced efa_proto_ah fields past the end of a base-sized efa_ah allocation on the efa-direct path; splitting the helpers removes the runtime branching and the out-of-bounds access. Test mocks are updated for the new efa_ah_alloc (size_t alloc_size) and efa_ah_release (no bool) signatures. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_ah.c | 91 ++++--------- prov/efa/src/efa_ah.h | 74 ++++++---- prov/efa/src/efa_av.c | 13 +- prov/efa/src/efa_conn.c | 12 +- prov/efa/src/rdm/efa_proto_av.c | 202 ++++++++++++++++++++++------ prov/efa/src/rdm/efa_proto_av.h | 44 ++++++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 4 +- prov/efa/test/efa_unit_test_av.c | 143 ++++++++++++++------ prov/efa/test/efa_unit_test_mocks.c | 45 ++++--- prov/efa/test/efa_unit_test_mocks.h | 17 +-- 10 files changed, 432 insertions(+), 213 deletions(-) diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index 982c979fbe0..2a4e5c61fb6 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -8,30 +8,6 @@ #include "efa_conn.h" #include -/** - * @brief Move the AH to the end of the LRU list to indicate that it is the - * most recently used entry - * - * This function is not called in the efa_rdm_ep_get_peer so that we don't add - * extra latency to the critical path with explicit AV insertion. We use the LRU - * list to remove AH entries with only implicit AV entries, so it is OK to do - * that. - * - * @param[in] domain efa domain - * @param[in] ah efa AH to move - */ -void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, - struct efa_ah *ah) -{ - assert(ah->implicit_refcnt > 0 || ah->explicit_refcnt > 0); - assert(dlist_entry_in_list(&domain->ah_lru_list, - &ah->domain_lru_ah_list_entry)); - - dlist_remove(&ah->domain_lru_ah_list_entry); - dlist_insert_tail(&ah->domain_lru_ah_list_entry, - &domain->ah_lru_list); -} - /** * @brief Emit a detailed warning for ibv_create_ah EINVAL. * @@ -63,20 +39,20 @@ static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t * } /** - * @brief allocate an ibv_ah object from GID. + * @brief allocate an ibv_ah from GID, reusing existing AH if possible * * Uses a hash map to store GID to ibv_ah mapping and reuses ibv_ah for * the same GID. If ibv_create_ah fails, returns NULL with errno set. * The caller is responsible for handling ENOMEM (e.g. by evicting AH * entries and retrying). * - * @param[in] domain efa_domain + * @param[in] domain efa domain * @param[in] gid GID - * @param[in] insert_implicit_av whether this is for an implicit AV entry + * @param[in] alloc_size size to allocate (sizeof(efa_ah) or larger for protocol wrapper) * @return pointer to efa_ah on success, NULL on failure (errno set) */ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { struct ibv_pd *ibv_pd = domain->ibv_pd; struct efa_ah *efa_ah; @@ -84,18 +60,19 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, struct efadv_ah_attr efa_ah_attr = { 0 }; int err; + assert(alloc_size >= sizeof(struct efa_ah)); + efa_ah = NULL; ofi_genlock_lock(&domain->util_domain.lock); HASH_FIND(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah); if (efa_ah) { - insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++; - efa_ah_implicit_av_lru_ah_move(domain, efa_ah); + efa_ah->refcnt++; ofi_genlock_unlock(&domain->util_domain.lock); return efa_ah; } - efa_ah = malloc(sizeof(struct efa_ah)); + efa_ah = calloc(1, alloc_size); if (!efa_ah) { errno = FI_ENOMEM; EFA_WARN(FI_LOG_AV, "cannot allocate memory for efa_ah\n"); @@ -114,7 +91,7 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, EFA_WARN(FI_LOG_AV, "ibv_create_ah failed! errno: %d\n", errno); } - goto err_free_efa_ah; + goto err_free; } err = efadv_query_ah(efa_ah->ibv_ah, &efa_ah_attr, sizeof(efa_ah_attr)); @@ -124,11 +101,7 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, goto err_destroy_ibv_ah; } - dlist_init(&efa_ah->implicit_conn_list); - dlist_insert_tail(&efa_ah->domain_lru_ah_list_entry, &domain->ah_lru_list); - efa_ah->implicit_refcnt = 0; - efa_ah->explicit_refcnt = 0; - insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++; + efa_ah->refcnt = 1; efa_ah->ahn = efa_ah_attr.ahn; memcpy(efa_ah->gid, gid, EFA_GID_LEN); HASH_ADD(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah); @@ -137,30 +110,27 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, err_destroy_ibv_ah: ibv_destroy_ah(efa_ah->ibv_ah); -err_free_efa_ah: +err_free: free(efa_ah); ofi_genlock_unlock(&domain->util_domain.lock); return NULL; } /** - * @brief destroy an efa_ah object + * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free) * - * Removes AH from hash map and LRU list, destroys ibv_ah, frees memory. * Caller must hold util_domain.lock. * - * @param[in] domain efa_domain - * @param[in] ah efa_ah object to destroy + * @param[in] domain efa domain + * @param[in] ah efa_ah to destroy */ -void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) +void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah) { int err; - assert(ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0); - assert(dlist_empty(&ah->implicit_conn_list)); + assert(ah->refcnt == 0); EFA_INFO(FI_LOG_AV, "Destroying AH for ahn %d\n", ah->ahn); - dlist_remove(&ah->domain_lru_ah_list_entry); HASH_DEL(domain->ah_map, ah); err = ibv_destroy_ah(ah->ibv_ah); @@ -170,33 +140,20 @@ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) } /** - * @brief release an efa_ah object after acquiring the util domain lock + * @brief release an efa_ah, destroying it when refcount reaches zero * - * Decrements the appropriate refcount. If both refcounts reach zero, - * destroys the AH. - * - * @param[in] domain efa_domain - * @param[in] ah efa_ah object pointer - * @param[in] release_from_implicit_av whether releasing from implicit AV + * @param[in] domain efa domain + * @param[in] ah efa_ah to release */ -void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av) +void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah) { ofi_genlock_lock(&domain->util_domain.lock); -#if ENABLE_DEBUG - struct efa_ah *tmp; - HASH_FIND(hh, domain->ah_map, ah->gid, EFA_GID_LEN, tmp); - assert(tmp == ah); -#endif - assert((release_from_implicit_av && ah->implicit_refcnt > 0) || - (!release_from_implicit_av && ah->explicit_refcnt > 0)); + assert(ah->refcnt > 0); + ah->refcnt--; - release_from_implicit_av ? ah->implicit_refcnt-- : - ah->explicit_refcnt--; + if (ah->refcnt == 0) + efa_ah_destroy(domain, ah); - if (ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0) { - efa_ah_destroy_ah(domain, ah); - } ofi_genlock_unlock(&domain->util_domain.lock); } diff --git a/prov/efa/src/efa_ah.h b/prov/efa/src/efa_ah.h index fe7feafe51c..25a81ffac1a 100644 --- a/prov/efa/src/efa_ah.h +++ b/prov/efa/src/efa_ah.h @@ -9,33 +9,55 @@ #define EFA_GID_LEN 16 +/** + * @brief Base address handle — shared by efa-direct and protocol paths + * + * Contains only the ibv_ah, GID, AHN, refcount, and hash handle. + * Protocol-specific fields (implicit_refcnt, implicit_conn_list, + * LRU list entry) are in efa_proto_ah. + * + * pahole: size: 88, cachelines: 2 (2-byte hole after ahn) + * + * TX hot path: ibv_ah (off=16) is passed to ibv post_send/read/write + * on every send. Both ibv_ah and ahn are in cacheline 0. + * All other fields are control path only (AH alloc/release/hash lookup). + */ struct efa_ah { - uint8_t gid[EFA_GID_LEN]; /* efa device GID */ - struct ibv_ah *ibv_ah; /* created by ibv_create_ah() using GID */ - uint16_t ahn; /* adress handle number */ - /* Number of explicit AV entries associated with this AH */ - int explicit_refcnt; - /* Number of implicit AV entries associated with this AH */ - int implicit_refcnt; - /* dlist of all implicit AV entries associated with this AH entry */ - struct dlist_entry implicit_conn_list; - /* dlist entry in domain's LRU AH list */ - struct dlist_entry domain_lru_ah_list_entry; - UT_hash_handle hh; /* hash map handle, link all efa_ah with efa_ep->ah_map */ + uint8_t gid[EFA_GID_LEN]; /* 0 16 */ + struct ibv_ah *ibv_ah; /* 16 8 */ + uint16_t ahn; /* 24 2 */ + /* 2-byte hole */ + int refcnt; /* 28 4 */ + UT_hash_handle hh; /* 32 56 */ }; -void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, - struct efa_ah *ah); - +/** + * @brief allocate an ibv_ah from GID, reusing existing AH if possible + * + * @param[in] domain efa domain + * @param[in] gid GID + * @param[in] alloc_size size to allocate (sizeof(efa_ah) or sizeof(efa_proto_ah)) + * @return pointer to efa_ah on success, NULL on failure (errno set) + */ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); - -void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); - -void efa_ah_release_unsafe(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); - -void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah); - -#endif \ No newline at end of file + size_t alloc_size); + +/** + * @brief release an efa_ah, destroying it when refcount reaches zero + * + * @param[in] domain efa domain + * @param[in] ah efa_ah to release + */ +void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah); + +/** + * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free) + * + * Caller must hold util_domain.lock. + * + * @param[in] domain efa domain + * @param[in] ah efa_ah to destroy + */ +void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah); + +#endif diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 4cc34c81db0..6b99c92ba03 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -11,8 +11,15 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "rdm/efa_rdm_pke_utils.h" +/* Stub: this function moved to efa_proto_av.c as efa_proto_ah_lru_move. + * The calls below are in dead code paths (old implicit AV) that will be + * removed in the strip commit. */ +static inline void efa_ah_implicit_av_lru_ah_move( + struct efa_domain *domain, struct efa_ah *ah) { } + static inline struct efa_conn *efa_av_addr_to_conn_impl(struct util_av *util_av, fi_addr_t fi_addr) { @@ -370,11 +377,11 @@ static int efa_conn_implicit_to_explicit(struct efa_av *av, av->used_explicit++; /* Handle AH LRU list and refcnt */ - assert(!dlist_empty(&ah->implicit_conn_list)); + assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list)); dlist_remove(&implicit_conn->ah_implicit_conn_list_entry); efa_ah_implicit_av_lru_ah_move(av->domain, ah); - ah->implicit_refcnt--; - ah->explicit_refcnt++; + efa_proto_ah_from_ah(ah)->implicit_refcnt--; + efa_proto_ah_from_ah(ah)->explicit_refcnt++; EFA_INFO(FI_LOG_AV, "Peer with implicit fi_addr %" PRIu64 diff --git a/prov/efa/src/efa_conn.c b/prov/efa/src/efa_conn.c index 6b191ac22f3..35b528e498a 100644 --- a/prov/efa/src/efa_conn.c +++ b/prov/efa/src/efa_conn.c @@ -275,13 +275,13 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, conn->implicit_fi_addr = FI_ADDR_NOTAVAIL; } - conn->ah = efa_ah_alloc(av->domain, raw_addr->raw, insert_implicit_av); + conn->ah = efa_ah_alloc(av->domain, raw_addr->raw, sizeof(struct efa_ah)); if (!conn->ah) goto err_release; if (insert_implicit_av) dlist_insert_tail(&conn->ah_implicit_conn_list_entry, - &conn->ah->implicit_conn_list); + &efa_proto_ah_from_ah(conn->ah)->implicit_conn_list); conn->shm_fi_addr = FI_ADDR_NOTAVAIL; /* @@ -321,7 +321,7 @@ struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, err_release: if (conn->ah) - efa_ah_release(av->domain, conn->ah, insert_implicit_av); + efa_ah_release(av->domain, conn->ah); conn->ep_addr = NULL; err = ofi_av_remove_addr(util_av, fi_addr); @@ -411,7 +411,7 @@ void efa_conn_release(struct efa_av *av, struct efa_conn *conn, if (release_from_implicit_av) dlist_remove(&conn->ah_implicit_conn_list_entry); - efa_ah_release(av->domain, conn->ah, release_from_implicit_av); + efa_ah_release(av->domain, conn->ah); efa_conn_release_util_av(av, conn, release_from_implicit_av); @@ -450,8 +450,8 @@ void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn, efa_conn_release_util_av(av, conn, release_from_implicit_av); - release_from_implicit_av ? conn->ah->implicit_refcnt-- : - conn->ah->explicit_refcnt--; + release_from_implicit_av ? efa_proto_ah_from_ah(conn->ah)->implicit_refcnt-- : + efa_proto_ah_from_ah(conn->ah)->explicit_refcnt--; release_from_implicit_av ? av->used_implicit-- : av->used_explicit--; } diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c index a78d02d8e8f..b76cd28c8af 100644 --- a/prov/efa/src/rdm/efa_proto_av.c +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -57,6 +57,9 @@ static bool efa_is_local_peer(struct efa_av *av, const void *addr) return 0; } +/* Forward declaration for static helper defined after entry release */ +static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah); + /* ---- Address lookup ---- */ /** @@ -357,7 +360,7 @@ void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av, dlist_insert_tail(&entry->implicit_av_lru_entry, &av->implicit_av_lru_list); - efa_ah_implicit_av_lru_ah_move(av->efa_av.domain, entry->ah); + efa_proto_ah_lru_move(av->efa_av.domain, entry->ah); } /* ---- Reverse lookup (protocol, connid-aware) ---- */ @@ -568,7 +571,7 @@ void efa_proto_av_entry_release(struct efa_proto_av *av, dlist_remove(&entry->implicit_av_lru_entry); } - efa_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av); + efa_proto_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av); efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; @@ -600,15 +603,34 @@ void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, } /* Decrement refcnts before release_util_av which NULLs entry->ah */ - release_from_implicit_av ? entry->ah->implicit_refcnt-- : - entry->ah->explicit_refcnt--; + release_from_implicit_av ? efa_proto_ah_from_ah(entry->ah)->implicit_refcnt-- : + efa_proto_ah_from_ah(entry->ah)->explicit_refcnt--; + entry->ah->refcnt--; efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; } -/* ---- AH alloc with eviction ---- */ +/* ---- Protocol AH helpers ---- */ + +/** + * @brief Move the AH to the end of the LRU list (most recently used) + * + * @param[in] domain efa domain + * @param[in] ah base AH (must be embedded in efa_proto_ah) + */ +static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah) +{ + struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah); + + assert(efa_proto_ah_from_ah(ah)->implicit_refcnt > 0 || efa_proto_ah_from_ah(ah)->explicit_refcnt > 0); + assert(dlist_entry_in_list(&domain->ah_lru_list, + &proto_ah->lru_list_entry)); + + dlist_remove(&proto_ah->lru_list_entry); + dlist_insert_tail(&proto_ah->lru_list_entry, &domain->ah_lru_list); +} /** * @brief Evict the least recently used AH that has no explicit AV entries. @@ -625,22 +647,22 @@ void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, static int efa_proto_ah_evict(struct efa_domain *domain) { struct efa_proto_av_entry *entry_to_release; - struct efa_ah *ah_tmp, *ah_to_release = NULL; + struct efa_proto_ah *proto_ah_tmp, *proto_ah_to_release = NULL; struct dlist_entry *tmp; assert(ofi_genlock_held(&domain->srx_lock)); ofi_genlock_lock(&domain->util_domain.lock); - dlist_foreach_container(&domain->ah_lru_list, struct efa_ah, ah_tmp, - domain_lru_ah_list_entry) { - if (ah_tmp->explicit_refcnt == 0) { - ah_to_release = ah_tmp; + dlist_foreach_container(&domain->ah_lru_list, struct efa_proto_ah, + proto_ah_tmp, lru_list_entry) { + if (proto_ah_tmp->explicit_refcnt == 0) { + proto_ah_to_release = proto_ah_tmp; break; } } - if (!ah_to_release) { + if (!proto_ah_to_release) { ofi_genlock_unlock(&domain->util_domain.lock); EFA_WARN(FI_LOG_AV, "AH creation for implicit AV entry failed with ENOMEM " @@ -648,9 +670,9 @@ static int efa_proto_ah_evict(struct efa_domain *domain) return -FI_ENOMEM; } - assert(ah_to_release->implicit_refcnt > 0); + assert(proto_ah_to_release->implicit_refcnt > 0); - dlist_foreach_container_safe(&ah_to_release->implicit_conn_list, + dlist_foreach_container_safe(&proto_ah_to_release->implicit_conn_list, struct efa_proto_av_entry, entry_to_release, ah_implicit_conn_list_entry, tmp) { @@ -661,9 +683,12 @@ static int efa_proto_ah_evict(struct efa_domain *domain) entry_to_release, true); } - if (ah_to_release->implicit_refcnt == 0 && - ah_to_release->explicit_refcnt == 0) { - efa_ah_destroy_ah(domain, ah_to_release); + if (proto_ah_to_release->implicit_refcnt == 0 && + proto_ah_to_release->explicit_refcnt == 0) { + dlist_remove(&proto_ah_to_release->lru_list_entry); + assert(dlist_empty(&proto_ah_to_release->implicit_conn_list)); + assert(proto_ah_to_release->ah.refcnt == 0); + efa_ah_destroy(domain, &proto_ah_to_release->ah); } ofi_genlock_unlock(&domain->util_domain.lock); @@ -672,40 +697,137 @@ static int efa_proto_ah_evict(struct efa_domain *domain) } /** - * @brief Allocate an AH with eviction retry for protocol AV. + * @brief Allocate a protocol AH with eviction retry. * - * Wraps efa_ah_alloc with ENOMEM handling: if ibv_create_ah fails due - * to too many AH entries, evicts an AH with only implicit references - * and retries. + * Calls efa_ah_alloc with sizeof(efa_proto_ah) to allocate the + * protocol wrapper. Initializes implicit_refcnt, explicit_refcnt, + * implicit_conn_list, and inserts into the domain LRU list. + * On ENOMEM, evicts an AH with only implicit references and retries. + * + * Protocol refcnts and the LRU list are shared across all AVs sharing + * the same PD (domain), but per-AV call sites only hold their own + * util_av lock. This function takes util_domain.lock around the proto + * field mutations to serialize against concurrent efa_proto_ah_alloc + * / efa_proto_ah_release on a different AV. * * @param[in] domain efa domain * @param[in] gid GID * @param[in] insert_implicit_av whether this is for an implicit AV entry - * @return pointer to efa_ah on success, NULL on failure + * @return pointer to base efa_ah on success, NULL on failure */ -static struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, +struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, const uint8_t *gid, bool insert_implicit_av) { struct efa_ah *ah; + struct efa_proto_ah *proto_ah; int err; + bool first_proto_user; - ah = efa_ah_alloc(domain, gid, insert_implicit_av); - if (ah) - return ah; + ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah)); + if (!ah) { + if (errno != FI_ENOMEM) + return NULL; - if (errno != FI_ENOMEM) - return NULL; + EFA_INFO(FI_LOG_AV, + "ibv_create_ah failed with ENOMEM. " + "Attempting to evict AH entry\n"); - EFA_INFO(FI_LOG_AV, - "ibv_create_ah failed with ENOMEM. " - "Attempting to evict AH entry\n"); + err = efa_proto_ah_evict(domain); + if (err) + return NULL; - err = efa_proto_ah_evict(domain); - if (err) - return NULL; + ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah)); + if (!ah) + return NULL; + } + + /* + * efa_ah_alloc released util_domain.lock on return. Reacquire it + * before touching the protocol-specific fields (refcnts, LRU list, + * implicit_conn_list) so concurrent allocators on a different AV's + * lock don't race on a shared AH. + * + * Between efa_ah_alloc returning and reacquiring the lock, a + * concurrent efa_proto_ah_release could have dropped both proto + * refcnts to zero and removed the AH from the LRU list, even though + * the base ah->refcnt stayed > 0. Detect "first proto user" by + * checking the proto refcnts directly rather than ah->refcnt. + */ + ofi_genlock_lock(&domain->util_domain.lock); + + proto_ah = efa_proto_ah_from_ah(ah); + + /* + * first_proto_user is true when both proto refcnts are zero — either + * this is a brand-new AH (refcnt just incremented from 0 to 1 inside + * efa_ah_alloc) or an AH where the last proto user released (and + * removed it from the LRU list) but the base layer kept it alive. + * Either way we need to (re)init the proto fields and (re)insert + * into the LRU list. + */ + first_proto_user = (proto_ah->implicit_refcnt == 0 && + proto_ah->explicit_refcnt == 0); + if (first_proto_user) { + dlist_init(&proto_ah->implicit_conn_list); + dlist_insert_tail(&proto_ah->lru_list_entry, + &domain->ah_lru_list); + } + + insert_implicit_av ? proto_ah->implicit_refcnt++ : + proto_ah->explicit_refcnt++; + + if (!first_proto_user) + efa_proto_ah_lru_move(domain, ah); + + ofi_genlock_unlock(&domain->util_domain.lock); + + return ah; +} + +/** + * @brief Release a protocol AH reference. + * + * Decrements the appropriate protocol refcount. When both protocol + * refcounts reach zero, removes from LRU list and calls efa_ah_release + * to decrement the base refcount (which destroys the AH). + * + * Protocol refcnts and the LRU list are shared across all AVs sharing + * the same PD (domain), but per-AV call sites only hold their own + * util_av lock. This function takes util_domain.lock around the proto + * field mutations to serialize against concurrent efa_proto_ah_alloc + * / efa_proto_ah_release on a different AV. + * + * @param[in] domain efa domain + * @param[in] ah base AH + * @param[in] release_from_implicit_av whether releasing implicit ref + */ +void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah, + bool release_from_implicit_av) +{ + struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah); + + /* + * Protocol refcnts and LRU list are shared across AVs sharing the + * same PD (domain), so mutations must be serialized by + * util_domain.lock — the same lock efa_ah_release acquires. + */ + ofi_genlock_lock(&domain->util_domain.lock); + + assert((release_from_implicit_av && proto_ah->implicit_refcnt > 0) || + (!release_from_implicit_av && proto_ah->explicit_refcnt > 0)); + + release_from_implicit_av ? proto_ah->implicit_refcnt-- : + proto_ah->explicit_refcnt--; + + if (proto_ah->implicit_refcnt == 0 && proto_ah->explicit_refcnt == 0) { + dlist_remove(&proto_ah->lru_list_entry); + assert(dlist_empty(&proto_ah->implicit_conn_list)); + } + + ofi_genlock_unlock(&domain->util_domain.lock); - return efa_ah_alloc(domain, gid, insert_implicit_av); + efa_ah_release(domain, ah); } /* ---- Entry alloc ---- */ @@ -790,7 +912,7 @@ struct efa_proto_av_entry *efa_proto_av_entry_alloc( if (insert_implicit_av) dlist_insert_tail(&entry->ah_implicit_conn_list_entry, - &entry->ah->implicit_conn_list); + &efa_proto_ah_from_ah(entry->ah)->implicit_conn_list); entry->shm_fi_addr = FI_ADDR_NOTAVAIL; @@ -826,7 +948,7 @@ struct efa_proto_av_entry *efa_proto_av_entry_alloc( dlist_remove(&entry->implicit_av_lru_entry); if (entry->ah) - efa_ah_release(av->efa_av.domain, entry->ah, insert_implicit_av); + efa_proto_ah_release(av->efa_av.domain, entry->ah, insert_implicit_av); entry->ah = NULL; memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN); @@ -967,11 +1089,11 @@ int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, av->efa_av.used_explicit++; /* Handle AH LRU list and refcnt */ - assert(!dlist_empty(&ah->implicit_conn_list)); + assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list)); dlist_remove(&implicit_entry->ah_implicit_conn_list_entry); - efa_ah_implicit_av_lru_ah_move(av->efa_av.domain, ah); - ah->implicit_refcnt--; - ah->explicit_refcnt++; + efa_proto_ah_lru_move(av->efa_av.domain, ah); + efa_proto_ah_from_ah(ah)->implicit_refcnt--; + efa_proto_ah_from_ah(ah)->explicit_refcnt++; EFA_INFO(FI_LOG_AV, "Peer with implicit fi_addr %" PRIu64 diff --git a/prov/efa/src/rdm/efa_proto_av.h b/prov/efa/src/rdm/efa_proto_av.h index 85e1ff0df29..6cf8f330383 100644 --- a/prov/efa/src/rdm/efa_proto_av.h +++ b/prov/efa/src/rdm/efa_proto_av.h @@ -9,6 +9,30 @@ struct efa_rdm_ep; struct efa_rdm_peer; +/** + * @brief Protocol AH — wraps base efa_ah with implicit refcount and LRU + * + * The base efa_ah has a single refcount and no LRU knowledge. + * efa_proto_ah adds the implicit/explicit refcount split, the + * implicit_conn_list (entries using this AH), and the LRU list + * entry for AH eviction. + * + * pahole: size: 128, cachelines: 2 + * + * All efa_proto_ah fields are control path only (AV insert/remove/eviction). + * The TX hot fields (ibv_ah, ahn) are in the embedded efa_ah at cacheline 0. + * The protocol extension fields start at offset 88 (cacheline 1), so + * accessing them on the eviction path does not pollute the TX cache line. + */ +struct efa_proto_ah { + struct efa_ah ah; /* 0 88 must be first (castable) */ + /* --- cacheline 1 boundary (64 bytes) was 24 bytes ago --- */ + int implicit_refcnt; /* 88 4 */ + int explicit_refcnt; /* 92 4 */ + struct dlist_entry implicit_conn_list; /* 96 16 */ + struct dlist_entry lru_list_entry; /* 112 16 */ +}; + /** * @brief Protocol AV entry — flat layout with same field prefix as efa_av_entry * @@ -109,6 +133,17 @@ struct efa_proto_av { struct efa_ep_addr_hashable *evicted_peers_hashset; /* 664 8 */ }; +/** + * @brief get the protocol AH wrapper from a base AH pointer + * + * @param[in] ah base AH (must be embedded in efa_proto_ah) + * @return pointer to the containing efa_proto_ah + */ +static inline struct efa_proto_ah *efa_proto_ah_from_ah(struct efa_ah *ah) +{ + return container_of(ah, struct efa_proto_ah, ah); +} + /** * @brief typed accessor for the ep_addr field of a proto AV entry * @@ -139,10 +174,19 @@ struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup( void efa_proto_av_entry_ep_peer_map_remove( struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep); +/* Protocol AH allocation / release (shared base AH + proto wrapper) */ +struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, + const uint8_t *gid, + bool insert_implicit_av); + +void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah, + bool release_from_implicit_av); + /* SHM AV operations */ int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av, struct efa_proto_av_entry *entry); +/* Entry deinit (tears down peers on the entry) */ void efa_proto_av_entry_deinit(struct efa_proto_av *av, struct efa_proto_av_entry *entry); diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 75a0e922426..64e8b4c83d4 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1080,7 +1080,7 @@ static int efa_rdm_ep_close(struct fid *fid) efa_rdm_ep_remove_cntr_ibv_cq_poll_list(&efa_rdm_ep->base_ep); if (efa_rdm_ep->self_ah) - efa_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false); + efa_proto_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false); efa_rdm_ep_deregister_ibv_cqs(efa_rdm_ep); @@ -1350,7 +1350,7 @@ static inline int efa_rdm_ep_create_self_ah(struct efa_rdm_ep *rdm_ep) { - rdm_ep->self_ah = efa_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false); + rdm_ep->self_ah = efa_proto_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false); return rdm_ep->self_ah ? 0 : -FI_EINVAL; } diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 22967a28b7e..376023913af 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -79,7 +79,7 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) assert_int_not_equal(addr1, addr2); } -static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, bool multi_av) +static void efa_ah_cnt_av_efa_impl(struct efa_resource **state, bool multi_av) { struct efa_resource *resource = *state; struct efa_ep_addr raw_addr = {0}; @@ -91,23 +91,19 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo struct fi_av_attr av_attr = {0}; struct fid_av *av1 = NULL, *av2 = NULL; - efa_unit_test_resource_construct(resource, FI_EP_RDM, efa_fabric ? EFA_FABRIC_NAME : EFA_DIRECT_FABRIC_NAME); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(err, 0); - /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 for efa fabric */ - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0); + /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); - if (efa_fabric) { - assert_non_null(efa_ah); - assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 1 : 0); - assert_int_equal(efa_ah->implicit_refcnt, 0); - } else { - assert_null(efa_ah); - } + assert_non_null(efa_ah); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); if (multi_av) { /* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */ @@ -133,15 +129,10 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo assert_int_not_equal(addr1, addr2); } - if (!efa_fabric) { - HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); - assert_non_null(efa_ah); - } - - /* So far we should still have 1 ah, and its refcnt is 3 for efa fabric (including self AH) and 2 for efa-direct fabric) */ + /* So far we should still have 1 ah, and its refcnt is 3 (including self AH) */ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 3 : 2); - assert_int_equal(efa_ah->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 3); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); if (multi_av) { /* ah refcnt should be decremented to 1 after av close */ @@ -153,15 +144,87 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0); } - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0); - if (efa_fabric) { - /* efa_ah is still alive because self-AH holds a reference */ - assert_int_equal(efa_ah->explicit_refcnt, 1); - assert_int_equal(efa_ah->implicit_refcnt, 0); + /* efa_ah is still alive because self-AH holds a reference */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + + /* ah map should be empty now after closing ep which destroys the self ah */ + assert_int_equal(fi_close(&resource->ep->fid), 0); + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + /* Reset to NULL to avoid test reaper closing again */ + resource->ep = NULL; +} + +static void efa_ah_cnt_av_efa_direct_impl(struct efa_resource **state, bool multi_av) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t addr1, addr2; + int err, num_addr; + struct efa_domain *efa_domain; + struct efa_ah *efa_ah = NULL; + struct fi_av_attr av_attr = {0}; + struct fid_av *av1 = NULL, *av2 = NULL; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + /* efa-direct does not create a self AH, so ah_map should be empty */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); + assert_null(efa_ah); + + if (multi_av) { + /* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */ + assert_int_equal(fi_av_open(resource->domain, &av_attr, &av1, NULL), 0); + assert_int_equal(fi_av_open(resource->domain, &av_attr, &av2, NULL), 0); + } + + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + + num_addr = fi_av_insert(multi_av ? av1 : resource->av, &raw_addr, 1, &addr1, 0, NULL); + assert_int_equal(num_addr, 1); + + raw_addr.qpn = 2; + raw_addr.qkey = 0x5678; + num_addr = fi_av_insert(multi_av ? av2 : resource->av, &raw_addr, 1, &addr2, 0, NULL); + assert_int_equal(num_addr, 1); + + if (multi_av) { + /* They should be equal as 0 since they are in different avs */ + assert_int_equal(addr1, addr2); + } else { + assert_int_not_equal(addr1, addr2); } - /* else: efa_ah has been freed, do not dereference */ - /* ah map should be empty now after closing ep which destroys the self ah for efa fabric */ + HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); + assert_non_null(efa_ah); + + /* So far we should still have 1 ah, and its refcnt is 2 */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_ah->refcnt, 2); + + if (multi_av) { + /* ah refcnt should be decremented to 0 after av close */ + assert_int_equal(fi_close(&av1->fid), 0); + assert_int_equal(fi_close(&av2->fid), 0); + } else { + /* ah refcnt should be decremented to 0 after av entry removals */ + assert_int_equal(fi_av_remove(resource->av, &addr1, 1, 0), 0); + assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0); + } + + /* efa_ah has been freed (no self AH holding a reference on efa-direct) */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* ah map should still be empty after closing ep */ assert_int_equal(fi_close(&resource->ep->fid), 0); assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); /* Reset to NULL to avoid test reaper closing again */ @@ -170,22 +233,22 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo void test_efa_ah_cnt_one_av_efa(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, true, false); + efa_ah_cnt_av_efa_impl(state, false); } void test_efa_ah_cnt_one_av_efa_direct(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, false, false); + efa_ah_cnt_av_efa_direct_impl(state, false); } void test_efa_ah_cnt_multi_av_efa(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, true, true); + efa_ah_cnt_av_efa_impl(state, true); } void test_efa_ah_cnt_multi_av_efa_direct(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, false, true); + efa_ah_cnt_av_efa_direct_impl(state, true); } /** @@ -809,8 +872,8 @@ void test_ah_refcnt(struct efa_resource **state) assert_int_equal(g_ibv_ah_cnt, 2); assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); - assert_int_equal(efa_ah->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); /* Move implicit AV entry to explicit AV entry */ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); @@ -819,8 +882,8 @@ void test_ah_refcnt(struct efa_resource **state) assert_int_equal(g_ibv_ah_cnt, 2); assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, 1); - assert_int_equal(efa_ah->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); err = fi_av_remove(resource->av, &fi_addr, 1, 0); assert_int_equal(err, 0); @@ -931,8 +994,8 @@ void test_ah_lru_eviction_impl(bool explicit) assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); efa_ah = peer->av_entry->ah; - assert_int_equal(efa_ah->implicit_refcnt, 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); if (explicit) { err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL); @@ -949,11 +1012,11 @@ void test_ah_lru_eviction_impl(bool explicit) efa_ah = peer->av_entry->ah; if (explicit) { - assert_int_equal(efa_ah->implicit_refcnt, 0); - assert_int_equal(efa_ah->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); } else { - assert_int_equal(efa_ah->implicit_refcnt, 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); } if (explicit) { diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 9f4875d4246..144e7f2d7f8 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -21,7 +21,7 @@ int g_ibv_ah_limit = 1024; int g_ibv_ah_cnt = 0; int g_self_ah_cnt = 1; struct ibv_ah g_dummy_ah; -struct efa_ah g_dummy_efa_ah = {0}; +struct efa_proto_ah g_dummy_proto_ah = {0}; void efa_ibv_ah_limit_cnt_reset() { @@ -74,40 +74,49 @@ int efa_mock_ibv_destroy_ah_dont_create_self_ah(struct ibv_ah *ibv_ah) } struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { return NULL; } struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { /* Intercept the self AH call in efa_ah_alloc and do not call * ibv_create_ah or modify the AH map etc */ if (g_ibv_ah_cnt < g_self_ah_cnt) { g_ibv_ah_cnt++; - g_dummy_efa_ah.ibv_ah = &g_dummy_ah; - g_dummy_efa_ah.ahn = -1; - memset(g_dummy_efa_ah.gid, 0, sizeof(g_dummy_efa_ah.gid)); - g_dummy_efa_ah.explicit_refcnt = 1; - g_dummy_efa_ah.implicit_refcnt = 0; - return &g_dummy_efa_ah; + g_dummy_proto_ah.ah.ibv_ah = &g_dummy_ah; + g_dummy_proto_ah.ah.ahn = -1; + memset(g_dummy_proto_ah.ah.gid, 0, sizeof(g_dummy_proto_ah.ah.gid)); + g_dummy_proto_ah.ah.refcnt = 1; + /* + * Reset protocol fields so efa_proto_ah_alloc sees a fresh AH + * regardless of prior test state. Without this reset, stale + * proto refcnts or a stale lru_list_entry from a freed domain + * would carry forward into the current test. + */ + g_dummy_proto_ah.implicit_refcnt = 0; + g_dummy_proto_ah.explicit_refcnt = 0; + memset(&g_dummy_proto_ah.lru_list_entry, 0, + sizeof(g_dummy_proto_ah.lru_list_entry)); + dlist_init(&g_dummy_proto_ah.implicit_conn_list); + return &g_dummy_proto_ah.ah; } else { - return __real_efa_ah_alloc(domain, gid, insert_implicit_av); + return __real_efa_ah_alloc(domain, gid, alloc_size); } } void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain, - struct efa_ah *ah, - bool release_from_implicit_av) + struct efa_ah *ah) { /* Intercept the self AH destruct call in efa_ah_release and do not call * ibv_destroy_ah or modify the AH map etc */ if (g_ibv_ah_cnt <= g_self_ah_cnt) g_ibv_ah_cnt--; else - return __real_efa_ah_release(domain, ah, release_from_implicit_av); + return __real_efa_ah_release(domain, ah); } int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibv_ctx, @@ -516,16 +525,14 @@ int __wrap_efadv_query_device(struct ibv_context *ibv_ctx, struct efadv_device_a } struct efa_ah *__wrap_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { - return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, insert_implicit_av); + return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, alloc_size); } -void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av) +void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah) { - return g_efa_unit_test_mocks.efa_ah_release(domain, ah, - release_from_implicit_av); + return g_efa_unit_test_mocks.efa_ah_release(domain, ah); } struct ibv_cq_ex *efa_mock_create_cq_ex_return_null(struct ibv_context *context, struct ibv_cq_init_attr_ex *init_attr) diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index ae68e77935f..96a618886b2 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -35,20 +35,18 @@ int __real_efadv_query_device(struct ibv_context *ibvctx, struct efadv_device_at uint32_t inlen); struct efa_ah *__real_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); -void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); +void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah); void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain, - struct efa_ah *ah, - bool release_from_implicit_av); + struct efa_ah *ah); int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibvctx, struct efadv_device_attr *attr, uint32_t inlen); @@ -166,9 +164,8 @@ struct efa_unit_test_mocks uint32_t inlen); struct efa_ah *(*efa_ah_alloc)(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); - void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); + size_t alloc_size); + void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah); #if HAVE_EFADV_CQ_EX struct ibv_cq_ex *(*efadv_create_cq)(struct ibv_context *ibvctx, From 451a3a765cc0c439e48e625add04baa28f3c9a00 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 6 May 2026 10:56:42 -0600 Subject: [PATCH 13/16] prov/efa: strip efa_av to base-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With every protocol-specific AV concern now living in efa_proto_av.c, remove the remnants from the base AV layer. efa_av.h and efa_av.c are reduced to what the efa-direct path actually needs: the efa_av struct now contains only domain, used, type, cur_reverse_av, prv_reverse_av, and util_av. All implicit-AV fields, LRU tracking, peer-map logic, SHM AV handling, and implicit reverse-lookup functions are gone from this file. struct efa_av_entry becomes a flat 48-byte record of ep_addr[32], a struct efa_ah *ah, and fi_addr_t fi_addr. struct efa_conn is no longer embedded, and the base path's callers switch from av_entry->conn.ah to av_entry->ah directly. The _Static_assert that previously tied efa_av_entry.conn.ah and efa_proto_av_entry.ah to a common offset is updated to reference efa_av_entry.ah directly. Two new static helpers efa_av_entry_init and efa_av_entry_release replace the old efa_conn_alloc / efa_conn_release paths for the base-only lifecycle. efa_av_insert_one and efa_av_insert lose their external linkage because they are only called from within efa_av.c now (RDM uses efa_proto_av_insert_one). Drop efa_conn.c and efa_conn.h from prov/efa/Makefile.include and libfabric.vcxproj. The files themselves are left on disk and deleted in a follow-up commit so this commit is strictly a functionality change and the next one is a pure file deletion. Leaving the files out of the build list means their compile errors against the new efa_av / efa_av_entry shapes are harmless — nothing includes or compiles them from this commit forward. Update efa_ah.c to drop the now-unused #include "efa_conn.h". Update the efa_unit_test_proto_av.c test file to call efa_proto_av_reverse_lookup in place of the deleted efa_av_reverse_lookup_rdm. Signed-off-by: Seth Zegelstein --- libfabric.vcxproj | 2 - prov/efa/Makefile.include | 2 - prov/efa/src/efa_ah.c | 1 - prov/efa/src/efa_av.c | 680 +++++++++---------------------- prov/efa/src/efa_av.h | 162 ++++---- prov/efa/src/efa_domain.c | 4 +- prov/efa/src/efa_msg.c | 2 +- prov/efa/src/efa_rma.c | 8 +- prov/efa/src/rdm/efa_proto_av.c | 25 +- prov/efa/test/efa_unit_test_av.c | 8 +- prov/efa/test/efa_unit_test_cq.c | 2 +- 11 files changed, 304 insertions(+), 592 deletions(-) diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 88106138abc..d59ea6e0a68 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -873,7 +873,6 @@ - @@ -1019,7 +1018,6 @@ - diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index b8faaedfd40..92f5b210a5f 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -36,7 +36,6 @@ _efa_files = \ prov/efa/src/efa_shm.c \ prov/efa/src/efa_av.c \ prov/efa/src/efa_ah.c \ - prov/efa/src/efa_conn.c \ prov/efa/src/rdm/efa_proto_av.c \ prov/efa/src/efa_domain.c \ prov/efa/src/efa_fabric.c \ @@ -90,7 +89,6 @@ _efa_headers = \ prov/efa/src/efa.h \ prov/efa/src/efa_av.h \ prov/efa/src/efa_ah.h \ - prov/efa/src/efa_conn.h \ prov/efa/src/efa_mr.h \ prov/efa/src/efa_shm.h \ prov/efa/src/efa_hmem.h \ diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index 2a4e5c61fb6..12d2167d835 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -5,7 +5,6 @@ #include "efa.h" #include "efa_ah.h" -#include "efa_conn.h" #include /** diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 6b99c92ba03..a4bd828b926 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -11,89 +11,39 @@ #include "efa.h" #include "efa_av.h" -#include "rdm/efa_proto_av.h" -#include "rdm/efa_rdm_pke_utils.h" - -/* Stub: this function moved to efa_proto_av.c as efa_proto_ah_lru_move. - * The calls below are in dead code paths (old implicit AV) that will be - * removed in the strip commit. */ -static inline void efa_ah_implicit_av_lru_ah_move( - struct efa_domain *domain, struct efa_ah *ah) { } - -static inline struct efa_conn *efa_av_addr_to_conn_impl(struct util_av *util_av, - fi_addr_t fi_addr) -{ - struct util_av_entry *util_av_entry; - struct efa_av_entry *efa_av_entry; - - if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) - return NULL; - - if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr))) - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); - else - return NULL; - - efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - return efa_av_entry->conn.ep_addr ? &efa_av_entry->conn : NULL; -} /** - * @brief find efa_conn struct using fi_addr in the explicit AV + * @brief find efa_av_entry using fi_addr in the explicit AV * * @param[in] av efa av - * @param[in] addr fi_addr - * @return if address is valid, return pointer to efa_conn struct + * @param[in] fi_addr libfabric address + * @return if address is valid, return pointer to efa_av_entry * otherwise, return NULL */ -struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) -{ - return efa_av_addr_to_conn_impl(&av->util_av, fi_addr); -} - -/** - * @brief find efa_conn struct using fi_addr in the implicit AV - * - * @param[in] av efa av - * @param[in] addr fi_addr - * @return if address is valid, return pointer to efa_conn struct - * otherwise, return NULL - */ -struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, fi_addr_t fi_addr) -{ - return efa_av_addr_to_conn_impl(&av->util_av_implicit, fi_addr); -} - -/** - * @brief Look up an efa_av_entry by fi_addr in the base (explicit) AV - * - * Wrapper around efa_av_addr_to_conn that returns the containing - * efa_av_entry via container_of. Exposed as the base-layer lookup - * primitive for callers that need to work with efa_av_entry rather - * than the embedded efa_conn. - * - * @param[in] av address vector - * @param[in] fi_addr libfabric address - * @return pointer to efa_av_entry, or NULL if not found - */ struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr) { - struct efa_conn *conn; + struct util_av_entry *util_av_entry; + struct efa_av_entry *av_entry; - conn = efa_av_addr_to_conn(av, fi_addr); - if (!conn) + if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) return NULL; - return container_of(conn, struct efa_av_entry, conn); + if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(av->util_av.av_entry_pool, fi_addr))) + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); + else + return NULL; + + av_entry = (struct efa_av_entry *)util_av_entry->data; + return av_entry->ah ? av_entry : NULL; } /** - * @brief find fi_addr for efa endpoint + * @brief find fi_addr for efa endpoint (base, AHN+QPN only) * * @param[in] av address vector * @param[in] ahn address handle number * @param[in] qpn QP number - * @return On success, return fi_addr to the peer who send the packet + * @return On success, return fi_addr to the peer * If no such peer exist, return FI_ADDR_NOTAVAIL */ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) @@ -106,89 +56,29 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) cur_key.qpn = qpn; HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); - return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->conn.fi_addr : FI_ADDR_NOTAVAIL; -} - -/** - * @brief find fi_addr for rdm endpoint in the explicit AV - * - * @param[in] av address vector - * @param[in] ahn address handle number - * @param[in] qpn QP number - * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid - * @return On success, return fi_addr to the peer who send the packet - * If no such peer exist, return FI_ADDR_NOTAVAIL - */ -fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, - uint16_t qpn, struct efa_rdm_pke *pkt_entry) -{ - struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); - - return efa_proto_av_reverse_lookup(proto_av, ahn, qpn, pkt_entry); -} - -/** - * @brief find fi_addr for rdm endpoint in the implicit AV - * - * @param[in] av address vector - * @param[in] ahn address handle number - * @param[in] qpn QP number - * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid - * @return On success, return fi_addr to the peer who send the packet - * If no such peer exist, return FI_ADDR_NOTAVAIL - */ -fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, - uint16_t qpn, - struct efa_rdm_pke *pkt_entry) -{ - struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); - - return efa_proto_av_reverse_lookup_implicit(proto_av, ahn, qpn, pkt_entry); -} - -/** - * @brief Move the conn to the front of the LRU list to indicate that it is the - * most recently used entry - * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list - */ -void efa_av_implicit_av_lru_conn_move(struct efa_av *av, - struct efa_conn *conn) -{ - assert(av->implicit_av_size == 0 || - HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size); - assert(dlist_entry_in_list(&av->implicit_av_lru_list, - &conn->implicit_av_lru_entry)); - - dlist_remove(&conn->implicit_av_lru_entry); - dlist_insert_tail(&conn->implicit_av_lru_entry, - &av->implicit_av_lru_list); - - efa_ah_implicit_av_lru_ah_move(av->domain, conn->ah); + return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->fi_addr : FI_ADDR_NOTAVAIL; } /* - * @brief Add newly insert address to the reverse AVs + * @brief Add newly inserted address to the reverse AVs * * @param[in] av EFA AV object - * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key - * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] av_entry efa_av_entry object - * @return On success, return 0. - * Otherwise, return a negative libfabric error code + * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key + * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key + * @param[in] av_entry AV entry to add + * @return 0 on success, negative libfabric error code on failure */ int efa_av_reverse_av_add(struct efa_av *av, - struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_av_entry *av_entry) + struct efa_cur_reverse_av **cur_reverse_av, + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_entry; struct efa_prv_reverse_av *prv_entry; struct efa_cur_reverse_av_key cur_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = av_entry->conn.ah->ahn; + cur_key.ahn = av_entry->ah->ahn; cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; cur_entry = NULL; @@ -208,10 +98,7 @@ int efa_av_reverse_av_add(struct efa_av *av, return 0; } - /* We used a static connid for all dgram endpoints, therefore cur_entry should always be NULL, - * and only RDM endpoint can reach here. hence the following assertion - */ - assert(av->domain->info_type == EFA_INFO_RDM); + /* Only RDM endpoint can reach here (dgram uses static connid) */ prv_entry = malloc(sizeof(*prv_entry)); if (!prv_entry) { EFA_WARN(FI_LOG_AV, "Cannot allocate memory for prv_reverse_av entry\n"); @@ -235,13 +122,13 @@ int efa_av_reverse_av_add(struct efa_av *av, * cur_reverse_av. Keeping the address in prv_reverse_av helps avoid QPN * collisions. * - * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key - * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] av_entry efa_av_entry object + * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key + * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key + * @param[in] av_entry AV entry to remove */ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_av_entry *av_entry) + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_reverse_av_entry; struct efa_prv_reverse_av *prv_reverse_av_entry; @@ -249,7 +136,7 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av_key prv_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = av_entry->conn.ah->ahn; + cur_key.ahn = av_entry->ah->ahn; cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_reverse_av_entry); @@ -258,7 +145,7 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, free(cur_reverse_av_entry); } else { memset(&prv_key, 0, sizeof(prv_key)); - prv_key.ahn = av_entry->conn.ah->ahn; + prv_key.ahn = av_entry->ah->ahn; prv_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; prv_key.connid = efa_av_entry_ep_addr(av_entry)->qkey; HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), @@ -270,161 +157,116 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, } } - -static fi_addr_t -efa_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) -{ - struct efa_rdm_pke *pke; - - pke = (struct efa_rdm_pke *) rx_entry->peer_context; - - return pke->peer->av_entry->fi_addr; -} - -static int efa_conn_implicit_to_explicit(struct efa_av *av, - struct efa_ep_addr *raw_addr, - fi_addr_t implicit_fi_addr, - fi_addr_t *fi_addr) +/** + * @brief Initialize an efa_av_entry (base path) + * + * Caller must hold util_av.lock. + * + * @param[in] av address vector + * @param[in] raw_addr raw efa address + * @param[in] flags flags from fi_av_insert + * @param[in] context context from fi_av_insert + * @return pointer to initialized entry on success, NULL on failure + */ +static struct efa_av_entry *efa_av_entry_init(struct efa_av *av, + struct efa_ep_addr *raw_addr, + uint64_t flags, void *context) { + struct util_av_entry *util_av_entry = NULL; + struct efa_av_entry *av_entry = NULL; + fi_addr_t fi_addr; int err; - struct efa_ah *ah; - struct efa_conn *implicit_conn, *explicit_conn; - struct efa_rdm_ep *ep; - struct dlist_entry *entry; - struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry; - struct efa_conn_ep_peer_map_entry *map_entry, *tmp; - struct efa_av_entry *implicit_av_entry, *explicit_av_entry; - struct fid_peer_srx *peer_srx; - - EFA_INFO(FI_LOG_AV, - "Moving peer with implicit fi_addr %" PRIu64 - " to explicit AV\n", - implicit_fi_addr); assert(ofi_genlock_held(&av->util_av.lock)); - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - - /* Get implicit util AV entry and conn */ - implicit_util_av_entry = - ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr); - - implicit_av_entry = (struct efa_av_entry *) implicit_util_av_entry->data; - assert(implicit_av_entry); - assert(efa_is_same_addr( - raw_addr, (struct efa_ep_addr *) implicit_av_entry->ep_addr)); + if (flags & FI_SYNC_ERR) + memset(context, 0, sizeof(int)); - implicit_conn = &implicit_av_entry->conn; - assert(implicit_conn->fi_addr == FI_ADDR_NOTAVAIL && - implicit_conn->implicit_fi_addr == implicit_fi_addr); - - ah = implicit_conn->ah; - - /* Create explicit util AV entry and conn */ - err = ofi_av_insert_addr(&av->util_av, raw_addr, fi_addr); + err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr); if (err) { - EFA_WARN(FI_LOG_AV, - "ofi_av_insert_addr into explicit AV failed! Error " - "message: %s\n", + EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", fi_strerror(err)); - return err; + return NULL; } - explicit_util_av_entry = - ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, *fi_addr); - explicit_av_entry = (struct efa_av_entry *) explicit_util_av_entry->data; - assert(efa_is_same_addr( - raw_addr, (struct efa_ep_addr *) explicit_av_entry->ep_addr)); + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); + av_entry = (struct efa_av_entry *)util_av_entry->data; + assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)av_entry->ep_addr)); - /* Copy information from implicit conn to explicit conn */ - explicit_conn = &explicit_av_entry->conn; - memset(explicit_conn, 0, sizeof(*explicit_conn)); - explicit_conn->ep_addr = (struct efa_ep_addr *) explicit_av_entry->ep_addr; + av_entry->fi_addr = fi_addr; assert(av->type == FI_AV_TABLE); - explicit_conn->ah = implicit_conn->ah; - explicit_conn->fi_addr = *fi_addr; - explicit_conn->shm_fi_addr = implicit_conn->shm_fi_addr; - explicit_conn->implicit_fi_addr = FI_ADDR_NOTAVAIL; - HASH_ITER(hh, implicit_conn->ep_peer_map, map_entry, tmp) { - HASH_DELETE(hh, implicit_conn->ep_peer_map, map_entry); - HASH_ADD_PTR(explicit_conn->ep_peer_map, ep_ptr, map_entry); - map_entry->peer.av_entry = (struct efa_proto_av_entry *)explicit_conn; - } - assert(HASH_CNT(hh, implicit_conn->ep_peer_map) == 0); - - /* Handle reverse AV and AV ref counts */ - efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, implicit_av_entry); - dlist_remove(&implicit_av_entry->conn.implicit_av_lru_entry); + av_entry->ah = efa_ah_alloc(av->domain, raw_addr->raw, sizeof(struct efa_ah)); + if (!av_entry->ah) + goto err_release; - err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, - "ofi_av_remove_addr from implicit AV failed! Error " - "message: %s\n", - fi_strerror(err)); - return err; - } + err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av, + av_entry); + if (err) + goto err_release_ah; - av->used_implicit--; + av->used++; + return av_entry; - err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av, - explicit_av_entry); +err_release_ah: + efa_ah_release(av->domain, av_entry->ah); +err_release: + av_entry->ah = NULL; + memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); + err = ofi_av_remove_addr(&av->util_av, fi_addr); if (err) - return err; + EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", + err); + return NULL; +} + +/** + * @brief Release an efa_av_entry (base path) + * + * Caller must hold util_av.lock. + * + * @param[in] av address vector + * @param[in] av_entry entry to release + */ +static void efa_av_entry_release(struct efa_av *av, struct efa_av_entry *av_entry) +{ + char gidstr[INET6_ADDRSTRLEN]; + int err; - av->used_explicit++; + assert(ofi_genlock_held(&av->util_av.lock)); - /* Handle AH LRU list and refcnt */ - assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list)); - dlist_remove(&implicit_conn->ah_implicit_conn_list_entry); - efa_ah_implicit_av_lru_ah_move(av->domain, ah); - efa_proto_ah_from_ah(ah)->implicit_refcnt--; - efa_proto_ah_from_ah(ah)->explicit_refcnt++; + efa_av_reverse_av_remove(&av->cur_reverse_av, &av->prv_reverse_av, av_entry); + efa_ah_release(av->domain, av_entry->ah); - EFA_INFO(FI_LOG_AV, - "Peer with implicit fi_addr %" PRIu64 - " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n", - implicit_fi_addr, *fi_addr); - - /* Call foreach_unspec_addr to move unexpected messages - * from the unspecified queue to the specified queues - * - * util_ep is bound to the explicit util_av, so the explicit util_av's - * ep_list contains all of the endpoints bound to this AV */ - ofi_genlock_lock(&av->util_av.ep_list_lock); - dlist_foreach(&av->util_av.ep_list, entry) { - ep = container_of(entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); - peer_srx = util_get_peer_srx(ep->peer_srx_ep); - peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_av_get_addr_from_peer_rx_entry); - } - ofi_genlock_unlock(&av->util_av.ep_list_lock); + inet_ntop(AF_INET6, efa_av_entry_ep_addr(av_entry)->raw, gidstr, INET6_ADDRSTRLEN); + EFA_INFO(FI_LOG_AV, "efa_av_entry released! entry[%p] GID[%s] QP[%u]\n", + av_entry, gidstr, efa_av_entry_ep_addr(av_entry)->qpn); - return FI_SUCCESS; + err = ofi_av_remove_addr(&av->util_av, av_entry->fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); + + av_entry->ah = NULL; + memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); + av->used--; } /** - * @brief insert one address into address vector (AV) + * @brief insert one address into AV (base, efa-direct path) * * @param[in] av address vector * @param[in] addr raw address, in the format of gid:qpn:qkey - * @param[out] fi_addr pointer to the output fi address. This address is used by fi_send - * @param[in] flags flags user passed to fi_av_insert. + * @param[out] fi_addr pointer to the output fi address + * @param[in] flags flags user passed to fi_av_insert * @param[in] context context user passed to fi_av_insert - * @param[in] insert_shm_av whether insert address to shm av - * @param[in] insert_implicit_av whether insert address to implicit AV * @return 0 on success, a negative error code on failure */ -int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, - fi_addr_t *fi_addr, uint64_t flags, void *context, - bool insert_shm_av, bool insert_implicit_av) +static int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context) { - struct efa_conn *conn; + struct efa_av_entry *av_entry; char raw_gid_str[INET6_ADDRSTRLEN]; fi_addr_t efa_fiaddr; - fi_addr_t implicit_fi_addr; - int ret = 0; if (!efa_av_is_valid_address(addr)) { EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n"); @@ -435,94 +277,57 @@ int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, if (av->domain->info_type == EFA_INFO_DGRAM) addr->qkey = EFA_DGRAM_CONNID; - if (av->domain->info_type == EFA_INFO_RDM) - assert(ofi_genlock_held(&av->domain->srx_lock)); - ofi_genlock_lock(&av->util_av_implicit.lock); ofi_genlock_lock(&av->util_av.lock); memset(raw_gid_str, 0, sizeof(raw_gid_str)); if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) { EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno); - ret = -FI_EINVAL; *fi_addr = FI_ADDR_NOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return -FI_EINVAL; } EFA_INFO(FI_LOG_AV, - "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n", - raw_gid_str, addr->qpn, addr->qkey, - insert_implicit_av ? "implicit" : "explicit"); + "Inserting address GID[%s] QP[%u] QKEY[%u] to explicit AV ....\n", + raw_gid_str, addr->qpn, addr->qkey); - /* - * Check if this address already has been inserted, if so set *fi_addr - * to existing address, and return 0 for success. - */ + /* Check if already inserted */ efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->util_av, addr); if (efa_fiaddr != FI_ADDR_NOTAVAIL) { - /* We should never try to insert into the implicit AV an address - * that's already in the explicit AV */ - assert(!insert_implicit_av); - EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr); *fi_addr = efa_fiaddr; - ret = 0; - goto out; - } - - implicit_fi_addr = - ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr); - if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { - EFA_INFO(FI_LOG_AV, - "Found implicit AV entry id %ld for the same " - "address\n", - implicit_fi_addr); - - if (insert_implicit_av) { - /* Move to the end of the LRU list */ - conn = efa_av_addr_to_conn_implicit(av, - implicit_fi_addr); - efa_av_implicit_av_lru_conn_move(av, conn); - - *fi_addr = implicit_fi_addr; - goto out; - } - - ret = efa_conn_implicit_to_explicit(av, addr, implicit_fi_addr, - fi_addr); - if (ret) - *fi_addr = FI_ADDR_NOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return 0; } - conn = efa_conn_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av); - if (!conn) { + av_entry = efa_av_entry_init(av, addr, flags, context); + if (!av_entry) { *fi_addr = FI_ADDR_NOTAVAIL; - ret = -FI_EADDRNOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return -FI_EADDRNOTAVAIL; } - if (insert_implicit_av) { - *fi_addr = conn->implicit_fi_addr; - EFA_INFO(FI_LOG_AV, - "Successfully inserted address GID[%s] QP[%u] " - "QKEY[%u] to implicit AV. fi_addr: %ld\n", - raw_gid_str, addr->qpn, addr->qkey, *fi_addr); - } else { - *fi_addr = conn->fi_addr; - EFA_INFO(FI_LOG_AV, - "Successfully inserted address GID[%s] QP[%u] " - "QKEY[%u] to explicit AV. fi_addr: %ld\n", - raw_gid_str, addr->qpn, addr->qkey, *fi_addr); - } - ret = 0; + *fi_addr = av_entry->fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); -out: ofi_genlock_unlock(&av->util_av.lock); - ofi_genlock_unlock(&av->util_av_implicit.lock); - return ret; + return 0; } -int efa_av_insert(struct fid_av *av_fid, const void *addr, +/** + * @brief insert addresses into AV (fi_av_insert implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr buffer containing one or more addresses to insert + * @param[in] count number of addresses to insert + * @param[out] fi_addr array where returned fabric addresses will be written + * @param[in] flags operation flags + * @param[in] context user context + * @return number of addresses successfully inserted + */ +static int efa_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -538,25 +343,16 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) return -FI_EINVAL; - /* - * Providers are allowed to ignore FI_MORE. - */ flags &= ~FI_MORE; if (flags) return -FI_ENOSYS; - /* The order in which the util AV and SRX locks are acquired must match - * in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); - for (i = 0; i < count; i++) { addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); - ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false); + ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context); if (ret) { - EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", - ret); + EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret); break; } @@ -565,9 +361,6 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, success_cnt++; } - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); - /* cancel remaining request and log to event queue */ for (; i < count ; i++) { if (fi_addr) @@ -577,11 +370,20 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, return success_cnt; } +/** + * @brief retrieve an address stored in the AV (fi_av_lookup implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr fabric address to look up + * @param[out] addr buffer to store the returned address + * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written + * @return 0 on success, negative libfabric error code on failure + */ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, void *addr, size_t *addrlen) { struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); - struct efa_conn *conn = NULL; + struct efa_av_entry *av_entry = NULL; if (av->type != FI_AV_TABLE) return -FI_EINVAL; @@ -590,13 +392,13 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, return -FI_EINVAL; ofi_genlock_lock(&av->util_av.lock); - conn = efa_av_addr_to_conn(av, fi_addr); - if (!conn) { + av_entry = efa_av_addr_to_entry(av, fi_addr); + if (!av_entry) { ofi_genlock_unlock(&av->util_av.lock); return -FI_EINVAL; } - memcpy(addr, (void *)conn->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); + memcpy(addr, (void *)av_entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); ofi_genlock_unlock(&av->util_av.lock); if (*addrlen > EFA_EP_ADDR_LEN) *addrlen = EFA_EP_ADDR_LEN; @@ -617,16 +419,16 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, * was set to FI_ADDR_NOTAVAIL. The TX completion handler will * ignore TX packet whose address is FI_ADDR_NOTAVAIL. * - * Meanwhile, lower provider will set a packet's address to - * FI_ADDR_NOTAVAIL from it is from a removed address. RX completion + * Meanwhile, lower provider will set a packet's address to + * FI_ADDR_NOTAVAIL if it is from a removed address. RX completion * handler will ignore such packets. * * @param[in] av_fid fid of AV (address vector) - * @param[in] fi_addr pointer to an array of libfabric addresses - * @param[in] counter number of libfabric addresses in the array + * @param[in] fi_addr pointer to an array of libfabric addresses + * @param[in] count number of libfabric addresses in the array * @param[in] flags flags * @return 0 if all addresses have been removed successfully, - * negative libfabric error code if error was encoutnered. + * negative libfabric error code if error was encountered. */ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count, uint64_t flags) @@ -634,7 +436,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, int err = 0; size_t i; struct efa_av *av; - struct efa_conn *conn; + struct efa_av_entry *av_entry; if (!fi_addr) return -FI_EINVAL; @@ -643,19 +445,15 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, if (av->type != FI_AV_TABLE) return -FI_EINVAL; - /* The order in which the util AV and SRX locks are acquired must match - in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); ofi_genlock_lock(&av->util_av.lock); for (i = 0; i < count; i++) { - conn = efa_av_addr_to_conn(av, fi_addr[i]); - if (!conn) { + av_entry = efa_av_addr_to_entry(av, fi_addr[i]); + if (!av_entry) { err = -FI_EINVAL; break; } - efa_conn_release(av, conn, false); + efa_av_entry_release(av, av_entry); } if (i < count) { @@ -664,11 +462,18 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, } ofi_genlock_unlock(&av->util_av.lock); - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); return err; } +/** + * @brief convert an address into a printable string (fi_av_straddr implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr address to convert + * @param[out] buf buffer to store the string + * @param[in,out] len on input, size of buf; on output, bytes written + * @return pointer to buf + */ static const char *efa_av_straddr(struct fid_av *av_fid, const void *addr, char *buf, size_t *len) { @@ -685,81 +490,37 @@ static struct fi_ops_av efa_av_ops = { .straddr = efa_av_straddr }; -static void efa_av_close_reverse_av(struct efa_av *av) +/** + * @brief close an AV and release all resources (fi_close implementation) + * + * @param[in] fid fid of AV + * @return 0 on success, negative libfabric error code on failure + */ +static int efa_av_close(struct fid *fid) { + struct efa_av *av; struct efa_cur_reverse_av *cur_entry, *curtmp; struct efa_prv_reverse_av *prv_entry, *prvtmp; + int err = 0; - /* The order in which the util AV and SRX locks are acquired must match - in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); + av = container_of(fid, struct efa_av, util_av.av_fid.fid); ofi_genlock_lock(&av->util_av.lock); HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) { - efa_conn_release(av, &cur_entry->av_entry->conn, false); + efa_av_entry_release(av, cur_entry->av_entry); } HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) { - efa_conn_release(av, &prv_entry->av_entry->conn, false); + efa_av_entry_release(av, prv_entry->av_entry); } ofi_genlock_unlock(&av->util_av.lock); - ofi_genlock_lock(&av->util_av_implicit.lock); - - HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) { - efa_conn_release(av, &cur_entry->av_entry->conn, true); - } - - HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) { - efa_conn_release(av, &prv_entry->av_entry->conn, true); - } - - ofi_genlock_unlock(&av->util_av_implicit.lock); - - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); -} - -static int efa_av_close(struct fid *fid) -{ - struct efa_av *av; - int err = 0; - struct efa_ep_addr_hashable *ep_addr_hashable, *tmp; - - av = container_of(fid, struct efa_av, util_av.av_fid.fid); - - efa_av_close_reverse_av(av); - err = ofi_av_close(&av->util_av); - if (OFI_UNLIKELY(err)) { + if (OFI_UNLIKELY(err)) EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n", fi_strerror(err)); - } - - err = ofi_av_close(&av->util_av_implicit); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n", - fi_strerror(err)); - } - - if (av->domain->info_type == EFA_INFO_RDM) { - if (av->shm_rdm_av) { - err = fi_close(&av->shm_rdm_av->fid); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_AV, - "Failed to close shm av: %s\n", - fi_strerror(err)); - } - } - } - - HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) { - HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable); - free(ep_addr_hashable); - } free(av); return err; @@ -774,12 +535,12 @@ static struct fi_ops efa_av_fi_ops = { }; /** - * @brief initialize the util_av field in efa_av + * @brief initialize a util_av * - * @param[in] util_domain util_domain which is part of efa_domain_base + * @param[in] efa_domain efa domain * @param[in] attr AV attr application passed to fi_av_open - * @param[out] util_av util_av field in efa_av - * @param[in] context contexted application passed to fi_av_open + * @param[out] util_av util_av to initialize + * @param[in] context context application passed to fi_av_open * @param[in] context_len size of provider-specific context per AV entry * @return On success, return 0. * On failure, return a negative libfabric error code. @@ -799,14 +560,22 @@ int efa_av_init_util_av(struct efa_domain *efa_domain, util_av, context); } +/** + * @brief open an address vector (fi_av_open implementation for efa-direct/dgram) + * + * @param[in] domain_fid fid of domain + * @param[in] attr AV attributes + * @param[out] av_fid pointer to store the opened AV fid + * @param[in] context user context + * @return 0 on success, negative libfabric error code on failure + */ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context) { struct efa_domain *efa_domain; struct efa_av *av; - struct fi_av_attr av_attr = { 0 }; - int ret, retv; size_t universe_size; + int ret; if (!attr) return -FI_EINVAL; @@ -843,49 +612,16 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, &universe_size) == FI_SUCCESS) attr->count = MAX(attr->count, universe_size); - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context, - sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN); - if (ret) - goto err; - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context, sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN); if (ret) - goto err_close_util_av_implicit; - - if (efa_domain->info_type == EFA_INFO_RDM && efa_domain->fabric && - efa_domain->fabric->shm_fabric) { - /* - * shm av supports maximum 256 entries - * Reset the count to 128 to reduce memory footprint and satisfy - * the need of the instances with more CPUs. - */ - av_attr = *attr; - if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) { - ret = -FI_ENOSYS; - EFA_WARN(FI_LOG_AV, - "The requested av size is beyond" - " shm supported maximum av size: %s\n", - fi_strerror(-ret)); - goto err_close_util_av; - } - av_attr.count = efa_env.shm_av_size; - assert(av_attr.type == FI_AV_TABLE); - ret = fi_av_open(efa_domain->shm_domain, &av_attr, - &av->shm_rdm_av, context); - if (ret) - goto err_close_util_av; - } + goto err; - EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", - attr->flags); + EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags); av->domain = efa_domain; av->type = attr->type; - av->implicit_av_size = efa_env.implicit_av_size; - av->used_implicit = 0; - av->used_explicit = 0; - av->shm_used = 0; + av->used = 0; *av_fid = &av->util_av.av_fid; (*av_fid)->fid.fclass = FI_CLASS_AV; @@ -893,22 +629,8 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, (*av_fid)->fid.ops = &efa_av_fi_ops; (*av_fid)->ops = &efa_av_ops; - dlist_init(&av->implicit_av_lru_list); - return 0; -err_close_util_av: - retv = ofi_av_close(&av->util_av); - if (retv) - EFA_WARN(FI_LOG_AV, - "Unable to close util_av: %s\n", fi_strerror(-retv)); - -err_close_util_av_implicit: - retv = ofi_av_close(&av->util_av_implicit); - if (retv) - EFA_WARN(FI_LOG_AV, - "Unable to close util_av_implicit: %s\n", fi_strerror(-retv)); - err: free(av); return ret; diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 35650d5c725..b92eebb91e6 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -5,10 +5,7 @@ #define EFA_AV_H #include -#include "rdm/efa_rdm_protocol.h" -#include "rdm/efa_rdm_peer.h" #include "efa_ah.h" -#include "efa_conn.h" #define EFA_MIN_AV_SIZE (16384) #define EFA_SHM_MAX_AV_COUNT (256) @@ -28,104 +25,94 @@ struct efa_ep_addr_hashable { #define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) -/* util_av implementation requires the first element of efa_av_entry to be - * ep_addr */ -struct efa_av_entry { - uint8_t ep_addr[EFA_EP_ADDR_LEN]; - struct efa_conn conn; -}; - -/** - * @brief typed accessor for the ep_addr field of an AV entry - * - * @param[in] entry AV entry - * @return pointer to the efa_ep_addr embedded in the entry - */ -static inline struct efa_ep_addr *efa_av_entry_ep_addr(struct efa_av_entry *entry) -{ - return (struct efa_ep_addr *)entry->ep_addr; -} - /** - * @brief check if an efa_ep_addr has a non-zero GID + * @brief Base AV entry (efa-direct) * - * @param[in] addr address to check - * @return non-zero if valid, 0 if all-zeros + * pahole: + * size: 48, cachelines: 1, members: 3 + * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20) + * ah* off=32 — TX hot + * fi_addr off=40 — RX hot */ -static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) -{ - struct efa_ep_addr all_zeros = { 0 }; - - return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); -} +struct efa_av_entry { + uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */ + struct efa_ah *ah; /* 32 8 */ + fi_addr_t fi_addr; /* 40 8 */ +}; +/* pahole: size: 4, no holes */ struct efa_cur_reverse_av_key { uint16_t ahn; uint16_t qpn; }; +/** + * @brief Reverse AV entry keyed by (AHN, QPN) — points to current peer + * + * pahole: size: 72, cachelines: 2 (4-byte hole after key) + */ struct efa_cur_reverse_av { - struct efa_cur_reverse_av_key key; - struct efa_av_entry *av_entry; - UT_hash_handle hh; + struct efa_cur_reverse_av_key key; /* 0 4 */ + /* 4-byte hole */ + struct efa_av_entry *av_entry; /* 8 8 */ + UT_hash_handle hh; /* 16 56 */ }; +/* pahole: size: 8, no holes */ struct efa_prv_reverse_av_key { uint16_t ahn; uint16_t qpn; uint32_t connid; }; +/** + * @brief Reverse AV entry keyed by (AHN, QPN, connid) — points to previous peer + * + * pahole: size: 72, cachelines: 2 + */ struct efa_prv_reverse_av { - struct efa_prv_reverse_av_key key; - struct efa_av_entry *av_entry; - UT_hash_handle hh; + struct efa_prv_reverse_av_key key; /* 0 8 */ + struct efa_av_entry *av_entry; /* 8 8 */ + UT_hash_handle hh; /* 16 56 */ }; +/** + * @brief Base AV — contains only what efa-direct needs + * + * pahole: + * size: 320, cachelines: 5 + * domain* off=0 — cacheline 0 + * used off=8 + * type off=16 + * (4-byte hole) off=20 + * cur_reverse_av* off=24 — RX hot: reverse lookup hash head + * prv_reverse_av* off=32 — RX hot: QPN reuse fallback hash head + * util_av off=40 — 280 bytes (contains bufpool, locks, ep_list) + */ struct efa_av { - struct fid_av *shm_rdm_av; - struct efa_domain *domain; - size_t used_explicit; - size_t used_implicit; - size_t shm_used; - enum fi_av_type type; - /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_conn. - * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_conns. - * cur_reverse_av is faster to search because its key size is smaller - */ - struct efa_cur_reverse_av *cur_reverse_av; - struct efa_prv_reverse_av *prv_reverse_av; - struct util_av util_av; - - /* implicit AV is used when receiving messages from peers not explicity - * inserted by the application + struct efa_domain *domain; /* 0 8 */ + size_t used; /* 8 8 */ + enum fi_av_type type; /* 16 4 */ + /* 4-byte hole */ + /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_av_entry. + * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_av_entries. + * cur_reverse_av is faster to search because its key size is smaller. */ - struct util_av util_av_implicit; - struct efa_cur_reverse_av *cur_reverse_av_implicit; - struct efa_prv_reverse_av *prv_reverse_av_implicit; - - size_t implicit_av_size; - struct dlist_entry implicit_av_lru_list; - struct efa_ep_addr_hashable *evicted_peers_hashset; + struct efa_cur_reverse_av *cur_reverse_av; /* 24 8 */ + struct efa_prv_reverse_av *prv_reverse_av; /* 32 8 */ + struct util_av util_av; /* 40 280 */ }; int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context); -int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, - fi_addr_t *fi_addr, uint64_t flags, void *context, - bool insert_shm_av, bool insert_implicit_av); - -struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr); -struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, - fi_addr_t fi_addr); - -fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, - uint16_t qpn, struct efa_rdm_pke *pkt_entry); +int efa_av_init_util_av(struct efa_domain *efa_domain, + struct fi_av_attr *attr, + struct util_av *util_av, + void *context, + size_t context_len); -fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, - uint16_t qpn, - struct efa_rdm_pke *pkt_entry); +struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr); fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn); @@ -138,15 +125,28 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av **prv_reverse_av, struct efa_av_entry *av_entry); -void efa_av_implicit_av_lru_conn_move(struct efa_av *av, - struct efa_conn *conn); +/** + * @brief typed accessor for the ep_addr field of an AV entry + * + * @param[in] entry AV entry + * @return pointer to the efa_ep_addr embedded in the entry + */ +static inline struct efa_ep_addr *efa_av_entry_ep_addr(struct efa_av_entry *entry) +{ + return (struct efa_ep_addr *)entry->ep_addr; +} -struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr); +/** + * @brief check if an efa_ep_addr has a non-zero GID + * + * @param[in] addr address to check + * @return non-zero if valid, 0 if all-zeros + */ +static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) +{ + struct efa_ep_addr all_zeros = { 0 }; -int efa_av_init_util_av(struct efa_domain *efa_domain, - struct fi_av_attr *attr, - struct util_av *util_av, - void *context, - size_t context_len); + return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); +} -#endif \ No newline at end of file +#endif diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 7f093cb2ed0..b734b199c1c 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -498,11 +498,11 @@ static int efa_domain_query_addr(struct fid_ep *ep_fid, fi_addr_t addr, { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); struct efa_av_entry *av_entry = efa_av_addr_to_entry(base_ep->av, addr); - if (!av_entry || !av_entry->conn.ah || !efa_av_entry_ep_addr(av_entry)) { + if (!av_entry || !av_entry->ah || !efa_av_entry_ep_addr(av_entry)) { EFA_WARN(FI_LOG_EP_CTRL, "Failed to find connection for addr %lu\n", addr); return -FI_EINVAL; } - *ahn = av_entry->conn.ah->ahn; + *ahn = av_entry->ah->ahn; *remote_qpn = efa_av_entry_ep_addr(av_entry)->qpn; *remote_qkey = efa_av_entry_ep_addr(av_entry)->qkey; diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index 96850325031..ab3935ab6f6 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -330,7 +330,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi /* Use consolidated send function */ ret = efa_qp_post_send(qp, sg_list, inline_data_list, iov_count, use_inline, wr_id, msg->data, flags, - av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(ret)) ret = (ret == ENOMEM) ? -FI_EAGAIN : -ret; diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index 151b0353553..cb0f3283697 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -110,7 +110,7 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, err = efa_qp_post_read(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, flags, - av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -265,7 +265,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, err = efa_qp_post_write(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, msg->data, flags, - av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -391,7 +391,7 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, 0, 0, av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, + wr_id, 0, 0, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -433,7 +433,7 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, data, IBV_SEND_INLINE, av_entry->conn.ah, efa_av_entry_ep_addr(av_entry)->qpn, + wr_id, data, IBV_SEND_INLINE, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c index b76cd28c8af..8b84f315bff 100644 --- a/prov/efa/src/rdm/efa_proto_av.c +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -10,21 +10,16 @@ #include "rdm/efa_rdm_pke_utils.h" /* - * During the coexistence period between efa_conn and efa_proto_av_entry, - * efa_av_reverse_av_add / _remove read av_entry->conn.ah->ahn on a - * pointer that may actually be an efa_proto_av_entry *. This relies on - * a layout coincidence: the 'ah' field sits at offset 32 in both - * struct efa_av_entry (inside the embedded efa_conn, which itself starts - * with struct efa_ah *ah) and struct efa_proto_av_entry (directly). - * If either struct is ever reordered, these static asserts break loudly - * instead of silently reading the wrong field. + * efa_av_entry and efa_proto_av_entry share the same cache-line-0 layout + * (ep_addr, ah) so reverse_av entries and util_av contexts work across + * both. Break loudly if anyone ever reorders either struct. */ _Static_assert(offsetof(struct efa_proto_av_entry, ep_addr) == offsetof(struct efa_av_entry, ep_addr), "efa_av_entry and efa_proto_av_entry must share ep_addr offset"); _Static_assert(offsetof(struct efa_proto_av_entry, ah) == - offsetof(struct efa_av_entry, conn) + offsetof(struct efa_conn, ah), - "efa_av_entry->conn.ah and efa_proto_av_entry->ah must be at the same offset"); + offsetof(struct efa_av_entry, ah), + "efa_av_entry and efa_proto_av_entry must share ah offset"); /** * @brief Local/remote peer detection by comparing peer GID with stored local GIDs @@ -574,7 +569,7 @@ void efa_proto_av_entry_release(struct efa_proto_av *av, efa_proto_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av); efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); - release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--; } /** @@ -609,7 +604,7 @@ void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); - release_from_implicit_av ? av->used_implicit-- : av->efa_av.used_explicit--; + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--; } /* ---- Protocol AH helpers ---- */ @@ -939,7 +934,7 @@ struct efa_proto_av_entry *efa_proto_av_entry_alloc( goto err_release; } - insert_implicit_av ? av->used_implicit++ : av->efa_av.used_explicit++; + insert_implicit_av ? av->used_implicit++ : av->efa_av.used++; return entry; @@ -1086,7 +1081,7 @@ int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, if (err) return err; - av->efa_av.used_explicit++; + av->efa_av.used++; /* Handle AH LRU list and refcnt */ assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list)); @@ -1562,7 +1557,7 @@ int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, av->efa_av.domain = efa_domain; av->efa_av.type = attr->type; - av->efa_av.used_explicit = 0; + av->efa_av.used = 0; av->implicit_av_size = efa_env.implicit_av_size; av->used_implicit = 0; av->shm_used = 0; diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 376023913af..8072e52e2ad 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -434,7 +434,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) assert_int_equal(err, 1); test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); /* cur_reverse_av (ahn, 100) -> conn1 (fi_addr1) */ - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), fi_addr1); /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's @@ -447,7 +447,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) test_av_verify_av_hash_cnt(av, proto_av, 1, 1, 0, 0); /* cur_reverse_av (ahn, 100) now points to conn2 (fi_addr2); peer1 is * in prv_reverse_av keyed by its own qkey. */ - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), fi_addr2); /* Remove peer1 first. Without the fix this would incorrectly delete @@ -456,7 +456,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) assert_int_equal(err, 0); /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), fi_addr2); /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry @@ -464,7 +464,7 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) err = fi_av_remove(resource->av, &fi_addr2, 1, 0); assert_int_equal(err, 0); test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), FI_ADDR_NOTAVAIL); } diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 3f4b2d3ddc8..672d4a863b9 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -1084,7 +1084,7 @@ static void test_efa_cq_read_prep(struct efa_resource *resource, will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_imm_data_return_mock, 0x1); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_qp_num_return_mock, base_ep->qp->qp_num); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_byte_len_return_mock, 4096); - will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_entry(base_ep->av, addr)->conn.ah->ahn); + will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_entry(base_ep->av, addr)->ah->ahn); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_src_qp_return_mock, raw_addr.qpn); From 7af9f8afa27b8b35804a0a8b33a2c005096f0ab9 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 6 May 2026 10:56:57 -0600 Subject: [PATCH 14/16] prov/efa: delete efa_conn.c and efa_conn.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit removed the last references to efa_conn from the tree (both as source and as a build target). The files are no longer compiled or included by anything. Delete them. Pure file deletion — no other changes. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_conn.c | 481 ---------------------------------------- prov/efa/src/efa_conn.h | 55 ----- 2 files changed, 536 deletions(-) delete mode 100644 prov/efa/src/efa_conn.c delete mode 100644 prov/efa/src/efa_conn.h diff --git a/prov/efa/src/efa_conn.c b/prov/efa/src/efa_conn.c deleted file mode 100644 index 35b528e498a..00000000000 --- a/prov/efa/src/efa_conn.c +++ /dev/null @@ -1,481 +0,0 @@ - -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include - -#include "efa.h" - -/* - * Local/remote peer detection by comparing peer GID with stored local GIDs - */ -static bool efa_is_local_peer(struct efa_av *av, const void *addr) -{ - int i; - uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw; - -#if ENABLE_DEBUG - char raw_gid_str[INET6_ADDRSTRLEN] = { 0 }; - - if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) { - EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno); - return 0; - } - EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str); -#endif - for (i = 0; i < g_efa_ibv_gid_cnt; ++i) { - if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) { - EFA_INFO(FI_LOG_AV, "The peer is local.\n"); - return 1; - } - } - - return 0; -} - -/** - * @brief Add the conn to the LRU list. If the list is full, evict the least - * recently used entry at the front of the LRU list and add the latest one - * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list - */ -static inline int efa_av_implicit_av_lru_insert(struct efa_av *av, - struct efa_conn *conn) -{ - size_t cur_size; - struct efa_ep_addr_hashable *ep_addr_hashable; - struct efa_conn *conn_to_release; - - /* Implicit AV size of 0 means we allow the implicit AV to grow without - * bound */ - if (av->implicit_av_size == 0) - goto out; - - cur_size = HASH_CNT(hh, av->util_av_implicit.hash); - if (cur_size <= av->implicit_av_size) - goto out; - - assert(ofi_genlock_held(&av->domain->srx_lock)); - - dlist_pop_front(&av->implicit_av_lru_list, struct efa_conn, - conn_to_release, implicit_av_lru_entry); - EFA_INFO(FI_LOG_AV, - "Evicting AV entry for peer implicit fi_addr %" PRIu64 - " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from " - "implicit AV\n", - conn_to_release->implicit_fi_addr, conn_to_release->ah->ahn, - conn_to_release->ep_addr->qpn, conn_to_release->ep_addr->qkey); - - /* Add to hashset with list of evicted peers */ - ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable)); - if (!ep_addr_hashable) { - EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n"); - return FI_ENOMEM; - } - memcpy(ep_addr_hashable, conn->ep_addr, sizeof(struct efa_ep_addr)); - HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable); - - assert(ofi_genlock_held(&av->domain->srx_lock)); - efa_conn_release(av, conn_to_release, true); - - assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size); - -out: - dlist_insert_tail(&conn->implicit_av_lru_entry, - &av->implicit_av_lru_list); - return FI_SUCCESS; -} - -/** - * @brief Insert the address into SHM provider's AV for RDM endpoints - * - * If shm transfer is enabled and the addr comes from local peer, - * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later. - * 2. insert gid_qpn into shm's av - * 3. store returned fi_addr from shm into the hash table - * - * @param[in] av address vector - * @param[in] conn efa_conn object - * @return On success return 0, otherwise return a negative error code - */ -int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn) -{ - int err, ret; - char smr_name[EFA_SHM_NAME_MAX]; - size_t smr_name_len; - - - assert(av->domain->info_type == EFA_INFO_RDM); - assert(conn->ep_addr); - - if (efa_is_local_peer(av, conn->ep_addr) && av->shm_rdm_av) { - if (av->shm_used >= efa_env.shm_av_size) { - EFA_WARN(FI_LOG_AV, - "Max number of shm AV entry (%d) has been reached.\n", - efa_env.shm_av_size); - return -FI_ENOMEM; - } - - smr_name_len = EFA_SHM_NAME_MAX; - err = efa_shm_ep_name_construct(smr_name, &smr_name_len, conn->ep_addr); - if (err != FI_SUCCESS) { - EFA_WARN(FI_LOG_AV, - "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err); - return err; - } - - /* - * The shm provider supports FI_AV_USER_ID flag. This flag - * associates a user-assigned identifier with each av entry that is - * returned with any completion entry in place of the AV's address. - * In the fi_av_insert call below, the &conn->shm_fi_addr is both an input - * and an output. conn->shm_fi_addr is passed in the function with value as - * conn->fi_addr, which is the address of peer in efa provider's av. shm - * records this value as user id in its internal hashmap for the use of cq - * write, and then overwrite conn->shm_fi_addr as the actual fi_addr in shm's - * av. The efa provider should still use conn->shm_fi_addr for transmissions - * through shm ep. - */ - conn->shm_fi_addr = conn->fi_addr; - ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &conn->shm_fi_addr, FI_AV_USER_ID, NULL); - if (OFI_UNLIKELY(ret != 1)) { - EFA_WARN(FI_LOG_AV, - "Failed to insert address to shm provider's av: %s\n", - fi_strerror(-ret)); - return ret; - } - - EFA_INFO(FI_LOG_AV, - "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", - smr_name, conn->fi_addr, conn->shm_fi_addr); - - assert(conn->shm_fi_addr < efa_env.shm_av_size); - av->shm_used++; - } - - return 0; -} - -/** - * @brief release the rdm related resources of an efa_conn object. This function - * requires the caller to take the SRX lock because this function modifies the - * peer map and destroys peers which are accessed and modified in the CQ read - * path. - * - * this function release the shm av entry and rdm peer; - * - * @param[in] av address vector - * @param[in] conn efa_conn object - * peer - */ -void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn) -{ - int err; - struct efa_conn_ep_peer_map_entry *peer_map_entry, *tmp; - - assert(av->domain->info_type == EFA_INFO_RDM); - - assert((conn->fi_addr != FI_ADDR_NOTAVAIL && - conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) || - (conn->implicit_fi_addr != FI_ADDR_NOTAVAIL && - conn->fi_addr == FI_ADDR_NOTAVAIL)); - - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { - err = fi_av_remove(av->shm_rdm_av, &conn->shm_fi_addr, 1, 0); - if (err) { - EFA_WARN(FI_LOG_AV, - "remove address from shm av failed! err=%d\n", - err); - } else { - av->shm_used--; - assert(conn->shm_fi_addr < efa_env.shm_av_size); - } - } - - assert(ofi_genlock_held(&av->domain->srx_lock)); - HASH_ITER(hh, conn->ep_peer_map, peer_map_entry, tmp) { - dlist_remove(&peer_map_entry->peer.ep_peer_list_entry); - efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr); - HASH_DEL(conn->ep_peer_map, peer_map_entry); - ofi_buf_free(peer_map_entry); - } - assert(HASH_CNT(hh, conn->ep_peer_map) == 0); -} - -/** - * @brief allocate an efa_conn object - * caller of this function must obtain av->util_av.lock or av->util_av_implicit.lock - * - * @param[in] av efa address vector - * @param[in] raw_addr raw efa address - * @param[in] flags flags application passed to fi_av_insert - * @param[in] context context application passed to fi_av_insert - * @param[in] insert_shm_av whether insert address to shm av - * @param[in] insert_implicit_av whether insert address to implicit AV - * @return on success, return a pointer to an efa_conn object - * otherwise, return NULL. errno will be set to a positive error code. - */ -struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, - uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av) -{ - struct util_av *util_av; - struct efa_cur_reverse_av **cur_reverse_av; - struct efa_prv_reverse_av **prv_reverse_av; - struct util_av_entry *util_av_entry = NULL; - struct efa_av_entry *efa_av_entry = NULL; - struct efa_conn *conn; - fi_addr_t fi_addr; - int err; - - if (flags & FI_SYNC_ERR) - memset(context, 0, sizeof(int)); - - if (insert_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - util_av = &av->util_av_implicit; - cur_reverse_av = &av->cur_reverse_av_implicit; - prv_reverse_av = &av->prv_reverse_av_implicit; - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - util_av = &av->util_av; - cur_reverse_av = &av->cur_reverse_av; - prv_reverse_av = &av->prv_reverse_av; - } - - err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", - fi_strerror(err)); - return NULL; - } - - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, - fi_addr); - efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr)); - - conn = &efa_av_entry->conn; - memset(conn, 0, sizeof(*conn)); - conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr; - assert(av->type == FI_AV_TABLE); - - conn->av = av; - - if (insert_implicit_av) { - conn->fi_addr = FI_ADDR_NOTAVAIL; - conn->implicit_fi_addr = fi_addr; - err = efa_av_implicit_av_lru_insert(av, conn); - if (err) - return NULL; - } else { - conn->fi_addr = fi_addr; - conn->implicit_fi_addr = FI_ADDR_NOTAVAIL; - } - - conn->ah = efa_ah_alloc(av->domain, raw_addr->raw, sizeof(struct efa_ah)); - if (!conn->ah) - goto err_release; - - if (insert_implicit_av) - dlist_insert_tail(&conn->ah_implicit_conn_list_entry, - &efa_proto_ah_from_ah(conn->ah)->implicit_conn_list); - - conn->shm_fi_addr = FI_ADDR_NOTAVAIL; - /* - * The efa_conn_alloc() call can be made in two situations: - * 1. application calls fi_av_insert API - * 2. efa progress engine get a message from unknown peer through efa device, - * which means peer is not local or shm is disabled for transmission. - * For situation 1, the shm av insertion should happen when the peer is local (insert_shm_av=1) - * For situation 2, the shm av insertion shouldn't happen anyway (insert_shm_av=0). - */ - if (av->domain->info_type == EFA_INFO_RDM && insert_shm_av) { - err = efa_conn_rdm_insert_shm_av(av, conn); - if (err) { - errno = -err; - goto err_release; - } - } - - err = efa_av_reverse_av_add(av, cur_reverse_av, prv_reverse_av, - container_of(conn, struct efa_av_entry, conn)); - if (err) { - if (av->domain->info_type == EFA_INFO_RDM) { - /* insert_implicit_av is only true for the CQ read path - * which already has the SRX lock */ - if (insert_implicit_av) - ofi_genlock_lock(&av->domain->srx_lock); - efa_conn_rdm_deinit(av, conn); - if (insert_implicit_av) - ofi_genlock_unlock(&av->domain->srx_lock); - } - goto err_release; - } - - insert_implicit_av ? av->used_implicit++ : av->used_explicit++; - - return conn; - -err_release: - if (conn->ah) - efa_ah_release(av->domain, conn->ah); - - conn->ep_addr = NULL; - err = ofi_av_remove_addr(util_av, fi_addr); - if (err) - EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", - err); - - return NULL; -} - -void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - struct efa_av_entry *av_entry = container_of(conn, struct efa_av_entry, conn); - - if (release_from_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, av_entry); - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - efa_av_reverse_av_remove(&av->cur_reverse_av, - &av->prv_reverse_av, av_entry); - } -} - -void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - struct util_av *util_av; - struct util_av_entry *util_av_entry; - struct efa_av_entry *efa_av_entry; - char gidstr[INET6_ADDRSTRLEN]; - fi_addr_t fi_addr; - int err; - - if (release_from_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - util_av = &av->util_av_implicit; - fi_addr = conn->implicit_fi_addr; - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - util_av = &av->util_av; - fi_addr = conn->fi_addr; - } - - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); - assert(util_av_entry); - efa_av_entry = (struct efa_av_entry *) util_av_entry->data; - - err = ofi_av_remove_addr(util_av, fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); - } - - inet_ntop(AF_INET6, conn->ep_addr->raw, gidstr, INET6_ADDRSTRLEN); - EFA_INFO(FI_LOG_AV, "efa_conn released! conn[%p] GID[%s] QP[%u]\n", - conn, gidstr, conn->ep_addr->qpn); - - conn->ep_addr = NULL; - memset(efa_av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); -} - -/** - * @brief release an efa conn object - * Caller of this function must obtain av->util_av.lock or - * av->util_av_implicit.lock. This function obtains the SRX lock and is called - * from the AV removal path. - * - * @param[in] av address vector - * @param[in] conn efa_conn object pointer - * @param[in] release_from_implicit_av whether to release conn - * from implicit AV - * @param[in] grab_srx_lock whether to get the SRX lock before - * destroying the peer struct - */ -void efa_conn_release(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - assert(av->domain->info_type != EFA_INFO_RDM || - ofi_genlock_held(&av->domain->srx_lock)); - - efa_conn_release_reverse_av(av, conn, release_from_implicit_av); - if (av->domain->info_type == EFA_INFO_RDM) - efa_conn_rdm_deinit(av, conn); - - if (release_from_implicit_av) - dlist_remove(&conn->ah_implicit_conn_list_entry); - - efa_ah_release(av->domain, conn->ah); - - efa_conn_release_util_av(av, conn, release_from_implicit_av); - - release_from_implicit_av ? av->used_implicit-- : av->used_explicit--; -} - -/** - * @brief release an efa conn object - * Caller of this function must obtain av->util_av.lock or - * av->util_av_implicit.lock and the SRX lock. It also calls - * efa_ah_release_unsafe which does not acquire the util_domain lock the - * protects the AH map. This function is called when evicting an AH entry in the - * CQ read path which already has the SRX lock and the util_domain lock. - * - * @param[in] av address vector - * @param[in] conn efa_conn object pointer - * @param[in] release_from_implicit_av whether to release conn - * from implicit AV - * @param[in] grab_srx_lock whether to get the SRX lock before - * destroying the peer struct - */ -void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - assert(av->domain->info_type != EFA_INFO_RDM || - ofi_genlock_held(&av->domain->srx_lock)); - - assert(ofi_genlock_held(&av->domain->util_domain.lock)); - - efa_conn_release_reverse_av(av, conn, release_from_implicit_av); - if (av->domain->info_type == EFA_INFO_RDM) - efa_conn_rdm_deinit(av, conn); - - if (release_from_implicit_av) - dlist_remove(&conn->ah_implicit_conn_list_entry); - - efa_conn_release_util_av(av, conn, release_from_implicit_av); - - release_from_implicit_av ? efa_proto_ah_from_ah(conn->ah)->implicit_refcnt-- : - efa_proto_ah_from_ah(conn->ah)->explicit_refcnt--; - release_from_implicit_av ? av->used_implicit-- : av->used_explicit--; -} - -void efa_conn_ep_peer_map_insert(struct efa_conn *conn, struct efa_conn_ep_peer_map_entry *map_entry) -{ - HASH_ADD_PTR(conn->ep_peer_map, ep_ptr, map_entry); -} - -struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn, - struct efa_rdm_ep *ep) -{ - struct efa_conn_ep_peer_map_entry *map_entry; - - HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry); - - return map_entry ? &map_entry->peer : NULL; -} - -void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep) -{ - struct efa_conn_ep_peer_map_entry *map_entry; - - HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry); - assert(map_entry); - HASH_DELETE(hh, conn->ep_peer_map, map_entry); - ofi_buf_free(map_entry); -} diff --git a/prov/efa/src/efa_conn.h b/prov/efa/src/efa_conn.h deleted file mode 100644 index bafa293da5f..00000000000 --- a/prov/efa/src/efa_conn.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#ifndef EFA_CONN_H -#define EFA_CONN_H - -#include "ofi_util.h" -#include "rdm/efa_rdm_peer.h" - -struct efa_conn { - struct efa_ah *ah; - struct efa_ep_addr *ep_addr; - struct efa_av *av; - fi_addr_t implicit_fi_addr; - fi_addr_t fi_addr; - fi_addr_t shm_fi_addr; - struct dlist_entry implicit_av_lru_entry; - struct dlist_entry ah_implicit_conn_list_entry; - struct efa_conn_ep_peer_map_entry *ep_peer_map; -}; - -struct efa_conn_ep_peer_map_entry { - struct efa_rdm_ep *ep_ptr; - struct efa_rdm_peer peer; - UT_hash_handle hh; -}; - -void efa_conn_ep_peer_map_insert(struct efa_conn *conn, - struct efa_conn_ep_peer_map_entry *map_entry); - -struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn, - struct efa_rdm_ep *ep); - -void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep); - -int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn); - -void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn); - -struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, - uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av); - -void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -#endif \ No newline at end of file From 2b8e742cf2bec3cee069f76af85fd918d3d09ff8 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 10 Apr 2026 15:01:05 -0600 Subject: [PATCH 15/16] prov/efa/test: split AV tests into base and protocol files Split efa_unit_test_av.c into two files: - efa_unit_test_av.c: base AV tests (efa-direct, AH counting, EP binding) - efa_unit_test_proto_av.c: protocol AV tests (implicit AV, LRU, peer map, reverse lookup, implicit-to-explicit migration, AH eviction) The new efa_unit_test_proto_av.c also contains the body of test_av_reverse_av_remove_qpn_collision, whose declaration and registration were added earlier but whose implementation had not been defined. Co-locating it with the other protocol AV tests keeps declaration, registration, and definition in sync. Update prov/efa/Makefile.include to build the new file. Signed-off-by: Seth Zegelstein --- prov/efa/Makefile.include | 1 + prov/efa/test/efa_unit_test_av.c | 756 ------------------------- prov/efa/test/efa_unit_test_proto_av.c | 741 ++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 5 +- prov/efa/test/efa_unit_tests.h | 5 +- 5 files changed, 750 insertions(+), 758 deletions(-) create mode 100644 prov/efa/test/efa_unit_test_proto_av.c diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 92f5b210a5f..a4d3197de8a 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -159,6 +159,7 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_domain.c \ prov/efa/test/efa_unit_test_ep.c \ prov/efa/test/efa_unit_test_av.c \ + prov/efa/test/efa_unit_test_proto_av.c \ prov/efa/test/efa_unit_test_cq.c \ prov/efa/test/efa_unit_test_cntr.c \ prov/efa/test/efa_unit_test_device.c \ diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 8072e52e2ad..4c68f2db12f 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -2,10 +2,7 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" -#include "efa_rdm_cq.h" -#include "efa_rdm_pke_req.h" #include "efa_av.h" -#include "rdm/efa_proto_av.h" /** * @brief Only works on nodes with EFA devices @@ -287,7 +284,6 @@ void test_av_multiple_ep_impl(struct efa_resource **state, char *fabric_name) fi_close(&ep2->fid); } - /** * @brief This test verifies that multiple endpoints can bind to the same AV * for the efa fabric @@ -309,755 +305,3 @@ void test_av_multiple_ep_efa_direct(struct efa_resource **state) { return test_av_multiple_ep_impl(state, EFA_DIRECT_FABRIC_NAME); } - -static void test_av_verify_av_hash_cnt(struct efa_av *av, struct efa_proto_av *proto_av, - int explicit_cur_av_count, - int explicit_prv_av_count, - int implicit_cur_av_count, - int implicit_prv_av_count) -{ - assert_int_equal(HASH_CNT(hh, av->util_av.hash), - explicit_cur_av_count + explicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, av->cur_reverse_av), - explicit_cur_av_count); - assert_int_equal(HASH_CNT(hh, av->prv_reverse_av), - explicit_prv_av_count); - - assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), - implicit_cur_av_count + implicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, proto_av->cur_reverse_av_implicit), - implicit_cur_av_count); - assert_int_equal(HASH_CNT(hh, proto_av->prv_reverse_av_implicit), - implicit_prv_av_count); -} - -/** - * @brief This test removes a peer and inserts it again - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_reinsertion(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_rdm_peer *peer; - struct efa_ep_addr raw_addr, raw_addr_2; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t fi_addr; - struct efa_av *av; - struct efa_proto_av *proto_av; - struct efa_rdm_ep *efa_rdm_ep; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 174; - raw_addr.qkey = 0x1234; - - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); - - peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->av_entry->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); - - err = fi_av_remove(resource->av, &fi_addr, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); - - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); - - peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->av_entry->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); - - err = fi_av_remove(resource->av, &fi_addr, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); -} - -/** - * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then - * remove the first-inserted peer before the second. This reproduces the bug - * in efa_av_reverse_av_remove() where the code blindly deletes the - * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to - * a different (newer) conn. Removing the surviving second peer afterwards - * then hits a NULL prv_reverse_av_entry and SEGVs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t fi_addr1, fi_addr2; - struct efa_av *av; - struct efa_proto_av *proto_av; - struct efa_rdm_ep *efa_rdm_ep; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, - base_ep.util_ep.ep_fid); - ahn = efa_rdm_ep->self_ah->ahn; - - /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */ - raw_addr.qpn = 100; - raw_addr.qkey = 0xAAAA; - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL); - assert_int_equal(err, 1); - test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - /* cur_reverse_av (ahn, 100) -> conn1 (fi_addr1) */ - assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), - fi_addr1); - - /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's - * reverse-AV entry from cur_reverse_av into prv_reverse_av. */ - raw_addr.qpn = 100; - raw_addr.qkey = 0xBBBB; - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL); - assert_int_equal(err, 1); - assert_int_not_equal(fi_addr1, fi_addr2); - test_av_verify_av_hash_cnt(av, proto_av, 1, 1, 0, 0); - /* cur_reverse_av (ahn, 100) now points to conn2 (fi_addr2); peer1 is - * in prv_reverse_av keyed by its own qkey. */ - assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), - fi_addr2); - - /* Remove peer1 first. Without the fix this would incorrectly delete - * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */ - err = fi_av_remove(resource->av, &fi_addr1, 1, 0); - assert_int_equal(err, 0); - /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ - test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), - fi_addr2); - - /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry - * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */ - err = fi_av_remove(resource->av, &fi_addr2, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); - assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), - FI_ADDR_NOTAVAIL); -} - -/** - * @brief Generate a peer with a unique QPN and a random QKEY and insert it - * into the implicit AV - * - * The QPN is drawn from a static monotonic counter so every peer minted by - * this helper has a distinct (ahn, qpn) key in the reverse AV. Callers rely - * on this uniqueness to exercise LRU ordering and eviction behavior without - * tripping over the provider's QPN-collision path. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource) -{ - struct efa_ep_addr raw_addr; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - fi_addr_t implicit_fi_addr, test_addr; - struct efa_av *av; - struct efa_proto_av *proto_av; - uint32_t ahn; - int err; - - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - static uint16_t next_qpn = 0; - raw_addr.qpn = next_qpn++; - raw_addr.qkey = rand(); - ahn = efa_rdm_ep->self_ah->ahn; - - /* Manually insert into implicit AV */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - - err = efa_proto_av_insert_one(proto_av, &raw_addr, &implicit_fi_addr, 0, NULL, true, true); - - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); - - assert_int_equal(peer->av_entry->implicit_fi_addr, implicit_fi_addr); - assert_int_equal(peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); - - test_addr = efa_proto_av_reverse_lookup_implicit(proto_av, ahn, raw_addr.qpn, NULL); - assert_int_equal(test_addr, implicit_fi_addr); - - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - - return peer; -} - -/** - * @brief This test fakes a peer in the implicit AV and closes the AV with an - * implicit peer in it - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - test_av_get_peer_from_implicit_av(resource); -} - -/** - * @brief This test fakes a peer in the implicit AV and verifies that the peer - * is moved to the explicit AV when fi_av_insert is called - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_to_explicit(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr, raw_addr_2; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - fi_addr_t explicit_fi_addr, test_addr; - struct efa_av *av; - struct efa_proto_av *proto_av; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */ - peer = test_av_get_peer_from_implicit_av(resource); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - /* Modify the peer and verify that the peer is moved as-is */ - peer->next_msg_id = 355; - peer->flags |= EFA_RDM_PEER_IN_BACKOFF; - - /* Insert explicitly */ - raw_addr.qpn = efa_proto_av_entry_ep_addr(peer->av_entry)->qpn; - raw_addr.qkey = efa_proto_av_entry_ep_addr(peer->av_entry)->qkey; - err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); - test_av_verify_av_hash_cnt(av, proto_av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); - - peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); - assert_int_equal(peer->av_entry->fi_addr, explicit_fi_addr); - assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); - - ahn = efa_rdm_ep->self_ah->ahn; - test_addr = efa_proto_av_reverse_lookup(proto_av, ahn, raw_addr.qpn, NULL); - assert_int_equal(test_addr, explicit_fi_addr); - - /* Verify the manually set peer properties above */ - assert_int_equal(peer->next_msg_id, 355); - assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF); - - /* Unset the flag to make fi_av_remove easier */ - peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF; - - err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 0, 0); -} - -static void test_av_implicit_av_verify_lru_list_first_last_elements( - struct efa_proto_av *proto_av, struct efa_proto_av_entry *first_entry_expected, - struct efa_proto_av_entry *last_entry_expected) -{ - struct dlist_entry *first_entry, *last_entry; - struct efa_proto_av_entry *first_entry_actual, *last_entry_actual; - - first_entry = proto_av->implicit_av_lru_list.next; - last_entry = proto_av->implicit_av_lru_list.prev; - - first_entry_actual = container_of(first_entry, struct efa_proto_av_entry, - implicit_av_lru_entry); - last_entry_actual = container_of(last_entry, struct efa_proto_av_entry, - implicit_av_lru_entry); - - assert_ptr_equal(first_entry_actual, first_entry_expected); - assert_ptr_equal(last_entry_actual, last_entry_expected); -} - -/** - * @brief This test inserts three implicit peers and verifies that the last - * inserted and/or accessed peer is at the tail of the LRU list - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_av_lru_insertion(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer0, *peer1, *peer2; - struct efa_av *av; - struct efa_proto_av *proto_av; - fi_addr_t implicit_fi_addr; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Manually insert first address into implicit AV */ - peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 1, 0); - - /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer0->av_entry); - - /* Manually insert second address into implicit AV */ - peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); - - /* Manually insert third address into implicit AV */ - peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); - - - /* Access peer0 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( - proto_av, ahn, efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 0); - - /* Expected LRU list: HEAD->peer1->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer0->av_entry); - - /* Access peer2 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( - proto_av, ahn, efa_proto_av_entry_ep_addr(peer2->av_entry)->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 2); - - /* Expected LRU list: HEAD->peer1->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer2->av_entry); - - - /* Access peer1 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer1->av_entry), &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 1); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer2->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); - - /* Access peer2 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer2->av_entry), &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 2); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); -} - -/** - * @brief This test sets the implicit AV size to 2 and inserts four implicit - * peers. It verifies that the least recently used peer is evicted. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_av_lru_eviction(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3; - struct efa_ep_addr peer1_ep_addr, peer2_ep_addr; - struct efa_ep_addr_hashable *efa_ep_addr_hashable; - struct efa_av *av; - struct efa_proto_av *proto_av; - fi_addr_t implicit_fi_addr; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Modify implicit AV size */ - proto_av->implicit_av_size = 2; - - /* Manually insert first address into implicit AV */ - peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 1, 0); - - /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer0->av_entry); - - /* Manually insert second address into implicit AV */ - peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); - - /* - * Snapshot peer1/peer2 ep_addr before they are evicted. After - * eviction the enclosing peer_map_entry is returned to the bufpool - * and peer->av_entry becomes stale memory (entry->ep_addr is zeroed - * by efa_proto_av_entry_release_util_av). - */ - memcpy(&peer1_ep_addr, efa_proto_av_entry_ep_addr(peer1->av_entry), - sizeof(struct efa_ep_addr)); - - /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer1->av_entry); - - /* Access peer0 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( - proto_av, ahn, efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 0); - - /* Expected LRU list: HEAD->peer1->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer1->av_entry, peer0->av_entry); - - /* Manually insert third address into implicit AV */ - peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); - /* Snapshot peer2 ep_addr before it too gets evicted later. */ - memcpy(&peer2_ep_addr, efa_proto_av_entry_ep_addr(peer2->av_entry), - sizeof(struct efa_ep_addr)); - - /* Expected LRU list: HEAD->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer2->av_entry); - - /* Verify that peer1 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 1); - HASH_FIND(hh, proto_av->evicted_peers_hashset, &peer1_ep_addr, - sizeof(struct efa_ep_addr), efa_ep_addr_hashable); - assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(&peer1_ep_addr, - &efa_ep_addr_hashable->addr), - 1); - - /* Access peer0 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, efa_proto_av_entry_ep_addr(peer0->av_entry), &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 0); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer2->av_entry, peer0->av_entry); - - /* Manually insert fourth address into implicit AV */ - peer3 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, proto_av, 0, 0, 2, 0); - - /* Verify that peer2 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 2); - HASH_FIND(hh, proto_av->evicted_peers_hashset, &peer2_ep_addr, - sizeof(struct efa_ep_addr), efa_ep_addr_hashable); - assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(&peer2_ep_addr, - &efa_ep_addr_hashable->addr), - 1); - - /* Expected LRU list: HEAD->peer0->peer3 */ - test_av_implicit_av_verify_lru_list_first_last_elements(proto_av, peer0->av_entry, peer3->av_entry); -} - -/** - * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_refcnt(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - fi_addr_t fi_addr; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_domain *efa_domain; - struct efa_rdm_peer *peer; - struct efa_av *av; - struct efa_proto_av *proto_av; - struct efa_ah *efa_ah = NULL; - int err; - - int allowed_ahs = 1; - - g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; - g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; - - g_self_ah_cnt = 1; - g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; - assert_int_equal(g_ibv_ah_cnt, 0); - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - proto_av = container_of(av, struct efa_proto_av, efa_av); - - /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */ - assert_int_equal(g_ibv_ah_cnt, 1); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); - - /* Manually insert into implicit AV */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, &raw_addr, &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - - efa_ah = peer->av_entry->ah; - - assert_int_equal(g_ibv_ah_cnt, 2); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); - - /* Move implicit AV entry to explicit AV entry */ - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - - assert_int_equal(g_ibv_ah_cnt, 2); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); - - err = fi_av_remove(resource->av, &fi_addr, 1, 0); - assert_int_equal(err, 0); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); - - /* Only the self AH should be left */ - assert_int_equal(g_ibv_ah_cnt, 1); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when an explicit AV entry is inserted. It - * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_impl(bool explicit) -{ - fi_addr_t fi_addr; - struct efa_ep_addr raw_addr[2] = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct fid_fabric *fabric_fid[2]; - struct fid_domain *domain_fid[2]; - struct fid_ep *ep_fid[2]; - struct fid_cq *cq_fid[2]; - struct fid_av *av_fid[2]; - struct efa_domain *efa_domain[2]; - struct efa_rdm_ep *efa_rdm_ep[2]; - struct efa_rdm_peer *peer; - struct efa_av *efa_av[2]; - struct efa_proto_av *proto_av; - struct efa_ah *efa_ah = NULL; - int err; - struct fi_av_attr av_attr = {0}; - struct fi_cq_attr cq_attr = { - .format = FI_CQ_FORMAT_DATA - }; - struct fi_info *hints, *info, *cur; - int num_nic = 0; - - int allowed_ahs = 1; - - g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; - g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; - - hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME); - fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info); - for (cur = info; cur; cur = cur->next) { - num_nic++; - } - - if (num_nic < 2) { - fi_freeinfo(info); - fi_freeinfo(hints); - return; - } - - g_self_ah_cnt = 2; - g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */ - assert_int_equal(g_ibv_ah_cnt, 0); - - cur = info; - for (int i = 0; i < 2; i++) { - err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL); - assert_int_equal(err, 0); - - err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL); - assert_int_equal(err, 0); - - efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid); - - err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL); - assert_int_equal(err, 0); - - efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid); - - err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL); - assert_int_equal(err, 0); - - err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL); - assert_int_equal(err, 0); - - efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0); - fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV); - - err = fi_enable(ep_fid[i]); - assert_int_equal(err, 0); - - err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len); - assert_int_equal(err, 0); - - cur = cur->next; - } - - proto_av = container_of(efa_av[0], struct efa_proto_av, efa_av); - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); - - /* Manually insert into implicit AV in first domain */ - ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, &raw_addr[0], &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); - ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - efa_ah = peer->av_entry->ah; - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); - - if (explicit) { - err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr); - } else { - ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_proto_av_insert_one(proto_av, &raw_addr[1], &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); - ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - } - - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - - efa_ah = peer->av_entry->ah; - if (explicit) { - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); - } else { - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); - assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); - } - - if (explicit) { - err = fi_av_remove(av_fid[0], &fi_addr, 1, 0); - assert_int_equal(err, 0); - proto_av = container_of(efa_av[0], struct efa_proto_av, efa_av); - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); - } - - for (int i = 0; i < 2; i++) { - efa_rdm_ep[i]->self_ah = NULL; - fi_close(&ep_fid[i]->fid); - fi_close(&cq_fid[i]->fid); - fi_close(&av_fid[i]->fid); - fi_close(&domain_fid[i]->fid); - fi_close(&fabric_fid[i]->fid); - } - fi_freeinfo(hints); - fi_freeinfo(info); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when an explicit AV entry is inserted. It - * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state) -{ - test_ah_lru_eviction_impl(true); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when another implicit AV entry is inserted. - * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state) -{ - test_ah_lru_eviction_impl(false); -} diff --git a/prov/efa/test/efa_unit_test_proto_av.c b/prov/efa/test/efa_unit_test_proto_av.c new file mode 100644 index 00000000000..d6b96093293 --- /dev/null +++ b/prov/efa/test/efa_unit_test_proto_av.c @@ -0,0 +1,741 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include "efa_unit_tests.h" +#include "efa_rdm_cq.h" +#include "efa_rdm_pke_req.h" +#include "efa_av.h" + +static void test_av_verify_av_hash_cnt(struct efa_av *av, + int explicit_cur_av_count, + int explicit_prv_av_count, + int implicit_cur_av_count, + int implicit_prv_av_count) +{ + struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); + + assert_int_equal(HASH_CNT(hh, av->util_av.hash), + explicit_cur_av_count + explicit_prv_av_count); + assert_int_equal(HASH_CNT(hh, av->cur_reverse_av), + explicit_cur_av_count); + assert_int_equal(HASH_CNT(hh, av->prv_reverse_av), + explicit_prv_av_count); + + assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), + implicit_cur_av_count + implicit_prv_av_count); + assert_int_equal(HASH_CNT(hh, proto_av->cur_reverse_av_implicit), + implicit_cur_av_count); + assert_int_equal(HASH_CNT(hh, proto_av->prv_reverse_av_implicit), + implicit_prv_av_count); +} + +/** + * @brief This test removes a peer and inserts it again + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_reinsertion(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_peer *peer; + struct efa_ep_addr raw_addr, raw_addr_2; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_av *av; + struct efa_rdm_ep *efa_rdm_ep; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 174; + raw_addr.qkey = 0x1234; + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + assert_int_equal(fi_addr, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + assert_int_equal(fi_addr, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Generate a peer with random QPN and QKEY and insert it into the implicit AV + * + * @param[in] state struct efa_resource that is managed by the framework + */ +static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource) +{ + struct efa_ep_addr raw_addr; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer; + fi_addr_t implicit_fi_addr, test_addr; + struct efa_av *av; + uint32_t ahn; + int err; + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + raw_addr.qpn = rand(); + raw_addr.qkey = rand(); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Manually insert into implicit AV */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &implicit_fi_addr, 0, NULL, true, true); + + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); + + assert_int_equal(peer->av_entry->implicit_fi_addr, implicit_fi_addr); + assert_int_equal(peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + test_addr = efa_proto_av_reverse_lookup_implicit(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL); + assert_int_equal(test_addr, implicit_fi_addr); + + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + return peer; +} + +/** + * @brief This test fakes a peer in the implicit AV and closes the AV with an + * implicit peer in it + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + test_av_get_peer_from_implicit_av(resource); +} + +/** + * @brief This test fakes a peer in the implicit AV and verifies that the peer + * is moved to the explicit AV when fi_av_insert is called + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_to_explicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr, raw_addr_2; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer; + fi_addr_t explicit_fi_addr, test_addr; + struct efa_av *av; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */ + peer = test_av_get_peer_from_implicit_av(resource); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + /* Modify the peer and verify that the peer is moved as-is */ + peer->next_msg_id = 355; + peer->flags |= EFA_RDM_PEER_IN_BACKOFF; + + /* Insert explicitly */ + raw_addr.qpn = efa_proto_av_entry_ep_addr(peer->av_entry)->qpn; + raw_addr.qkey = efa_proto_av_entry_ep_addr(peer->av_entry)->qkey; + err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); + assert_int_equal(peer->av_entry->fi_addr, explicit_fi_addr); + assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + ahn = efa_rdm_ep->self_ah->ahn; + test_addr = efa_proto_av_reverse_lookup(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL); + assert_int_equal(test_addr, explicit_fi_addr); + + /* Verify the manually set peer properties above */ + assert_int_equal(peer->next_msg_id, 355); + assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF); + + /* Unset the flag to make fi_av_remove easier */ + peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF; + + err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +static void test_av_implicit_av_verify_lru_list_first_last_elements( + struct efa_av *av, struct efa_proto_av_entry *first_conn_expected, + struct efa_proto_av_entry *last_conn_expected) +{ + struct dlist_entry *first_entry, *last_entry; + struct efa_proto_av_entry *first_conn_actual, *last_conn_actual; + + first_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.next; + last_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.prev; + + first_conn_actual = container_of(first_entry, struct efa_proto_av_entry, + implicit_av_lru_entry); + last_conn_actual = container_of(last_entry, struct efa_proto_av_entry, + implicit_av_lru_entry); + + assert_ptr_equal(first_conn_actual, first_conn_expected); + assert_ptr_equal(last_conn_actual, last_conn_expected); +} + +/** + * @brief This test inserts three implicit peers and verifies that the last + * inserted and/or accessed peer is at the tail of the LRU list + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_insertion(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer0, *peer1, *peer2; + struct efa_av *av; + fi_addr_t implicit_fi_addr; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Manually insert first address into implicit AV */ + peer0 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Expected LRU list: HEAD->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry); + + /* Manually insert second address into implicit AV */ + peer1 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Expected LRU list: HEAD->peer0->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Manually insert third address into implicit AV */ + peer2 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer1->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); + + + /* Access peer0 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 0); + + /* Expected LRU list: HEAD->peer1->peer2->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry); + + /* Access peer2 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer2->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 2); + + /* Expected LRU list: HEAD->peer1->peer0->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer2->av_entry); + + + /* Access peer1 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer1->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 1); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer2->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Access peer2 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer2->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 2); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer1->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); +} + +/** + * @brief This test sets the implicit AV size to 2 and inserts four implicit + * peers. It verifies that the least recently used peer is evicted. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_eviction(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3; + struct efa_ep_addr peer1_ep_addr, peer2_ep_addr; + struct efa_ep_addr_hashable *efa_ep_addr_hashable; + struct efa_av *av; + fi_addr_t implicit_fi_addr; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Modify implicit AV size */ + container_of(av, struct efa_proto_av, efa_av)->implicit_av_size = 2; + + /* Manually insert first address into implicit AV */ + peer0 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Expected LRU list: HEAD->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry); + + /* Manually insert second address into implicit AV */ + peer1 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* + * Snapshot peer1/peer2 ep_addr before they are evicted. After + * eviction the enclosing peer_map_entry is returned to the bufpool + * and peer1->av_entry / peer2->av_entry become stale memory. + */ + memcpy(&peer1_ep_addr, efa_proto_av_entry_ep_addr(peer1->av_entry), + sizeof(struct efa_ep_addr)); + + /* Expected LRU list: HEAD->peer0->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Access peer0 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 0); + + /* Expected LRU list: HEAD->peer1->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry); + + /* Manually insert third address into implicit AV */ + peer2 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + /* Snapshot peer2 ep_addr before it too gets evicted later. */ + memcpy(&peer2_ep_addr, efa_proto_av_entry_ep_addr(peer2->av_entry), + sizeof(struct efa_ep_addr)); + + /* Expected LRU list: HEAD->peer0->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); + + /* Verify that peer1 is evicted and added to the evicted hashmap */ + assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 1); + HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer1_ep_addr, + sizeof(struct efa_ep_addr), efa_ep_addr_hashable); + assert_non_null(efa_ep_addr_hashable); + assert_int_equal(efa_is_same_addr(&peer1_ep_addr, + &efa_ep_addr_hashable->addr), + 1); + + /* Access peer0 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer0->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Expected LRU list: HEAD->peer2->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->av_entry, peer0->av_entry); + + /* Manually insert fourth address into implicit AV */ + peer3 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Verify that peer2 is evicted and added to the evicted hashmap */ + assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 2); + HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer2_ep_addr, + sizeof(struct efa_ep_addr), efa_ep_addr_hashable); + assert_non_null(efa_ep_addr_hashable); + assert_int_equal(efa_is_same_addr(&peer2_ep_addr, + &efa_ep_addr_hashable->addr), + 1); + + /* Expected LRU list: HEAD->peer0->peer3 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer3->av_entry); +} + +/** + * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_refcnt(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + fi_addr_t fi_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_domain *efa_domain; + struct efa_rdm_peer *peer; + struct efa_av *av; + struct efa_ah *efa_ah = NULL; + int err; + + int allowed_ahs = 1; + + g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; + g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; + + g_self_ah_cnt = 1; + g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; + assert_int_equal(g_ibv_ah_cnt, 0); + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + + /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */ + assert_int_equal(g_ibv_ah_cnt, 1); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* Manually insert into implicit AV */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + efa_ah = peer->av_entry->ah; + + assert_int_equal(g_ibv_ah_cnt, 2); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + + /* Move implicit AV entry to explicit AV entry */ + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + + assert_int_equal(g_ibv_ah_cnt, 2); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* Only the self AH should be left */ + assert_int_equal(g_ibv_ah_cnt, 1); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when an explicit AV entry is inserted. It + * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_impl(bool explicit) +{ + fi_addr_t fi_addr; + struct efa_ep_addr raw_addr[2] = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct fid_fabric *fabric_fid[2]; + struct fid_domain *domain_fid[2]; + struct fid_ep *ep_fid[2]; + struct fid_cq *cq_fid[2]; + struct fid_av *av_fid[2]; + struct efa_domain *efa_domain[2]; + struct efa_rdm_ep *efa_rdm_ep[2]; + struct efa_rdm_peer *peer; + struct efa_av *efa_av[2]; + struct efa_ah *efa_ah = NULL; + int err; + struct fi_av_attr av_attr = {0}; + struct fi_cq_attr cq_attr = { + .format = FI_CQ_FORMAT_DATA + }; + struct fi_info *hints, *info, *cur; + int num_nic = 0; + + int allowed_ahs = 1; + + g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; + g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; + + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME); + fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info); + for (cur = info; cur; cur = cur->next) { + num_nic++; + } + + if (num_nic < 2) { + fi_freeinfo(info); + fi_freeinfo(hints); + return; + } + + g_self_ah_cnt = 2; + g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */ + assert_int_equal(g_ibv_ah_cnt, 0); + + cur = info; + for (int i = 0; i < 2; i++) { + err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL); + assert_int_equal(err, 0); + + err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL); + assert_int_equal(err, 0); + + efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid); + + err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL); + assert_int_equal(err, 0); + + efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid); + + err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL); + assert_int_equal(err, 0); + + err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL); + assert_int_equal(err, 0); + + efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0); + fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV); + + err = fi_enable(ep_fid[i]); + assert_int_equal(err, 0); + + err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len); + assert_int_equal(err, 0); + + cur = cur->next; + } + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); + + /* Manually insert into implicit AV in first domain */ + ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[0], &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); + ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); + efa_ah = peer->av_entry->ah; + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + + if (explicit) { + err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr); + } else { + ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[1], &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); + ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + } + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); + + efa_ah = peer->av_entry->ah; + if (explicit) { + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + } else { + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + } + + if (explicit) { + err = fi_av_remove(av_fid[0], &fi_addr, 1, 0); + assert_int_equal(err, 0); + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); + } + + for (int i = 0; i < 2; i++) { + efa_rdm_ep[i]->self_ah = NULL; + fi_close(&ep_fid[i]->fid); + fi_close(&cq_fid[i]->fid); + fi_close(&av_fid[i]->fid); + fi_close(&domain_fid[i]->fid); + fi_close(&fabric_fid[i]->fid); + } + fi_freeinfo(hints); + fi_freeinfo(info); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when an explicit AV entry is inserted. It + * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state) +{ + test_ah_lru_eviction_impl(true); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when another implicit AV entry is inserted. + * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state) +{ + test_ah_lru_eviction_impl(false); +} + +/** + * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then + * remove the first-inserted peer before the second. This reproduces the bug + * in efa_av_reverse_av_remove() where the code blindly deletes the + * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to + * a different (newer) conn. Removing the surviving second peer afterwards + * then hits a NULL prv_reverse_av_entry and SEGVs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr1, fi_addr2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */ + raw_addr.qpn = 100; + raw_addr.qkey = 0xAAAA; + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL); + assert_int_equal(err, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + /* cur_reverse_av (ahn, 100) -> entry1 (fi_addr1) */ + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr1); + + /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's + * reverse-AV entry from cur_reverse_av into prv_reverse_av. */ + raw_addr.qpn = 100; + raw_addr.qkey = 0xBBBB; + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL); + assert_int_equal(err, 1); + assert_int_not_equal(fi_addr1, fi_addr2); + test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); + /* cur_reverse_av (ahn, 100) now points to entry2 (fi_addr2); peer1 is + * in prv_reverse_av keyed by its own qkey. */ + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr2); + + /* Remove peer1 first. Without the fix this would incorrectly delete + * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */ + err = fi_av_remove(resource->av, &fi_addr1, 1, 0); + assert_int_equal(err, 0); + /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr2); + + /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry + * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */ + err = fi_av_remove(resource->av, &fi_addr2, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + FI_ADDR_NOTAVAIL); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 6f6f7771361..dc1d3ec8c5d 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -147,6 +147,9 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_ah_cnt_multi_av_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + /* end efa_unit_test_av.c */ + + /* begin efa_unit_test_proto_av.c */ cmocka_unit_test_setup_teardown(test_av_reinsertion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_reverse_av_remove_qpn_collision, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_implicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -156,7 +159,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_ah_refcnt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_explicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_implicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - /* end efa_unit_test_av.c */ + /* end efa_unit_test_proto_av.c */ /* begin efa_unit_test_ep.c */ cmocka_unit_test_setup_teardown(test_efa_device_construct_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 1c9b021f051..4151c4da858 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -117,6 +117,9 @@ void test_efa_ah_cnt_multi_av_efa(); void test_efa_ah_cnt_multi_av_efa_direct(); void test_av_multiple_ep_efa(); void test_av_multiple_ep_efa_direct(); +/* end efa_unit_test_av.c */ + +/* begin efa_unit_test_proto_av.c */ void test_av_reinsertion(); void test_av_reverse_av_remove_qpn_collision(); void test_av_implicit(); @@ -126,7 +129,7 @@ void test_av_implicit_av_lru_eviction(); void test_ah_refcnt(); void test_ah_lru_eviction_explicit_av_insert(); void test_ah_lru_eviction_implicit_av_insert(); -/* end efa_unit_test_av.c */ +/* end efa_unit_test_proto_av.c */ void test_efa_device_construct_error_handling(); void test_efa_rdm_ep_ignore_missing_host_id_file(); From 3e8d13033471937f05a1312e59d6e7b2f0e63085 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 10 Apr 2026 15:01:17 -0600 Subject: [PATCH 16/16] prov/efa/test: add new unit tests for AV coverage Add thirteen new unit tests that exercise the refactored AV code paths. The existing suite focused mostly on AH refcount and endpoint-binding behavior; these tests add direct coverage of entry insert/remove/lookup, protocol-AV-only behaviors, and a handful of error paths that previously had no assertions. Base AV coverage adds test_av_insert_remove_lookup_efa_direct, which walks a full insert/lookup/remove cycle against the base efa_av, and test_av_base_addr_to_entry_invalid, which confirms that efa_av_addr_to_entry returns NULL for FI_ADDR_NOTAVAIL and FI_ADDR_UNSPEC inputs. Protocol AV coverage exercises lookup and lifetime of efa_proto_av_entry: test_av_proto_reverse_lookup_explicit covers reverse lookup after insert and remove, test_av_proto_prv_reverse_av covers QPN reuse falling through to prv_reverse_av, test_av_proto_addr_to_entry_after_remove checks the validity bit (ah != NULL) after a remove, test_av_proto_insert_remove_with_peer drives the per-endpoint peer-map lifecycle alongside insert and remove, test_av_proto_batch_insert covers a multi-address fi_av_insert, and test_av_proto_remove_nonexistent covers the error path for a fi_addr that was never inserted. Implicit-AV behavior adds test_av_implicit_to_explicit_peer_updated, which verifies that migrating a peer from the implicit AV to the explicit AV preserves peer and AH identity; test_av_implicit_av_unbounded, which confirms the implicit AV grows without an explicit cap; and test_av_implicit_av_lru_move_single, which exercises LRU move when only a single implicit entry exists. Input validation adds test_av_proto_insert_invalid_address for inserts of all-zero endpoint addresses and test_av_proto_open_unsupported_attrs for fi_av_open with attributes the protocol AV does not accept. Signed-off-by: Seth Zegelstein --- prov/efa/test/efa_unit_test_av.c | 79 ++++ prov/efa/test/efa_unit_test_proto_av.c | 495 +++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 14 + prov/efa/test/efa_unit_tests.h | 13 + 4 files changed, 601 insertions(+) diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index 4c68f2db12f..927f130d552 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -305,3 +305,82 @@ void test_av_multiple_ep_efa_direct(struct efa_resource **state) { return test_av_multiple_ep_impl(state, EFA_DIRECT_FABRIC_NAME); } + +/** + * @brief Test base AV (efa-direct) insert, lookup, remove cycle + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_insert_remove_lookup_efa_direct(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}, raw_addr_out = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_av *av; + struct efa_av_entry *entry; + int err, num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 7; + raw_addr.qkey = 0xABCD; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + assert_int_equal(fi_addr, 0); + assert_int_equal(av->used, 1); + + /* Verify entry is accessible and fields are correct */ + entry = efa_av_addr_to_entry(av, fi_addr); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addr); + assert_int_equal(efa_av_entry_ep_addr(entry)->qpn, 7); + assert_int_equal(efa_av_entry_ep_addr(entry)->qkey, 0xABCD); + + /* Lookup should return the same address */ + raw_addr_len = sizeof(raw_addr_out); + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(raw_addr_out.qpn, 7); + assert_int_equal(raw_addr_out.qkey, 0xABCD); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_out), 1); + + /* Remove and verify */ + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + assert_int_equal(av->used, 0); + + /* Entry should be NULL after remove */ + entry = efa_av_addr_to_entry(av, fi_addr); + assert_null(entry); + + /* Lookup should fail after remove */ + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len); + assert_int_not_equal(err, 0); +} + +/** + * @brief Test base AV (efa-direct) addr_to_entry returns NULL for invalid fi_addr + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_base_addr_to_entry_invalid(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_av *av; + struct efa_av_entry *entry; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + + entry = efa_av_addr_to_entry(av, FI_ADDR_NOTAVAIL); + assert_null(entry); + + entry = efa_av_addr_to_entry(av, FI_ADDR_UNSPEC); + assert_null(entry); +} diff --git a/prov/efa/test/efa_unit_test_proto_av.c b/prov/efa/test/efa_unit_test_proto_av.c index d6b96093293..a7a898e9e01 100644 --- a/prov/efa/test/efa_unit_test_proto_av.c +++ b/prov/efa/test/efa_unit_test_proto_av.c @@ -120,6 +120,7 @@ static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resourc ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &implicit_fi_addr, 0, NULL, true, true); + assert_int_equal(err, 0); peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); @@ -666,6 +667,375 @@ void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state) test_ah_lru_eviction_impl(false); } +/** + * @brief Test proto AV explicit reverse lookup returns correct fi_addr + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_reverse_lookup_explicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr, lookup_addr; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + uint32_t ahn; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Reverse lookup on empty AV should return NOTAVAIL */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 42; + raw_addr.qkey = 0x5678; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Reverse lookup should find the entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, fi_addr); + + /* Lookup with wrong QPN should return NOTAVAIL */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 99, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); + + /* After remove, reverse lookup should return FI_ADDR_NOTAVAIL */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); +} + +/** + * @brief Test that proto AV addr_to_entry returns NULL after entry is removed + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_addr_to_entry_after_remove(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + /* addr_to_entry on empty AV should return NULL */ + entry = efa_proto_av_addr_to_entry(proto_av, 0); + assert_null(entry); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 99; + raw_addr.qkey = 0x9999; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* Entry should be found with correct fields */ + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addr); + assert_int_equal(entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 99); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x9999); + + /* Remove and verify entry is no longer valid */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_null(entry); +} + +/** + * @brief Test proto AV insert/remove with peer creation via get_peer + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_insert_remove_with_peer(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer, *peer2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 55; + raw_addr.qkey = 0x5555; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Create peer via get_peer */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_non_null(peer); + assert_non_null(peer->av_entry); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qpn, 55); + assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qkey, 0x5555); + assert_ptr_equal(peer->ep, efa_rdm_ep); + + /* Peer map lookup should find the same peer */ + peer2 = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_ptr_equal(peer2, peer); + + /* Verify peer map on the entry itself */ + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_non_null(entry); + assert_ptr_equal(efa_proto_av_entry_ep_peer_map_lookup(entry, efa_rdm_ep), peer); + + /* Remove — peer is destroyed during av_remove */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV implicit insert followed by explicit insert of same addr + * verifies the peer's av_entry pointer is updated to the explicit entry + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_to_explicit_peer_updated(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *implicit_peer, *explicit_peer; + struct efa_av *av; + struct efa_proto_av *proto_av; + fi_addr_t implicit_fi_addr, explicit_fi_addr; + struct efa_ah *ah_before; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Insert implicit peer */ + implicit_peer = test_av_get_peer_from_implicit_av(resource); + assert_non_null(implicit_peer); + implicit_fi_addr = implicit_peer->av_entry->implicit_fi_addr; + assert_int_equal(implicit_peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); + assert_int_not_equal(implicit_fi_addr, FI_ADDR_NOTAVAIL); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Remember the AH — it should be reused after migration */ + ah_before = implicit_peer->av_entry->ah; + assert_non_null(ah_before); + + /* Now insert explicitly with the same address */ + struct efa_ep_addr raw_addr; + memcpy(&raw_addr, implicit_peer->av_entry->ep_addr, EFA_EP_ADDR_LEN); + + err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); + assert_int_equal(err, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Implicit entry should be gone */ + assert_null(efa_proto_av_addr_to_entry_implicit(proto_av, implicit_fi_addr)); + + /* Get peer via explicit addr — should be the same peer with updated av_entry */ + explicit_peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); + assert_non_null(explicit_peer); + assert_ptr_equal(explicit_peer, implicit_peer); + assert_int_equal(explicit_peer->av_entry->fi_addr, explicit_fi_addr); + assert_int_equal(explicit_peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + + /* AH should be the same object (reused, not reallocated) */ + assert_ptr_equal(explicit_peer->av_entry->ah, ah_before); + + fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV batch insert of multiple addresses in one fi_av_insert call + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_batch_insert(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addrs[3] = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addrs[3]; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr, i; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addrs[0], &raw_addr_len), 0); + memcpy(&raw_addrs[1], &raw_addrs[0], sizeof(struct efa_ep_addr)); + memcpy(&raw_addrs[2], &raw_addrs[0], sizeof(struct efa_ep_addr)); + raw_addrs[0].qpn = 10; raw_addrs[0].qkey = 0x1000; + raw_addrs[1].qpn = 11; raw_addrs[1].qkey = 0x1001; + raw_addrs[2].qpn = 12; raw_addrs[2].qkey = 0x1002; + + num_addr = fi_av_insert(resource->av, raw_addrs, 3, fi_addrs, 0, NULL); + assert_int_equal(num_addr, 3); + + /* All three should have distinct fi_addrs */ + assert_int_not_equal(fi_addrs[0], fi_addrs[1]); + assert_int_not_equal(fi_addrs[1], fi_addrs[2]); + assert_int_not_equal(fi_addrs[0], fi_addrs[2]); + + test_av_verify_av_hash_cnt(av, 3, 0, 0, 0); + + /* Verify each entry is accessible with correct QPN */ + for (i = 0; i < 3; i++) { + entry = efa_proto_av_addr_to_entry(proto_av, fi_addrs[i]); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addrs[i]); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 10 + i); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x1000 + i); + } + + /* Remove one at a time and verify counts */ + fi_av_remove(resource->av, &fi_addrs[0], 1, 0); + test_av_verify_av_hash_cnt(av, 2, 0, 0, 0); + assert_null(efa_proto_av_addr_to_entry(proto_av, fi_addrs[0])); + + fi_av_remove(resource->av, &fi_addrs[1], 1, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + fi_av_remove(resource->av, &fi_addrs[2], 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV remove of non-existent address returns error + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_remove_nonexistent(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + fi_addr_t bad_addr = 9999; + fi_addr_t notavail = FI_ADDR_NOTAVAIL; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + /* Remove with out-of-range fi_addr */ + err = fi_av_remove(resource->av, &bad_addr, 1, 0); + assert_int_not_equal(err, 0); + + /* Remove with FI_ADDR_NOTAVAIL */ + err = fi_av_remove(resource->av, ¬avail, 1, 0); + assert_int_not_equal(err, 0); +} + +/** + * @brief Test proto AV prv_reverse_av path: insert two addresses with same GID + * but different QPN/QKEY, remove the first, insert a new one with the same QPN + * as the first but different QKEY. The old entry should be in prv_reverse_av. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_prv_reverse_av(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr1 = {0}, raw_addr2 = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr1, fi_addr2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_proto_av_entry *entry1, *entry2; + fi_addr_t lookup_addr; + uint32_t ahn; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr1, &raw_addr_len), 0); + memcpy(&raw_addr2, &raw_addr1, sizeof(struct efa_ep_addr)); + + /* Insert first address with qpn=20, qkey=0xAAAA */ + raw_addr1.qpn = 20; + raw_addr1.qkey = 0xAAAA; + num_addr = fi_av_insert(resource->av, &raw_addr1, 1, &fi_addr1, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Verify first entry */ + entry1 = efa_proto_av_addr_to_entry(proto_av, fi_addr1); + assert_non_null(entry1); + assert_int_equal(efa_proto_av_entry_ep_addr(entry1)->qkey, 0xAAAA); + + /* Reverse lookup should find first entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL); + assert_int_equal(lookup_addr, fi_addr1); + + /* Insert second address with same qpn=20 but different qkey=0xBBBB. + * This simulates QPN reuse — the first entry moves to prv_reverse_av */ + raw_addr2.qpn = 20; + raw_addr2.qkey = 0xBBBB; + num_addr = fi_av_insert(resource->av, &raw_addr2, 1, &fi_addr2, 0, NULL); + assert_int_equal(num_addr, 1); + assert_int_not_equal(fi_addr1, fi_addr2); + + /* cur_reverse_av has 1 entry (the latest), prv_reverse_av has 1 (the old) */ + test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); + + /* Verify second entry */ + entry2 = efa_proto_av_addr_to_entry(proto_av, fi_addr2); + assert_non_null(entry2); + assert_int_equal(efa_proto_av_entry_ep_addr(entry2)->qkey, 0xBBBB); + + /* Both entries should share the same AH (same GID) */ + assert_ptr_equal(entry1->ah, entry2->ah); + + /* Reverse lookup without connid should return the current (latest) entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL); + assert_int_equal(lookup_addr, fi_addr2); + + /* Remove in reverse order: current entry first, then previous */ + fi_av_remove(resource->av, &fi_addr2, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 1, 0, 0); + + fi_av_remove(resource->av, &fi_addr1, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + /** * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then * remove the first-inserted peer before the second. This reproduces the bug @@ -739,3 +1109,128 @@ void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), FI_ADDR_NOTAVAIL); } + +/** + * @brief Inserting an all-zero GID into the protocol AV must be rejected. + * + * efa_av_is_valid_address() returns 0 for all-zero GIDs. fi_av_insert + * should skip the bad address and return 0 (no address inserted), and + * the output fi_addr should be FI_ADDR_NOTAVAIL. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_insert_invalid_address(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr zero_addr = {0}; + fi_addr_t fi_addr = 0; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + zero_addr.qpn = 5; + zero_addr.qkey = 0x1234; + /* zero_addr.raw is left all-zero */ + + err = fi_av_insert(resource->av, &zero_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 0); + assert_int_equal(fi_addr, FI_ADDR_NOTAVAIL); +} + +/** + * @brief With implicit_av_size set to 0 (unbounded mode), the implicit AV + * never evicts entries. + * + * Insert several implicit peers and verify all remain in the LRU list and + * util_av, and evicted_peers_hashset stays empty. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_unbounded(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_proto_av *proto_av; + struct efa_av *av; + const int num_peers = 10; + int i; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + /* Disable the eviction limit */ + proto_av->implicit_av_size = 0; + + for (i = 0; i < num_peers; i++) + test_av_get_peer_from_implicit_av(resource); + + /* All peers should still be in the implicit AV */ + assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), num_peers); + /* No peer should have been evicted */ + assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 0); +} + +/** + * @brief efa_proto_av_open rejects attr->name and attr->flags (both unsupported) + * + * Ensures the early-return error paths in efa_proto_av_open are exercised. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_open_unsupported_attrs(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_av_attr av_attr = {0}; + struct fid_av *av = NULL; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + /* attr->name is not supported */ + av_attr.name = "foo"; + err = fi_av_open(resource->domain, &av_attr, &av, NULL); + assert_int_equal(err, -FI_ENOSYS); + assert_null(av); + av_attr.name = NULL; + + /* attr->flags is not supported */ + av_attr.flags = 1; + err = fi_av_open(resource->domain, &av_attr, &av, NULL); + assert_int_equal(err, -FI_ENOSYS); + assert_null(av); +} + +/** + * @brief efa_proto_av_implicit_av_lru_entry_move on a single-element list + * + * Insert exactly one implicit peer; the LRU list has exactly one node. + * Call efa_proto_av_implicit_av_lru_entry_move on it — this exercises the + * dlist_entry_in_list assertion on the smallest non-empty list. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_move_single(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_peer *peer; + struct efa_proto_av *proto_av; + struct efa_av *av; + struct efa_rdm_ep *efa_rdm_ep; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + peer = test_av_get_peer_from_implicit_av(resource); + assert_non_null(peer); + + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + efa_proto_av_implicit_av_lru_entry_move(proto_av, peer->av_entry); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + /* Still exactly one entry in the LRU list */ + test_av_implicit_av_verify_lru_list_first_last_elements( + av, peer->av_entry, peer->av_entry); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index dc1d3ec8c5d..49fd5672326 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -147,6 +147,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_ah_cnt_multi_av_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_base_addr_to_entry_invalid, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), /* end efa_unit_test_av.c */ /* begin efa_unit_test_proto_av.c */ @@ -159,6 +161,18 @@ int main(void) cmocka_unit_test_setup_teardown(test_ah_refcnt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_explicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_implicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_reverse_lookup_explicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_addr_to_entry_after_remove, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_insert_remove_with_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_to_explicit_peer_updated, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_batch_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_remove_nonexistent, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_prv_reverse_av, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_insert_invalid_address, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_av_unbounded, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_open_unsupported_attrs, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_av_lru_move_single, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), /* end efa_unit_test_proto_av.c */ /* begin efa_unit_test_ep.c */ diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 4151c4da858..06f0405f911 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -117,6 +117,8 @@ void test_efa_ah_cnt_multi_av_efa(); void test_efa_ah_cnt_multi_av_efa_direct(); void test_av_multiple_ep_efa(); void test_av_multiple_ep_efa_direct(); +void test_av_insert_remove_lookup_efa_direct(); +void test_av_base_addr_to_entry_invalid(); /* end efa_unit_test_av.c */ /* begin efa_unit_test_proto_av.c */ @@ -129,6 +131,17 @@ void test_av_implicit_av_lru_eviction(); void test_ah_refcnt(); void test_ah_lru_eviction_explicit_av_insert(); void test_ah_lru_eviction_implicit_av_insert(); +void test_av_proto_reverse_lookup_explicit(); +void test_av_proto_addr_to_entry_after_remove(); +void test_av_proto_insert_remove_with_peer(); +void test_av_implicit_to_explicit_peer_updated(); +void test_av_proto_batch_insert(); +void test_av_proto_remove_nonexistent(); +void test_av_proto_prv_reverse_av(); +void test_av_proto_insert_invalid_address(); +void test_av_implicit_av_unbounded(); +void test_av_proto_open_unsupported_attrs(); +void test_av_implicit_av_lru_move_single(); /* end efa_unit_test_proto_av.c */ void test_efa_device_construct_error_handling();