diff --git a/libfabric.vcxproj b/libfabric.vcxproj
index 76d1f73ba48..d59ea6e0a68 100644
--- a/libfabric.vcxproj
+++ b/libfabric.vcxproj
@@ -873,7 +873,7 @@
-
+
@@ -1018,7 +1018,7 @@
-
+
diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include
index 3d7780e389f..a4d3197de8a 100644
--- a/prov/efa/Makefile.include
+++ b/prov/efa/Makefile.include
@@ -36,7 +36,7 @@ _efa_files = \
prov/efa/src/efa_shm.c \
prov/efa/src/efa_av.c \
prov/efa/src/efa_ah.c \
- prov/efa/src/efa_conn.c \
+ prov/efa/src/rdm/efa_proto_av.c \
prov/efa/src/efa_domain.c \
prov/efa/src/efa_fabric.c \
prov/efa/src/efa_mr.c \
@@ -89,7 +89,6 @@ _efa_headers = \
prov/efa/src/efa.h \
prov/efa/src/efa_av.h \
prov/efa/src/efa_ah.h \
- prov/efa/src/efa_conn.h \
prov/efa/src/efa_mr.h \
prov/efa/src/efa_shm.h \
prov/efa/src/efa_hmem.h \
@@ -117,6 +116,7 @@ _efa_headers = \
prov/efa/src/efa_data_path_direct_internal.h \
prov/efa/src/efa_mmio.h \
prov/efa/src/rdm/efa_rdm_peer.h \
+ prov/efa/src/rdm/efa_proto_av.h \
prov/efa/src/rdm/efa_rdm_cq.h \
prov/efa/src/rdm/efa_rdm_cntr.h \
prov/efa/src/rdm/efa_rdm_ep.h \
@@ -159,6 +159,7 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \
prov/efa/test/efa_unit_test_domain.c \
prov/efa/test/efa_unit_test_ep.c \
prov/efa/test/efa_unit_test_av.c \
+ prov/efa/test/efa_unit_test_proto_av.c \
prov/efa/test/efa_unit_test_cq.c \
prov/efa/test/efa_unit_test_cntr.c \
prov/efa/test/efa_unit_test_device.c \
diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h
index 7e85962cbee..b59d796cbc6 100644
--- a/prov/efa/src/efa.h
+++ b/prov/efa/src/efa.h
@@ -52,6 +52,7 @@
#include "rdm/efa_rdm_ope.h"
#include "rdm/efa_rdm_pke.h"
#include "rdm/efa_rdm_peer.h"
+#include "rdm/efa_proto_av.h"
#include "rdm/efa_rdm_util.h"
#include "fi_ext_efa.h"
diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c
index 53bf736f1fd..12d2167d835 100644
--- a/prov/efa/src/efa_ah.c
+++ b/prov/efa/src/efa_ah.c
@@ -5,75 +5,18 @@
#include "efa.h"
#include "efa_ah.h"
-#include "efa_conn.h"
#include
-void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah);
-
/**
- * @brief Move the AH to the end of the LRU list to indicate that it is the
- * most recently used entry
+ * @brief Emit a detailed warning for ibv_create_ah EINVAL.
*
- * This function is not called in the efa_rdm_ep_get_peer so that we don't add
- * extra latency to the critical path with explicit AV insertion. We use the LRU
- * list to remove AH entries with only implicit AV entries, so it is OK to do
- * that.
+ * The most common reasons for EINVAL are cross-AZ addressing, invalid
+ * remote GID, and invalid PD. Log both local and remote GIDs plus the
+ * PD pointer to help operators diagnose failures from logs alone.
*
- * @param[in] av efa address vector
- * @param[in] conn efa conn to be added to the LRU list
+ * @param[in] domain efa domain (for local GID and PD)
+ * @param[in] gid remote GID that failed
*/
-void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain,
- struct efa_ah *ah)
-{
- assert(ah->implicit_refcnt > 0 || ah->explicit_refcnt > 0);
- assert(dlist_entry_in_list(&domain->ah_lru_list,
- &ah->domain_lru_ah_list_entry));
-
- dlist_remove(&ah->domain_lru_ah_list_entry);
- dlist_insert_tail(&ah->domain_lru_ah_list_entry,
- &domain->ah_lru_list);
-}
-
-static inline int efa_ah_implicit_av_evict_ah(struct efa_domain *domain) {
- struct efa_conn *conn_to_release;
- struct efa_ah *ah_tmp, *ah_to_release = NULL;
- struct dlist_entry *tmp;
-
- dlist_foreach_container (&domain->ah_lru_list, struct efa_ah, ah_tmp,
- domain_lru_ah_list_entry) {
- if (ah_tmp->explicit_refcnt == 0) {
- ah_to_release = ah_tmp;
- break;
- }
- }
-
- if (!ah_to_release) {
- EFA_WARN(FI_LOG_AV,
- "AH creation for implicit AV entry failed with ENOMEM "
- "but no AH entries available to evict\n");
- return -FI_ENOMEM;
- }
-
- assert(ah_to_release->implicit_refcnt > 0);
-
- dlist_foreach_container_safe(&ah_to_release->implicit_conn_list,
- struct efa_conn, conn_to_release,
- ah_implicit_conn_list_entry, tmp) {
-
- assert(conn_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
- conn_to_release->fi_addr == FI_ADDR_NOTAVAIL);
-
- efa_conn_release_ah_unsafe(conn_to_release->av, conn_to_release, true);
- }
-
- if (ah_to_release->implicit_refcnt == 0 &&
- ah_to_release->explicit_refcnt == 0) {
- efa_ah_destroy_ah(domain, ah_to_release);
- }
-
- return FI_SUCCESS;
-}
-
static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t *gid)
{
char remote_gid_str[INET6_ADDRSTRLEN] = {0};
@@ -95,15 +38,20 @@ static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t *
}
/**
- * @brief allocate an ibv_ah object from GID.
- * This function use a hash map to store GID to ibv_ah map,
- * and re-use ibv_ah for same GID
+ * @brief allocate an ibv_ah from GID, reusing existing AH if possible
+ *
+ * Uses a hash map to store GID to ibv_ah mapping and reuses ibv_ah for
+ * the same GID. If ibv_create_ah fails, returns NULL with errno set.
+ * The caller is responsible for handling ENOMEM (e.g. by evicting AH
+ * entries and retrying).
*
- * @param[in] domain efa_domain
- * @param[in] gid GID
+ * @param[in] domain efa domain
+ * @param[in] gid GID
+ * @param[in] alloc_size size to allocate (sizeof(efa_ah) or larger for protocol wrapper)
+ * @return pointer to efa_ah on success, NULL on failure (errno set)
*/
struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av)
+ size_t alloc_size)
{
struct ibv_pd *ibv_pd = domain->ibv_pd;
struct efa_ah *efa_ah;
@@ -111,21 +59,23 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
struct efadv_ah_attr efa_ah_attr = { 0 };
int err;
+ assert(alloc_size >= sizeof(struct efa_ah));
+
efa_ah = NULL;
ofi_genlock_lock(&domain->util_domain.lock);
HASH_FIND(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah);
if (efa_ah) {
- insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++;
- efa_ah_implicit_av_lru_ah_move(domain, efa_ah);
+ efa_ah->refcnt++;
ofi_genlock_unlock(&domain->util_domain.lock);
return efa_ah;
}
- efa_ah = malloc(sizeof(struct efa_ah));
+ efa_ah = calloc(1, alloc_size);
if (!efa_ah) {
errno = FI_ENOMEM;
EFA_WARN(FI_LOG_AV, "cannot allocate memory for efa_ah\n");
+ ofi_genlock_unlock(&domain->util_domain.lock);
return NULL;
}
@@ -134,39 +84,13 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
memcpy(ibv_ah_attr.grh.dgid.raw, gid, EFA_GID_LEN);
efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr);
if (!efa_ah->ibv_ah) {
- /* If the failure is because we have too many AH entries, try to
- * evict an AH entry with no explicit AV entries and try AH
- * creation again */
- if (errno == FI_ENOMEM) {
- EFA_INFO(
- FI_LOG_AV,
- "ibv_create_ah failed with ENOMEM for implicit "
- "AV insertion. Attempting to evict AH entry\n");
-
- err = efa_ah_implicit_av_evict_ah(domain);
- if (err)
- goto err_free_efa_ah;
-
- efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr);
- if (!efa_ah->ibv_ah) {
- if (errno == EINVAL) {
- efa_ah_warn_create_einval(domain, gid);
- } else {
- EFA_WARN(FI_LOG_AV,
- "ibv_create_ah failed for implicit AV "
- "insertion! errno: %d\n",
- errno);
- }
- goto err_free_efa_ah;
- }
- } else if (errno == EINVAL) {
+ if (errno == EINVAL) {
efa_ah_warn_create_einval(domain, gid);
- goto err_free_efa_ah;
} else {
EFA_WARN(FI_LOG_AV,
- "ibv_create_ah failed! errno: %s\n", strerror(errno));
- goto err_free_efa_ah;
+ "ibv_create_ah failed! errno: %d\n", errno);
}
+ goto err_free;
}
err = efadv_query_ah(efa_ah->ibv_ah, &efa_ah_attr, sizeof(efa_ah_attr));
@@ -176,11 +100,7 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
goto err_destroy_ibv_ah;
}
- dlist_init(&efa_ah->implicit_conn_list);
- dlist_insert_tail(&efa_ah->domain_lru_ah_list_entry, &domain->ah_lru_list);
- efa_ah->implicit_refcnt = 0;
- efa_ah->explicit_refcnt = 0;
- insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++;
+ efa_ah->refcnt = 1;
efa_ah->ahn = efa_ah_attr.ahn;
memcpy(efa_ah->gid, gid, EFA_GID_LEN);
HASH_ADD(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah);
@@ -189,21 +109,27 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
err_destroy_ibv_ah:
ibv_destroy_ah(efa_ah->ibv_ah);
-err_free_efa_ah:
+err_free:
free(efa_ah);
ofi_genlock_unlock(&domain->util_domain.lock);
return NULL;
}
-void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah)
+/**
+ * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free)
+ *
+ * Caller must hold util_domain.lock.
+ *
+ * @param[in] domain efa domain
+ * @param[in] ah efa_ah to destroy
+ */
+void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah)
{
int err;
- assert(ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0);
- assert(dlist_empty(&ah->implicit_conn_list));
+ assert(ah->refcnt == 0);
EFA_INFO(FI_LOG_AV, "Destroying AH for ahn %d\n", ah->ahn);
- dlist_remove(&ah->domain_lru_ah_list_entry);
HASH_DEL(domain->ah_map, ah);
err = ibv_destroy_ah(ah->ibv_ah);
@@ -213,29 +139,20 @@ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah)
}
/**
- * @brief release an efa_ah object after acquiring the util domain lock
+ * @brief release an efa_ah, destroying it when refcount reaches zero
*
- * @param[in] domain efa_domain
- * @param[in] ah efa_ah object pointer
+ * @param[in] domain efa domain
+ * @param[in] ah efa_ah to release
*/
-void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av)
+void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah)
{
ofi_genlock_lock(&domain->util_domain.lock);
-#if ENABLE_DEBUG
- struct efa_ah *tmp;
- HASH_FIND(hh, domain->ah_map, ah->gid, EFA_GID_LEN, tmp);
- assert(tmp == ah);
-#endif
- assert((release_from_implicit_av && ah->implicit_refcnt > 0) ||
- (!release_from_implicit_av && ah->explicit_refcnt > 0));
+ assert(ah->refcnt > 0);
+ ah->refcnt--;
- release_from_implicit_av ? ah->implicit_refcnt-- :
- ah->explicit_refcnt--;
+ if (ah->refcnt == 0)
+ efa_ah_destroy(domain, ah);
- if (ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0) {
- efa_ah_destroy_ah(domain, ah);
- }
ofi_genlock_unlock(&domain->util_domain.lock);
}
diff --git a/prov/efa/src/efa_ah.h b/prov/efa/src/efa_ah.h
index b04b53a0114..25a81ffac1a 100644
--- a/prov/efa/src/efa_ah.h
+++ b/prov/efa/src/efa_ah.h
@@ -9,31 +9,55 @@
#define EFA_GID_LEN 16
+/**
+ * @brief Base address handle — shared by efa-direct and protocol paths
+ *
+ * Contains only the ibv_ah, GID, AHN, refcount, and hash handle.
+ * Protocol-specific fields (implicit_refcnt, implicit_conn_list,
+ * LRU list entry) are in efa_proto_ah.
+ *
+ * pahole: size: 88, cachelines: 2 (2-byte hole after ahn)
+ *
+ * TX hot path: ibv_ah (off=16) is passed to ibv post_send/read/write
+ * on every send. Both ibv_ah and ahn are in cacheline 0.
+ * All other fields are control path only (AH alloc/release/hash lookup).
+ */
struct efa_ah {
- uint8_t gid[EFA_GID_LEN]; /* efa device GID */
- struct ibv_ah *ibv_ah; /* created by ibv_create_ah() using GID */
- uint16_t ahn; /* adress handle number */
- /* Number of explicit AV entries associated with this AH */
- int explicit_refcnt;
- /* Number of implicit AV entries associated with this AH */
- int implicit_refcnt;
- /* dlist of all implicit AV entries associated with this AH entry */
- struct dlist_entry implicit_conn_list;
- /* dlist entry in domain's LRU AH list */
- struct dlist_entry domain_lru_ah_list_entry;
- UT_hash_handle hh; /* hash map handle, link all efa_ah with efa_ep->ah_map */
+ uint8_t gid[EFA_GID_LEN]; /* 0 16 */
+ struct ibv_ah *ibv_ah; /* 16 8 */
+ uint16_t ahn; /* 24 2 */
+ /* 2-byte hole */
+ int refcnt; /* 28 4 */
+ UT_hash_handle hh; /* 32 56 */
};
-void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain,
- struct efa_ah *ah);
-
+/**
+ * @brief allocate an ibv_ah from GID, reusing existing AH if possible
+ *
+ * @param[in] domain efa domain
+ * @param[in] gid GID
+ * @param[in] alloc_size size to allocate (sizeof(efa_ah) or sizeof(efa_proto_ah))
+ * @return pointer to efa_ah on success, NULL on failure (errno set)
+ */
struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av);
-
-void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av);
-
-void efa_ah_release_unsafe(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av);
-
-#endif
\ No newline at end of file
+ size_t alloc_size);
+
+/**
+ * @brief release an efa_ah, destroying it when refcount reaches zero
+ *
+ * @param[in] domain efa domain
+ * @param[in] ah efa_ah to release
+ */
+void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah);
+
+/**
+ * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free)
+ *
+ * Caller must hold util_domain.lock.
+ *
+ * @param[in] domain efa domain
+ * @param[in] ah efa_ah to destroy
+ */
+void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah);
+
+#endif
diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c
index 4bb8e5b680d..a4bd828b926 100644
--- a/prov/efa/src/efa_av.c
+++ b/prov/efa/src/efa_av.c
@@ -11,59 +11,39 @@
#include "efa.h"
#include "efa_av.h"
-#include "rdm/efa_rdm_pke_utils.h"
-static inline struct efa_conn *efa_av_addr_to_conn_impl(struct util_av *util_av,
- fi_addr_t fi_addr)
+/**
+ * @brief find efa_av_entry using fi_addr in the explicit AV
+ *
+ * @param[in] av efa av
+ * @param[in] fi_addr libfabric address
+ * @return if address is valid, return pointer to efa_av_entry
+ * otherwise, return NULL
+ */
+struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr)
{
struct util_av_entry *util_av_entry;
- struct efa_av_entry *efa_av_entry;
+ struct efa_av_entry *av_entry;
if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL))
return NULL;
- if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr)))
- util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr);
+ if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(av->util_av.av_entry_pool, fi_addr)))
+ util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr);
else
return NULL;
- efa_av_entry = (struct efa_av_entry *)util_av_entry->data;
- return efa_av_entry->conn.ep_addr ? &efa_av_entry->conn : NULL;
-}
-
-/**
- * @brief find efa_conn struct using fi_addr in the explicit AV
- *
- * @param[in] av efa av
- * @param[in] addr fi_addr
- * @return if address is valid, return pointer to efa_conn struct
- * otherwise, return NULL
- */
-struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr)
-{
- return efa_av_addr_to_conn_impl(&av->util_av, fi_addr);
+ av_entry = (struct efa_av_entry *)util_av_entry->data;
+ return av_entry->ah ? av_entry : NULL;
}
/**
- * @brief find efa_conn struct using fi_addr in the implicit AV
- *
- * @param[in] av efa av
- * @param[in] addr fi_addr
- * @return if address is valid, return pointer to efa_conn struct
- * otherwise, return NULL
- */
-struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, fi_addr_t fi_addr)
-{
- return efa_av_addr_to_conn_impl(&av->util_av_implicit, fi_addr);
-}
-
-/**
- * @brief find fi_addr for efa endpoint
+ * @brief find fi_addr for efa endpoint (base, AHN+QPN only)
*
* @param[in] av address vector
* @param[in] ahn address handle number
* @param[in] qpn QP number
- * @return On success, return fi_addr to the peer who send the packet
+ * @return On success, return fi_addr to the peer
* If no such peer exist, return FI_ADDR_NOTAVAIL
*/
fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn)
@@ -76,170 +56,30 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn)
cur_key.qpn = qpn;
HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
- return (OFI_LIKELY(!!cur_entry)) ? cur_entry->conn->fi_addr : FI_ADDR_NOTAVAIL;
-}
-
-static inline struct efa_conn *
-efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av,
- struct efa_prv_reverse_av **prv_reverse_av,
- uint16_t ahn, uint16_t qpn,
- struct efa_rdm_pke *pkt_entry)
-{
- uint32_t *connid;
- struct efa_cur_reverse_av *cur_entry;
- struct efa_prv_reverse_av *prv_entry;
- struct efa_cur_reverse_av_key cur_key;
- struct efa_prv_reverse_av_key prv_key;
-
- cur_key.ahn = ahn;
- cur_key.qpn = qpn;
-
- HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
-
- if (OFI_UNLIKELY(!cur_entry))
- return NULL;
-
- if (!pkt_entry) {
- /**
- * There is no packet entry to extract connid from when we get
- * an IBV_WC_RECV_RDMA_WITH_IMM completion from rdma-core. Or
- * the pkt_entry is allocated from a buffer user posted that
- * doesn't expect any pkt hdr.
- */
- return cur_entry->conn;
- }
-
- connid = efa_rdm_pke_connid_ptr(pkt_entry);
- if (!connid) {
- EFA_WARN_ONCE(FI_LOG_EP_CTRL,
- "An incoming packet does NOT have connection ID "
- "in its header.\n"
- "This means the peer is using an older version "
- "of libfabric.\n"
- "The communication can continue but it is "
- "encouraged to use\n"
- "a newer version of libfabric\n");
- return cur_entry->conn;
- }
-
- if (OFI_LIKELY(*connid == cur_entry->conn->ep_addr->qkey))
- return cur_entry->conn;
-
- /* the packet is from a previous peer, look for its address from the
- * prv_reverse_av */
- prv_key.ahn = ahn;
- prv_key.qpn = qpn;
- prv_key.connid = *connid;
- HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry);
-
- return OFI_LIKELY(!!prv_entry) ? prv_entry->conn : NULL;
-};
-
-/**
- * @brief find fi_addr for rdm endpoint in the explicit AV
- *
- * @param[in] av address vector
- * @param[in] ahn address handle number
- * @param[in] qpn QP number
- * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid
- * @return On success, return fi_addr to the peer who send the packet
- * If no such peer exist, return FI_ADDR_NOTAVAIL
- */
-fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn,
- uint16_t qpn, struct efa_rdm_pke *pkt_entry)
-{
- struct efa_conn *conn;
-
- conn = efa_av_reverse_lookup_rdm_conn(
- &av->cur_reverse_av, &av->prv_reverse_av, ahn, qpn, pkt_entry);
-
- if (OFI_LIKELY(!!conn))
- return conn->fi_addr;
-
- return FI_ADDR_NOTAVAIL;
-}
-
-/**
- * @brief find fi_addr for rdm endpoint in the implicit AV
- *
- * @param[in] av address vector
- * @param[in] ahn address handle number
- * @param[in] qpn QP number
- * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid
- * @return On success, return fi_addr to the peer who send the packet
- * If no such peer exist, return FI_ADDR_NOTAVAIL
- */
-fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn,
- uint16_t qpn,
- struct efa_rdm_pke *pkt_entry)
-{
- struct efa_conn *conn;
-
- assert(ofi_genlock_held(&av->domain->srx_lock));
-
- conn = efa_av_reverse_lookup_rdm_conn(&av->cur_reverse_av_implicit,
- &av->prv_reverse_av_implicit, ahn,
- qpn, pkt_entry);
-
- if (OFI_LIKELY(!!conn)) {
- efa_av_implicit_av_lru_conn_move(av, conn);
- return conn->implicit_fi_addr;
- }
-
- return FI_ADDR_NOTAVAIL;
-}
-
-static inline int efa_av_is_valid_address(struct efa_ep_addr *addr)
-{
- struct efa_ep_addr all_zeros = { 0 };
-
- return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw));
-}
-
-/**
- * @brief Move the conn to the front of the LRU list to indicate that it is the
- * most recently used entry
- *
- * @param[in] av efa address vector
- * @param[in] conn efa conn to be added to the LRU list
- */
-void efa_av_implicit_av_lru_conn_move(struct efa_av *av,
- struct efa_conn *conn)
-{
- assert(av->implicit_av_size == 0 ||
- HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size);
- assert(dlist_entry_in_list(&av->implicit_av_lru_list,
- &conn->implicit_av_lru_entry));
-
- dlist_remove(&conn->implicit_av_lru_entry);
- dlist_insert_tail(&conn->implicit_av_lru_entry,
- &av->implicit_av_lru_list);
-
- efa_ah_implicit_av_lru_ah_move(av->domain, conn->ah);
+ return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->fi_addr : FI_ADDR_NOTAVAIL;
}
/*
- * @brief Add newly insert address to the reverse AVs
+ * @brief Add newly inserted address to the reverse AVs
*
* @param[in] av EFA AV object
- * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key
- * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key
- * @param[in] conn efa_conn object
- * @return On success, return 0.
- * Otherwise, return a negative libfabric error code
+ * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key
+ * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key
+ * @param[in] av_entry AV entry to add
+ * @return 0 on success, negative libfabric error code on failure
*/
int efa_av_reverse_av_add(struct efa_av *av,
- struct efa_cur_reverse_av **cur_reverse_av,
- struct efa_prv_reverse_av **prv_reverse_av,
- struct efa_conn *conn)
+ struct efa_cur_reverse_av **cur_reverse_av,
+ struct efa_prv_reverse_av **prv_reverse_av,
+ struct efa_av_entry *av_entry)
{
struct efa_cur_reverse_av *cur_entry;
struct efa_prv_reverse_av *prv_entry;
struct efa_cur_reverse_av_key cur_key;
memset(&cur_key, 0, sizeof(cur_key));
- cur_key.ahn = conn->ah->ahn;
- cur_key.qpn = conn->ep_addr->qpn;
+ cur_key.ahn = av_entry->ah->ahn;
+ cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn;
cur_entry = NULL;
HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
@@ -252,16 +92,13 @@ int efa_av_reverse_av_add(struct efa_av *av,
cur_entry->key.ahn = cur_key.ahn;
cur_entry->key.qpn = cur_key.qpn;
- cur_entry->conn = conn;
+ cur_entry->av_entry = av_entry;
HASH_ADD(hh, *cur_reverse_av, key, sizeof(cur_key), cur_entry);
return 0;
}
- /* We used a static connid for all dgram endpoints, therefore cur_entry should always be NULL,
- * and only RDM endpoint can reach here. hence the following assertion
- */
- assert(av->domain->info_type == EFA_INFO_RDM);
+ /* Only RDM endpoint can reach here (dgram uses static connid) */
prv_entry = malloc(sizeof(*prv_entry));
if (!prv_entry) {
EFA_WARN(FI_LOG_AV, "Cannot allocate memory for prv_reverse_av entry\n");
@@ -270,11 +107,11 @@ int efa_av_reverse_av_add(struct efa_av *av,
prv_entry->key.ahn = cur_key.ahn;
prv_entry->key.qpn = cur_key.qpn;
- prv_entry->key.connid = cur_entry->conn->ep_addr->qkey;
- prv_entry->conn = cur_entry->conn;
+ prv_entry->key.connid = efa_av_entry_ep_addr(cur_entry->av_entry)->qkey;
+ prv_entry->av_entry = cur_entry->av_entry;
HASH_ADD(hh, *prv_reverse_av, key, sizeof(prv_entry->key), prv_entry);
- cur_entry->conn = conn;
+ cur_entry->av_entry = av_entry;
return 0;
}
@@ -285,16 +122,13 @@ int efa_av_reverse_av_add(struct efa_av *av,
* cur_reverse_av. Keeping the address in prv_reverse_av helps avoid QPN
* collisions.
*
- * @param[in] av EFA AV object
- * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key
- * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key
- * @param[in] conn efa_conn object
- * @return On success, return 0.
- * Otherwise, return a negative libfabric error code
+ * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key
+ * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key
+ * @param[in] av_entry AV entry to remove
*/
void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av,
- struct efa_prv_reverse_av **prv_reverse_av,
- struct efa_conn *conn)
+ struct efa_prv_reverse_av **prv_reverse_av,
+ struct efa_av_entry *av_entry)
{
struct efa_cur_reverse_av *cur_reverse_av_entry;
struct efa_prv_reverse_av *prv_reverse_av_entry;
@@ -302,182 +136,137 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av,
struct efa_prv_reverse_av_key prv_key;
memset(&cur_key, 0, sizeof(cur_key));
- cur_key.ahn = conn->ah->ahn;
- cur_key.qpn = conn->ep_addr->qpn;
+ cur_key.ahn = av_entry->ah->ahn;
+ cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn;
HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key),
cur_reverse_av_entry);
- if (cur_reverse_av_entry && cur_reverse_av_entry->conn == conn) {
+ if (cur_reverse_av_entry && cur_reverse_av_entry->av_entry == av_entry) {
HASH_DEL(*cur_reverse_av, cur_reverse_av_entry);
free(cur_reverse_av_entry);
} else {
memset(&prv_key, 0, sizeof(prv_key));
- prv_key.ahn = conn->ah->ahn;
- prv_key.qpn = conn->ep_addr->qpn;
- prv_key.connid = conn->ep_addr->qkey;
+ prv_key.ahn = av_entry->ah->ahn;
+ prv_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn;
+ prv_key.connid = efa_av_entry_ep_addr(av_entry)->qkey;
HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key),
prv_reverse_av_entry);
assert(prv_reverse_av_entry &&
- prv_reverse_av_entry->conn == conn);
+ prv_reverse_av_entry->av_entry == av_entry);
HASH_DEL(*prv_reverse_av, prv_reverse_av_entry);
free(prv_reverse_av_entry);
}
}
-
-static fi_addr_t
-efa_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry)
-{
- struct efa_rdm_pke *pke;
-
- pke = (struct efa_rdm_pke *) rx_entry->peer_context;
-
- return pke->peer->conn->fi_addr;
-}
-
-static int efa_conn_implicit_to_explicit(struct efa_av *av,
- struct efa_ep_addr *raw_addr,
- fi_addr_t implicit_fi_addr,
- fi_addr_t *fi_addr)
+/**
+ * @brief Initialize an efa_av_entry (base path)
+ *
+ * Caller must hold util_av.lock.
+ *
+ * @param[in] av address vector
+ * @param[in] raw_addr raw efa address
+ * @param[in] flags flags from fi_av_insert
+ * @param[in] context context from fi_av_insert
+ * @return pointer to initialized entry on success, NULL on failure
+ */
+static struct efa_av_entry *efa_av_entry_init(struct efa_av *av,
+ struct efa_ep_addr *raw_addr,
+ uint64_t flags, void *context)
{
+ struct util_av_entry *util_av_entry = NULL;
+ struct efa_av_entry *av_entry = NULL;
+ fi_addr_t fi_addr;
int err;
- struct efa_ah *ah;
- struct efa_conn *implicit_conn, *explicit_conn;
- struct efa_rdm_ep *ep;
- struct dlist_entry *entry;
- struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry;
- struct efa_conn_ep_peer_map_entry *map_entry, *tmp;
- struct efa_av_entry *implicit_av_entry, *explicit_av_entry;
- struct fid_peer_srx *peer_srx;
-
- EFA_INFO(FI_LOG_AV,
- "Moving peer with implicit fi_addr %" PRIu64
- " to explicit AV\n",
- implicit_fi_addr);
assert(ofi_genlock_held(&av->util_av.lock));
- assert(ofi_genlock_held(&av->util_av_implicit.lock));
-
- /* Get implicit util AV entry and conn */
- implicit_util_av_entry =
- ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr);
- implicit_av_entry = (struct efa_av_entry *) implicit_util_av_entry->data;
+ if (flags & FI_SYNC_ERR)
+ memset(context, 0, sizeof(int));
- assert(implicit_av_entry);
- assert(efa_is_same_addr(
- raw_addr, (struct efa_ep_addr *) implicit_av_entry->ep_addr));
-
- implicit_conn = &implicit_av_entry->conn;
- assert(implicit_conn->fi_addr == FI_ADDR_NOTAVAIL &&
- implicit_conn->implicit_fi_addr == implicit_fi_addr);
-
- ah = implicit_conn->ah;
-
- /* Create explicit util AV entry and conn */
- err = ofi_av_insert_addr(&av->util_av, raw_addr, fi_addr);
+ err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr);
if (err) {
- EFA_WARN(FI_LOG_AV,
- "ofi_av_insert_addr into explicit AV failed! Error "
- "message: %s\n",
+ EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n",
fi_strerror(err));
- return err;
+ return NULL;
}
- explicit_util_av_entry =
- ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, *fi_addr);
- explicit_av_entry = (struct efa_av_entry *) explicit_util_av_entry->data;
- assert(efa_is_same_addr(
- raw_addr, (struct efa_ep_addr *) explicit_av_entry->ep_addr));
+ util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr);
+ av_entry = (struct efa_av_entry *)util_av_entry->data;
+ assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)av_entry->ep_addr));
- /* Copy information from implicit conn to explicit conn */
- explicit_conn = &explicit_av_entry->conn;
- memset(explicit_conn, 0, sizeof(*explicit_conn));
- explicit_conn->ep_addr = (struct efa_ep_addr *) explicit_av_entry->ep_addr;
+ av_entry->fi_addr = fi_addr;
assert(av->type == FI_AV_TABLE);
- explicit_conn->ah = implicit_conn->ah;
- explicit_conn->fi_addr = *fi_addr;
- explicit_conn->shm_fi_addr = implicit_conn->shm_fi_addr;
- explicit_conn->implicit_fi_addr = FI_ADDR_NOTAVAIL;
- HASH_ITER(hh, implicit_conn->ep_peer_map, map_entry, tmp) {
- HASH_DELETE(hh, implicit_conn->ep_peer_map, map_entry);
- HASH_ADD_PTR(explicit_conn->ep_peer_map, ep_ptr, map_entry);
- map_entry->peer.conn = explicit_conn;
- }
- assert(HASH_CNT(hh, implicit_conn->ep_peer_map) == 0);
-
- /* Handle reverse AV and AV ref counts */
- efa_av_reverse_av_remove(&av->cur_reverse_av_implicit,
- &av->prv_reverse_av_implicit, implicit_conn);
- dlist_remove(&implicit_av_entry->conn.implicit_av_lru_entry);
+ av_entry->ah = efa_ah_alloc(av->domain, raw_addr->raw, sizeof(struct efa_ah));
+ if (!av_entry->ah)
+ goto err_release;
- err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr);
- if (err) {
- EFA_WARN(FI_LOG_AV,
- "ofi_av_remove_addr from implicit AV failed! Error "
- "message: %s\n",
- fi_strerror(err));
- return err;
- }
+ err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av,
+ av_entry);
+ if (err)
+ goto err_release_ah;
- av->used_implicit--;
+ av->used++;
+ return av_entry;
- err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av,
- explicit_conn);
+err_release_ah:
+ efa_ah_release(av->domain, av_entry->ah);
+err_release:
+ av_entry->ah = NULL;
+ memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN);
+ err = ofi_av_remove_addr(&av->util_av, fi_addr);
if (err)
- return err;
+ EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n",
+ err);
+ return NULL;
+}
- av->used_explicit++;
+/**
+ * @brief Release an efa_av_entry (base path)
+ *
+ * Caller must hold util_av.lock.
+ *
+ * @param[in] av address vector
+ * @param[in] av_entry entry to release
+ */
+static void efa_av_entry_release(struct efa_av *av, struct efa_av_entry *av_entry)
+{
+ char gidstr[INET6_ADDRSTRLEN];
+ int err;
- /* Handle AH LRU list and refcnt */
- assert(!dlist_empty(&ah->implicit_conn_list));
- dlist_remove(&implicit_conn->ah_implicit_conn_list_entry);
- efa_ah_implicit_av_lru_ah_move(av->domain, ah);
- ah->implicit_refcnt--;
- ah->explicit_refcnt++;
+ assert(ofi_genlock_held(&av->util_av.lock));
- EFA_INFO(FI_LOG_AV,
- "Peer with implicit fi_addr %" PRIu64
- " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n",
- implicit_fi_addr, *fi_addr);
-
- /* Call foreach_unspec_addr to move unexpected messages
- * from the unspecified queue to the specified queues
- *
- * util_ep is bound to the explicit util_av, so the explicit util_av's
- * ep_list contains all of the endpoints bound to this AV */
- ofi_genlock_lock(&av->util_av.ep_list_lock);
- dlist_foreach(&av->util_av.ep_list, entry) {
- ep = container_of(entry, struct efa_rdm_ep, base_ep.util_ep.av_entry);
- peer_srx = util_get_peer_srx(ep->peer_srx_ep);
- peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_av_get_addr_from_peer_rx_entry);
- }
- ofi_genlock_unlock(&av->util_av.ep_list_lock);
+ efa_av_reverse_av_remove(&av->cur_reverse_av, &av->prv_reverse_av, av_entry);
+ efa_ah_release(av->domain, av_entry->ah);
+
+ inet_ntop(AF_INET6, efa_av_entry_ep_addr(av_entry)->raw, gidstr, INET6_ADDRSTRLEN);
+ EFA_INFO(FI_LOG_AV, "efa_av_entry released! entry[%p] GID[%s] QP[%u]\n",
+ av_entry, gidstr, efa_av_entry_ep_addr(av_entry)->qpn);
+
+ err = ofi_av_remove_addr(&av->util_av, av_entry->fi_addr);
+ if (err)
+ EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err);
- return FI_SUCCESS;
+ av_entry->ah = NULL;
+ memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN);
+ av->used--;
}
/**
- * @brief insert one address into address vector (AV)
+ * @brief insert one address into AV (base, efa-direct path)
*
* @param[in] av address vector
* @param[in] addr raw address, in the format of gid:qpn:qkey
- * @param[out] fi_addr pointer to the output fi address. This address is used by fi_send
- * @param[in] flags flags user passed to fi_av_insert.
+ * @param[out] fi_addr pointer to the output fi address
+ * @param[in] flags flags user passed to fi_av_insert
* @param[in] context context user passed to fi_av_insert
- * @param[in] insert_shm_av whether insert address to shm av
- * @param[in] insert_implicit_av whether insert address to implicit AV
* @return 0 on success, a negative error code on failure
*/
-int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
- fi_addr_t *fi_addr, uint64_t flags, void *context,
- bool insert_shm_av, bool insert_implicit_av)
+static int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
+ fi_addr_t *fi_addr, uint64_t flags, void *context)
{
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
char raw_gid_str[INET6_ADDRSTRLEN];
fi_addr_t efa_fiaddr;
- fi_addr_t implicit_fi_addr;
- int ret = 0;
if (!efa_av_is_valid_address(addr)) {
EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n");
@@ -488,94 +277,57 @@ int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
if (av->domain->info_type == EFA_INFO_DGRAM)
addr->qkey = EFA_DGRAM_CONNID;
- if (av->domain->info_type == EFA_INFO_RDM)
- assert(ofi_genlock_held(&av->domain->srx_lock));
- ofi_genlock_lock(&av->util_av_implicit.lock);
ofi_genlock_lock(&av->util_av.lock);
memset(raw_gid_str, 0, sizeof(raw_gid_str));
if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) {
EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno);
- ret = -FI_EINVAL;
*fi_addr = FI_ADDR_NOTAVAIL;
- goto out;
+ ofi_genlock_unlock(&av->util_av.lock);
+ return -FI_EINVAL;
}
EFA_INFO(FI_LOG_AV,
- "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n",
- raw_gid_str, addr->qpn, addr->qkey,
- insert_implicit_av ? "implicit" : "explicit");
+ "Inserting address GID[%s] QP[%u] QKEY[%u] to explicit AV ....\n",
+ raw_gid_str, addr->qpn, addr->qkey);
- /*
- * Check if this address already has been inserted, if so set *fi_addr
- * to existing address, and return 0 for success.
- */
+ /* Check if already inserted */
efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->util_av, addr);
if (efa_fiaddr != FI_ADDR_NOTAVAIL) {
- /* We should never try to insert into the implicit AV an address
- * that's already in the explicit AV */
- assert(!insert_implicit_av);
-
EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr);
*fi_addr = efa_fiaddr;
- ret = 0;
- goto out;
- }
-
- implicit_fi_addr =
- ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr);
- if (implicit_fi_addr != FI_ADDR_NOTAVAIL) {
- EFA_INFO(FI_LOG_AV,
- "Found implicit AV entry id %ld for the same "
- "address\n",
- implicit_fi_addr);
-
- if (insert_implicit_av) {
- /* Move to the end of the LRU list */
- conn = efa_av_addr_to_conn_implicit(av,
- implicit_fi_addr);
- efa_av_implicit_av_lru_conn_move(av, conn);
-
- *fi_addr = implicit_fi_addr;
- goto out;
- }
-
- ret = efa_conn_implicit_to_explicit(av, addr, implicit_fi_addr,
- fi_addr);
- if (ret)
- *fi_addr = FI_ADDR_NOTAVAIL;
- goto out;
+ ofi_genlock_unlock(&av->util_av.lock);
+ return 0;
}
- conn = efa_conn_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av);
- if (!conn) {
+ av_entry = efa_av_entry_init(av, addr, flags, context);
+ if (!av_entry) {
*fi_addr = FI_ADDR_NOTAVAIL;
- ret = -FI_EADDRNOTAVAIL;
- goto out;
+ ofi_genlock_unlock(&av->util_av.lock);
+ return -FI_EADDRNOTAVAIL;
}
- if (insert_implicit_av) {
- *fi_addr = conn->implicit_fi_addr;
- EFA_INFO(FI_LOG_AV,
- "Successfully inserted address GID[%s] QP[%u] "
- "QKEY[%u] to implicit AV. fi_addr: %ld\n",
- raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
- } else {
- *fi_addr = conn->fi_addr;
- EFA_INFO(FI_LOG_AV,
- "Successfully inserted address GID[%s] QP[%u] "
- "QKEY[%u] to explicit AV. fi_addr: %ld\n",
- raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
- }
- ret = 0;
+ *fi_addr = av_entry->fi_addr;
+ EFA_INFO(FI_LOG_AV,
+ "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n",
+ raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
-out:
ofi_genlock_unlock(&av->util_av.lock);
- ofi_genlock_unlock(&av->util_av_implicit.lock);
- return ret;
+ return 0;
}
-int efa_av_insert(struct fid_av *av_fid, const void *addr,
+/**
+ * @brief insert addresses into AV (fi_av_insert implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] addr buffer containing one or more addresses to insert
+ * @param[in] count number of addresses to insert
+ * @param[out] fi_addr array where returned fabric addresses will be written
+ * @param[in] flags operation flags
+ * @param[in] context user context
+ * @return number of addresses successfully inserted
+ */
+static int efa_av_insert(struct fid_av *av_fid, const void *addr,
size_t count, fi_addr_t *fi_addr,
uint64_t flags, void *context)
{
@@ -591,25 +343,16 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT)))
return -FI_EINVAL;
- /*
- * Providers are allowed to ignore FI_MORE.
- */
flags &= ~FI_MORE;
if (flags)
return -FI_ENOSYS;
- /* The order in which the util AV and SRX locks are acquired must match
- * in the AV insertion, removal and CQ read paths to prevent deadlocks */
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_lock(&av->domain->srx_lock);
-
for (i = 0; i < count; i++) {
addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
- ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false);
+ ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context);
if (ret) {
- EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n",
- ret);
+ EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret);
break;
}
@@ -618,9 +361,6 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
success_cnt++;
}
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_unlock(&av->domain->srx_lock);
-
/* cancel remaining request and log to event queue */
for (; i < count ; i++) {
if (fi_addr)
@@ -630,11 +370,20 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr,
return success_cnt;
}
+/**
+ * @brief retrieve an address stored in the AV (fi_av_lookup implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] fi_addr fabric address to look up
+ * @param[out] addr buffer to store the returned address
+ * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written
+ * @return 0 on success, negative libfabric error code on failure
+ */
static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
void *addr, size_t *addrlen)
{
struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid);
- struct efa_conn *conn = NULL;
+ struct efa_av_entry *av_entry = NULL;
if (av->type != FI_AV_TABLE)
return -FI_EINVAL;
@@ -643,13 +392,13 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
return -FI_EINVAL;
ofi_genlock_lock(&av->util_av.lock);
- conn = efa_av_addr_to_conn(av, fi_addr);
- if (!conn) {
+ av_entry = efa_av_addr_to_entry(av, fi_addr);
+ if (!av_entry) {
ofi_genlock_unlock(&av->util_av.lock);
return -FI_EINVAL;
}
- memcpy(addr, (void *)conn->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen));
+ memcpy(addr, (void *)av_entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen));
ofi_genlock_unlock(&av->util_av.lock);
if (*addrlen > EFA_EP_ADDR_LEN)
*addrlen = EFA_EP_ADDR_LEN;
@@ -670,16 +419,16 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
* was set to FI_ADDR_NOTAVAIL. The TX completion handler will
* ignore TX packet whose address is FI_ADDR_NOTAVAIL.
*
- * Meanwhile, lower provider will set a packet's address to
- * FI_ADDR_NOTAVAIL from it is from a removed address. RX completion
+ * Meanwhile, lower provider will set a packet's address to
+ * FI_ADDR_NOTAVAIL if it is from a removed address. RX completion
* handler will ignore such packets.
*
* @param[in] av_fid fid of AV (address vector)
- * @param[in] fi_addr pointer to an array of libfabric addresses
- * @param[in] counter number of libfabric addresses in the array
+ * @param[in] fi_addr pointer to an array of libfabric addresses
+ * @param[in] count number of libfabric addresses in the array
* @param[in] flags flags
* @return 0 if all addresses have been removed successfully,
- * negative libfabric error code if error was encoutnered.
+ * negative libfabric error code if error was encountered.
*/
static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
size_t count, uint64_t flags)
@@ -687,7 +436,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
int err = 0;
size_t i;
struct efa_av *av;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
if (!fi_addr)
return -FI_EINVAL;
@@ -696,19 +445,15 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
if (av->type != FI_AV_TABLE)
return -FI_EINVAL;
- /* The order in which the util AV and SRX locks are acquired must match
- in the AV insertion, removal and CQ read paths to prevent deadlocks */
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_lock(&av->domain->srx_lock);
ofi_genlock_lock(&av->util_av.lock);
for (i = 0; i < count; i++) {
- conn = efa_av_addr_to_conn(av, fi_addr[i]);
- if (!conn) {
+ av_entry = efa_av_addr_to_entry(av, fi_addr[i]);
+ if (!av_entry) {
err = -FI_EINVAL;
break;
}
- efa_conn_release(av, conn, false);
+ efa_av_entry_release(av, av_entry);
}
if (i < count) {
@@ -717,11 +462,18 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
}
ofi_genlock_unlock(&av->util_av.lock);
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_unlock(&av->domain->srx_lock);
return err;
}
+/**
+ * @brief convert an address into a printable string (fi_av_straddr implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] addr address to convert
+ * @param[out] buf buffer to store the string
+ * @param[in,out] len on input, size of buf; on output, bytes written
+ * @return pointer to buf
+ */
static const char *efa_av_straddr(struct fid_av *av_fid, const void *addr,
char *buf, size_t *len)
{
@@ -738,81 +490,37 @@ static struct fi_ops_av efa_av_ops = {
.straddr = efa_av_straddr
};
-static void efa_av_close_reverse_av(struct efa_av *av)
+/**
+ * @brief close an AV and release all resources (fi_close implementation)
+ *
+ * @param[in] fid fid of AV
+ * @return 0 on success, negative libfabric error code on failure
+ */
+static int efa_av_close(struct fid *fid)
{
+ struct efa_av *av;
struct efa_cur_reverse_av *cur_entry, *curtmp;
struct efa_prv_reverse_av *prv_entry, *prvtmp;
+ int err = 0;
- /* The order in which the util AV and SRX locks are acquired must match
- in the AV insertion, removal and CQ read paths to prevent deadlocks */
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_lock(&av->domain->srx_lock);
+ av = container_of(fid, struct efa_av, util_av.av_fid.fid);
ofi_genlock_lock(&av->util_av.lock);
HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) {
- efa_conn_release(av, cur_entry->conn, false);
+ efa_av_entry_release(av, cur_entry->av_entry);
}
HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) {
- efa_conn_release(av, prv_entry->conn, false);
+ efa_av_entry_release(av, prv_entry->av_entry);
}
ofi_genlock_unlock(&av->util_av.lock);
- ofi_genlock_lock(&av->util_av_implicit.lock);
-
- HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) {
- efa_conn_release(av, cur_entry->conn, true);
- }
-
- HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) {
- efa_conn_release(av, prv_entry->conn, true);
- }
-
- ofi_genlock_unlock(&av->util_av_implicit.lock);
-
- if (av->domain->info_type == EFA_INFO_RDM)
- ofi_genlock_unlock(&av->domain->srx_lock);
-}
-
-static int efa_av_close(struct fid *fid)
-{
- struct efa_av *av;
- int err = 0;
- struct efa_ep_addr_hashable *ep_addr_hashable, *tmp;
-
- av = container_of(fid, struct efa_av, util_av.av_fid.fid);
-
- efa_av_close_reverse_av(av);
-
err = ofi_av_close(&av->util_av);
- if (OFI_UNLIKELY(err)) {
+ if (OFI_UNLIKELY(err))
EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n",
fi_strerror(err));
- }
-
- err = ofi_av_close(&av->util_av_implicit);
- if (OFI_UNLIKELY(err)) {
- EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n",
- fi_strerror(err));
- }
-
- if (av->domain->info_type == EFA_INFO_RDM) {
- if (av->shm_rdm_av) {
- err = fi_close(&av->shm_rdm_av->fid);
- if (OFI_UNLIKELY(err)) {
- EFA_WARN(FI_LOG_AV,
- "Failed to close shm av: %s\n",
- fi_strerror(err));
- }
- }
- }
-
- HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) {
- HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable);
- free(ep_addr_hashable);
- }
free(av);
return err;
@@ -827,37 +535,47 @@ static struct fi_ops efa_av_fi_ops = {
};
/**
- * @brief initialize the util_av field in efa_av
+ * @brief initialize a util_av
*
- * @param[in] util_domain util_domain which is part of efa_domain_base
+ * @param[in] efa_domain efa domain
* @param[in] attr AV attr application passed to fi_av_open
- * @param[out] util_av util_av field in efa_av
- * @param[in] context contexted application passed to fi_av_open
+ * @param[out] util_av util_av to initialize
+ * @param[in] context context application passed to fi_av_open
+ * @param[in] context_len size of provider-specific context per AV entry
* @return On success, return 0.
* On failure, return a negative libfabric error code.
*/
int efa_av_init_util_av(struct efa_domain *efa_domain,
struct fi_av_attr *attr,
struct util_av *util_av,
- void *context)
+ void *context,
+ size_t context_len)
{
struct util_av_attr util_attr;
util_attr.addrlen = EFA_EP_ADDR_LEN;
- util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN;
+ util_attr.context_len = context_len;
util_attr.flags = 0;
return ofi_av_init(&efa_domain->util_domain, attr, &util_attr,
util_av, context);
}
+/**
+ * @brief open an address vector (fi_av_open implementation for efa-direct/dgram)
+ *
+ * @param[in] domain_fid fid of domain
+ * @param[in] attr AV attributes
+ * @param[out] av_fid pointer to store the opened AV fid
+ * @param[in] context user context
+ * @return 0 on success, negative libfabric error code on failure
+ */
int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
struct fid_av **av_fid, void *context)
{
struct efa_domain *efa_domain;
struct efa_av *av;
- struct fi_av_attr av_attr = { 0 };
- int ret, retv;
size_t universe_size;
+ int ret;
if (!attr)
return -FI_EINVAL;
@@ -894,47 +612,16 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
&universe_size) == FI_SUCCESS)
attr->count = MAX(attr->count, universe_size);
- ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context);
+ ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context,
+ sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN);
if (ret)
goto err;
- ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context);
- if (ret)
- goto err_close_util_av_implicit;
-
- if (efa_domain->info_type == EFA_INFO_RDM && efa_domain->fabric &&
- efa_domain->fabric->shm_fabric) {
- /*
- * shm av supports maximum 256 entries
- * Reset the count to 128 to reduce memory footprint and satisfy
- * the need of the instances with more CPUs.
- */
- av_attr = *attr;
- if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) {
- ret = -FI_ENOSYS;
- EFA_WARN(FI_LOG_AV,
- "The requested av size is beyond"
- " shm supported maximum av size: %s\n",
- fi_strerror(-ret));
- goto err_close_util_av;
- }
- av_attr.count = efa_env.shm_av_size;
- assert(av_attr.type == FI_AV_TABLE);
- ret = fi_av_open(efa_domain->shm_domain, &av_attr,
- &av->shm_rdm_av, context);
- if (ret)
- goto err_close_util_av;
- }
-
- EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n",
- attr->flags);
+ EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags);
av->domain = efa_domain;
av->type = attr->type;
- av->implicit_av_size = efa_env.implicit_av_size;
- av->used_implicit = 0;
- av->used_explicit = 0;
- av->shm_used = 0;
+ av->used = 0;
*av_fid = &av->util_av.av_fid;
(*av_fid)->fid.fclass = FI_CLASS_AV;
@@ -942,22 +629,8 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
(*av_fid)->fid.ops = &efa_av_fi_ops;
(*av_fid)->ops = &efa_av_ops;
- dlist_init(&av->implicit_av_lru_list);
-
return 0;
-err_close_util_av:
- retv = ofi_av_close(&av->util_av);
- if (retv)
- EFA_WARN(FI_LOG_AV,
- "Unable to close util_av: %s\n", fi_strerror(-retv));
-
-err_close_util_av_implicit:
- retv = ofi_av_close(&av->util_av_implicit);
- if (retv)
- EFA_WARN(FI_LOG_AV,
- "Unable to close util_av_implicit: %s\n", fi_strerror(-retv));
-
err:
free(av);
return ret;
diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h
index 6cbe7b506ea..b92eebb91e6 100644
--- a/prov/efa/src/efa_av.h
+++ b/prov/efa/src/efa_av.h
@@ -5,10 +5,7 @@
#define EFA_AV_H
#include
-#include "rdm/efa_rdm_protocol.h"
-#include "rdm/efa_rdm_peer.h"
#include "efa_ah.h"
-#include "efa_conn.h"
#define EFA_MIN_AV_SIZE (16384)
#define EFA_SHM_MAX_AV_COUNT (256)
@@ -28,93 +25,128 @@ struct efa_ep_addr_hashable {
#define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr)
-/* util_av implementation requires the first element of efa_av_entry to be
- * ep_addr */
+/**
+ * @brief Base AV entry (efa-direct)
+ *
+ * pahole:
+ * size: 48, cachelines: 1, members: 3
+ * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20)
+ * ah* off=32 — TX hot
+ * fi_addr off=40 — RX hot
+ */
struct efa_av_entry {
- uint8_t ep_addr[EFA_EP_ADDR_LEN];
- struct efa_conn conn;
+ uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */
+ struct efa_ah *ah; /* 32 8 */
+ fi_addr_t fi_addr; /* 40 8 */
};
+/* pahole: size: 4, no holes */
struct efa_cur_reverse_av_key {
uint16_t ahn;
uint16_t qpn;
};
+/**
+ * @brief Reverse AV entry keyed by (AHN, QPN) — points to current peer
+ *
+ * pahole: size: 72, cachelines: 2 (4-byte hole after key)
+ */
struct efa_cur_reverse_av {
- struct efa_cur_reverse_av_key key;
- struct efa_conn *conn;
- UT_hash_handle hh;
+ struct efa_cur_reverse_av_key key; /* 0 4 */
+ /* 4-byte hole */
+ struct efa_av_entry *av_entry; /* 8 8 */
+ UT_hash_handle hh; /* 16 56 */
};
+/* pahole: size: 8, no holes */
struct efa_prv_reverse_av_key {
uint16_t ahn;
uint16_t qpn;
uint32_t connid;
};
+/**
+ * @brief Reverse AV entry keyed by (AHN, QPN, connid) — points to previous peer
+ *
+ * pahole: size: 72, cachelines: 2
+ */
struct efa_prv_reverse_av {
- struct efa_prv_reverse_av_key key;
- struct efa_conn *conn;
- UT_hash_handle hh;
+ struct efa_prv_reverse_av_key key; /* 0 8 */
+ struct efa_av_entry *av_entry; /* 8 8 */
+ UT_hash_handle hh; /* 16 56 */
};
+/**
+ * @brief Base AV — contains only what efa-direct needs
+ *
+ * pahole:
+ * size: 320, cachelines: 5
+ * domain* off=0 — cacheline 0
+ * used off=8
+ * type off=16
+ * (4-byte hole) off=20
+ * cur_reverse_av* off=24 — RX hot: reverse lookup hash head
+ * prv_reverse_av* off=32 — RX hot: QPN reuse fallback hash head
+ * util_av off=40 — 280 bytes (contains bufpool, locks, ep_list)
+ */
struct efa_av {
- struct fid_av *shm_rdm_av;
- struct efa_domain *domain;
- size_t used_explicit;
- size_t used_implicit;
- size_t shm_used;
- enum fi_av_type type;
- /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_conn.
- * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_conns.
- * cur_reverse_av is faster to search because its key size is smaller
+ struct efa_domain *domain; /* 0 8 */
+ size_t used; /* 8 8 */
+ enum fi_av_type type; /* 16 4 */
+ /* 4-byte hole */
+ /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_av_entry.
+ * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_av_entries.
+ * cur_reverse_av is faster to search because its key size is smaller.
*/
- struct efa_cur_reverse_av *cur_reverse_av;
- struct efa_prv_reverse_av *prv_reverse_av;
- struct util_av util_av;
-
- /* implicit AV is used when receiving messages from peers not explicity
- * inserted by the application
- */
- struct util_av util_av_implicit;
- struct efa_cur_reverse_av *cur_reverse_av_implicit;
- struct efa_prv_reverse_av *prv_reverse_av_implicit;
-
- size_t implicit_av_size;
- struct dlist_entry implicit_av_lru_list;
- struct efa_ep_addr_hashable *evicted_peers_hashset;
+ struct efa_cur_reverse_av *cur_reverse_av; /* 24 8 */
+ struct efa_prv_reverse_av *prv_reverse_av; /* 32 8 */
+ struct util_av util_av; /* 40 280 */
};
int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
struct fid_av **av_fid, void *context);
-int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr,
- fi_addr_t *fi_addr, uint64_t flags, void *context,
- bool insert_shm_av, bool insert_implicit_av);
-
-struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr);
-struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av,
- fi_addr_t fi_addr);
+int efa_av_init_util_av(struct efa_domain *efa_domain,
+ struct fi_av_attr *attr,
+ struct util_av *util_av,
+ void *context,
+ size_t context_len);
-fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn,
- uint16_t qpn, struct efa_rdm_pke *pkt_entry);
-
-fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn,
- uint16_t qpn,
- struct efa_rdm_pke *pkt_entry);
+struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr);
fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn);
int efa_av_reverse_av_add(struct efa_av *av,
struct efa_cur_reverse_av **cur_reverse_av,
struct efa_prv_reverse_av **prv_reverse_av,
- struct efa_conn *conn);
+ struct efa_av_entry *av_entry);
void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av,
- struct efa_prv_reverse_av **prv_reverse_av,
- struct efa_conn *conn);
-
-void efa_av_implicit_av_lru_conn_move(struct efa_av *av,
- struct efa_conn *conn);
-
-#endif
\ No newline at end of file
+ struct efa_prv_reverse_av **prv_reverse_av,
+ struct efa_av_entry *av_entry);
+
+/**
+ * @brief typed accessor for the ep_addr field of an AV entry
+ *
+ * @param[in] entry AV entry
+ * @return pointer to the efa_ep_addr embedded in the entry
+ */
+static inline struct efa_ep_addr *efa_av_entry_ep_addr(struct efa_av_entry *entry)
+{
+ return (struct efa_ep_addr *)entry->ep_addr;
+}
+
+/**
+ * @brief check if an efa_ep_addr has a non-zero GID
+ *
+ * @param[in] addr address to check
+ * @return non-zero if valid, 0 if all-zeros
+ */
+static inline int efa_av_is_valid_address(struct efa_ep_addr *addr)
+{
+ struct efa_ep_addr all_zeros = { 0 };
+
+ return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw));
+}
+
+#endif
diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c
index a429be8e26c..1b18879c72b 100644
--- a/prov/efa/src/efa_base_ep.c
+++ b/prov/efa/src/efa_base_ep.c
@@ -693,11 +693,11 @@ const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, siz
struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, fi_addr_t addr)
{
struct efa_av *efa_av;
- struct efa_conn *efa_conn;
+ struct efa_av_entry *av_entry;
efa_av = base_ep->av;
- efa_conn = efa_av_addr_to_conn(efa_av, addr);
- return efa_conn ? efa_conn->ep_addr : NULL;
+ av_entry = efa_av_addr_to_entry(efa_av, addr);
+ return av_entry ? efa_av_entry_ep_addr(av_entry) : NULL;
}
/**
diff --git a/prov/efa/src/efa_conn.c b/prov/efa/src/efa_conn.c
deleted file mode 100644
index a58f1f6e333..00000000000
--- a/prov/efa/src/efa_conn.c
+++ /dev/null
@@ -1,478 +0,0 @@
-
-/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. */
-/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */
-/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
-
-#include
-
-#include "efa.h"
-
-/*
- * Local/remote peer detection by comparing peer GID with stored local GIDs
- */
-static bool efa_is_local_peer(struct efa_av *av, const void *addr)
-{
- int i;
- uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw;
-
-#if ENABLE_DEBUG
- char raw_gid_str[INET6_ADDRSTRLEN] = { 0 };
-
- if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) {
- EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno);
- return 0;
- }
- EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str);
-#endif
- for (i = 0; i < g_efa_ibv_gid_cnt; ++i) {
- if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) {
- EFA_INFO(FI_LOG_AV, "The peer is local.\n");
- return 1;
- }
- }
-
- return 0;
-}
-
-/**
- * @brief Add the conn to the LRU list. If the list is full, evict the least
- * recently used entry at the front of the LRU list and add the latest one
- *
- * @param[in] av efa address vector
- * @param[in] conn efa conn to be added to the LRU list
- */
-static inline int efa_av_implicit_av_lru_insert(struct efa_av *av,
- struct efa_conn *conn)
-{
- size_t cur_size;
- struct efa_ep_addr_hashable *ep_addr_hashable;
- struct efa_conn *conn_to_release;
-
- /* Implicit AV size of 0 means we allow the implicit AV to grow without
- * bound */
- if (av->implicit_av_size == 0)
- goto out;
-
- cur_size = HASH_CNT(hh, av->util_av_implicit.hash);
- if (cur_size <= av->implicit_av_size)
- goto out;
-
- assert(ofi_genlock_held(&av->domain->srx_lock));
-
- dlist_pop_front(&av->implicit_av_lru_list, struct efa_conn,
- conn_to_release, implicit_av_lru_entry);
- EFA_INFO(FI_LOG_AV,
- "Evicting AV entry for peer implicit fi_addr %" PRIu64
- " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from "
- "implicit AV\n",
- conn_to_release->implicit_fi_addr, conn_to_release->ah->ahn,
- conn_to_release->ep_addr->qpn, conn_to_release->ep_addr->qkey);
-
- /* Add to hashset with list of evicted peers */
- ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable));
- if (!ep_addr_hashable) {
- EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n");
- return FI_ENOMEM;
- }
- memcpy(ep_addr_hashable, conn->ep_addr, sizeof(struct efa_ep_addr));
- HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable);
-
- assert(ofi_genlock_held(&av->domain->srx_lock));
- efa_conn_release(av, conn_to_release, true);
-
- assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size);
-
-out:
- dlist_insert_tail(&conn->implicit_av_lru_entry,
- &av->implicit_av_lru_list);
- return FI_SUCCESS;
-}
-
-/**
- * @brief Insert the address into SHM provider's AV for RDM endpoints
- *
- * If shm transfer is enabled and the addr comes from local peer,
- * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later.
- * 2. insert gid_qpn into shm's av
- * 3. store returned fi_addr from shm into the hash table
- *
- * @param[in] av address vector
- * @param[in] conn efa_conn object
- * @return On success return 0, otherwise return a negative error code
- */
-int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn)
-{
- int err, ret;
- char smr_name[EFA_SHM_NAME_MAX];
- size_t smr_name_len;
-
-
- assert(av->domain->info_type == EFA_INFO_RDM);
- assert(conn->ep_addr);
-
- if (efa_is_local_peer(av, conn->ep_addr) && av->shm_rdm_av) {
- if (av->shm_used >= efa_env.shm_av_size) {
- EFA_WARN(FI_LOG_AV,
- "Max number of shm AV entry (%d) has been reached.\n",
- efa_env.shm_av_size);
- return -FI_ENOMEM;
- }
-
- smr_name_len = EFA_SHM_NAME_MAX;
- err = efa_shm_ep_name_construct(smr_name, &smr_name_len, conn->ep_addr);
- if (err != FI_SUCCESS) {
- EFA_WARN(FI_LOG_AV,
- "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err);
- return err;
- }
-
- /*
- * The shm provider supports FI_AV_USER_ID flag. This flag
- * associates a user-assigned identifier with each av entry that is
- * returned with any completion entry in place of the AV's address.
- * In the fi_av_insert call below, the &conn->shm_fi_addr is both an input
- * and an output. conn->shm_fi_addr is passed in the function with value as
- * conn->fi_addr, which is the address of peer in efa provider's av. shm
- * records this value as user id in its internal hashmap for the use of cq
- * write, and then overwrite conn->shm_fi_addr as the actual fi_addr in shm's
- * av. The efa provider should still use conn->shm_fi_addr for transmissions
- * through shm ep.
- */
- conn->shm_fi_addr = conn->fi_addr;
- ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &conn->shm_fi_addr, FI_AV_USER_ID, NULL);
- if (OFI_UNLIKELY(ret != 1)) {
- EFA_WARN(FI_LOG_AV,
- "Failed to insert address to shm provider's av: %s\n",
- fi_strerror(-ret));
- return ret;
- }
-
- EFA_INFO(FI_LOG_AV,
- "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n",
- smr_name, conn->fi_addr, conn->shm_fi_addr);
-
- assert(conn->shm_fi_addr < efa_env.shm_av_size);
- av->shm_used++;
- }
-
- return 0;
-}
-
-/**
- * @brief release the rdm related resources of an efa_conn object. This function
- * requires the caller to take the SRX lock because this function modifies the
- * peer map and destroys peers which are accessed and modified in the CQ read
- * path.
- *
- * this function release the shm av entry and rdm peer;
- *
- * @param[in] av address vector
- * @param[in] conn efa_conn object
- * peer
- */
-void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn)
-{
- int err;
- struct efa_conn_ep_peer_map_entry *peer_map_entry, *tmp;
-
- assert(av->domain->info_type == EFA_INFO_RDM);
-
- assert((conn->fi_addr != FI_ADDR_NOTAVAIL &&
- conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) ||
- (conn->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
- conn->fi_addr == FI_ADDR_NOTAVAIL));
-
- if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) {
- err = fi_av_remove(av->shm_rdm_av, &conn->shm_fi_addr, 1, 0);
- if (err) {
- EFA_WARN(FI_LOG_AV,
- "remove address from shm av failed! err=%d\n",
- err);
- } else {
- av->shm_used--;
- assert(conn->shm_fi_addr < efa_env.shm_av_size);
- }
- }
-
- assert(ofi_genlock_held(&av->domain->srx_lock));
- HASH_ITER(hh, conn->ep_peer_map, peer_map_entry, tmp) {
- dlist_remove(&peer_map_entry->peer.ep_peer_list_entry);
- efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr);
- HASH_DEL(conn->ep_peer_map, peer_map_entry);
- ofi_buf_free(peer_map_entry);
- }
- assert(HASH_CNT(hh, conn->ep_peer_map) == 0);
-}
-
-/**
- * @brief allocate an efa_conn object
- * caller of this function must obtain av->util_av.lock or av->util_av_implicit.lock
- *
- * @param[in] av efa address vector
- * @param[in] raw_addr raw efa address
- * @param[in] flags flags application passed to fi_av_insert
- * @param[in] context context application passed to fi_av_insert
- * @param[in] insert_shm_av whether insert address to shm av
- * @param[in] insert_implicit_av whether insert address to implicit AV
- * @return on success, return a pointer to an efa_conn object
- * otherwise, return NULL. errno will be set to a positive error code.
- */
-struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,
- uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av)
-{
- struct util_av *util_av;
- struct efa_cur_reverse_av **cur_reverse_av;
- struct efa_prv_reverse_av **prv_reverse_av;
- struct util_av_entry *util_av_entry = NULL;
- struct efa_av_entry *efa_av_entry = NULL;
- struct efa_conn *conn;
- fi_addr_t fi_addr;
- int err;
-
- if (flags & FI_SYNC_ERR)
- memset(context, 0, sizeof(int));
-
- if (insert_implicit_av) {
- assert(ofi_genlock_held(&av->util_av_implicit.lock));
- util_av = &av->util_av_implicit;
- cur_reverse_av = &av->cur_reverse_av_implicit;
- prv_reverse_av = &av->prv_reverse_av_implicit;
- } else {
- assert(ofi_genlock_held(&av->util_av.lock));
- util_av = &av->util_av;
- cur_reverse_av = &av->cur_reverse_av;
- prv_reverse_av = &av->prv_reverse_av;
- }
-
- err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr);
- if (err) {
- EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n",
- fi_strerror(err));
- return NULL;
- }
-
- util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool,
- fi_addr);
- efa_av_entry = (struct efa_av_entry *)util_av_entry->data;
- assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr));
-
- conn = &efa_av_entry->conn;
- memset(conn, 0, sizeof(*conn));
- conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr;
- assert(av->type == FI_AV_TABLE);
-
- conn->av = av;
-
- if (insert_implicit_av) {
- conn->fi_addr = FI_ADDR_NOTAVAIL;
- conn->implicit_fi_addr = fi_addr;
- err = efa_av_implicit_av_lru_insert(av, conn);
- if (err)
- return NULL;
- } else {
- conn->fi_addr = fi_addr;
- conn->implicit_fi_addr = FI_ADDR_NOTAVAIL;
- }
-
- conn->ah = efa_ah_alloc(av->domain, raw_addr->raw, insert_implicit_av);
- if (!conn->ah)
- goto err_release;
-
- if (insert_implicit_av)
- dlist_insert_tail(&conn->ah_implicit_conn_list_entry,
- &conn->ah->implicit_conn_list);
-
- conn->shm_fi_addr = FI_ADDR_NOTAVAIL;
- /*
- * The efa_conn_alloc() call can be made in two situations:
- * 1. application calls fi_av_insert API
- * 2. efa progress engine get a message from unknown peer through efa device,
- * which means peer is not local or shm is disabled for transmission.
- * For situation 1, the shm av insertion should happen when the peer is local (insert_shm_av=1)
- * For situation 2, the shm av insertion shouldn't happen anyway (insert_shm_av=0).
- */
- if (av->domain->info_type == EFA_INFO_RDM && insert_shm_av) {
- err = efa_conn_rdm_insert_shm_av(av, conn);
- if (err) {
- errno = -err;
- goto err_release;
- }
- }
-
- err = efa_av_reverse_av_add(av, cur_reverse_av, prv_reverse_av, conn);
- if (err) {
- if (av->domain->info_type == EFA_INFO_RDM) {
- /* insert_implicit_av is only true for the CQ read path
- * which already has the SRX lock */
- if (insert_implicit_av)
- ofi_genlock_lock(&av->domain->srx_lock);
- efa_conn_rdm_deinit(av, conn);
- if (insert_implicit_av)
- ofi_genlock_unlock(&av->domain->srx_lock);
- }
- goto err_release;
- }
-
- insert_implicit_av ? av->used_implicit++ : av->used_explicit++;
-
- return conn;
-
-err_release:
- if (conn->ah)
- efa_ah_release(av->domain, conn->ah, insert_implicit_av);
-
- conn->ep_addr = NULL;
- err = ofi_av_remove_addr(util_av, fi_addr);
- if (err)
- EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n",
- err);
-
- return NULL;
-}
-
-void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av)
-{
- if (release_from_implicit_av) {
- assert(ofi_genlock_held(&av->util_av_implicit.lock));
- efa_av_reverse_av_remove(&av->cur_reverse_av_implicit,
- &av->prv_reverse_av_implicit, conn);
- } else {
- assert(ofi_genlock_held(&av->util_av.lock));
- efa_av_reverse_av_remove(&av->cur_reverse_av,
- &av->prv_reverse_av, conn);
- }
-}
-
-void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av)
-{
- struct util_av *util_av;
- struct util_av_entry *util_av_entry;
- struct efa_av_entry *efa_av_entry;
- char gidstr[INET6_ADDRSTRLEN];
- fi_addr_t fi_addr;
- int err;
-
- if (release_from_implicit_av) {
- assert(ofi_genlock_held(&av->util_av_implicit.lock));
- util_av = &av->util_av_implicit;
- fi_addr = conn->implicit_fi_addr;
- } else {
- assert(ofi_genlock_held(&av->util_av.lock));
- util_av = &av->util_av;
- fi_addr = conn->fi_addr;
- }
-
- util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr);
- assert(util_av_entry);
- efa_av_entry = (struct efa_av_entry *) util_av_entry->data;
-
- err = ofi_av_remove_addr(util_av, fi_addr);
- if (err) {
- EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err);
- }
-
- inet_ntop(AF_INET6, conn->ep_addr->raw, gidstr, INET6_ADDRSTRLEN);
- EFA_INFO(FI_LOG_AV, "efa_conn released! conn[%p] GID[%s] QP[%u]\n",
- conn, gidstr, conn->ep_addr->qpn);
-
- conn->ep_addr = NULL;
- memset(efa_av_entry->ep_addr, 0, EFA_EP_ADDR_LEN);
-}
-
-/**
- * @brief release an efa conn object
- * Caller of this function must obtain av->util_av.lock or
- * av->util_av_implicit.lock. This function obtains the SRX lock and is called
- * from the AV removal path.
- *
- * @param[in] av address vector
- * @param[in] conn efa_conn object pointer
- * @param[in] release_from_implicit_av whether to release conn
- * from implicit AV
- * @param[in] grab_srx_lock whether to get the SRX lock before
- * destroying the peer struct
- */
-void efa_conn_release(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av)
-{
- assert(av->domain->info_type != EFA_INFO_RDM ||
- ofi_genlock_held(&av->domain->srx_lock));
-
- efa_conn_release_reverse_av(av, conn, release_from_implicit_av);
- if (av->domain->info_type == EFA_INFO_RDM)
- efa_conn_rdm_deinit(av, conn);
-
- if (release_from_implicit_av)
- dlist_remove(&conn->ah_implicit_conn_list_entry);
-
- efa_ah_release(av->domain, conn->ah, release_from_implicit_av);
-
- efa_conn_release_util_av(av, conn, release_from_implicit_av);
-
- release_from_implicit_av ? av->used_implicit-- : av->used_explicit--;
-}
-
-/**
- * @brief release an efa conn object
- * Caller of this function must obtain av->util_av.lock or
- * av->util_av_implicit.lock and the SRX lock. It also calls
- * efa_ah_release_unsafe which does not acquire the util_domain lock the
- * protects the AH map. This function is called when evicting an AH entry in the
- * CQ read path which already has the SRX lock and the util_domain lock.
- *
- * @param[in] av address vector
- * @param[in] conn efa_conn object pointer
- * @param[in] release_from_implicit_av whether to release conn
- * from implicit AV
- * @param[in] grab_srx_lock whether to get the SRX lock before
- * destroying the peer struct
- */
-void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av)
-{
- assert(av->domain->info_type != EFA_INFO_RDM ||
- ofi_genlock_held(&av->domain->srx_lock));
-
- assert(ofi_genlock_held(&av->domain->util_domain.lock));
-
- efa_conn_release_reverse_av(av, conn, release_from_implicit_av);
- if (av->domain->info_type == EFA_INFO_RDM)
- efa_conn_rdm_deinit(av, conn);
-
- if (release_from_implicit_av)
- dlist_remove(&conn->ah_implicit_conn_list_entry);
-
- efa_conn_release_util_av(av, conn, release_from_implicit_av);
-
- release_from_implicit_av ? conn->ah->implicit_refcnt-- :
- conn->ah->explicit_refcnt--;
- release_from_implicit_av ? av->used_implicit-- : av->used_explicit--;
-}
-
-void efa_conn_ep_peer_map_insert(struct efa_conn *conn, struct efa_conn_ep_peer_map_entry *map_entry)
-{
- HASH_ADD_PTR(conn->ep_peer_map, ep_ptr, map_entry);
-}
-
-struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn,
- struct efa_rdm_ep *ep)
-{
- struct efa_conn_ep_peer_map_entry *map_entry;
-
- HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry);
-
- return map_entry ? &map_entry->peer : NULL;
-}
-
-void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep)
-{
- struct efa_conn_ep_peer_map_entry *map_entry;
-
- HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry);
- assert(map_entry);
- HASH_DELETE(hh, conn->ep_peer_map, map_entry);
- ofi_buf_free(map_entry);
-}
diff --git a/prov/efa/src/efa_conn.h b/prov/efa/src/efa_conn.h
deleted file mode 100644
index bafa293da5f..00000000000
--- a/prov/efa/src/efa_conn.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
-
-#ifndef EFA_CONN_H
-#define EFA_CONN_H
-
-#include "ofi_util.h"
-#include "rdm/efa_rdm_peer.h"
-
-struct efa_conn {
- struct efa_ah *ah;
- struct efa_ep_addr *ep_addr;
- struct efa_av *av;
- fi_addr_t implicit_fi_addr;
- fi_addr_t fi_addr;
- fi_addr_t shm_fi_addr;
- struct dlist_entry implicit_av_lru_entry;
- struct dlist_entry ah_implicit_conn_list_entry;
- struct efa_conn_ep_peer_map_entry *ep_peer_map;
-};
-
-struct efa_conn_ep_peer_map_entry {
- struct efa_rdm_ep *ep_ptr;
- struct efa_rdm_peer peer;
- UT_hash_handle hh;
-};
-
-void efa_conn_ep_peer_map_insert(struct efa_conn *conn,
- struct efa_conn_ep_peer_map_entry *map_entry);
-
-struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn,
- struct efa_rdm_ep *ep);
-
-void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep);
-
-int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn);
-
-void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn);
-
-struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr,
- uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av);
-
-void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av);
-
-void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av);
-
-void efa_conn_release(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av);
-
-void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn,
- bool release_from_implicit_av);
-
-#endif
\ No newline at end of file
diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c
index 3eab28231d4..b734b199c1c 100644
--- a/prov/efa/src/efa_domain.c
+++ b/prov/efa/src/efa_domain.c
@@ -8,6 +8,7 @@
#include "config.h"
#include "efa.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_cntr.h"
#include "rdm/efa_rdm_cntr.h"
#include "rdm/efa_rdm_cq.h"
@@ -46,7 +47,7 @@ static struct fi_ops_domain efa_domain_ops = {
static struct fi_ops_domain efa_domain_ops_rdm = {
.size = sizeof(struct fi_ops_domain),
- .av_open = efa_av_open,
+ .av_open = efa_proto_av_open,
.cq_open = efa_rdm_cq_open,
.endpoint = efa_rdm_ep_open,
.scalable_ep = fi_no_scalable_ep,
@@ -496,14 +497,14 @@ static int efa_domain_query_addr(struct fid_ep *ep_fid, fi_addr_t addr,
uint32_t *remote_qkey)
{
struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid);
- struct efa_conn *conn = efa_av_addr_to_conn(base_ep->av, addr);
- if (!conn || !conn->ah || !conn->ep_addr) {
+ struct efa_av_entry *av_entry = efa_av_addr_to_entry(base_ep->av, addr);
+ if (!av_entry || !av_entry->ah || !efa_av_entry_ep_addr(av_entry)) {
EFA_WARN(FI_LOG_EP_CTRL, "Failed to find connection for addr %lu\n", addr);
return -FI_EINVAL;
}
- *ahn = conn->ah->ahn;
- *remote_qpn = conn->ep_addr->qpn;
- *remote_qkey = conn->ep_addr->qkey;
+ *ahn = av_entry->ah->ahn;
+ *remote_qpn = efa_av_entry_ep_addr(av_entry)->qpn;
+ *remote_qkey = efa_av_entry_ep_addr(av_entry)->qkey;
return FI_SUCCESS;
}
@@ -824,8 +825,8 @@ void efa_domain_progress_rdm_peers_and_queues(struct efa_domain *domain)
EFA_WARN(FI_LOG_EP_CTRL,
"Failed to post HANDSHAKE to peer fi_addr: "
"%ld implicit fi_addr: %ld. %s\n",
- peer->conn->fi_addr,
- peer->conn->implicit_fi_addr,
+ peer->av_entry->fi_addr,
+ peer->av_entry->implicit_fi_addr,
fi_strerror(-ret));
efa_base_ep_write_eq_error(&peer->ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE);
continue;
diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c
index b6a7e83b864..ab3935ab6f6 100644
--- a/prov/efa/src/efa_msg.c
+++ b/prov/efa/src/efa_msg.c
@@ -206,7 +206,7 @@ static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void
static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags)
{
struct efa_qp *qp = base_ep->qp;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */
struct ibv_data_buf inline_data_list[2];
struct efa_context *efa_ctx;
@@ -227,8 +227,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi
dump_msg(msg, "send");
- conn = efa_av_addr_to_conn(base_ep->av, msg->addr);
- assert(conn && conn->ep_addr);
+ av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr);
+ assert(av_entry && efa_av_entry_ep_addr(av_entry));
assert(msg->iov_count <= base_ep->info->tx_attr->iov_limit);
@@ -330,7 +330,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi
/* Use consolidated send function */
ret = efa_qp_post_send(qp, sg_list, inline_data_list, iov_count,
use_inline, wr_id, msg->data, flags,
- conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey);
+ av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey);
if (OFI_UNLIKELY(ret))
ret = (ret == ENOMEM) ? -FI_EAGAIN : -ret;
diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c
index cf136e623b5..cb0f3283697 100644
--- a/prov/efa/src/efa_rma.c
+++ b/prov/efa/src/efa_rma.c
@@ -38,7 +38,7 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep,
{
struct efa_domain *domain = base_ep->domain;
struct efa_mr *efa_mr;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
size_t iov_count = msg->iov_count;
struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */
uintptr_t wr_id;
@@ -102,15 +102,15 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep,
}
}
- conn = efa_av_addr_to_conn(base_ep->av, msg->addr);
- assert(conn && conn->ep_addr);
+ av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr);
+ assert(av_entry && efa_av_entry_ep_addr(av_entry));
/* Use consolidated RDMA read function */
/* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */
err = efa_qp_post_read(base_ep->qp, sge_list, iov_count,
msg->rma_iov[0].key, msg->rma_iov[0].addr,
wr_id, flags,
- conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey);
+ av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey);
if (OFI_UNLIKELY(err))
err = (err == ENOMEM) ? -FI_EAGAIN : -err;
@@ -197,7 +197,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep,
uint64_t flags)
{
struct efa_domain *domain = base_ep->domain;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
size_t iov_count = msg->iov_count;
struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */
uintptr_t wr_id;
@@ -258,14 +258,14 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep,
}
}
- conn = efa_av_addr_to_conn(base_ep->av, msg->addr);
- assert(conn && conn->ep_addr);
+ av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr);
+ assert(av_entry && efa_av_entry_ep_addr(av_entry));
/* Use consolidated RDMA write function */
err = efa_qp_post_write(base_ep->qp, sge_list, iov_count,
msg->rma_iov[0].key, msg->rma_iov[0].addr,
wr_id, msg->data, flags,
- conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey);
+ av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey);
if (OFI_UNLIKELY(err))
err = (err == ENOMEM) ? -FI_EAGAIN : -err;
@@ -365,7 +365,7 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len,
struct efa_base_ep *base_ep;
struct efa_domain *domain;
struct ibv_sge sge;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
uintptr_t wr_id;
int err;
@@ -387,12 +387,12 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len,
sge.length = 0;
sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey;
- conn = efa_av_addr_to_conn(base_ep->av, dest_addr);
- assert(conn && conn->ep_addr);
+ av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr);
+ assert(av_entry && efa_av_entry_ep_addr(av_entry));
err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr,
- wr_id, 0, 0, conn->ah, conn->ep_addr->qpn,
- conn->ep_addr->qkey);
+ wr_id, 0, 0, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn,
+ efa_av_entry_ep_addr(av_entry)->qkey);
if (OFI_UNLIKELY(err))
err = (err == ENOMEM) ? -FI_EAGAIN : -err;
@@ -406,7 +406,7 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size
{
struct efa_base_ep *base_ep;
struct efa_domain *domain;
- struct efa_conn *conn;
+ struct efa_av_entry *av_entry;
struct ibv_sge sge;
uintptr_t wr_id;
int err;
@@ -429,12 +429,12 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size
sge.length = 0;
sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey;
- conn = efa_av_addr_to_conn(base_ep->av, dest_addr);
- assert(conn && conn->ep_addr);
+ av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr);
+ assert(av_entry && efa_av_entry_ep_addr(av_entry));
err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr,
- wr_id, data, IBV_SEND_INLINE, conn->ah, conn->ep_addr->qpn,
- conn->ep_addr->qkey);
+ wr_id, data, IBV_SEND_INLINE, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn,
+ efa_av_entry_ep_addr(av_entry)->qkey);
if (OFI_UNLIKELY(err))
err = (err == ENOMEM) ? -FI_EAGAIN : -err;
diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c
new file mode 100644
index 00000000000..8b84f315bff
--- /dev/null
+++ b/prov/efa/src/rdm/efa_proto_av.c
@@ -0,0 +1,1590 @@
+/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
+/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
+
+#include
+#include
+
+#include "efa.h"
+#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
+#include "rdm/efa_rdm_pke_utils.h"
+
+/*
+ * efa_av_entry and efa_proto_av_entry share the same cache-line-0 layout
+ * (ep_addr, ah) so reverse_av entries and util_av contexts work across
+ * both. Break loudly if anyone ever reorders either struct.
+ */
+_Static_assert(offsetof(struct efa_proto_av_entry, ep_addr) ==
+ offsetof(struct efa_av_entry, ep_addr),
+ "efa_av_entry and efa_proto_av_entry must share ep_addr offset");
+_Static_assert(offsetof(struct efa_proto_av_entry, ah) ==
+ offsetof(struct efa_av_entry, ah),
+ "efa_av_entry and efa_proto_av_entry must share ah offset");
+
+/**
+ * @brief Local/remote peer detection by comparing peer GID with stored local GIDs
+ *
+ * @param[in] av efa AV
+ * @param[in] addr peer address to check
+ * @return true if local, false otherwise
+ */
+static bool efa_is_local_peer(struct efa_av *av, const void *addr)
+{
+ int i;
+ uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw;
+
+#if ENABLE_DEBUG
+ char raw_gid_str[INET6_ADDRSTRLEN] = { 0 };
+
+ if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) {
+ EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno);
+ return 0;
+ }
+ EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str);
+#endif
+ for (i = 0; i < g_efa_ibv_gid_cnt; ++i) {
+ if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) {
+ EFA_INFO(FI_LOG_AV, "The peer is local.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Forward declaration for static helper defined after entry release */
+static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah);
+
+/* ---- Address lookup ---- */
+
+/**
+ * @brief find proto AV entry using fi_addr in the given util_av
+ *
+ * @param[in] util_av util AV to search
+ * @param[in] fi_addr fabric address to look up
+ * @return pointer to entry if valid, NULL otherwise
+ */
+static inline struct efa_proto_av_entry *
+efa_proto_av_addr_to_entry_impl(struct util_av *util_av, fi_addr_t fi_addr)
+{
+ struct util_av_entry *util_av_entry;
+ struct efa_proto_av_entry *entry;
+
+ if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL))
+ return NULL;
+
+ if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr)))
+ util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr);
+ else
+ return NULL;
+
+ entry = (struct efa_proto_av_entry *)util_av_entry->data;
+ return entry->ah ? entry : NULL;
+}
+
+/**
+ * @brief find proto AV entry using fi_addr in the explicit AV
+ *
+ * @param[in] av protocol AV
+ * @param[in] fi_addr fabric address
+ * @return pointer to entry if valid, NULL otherwise
+ */
+struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av,
+ fi_addr_t fi_addr)
+{
+ return efa_proto_av_addr_to_entry_impl(&av->efa_av.util_av, fi_addr);
+}
+
+/**
+ * @brief find proto AV entry using fi_addr in the implicit AV
+ *
+ * @param[in] av protocol AV
+ * @param[in] fi_addr fabric address
+ * @return pointer to entry if valid, NULL otherwise
+ */
+struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit(
+ struct efa_proto_av *av, fi_addr_t fi_addr)
+{
+ return efa_proto_av_addr_to_entry_impl(&av->util_av_implicit, fi_addr);
+}
+
+/* ---- Peer map operations ---- */
+
+/**
+ * @brief insert an entry into the peer map for a given AV entry
+ *
+ * @param[in] entry proto AV entry
+ * @param[in] map_entry peer map entry to insert
+ */
+void efa_proto_av_entry_ep_peer_map_insert(
+ struct efa_proto_av_entry *entry,
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry)
+{
+ HASH_ADD_PTR(entry->ep_peer_map, ep_ptr, map_entry);
+}
+
+/**
+ * @brief look up a peer in the peer map for a given AV entry and endpoint
+ *
+ * @param[in] entry proto AV entry
+ * @param[in] ep RDM endpoint
+ * @return pointer to peer if found, NULL otherwise
+ */
+struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup(
+ struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep)
+{
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry;
+
+ HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry);
+ return map_entry ? &map_entry->peer : NULL;
+}
+
+/**
+ * @brief remove an endpoint's peer from the peer map for a given AV entry
+ *
+ * @param[in] entry proto AV entry
+ * @param[in] ep RDM endpoint whose peer to remove
+ */
+void efa_proto_av_entry_ep_peer_map_remove(
+ struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep)
+{
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry;
+
+ HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry);
+ assert(map_entry);
+ HASH_DELETE(hh, entry->ep_peer_map, map_entry);
+ ofi_buf_free(map_entry);
+}
+
+/* ---- SHM AV operations ---- */
+
+/**
+ * @brief Insert the address into SHM provider's AV
+ *
+ * If shm transfer is enabled and the addr comes from local peer,
+ * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later.
+ * 2. insert gid_qpn into shm's av
+ * 3. store returned fi_addr from shm into the hash table
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry
+ * @return On success return 0, otherwise return a negative error code
+ */
+int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry)
+{
+ int err, ret;
+ char smr_name[EFA_SHM_NAME_MAX];
+ size_t smr_name_len;
+ struct efa_ep_addr *ep_addr = efa_proto_av_entry_ep_addr(entry);
+
+ assert(ep_addr);
+
+ if (efa_is_local_peer(&av->efa_av, ep_addr) && av->shm_rdm_av) {
+ if (av->shm_used >= efa_env.shm_av_size) {
+ EFA_WARN(FI_LOG_AV,
+ "Max number of shm AV entry (%d) has been reached.\n",
+ efa_env.shm_av_size);
+ return -FI_ENOMEM;
+ }
+
+ smr_name_len = EFA_SHM_NAME_MAX;
+ err = efa_shm_ep_name_construct(smr_name, &smr_name_len, ep_addr);
+ if (err != FI_SUCCESS) {
+ EFA_WARN(FI_LOG_AV,
+ "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err);
+ return err;
+ }
+
+ /*
+ * The shm provider supports FI_AV_USER_ID flag. This flag
+ * associates a user-assigned identifier with each av entry that
+ * is returned with any completion entry in place of the AV's
+ * address. Below, &entry->shm_fi_addr is both input and output.
+ * It is passed in with value entry->fi_addr (the efa provider's
+ * fi_addr). shm records this as user id for cq write, then
+ * overwrites shm_fi_addr with the actual fi_addr in shm's av.
+ * The efa provider uses shm_fi_addr for transmissions through
+ * the shm ep.
+ */
+ entry->shm_fi_addr = entry->fi_addr;
+ ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &entry->shm_fi_addr, FI_AV_USER_ID, NULL);
+ if (OFI_UNLIKELY(ret != 1)) {
+ EFA_WARN(FI_LOG_AV,
+ "Failed to insert address to shm provider's av: %s\n",
+ fi_strerror(-ret));
+ entry->shm_fi_addr = FI_ADDR_NOTAVAIL;
+ return ret;
+ }
+
+ EFA_INFO(FI_LOG_AV,
+ "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n",
+ smr_name, entry->fi_addr, entry->shm_fi_addr);
+
+ assert(entry->shm_fi_addr < efa_env.shm_av_size);
+ av->shm_used++;
+ }
+
+ return 0;
+}
+
+/**
+ * @brief Release the protocol-specific resources of an AV entry.
+ *
+ * Releases the shm av entry and destroys rdm peers. Caller must hold
+ * the SRX lock because this function modifies the peer map and destroys
+ * peers which are accessed and modified in the CQ read path.
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry
+ */
+void efa_proto_av_entry_deinit(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry)
+{
+ int err;
+ struct efa_proto_av_entry_ep_peer_map_entry *peer_map_entry, *tmp;
+
+ assert((entry->fi_addr != FI_ADDR_NOTAVAIL &&
+ entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) ||
+ (entry->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
+ entry->fi_addr == FI_ADDR_NOTAVAIL));
+
+ if (entry->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) {
+ err = fi_av_remove(av->shm_rdm_av, &entry->shm_fi_addr, 1, 0);
+ if (err) {
+ EFA_WARN(FI_LOG_AV,
+ "remove address from shm av failed! err=%d\n",
+ err);
+ } else {
+ av->shm_used--;
+ assert(entry->shm_fi_addr < efa_env.shm_av_size);
+ }
+ }
+
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+ HASH_ITER(hh, entry->ep_peer_map, peer_map_entry, tmp) {
+ dlist_remove(&peer_map_entry->peer.ep_peer_list_entry);
+ efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr);
+ HASH_DEL(entry->ep_peer_map, peer_map_entry);
+ ofi_buf_free(peer_map_entry);
+ }
+ assert(HASH_CNT(hh, entry->ep_peer_map) == 0);
+}
+
+/* ---- Implicit AV LRU ---- */
+
+/**
+ * @brief Add entry to the LRU list. If the list is full, evict the least
+ * recently used entry at the front of the LRU list and add the latest one.
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry to be added to the LRU list
+ */
+static inline int efa_proto_av_implicit_av_lru_insert(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry)
+{
+ size_t cur_size;
+ struct efa_ep_addr_hashable *ep_addr_hashable;
+ struct efa_proto_av_entry *entry_to_release;
+
+ if (av->implicit_av_size == 0)
+ goto out;
+
+ cur_size = HASH_CNT(hh, av->util_av_implicit.hash);
+ if (cur_size <= av->implicit_av_size)
+ goto out;
+
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+
+ dlist_pop_front(&av->implicit_av_lru_list, struct efa_proto_av_entry,
+ entry_to_release, implicit_av_lru_entry);
+ /*
+ * dlist_pop_front leaves entry_to_release's dlist_entry pointing at its
+ * old neighbors. Re-init so that efa_proto_av_entry_release's call to
+ * dlist_remove is a no-op on the already-popped node and does not
+ * disturb the surrounding list.
+ */
+ dlist_init(&entry_to_release->implicit_av_lru_entry);
+
+ EFA_INFO(FI_LOG_AV,
+ "Evicting AV entry for peer implicit fi_addr %" PRIu64
+ " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from "
+ "implicit AV\n",
+ entry_to_release->implicit_fi_addr, entry_to_release->ah->ahn,
+ efa_proto_av_entry_ep_addr(entry_to_release)->qpn,
+ efa_proto_av_entry_ep_addr(entry_to_release)->qkey);
+
+ ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable));
+ if (!ep_addr_hashable) {
+ EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n");
+ /* Re-insert the victim at the head so it remains tracked in the LRU. */
+ dlist_insert_head(&entry_to_release->implicit_av_lru_entry,
+ &av->implicit_av_lru_list);
+ return -FI_ENOMEM;
+ }
+ memcpy(ep_addr_hashable, entry_to_release->ep_addr, sizeof(struct efa_ep_addr));
+ HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable);
+
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+ efa_proto_av_entry_release(av, entry_to_release, true);
+
+ assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size);
+
+out:
+ dlist_insert_tail(&entry->implicit_av_lru_entry,
+ &av->implicit_av_lru_list);
+ return FI_SUCCESS;
+}
+
+/**
+ * @brief Move entry to the end of the LRU list (most recently used)
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry to move
+ */
+void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry)
+{
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+ assert(av->implicit_av_size == 0 ||
+ HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size);
+ assert(dlist_entry_in_list(&av->implicit_av_lru_list,
+ &entry->implicit_av_lru_entry));
+
+ dlist_remove(&entry->implicit_av_lru_entry);
+ dlist_insert_tail(&entry->implicit_av_lru_entry,
+ &av->implicit_av_lru_list);
+
+ efa_proto_ah_lru_move(av->efa_av.domain, entry->ah);
+}
+
+/* ---- Reverse lookup (protocol, connid-aware) ---- */
+
+/**
+ * @brief reverse lookup a proto AV entry by AHN, QPN, and optional connid
+ *
+ * @param[in] cur_reverse_av current reverse AV hash table
+ * @param[in] prv_reverse_av previous reverse AV hash table
+ * @param[in] ahn address handle number
+ * @param[in] qpn QP number
+ * @param[in] pkt_entry NULL or packet entry to extract connid from
+ * @return pointer to entry if found, NULL otherwise
+ */
+static inline struct efa_proto_av_entry *
+efa_proto_av_reverse_lookup_entry(struct efa_cur_reverse_av **cur_reverse_av,
+ struct efa_prv_reverse_av **prv_reverse_av,
+ uint16_t ahn, uint16_t qpn,
+ struct efa_rdm_pke *pkt_entry)
+{
+ uint32_t *connid;
+ struct efa_cur_reverse_av *cur_entry;
+ struct efa_prv_reverse_av *prv_entry;
+ struct efa_cur_reverse_av_key cur_key;
+ struct efa_prv_reverse_av_key prv_key;
+
+ cur_key.ahn = ahn;
+ cur_key.qpn = qpn;
+
+ HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry);
+
+ if (OFI_UNLIKELY(!cur_entry))
+ return NULL;
+
+ /*
+ * Cast is safe: in protocol path, av_entry points to the ep_addr field
+ * of a efa_proto_av_entry which has the same layout prefix.
+ */
+ if (!pkt_entry) {
+ return (struct efa_proto_av_entry *)cur_entry->av_entry;
+ }
+
+ connid = efa_rdm_pke_connid_ptr(pkt_entry);
+ if (!connid) {
+ EFA_WARN_ONCE(FI_LOG_EP_CTRL,
+ "An incoming packet does NOT have connection ID "
+ "in its header.\n"
+ "This means the peer is using an older version "
+ "of libfabric.\n"
+ "The communication can continue but it is "
+ "encouraged to use\n"
+ "a newer version of libfabric\n");
+ return (struct efa_proto_av_entry *)cur_entry->av_entry;
+ }
+
+ if (OFI_LIKELY(*connid == efa_av_entry_ep_addr(cur_entry->av_entry)->qkey))
+ return (struct efa_proto_av_entry *)cur_entry->av_entry;
+
+ prv_key.ahn = ahn;
+ prv_key.qpn = qpn;
+ prv_key.connid = *connid;
+ HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry);
+
+ return OFI_LIKELY(!!prv_entry) ? (struct efa_proto_av_entry *)prv_entry->av_entry : NULL;
+}
+
+/**
+ * @brief find fi_addr for RDM endpoint in the explicit AV (connid-aware)
+ *
+ * @param[in] av protocol AV
+ * @param[in] ahn address handle number
+ * @param[in] qpn QP number
+ * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid
+ * @return fi_addr on success, FI_ADDR_NOTAVAIL if not found
+ */
+fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av,
+ uint16_t ahn, uint16_t qpn,
+ struct efa_rdm_pke *pkt_entry)
+{
+ struct efa_proto_av_entry *entry;
+
+ entry = efa_proto_av_reverse_lookup_entry(
+ &av->efa_av.cur_reverse_av, &av->efa_av.prv_reverse_av,
+ ahn, qpn, pkt_entry);
+
+ if (OFI_LIKELY(!!entry))
+ return entry->fi_addr;
+
+ return FI_ADDR_NOTAVAIL;
+}
+
+/**
+ * @brief find fi_addr for RDM endpoint in the implicit AV (connid-aware)
+ *
+ * Caller must hold srx_lock. Updates LRU list on hit.
+ *
+ * @param[in] av protocol AV
+ * @param[in] ahn address handle number
+ * @param[in] qpn QP number
+ * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid
+ * @return implicit fi_addr on success, FI_ADDR_NOTAVAIL if not found
+ */
+fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av,
+ uint16_t ahn, uint16_t qpn,
+ struct efa_rdm_pke *pkt_entry)
+{
+ struct efa_proto_av_entry *entry;
+
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+
+ entry = efa_proto_av_reverse_lookup_entry(
+ &av->cur_reverse_av_implicit, &av->prv_reverse_av_implicit,
+ ahn, qpn, pkt_entry);
+
+ if (OFI_LIKELY(!!entry)) {
+ efa_proto_av_implicit_av_lru_entry_move(av, entry);
+ return entry->implicit_fi_addr;
+ }
+
+ return FI_ADDR_NOTAVAIL;
+}
+
+/* ---- Entry release helpers ---- */
+
+/**
+ * @brief remove entry from the appropriate reverse AV hash tables
+ *
+ * @param[in] av protocol AV
+ * @param[in] entry entry to remove
+ * @param[in] release_from_implicit_av whether entry is in implicit AV
+ */
+static void efa_proto_av_entry_release_reverse_av(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av)
+{
+ if (release_from_implicit_av) {
+ assert(ofi_genlock_held(&av->util_av_implicit.lock));
+ efa_av_reverse_av_remove(&av->cur_reverse_av_implicit,
+ &av->prv_reverse_av_implicit,
+ (struct efa_av_entry *)entry);
+ } else {
+ assert(ofi_genlock_held(&av->efa_av.util_av.lock));
+ efa_av_reverse_av_remove(&av->efa_av.cur_reverse_av,
+ &av->efa_av.prv_reverse_av,
+ (struct efa_av_entry *)entry);
+ }
+}
+
+/**
+ * @brief remove entry from the appropriate util_av and clear its fields
+ *
+ * @param[in] av protocol AV
+ * @param[in] entry entry to remove
+ * @param[in] release_from_implicit_av whether entry is in implicit AV
+ */
+static void efa_proto_av_entry_release_util_av(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av)
+{
+ struct util_av *util_av;
+ char gidstr[INET6_ADDRSTRLEN];
+ fi_addr_t fi_addr;
+ int err;
+
+ if (release_from_implicit_av) {
+ assert(ofi_genlock_held(&av->util_av_implicit.lock));
+ util_av = &av->util_av_implicit;
+ fi_addr = entry->implicit_fi_addr;
+ } else {
+ assert(ofi_genlock_held(&av->efa_av.util_av.lock));
+ util_av = &av->efa_av.util_av;
+ fi_addr = entry->fi_addr;
+ }
+
+ err = ofi_av_remove_addr(util_av, fi_addr);
+ if (err)
+ EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err);
+
+ inet_ntop(AF_INET6, efa_proto_av_entry_ep_addr(entry)->raw, gidstr, INET6_ADDRSTRLEN);
+ EFA_INFO(FI_LOG_AV, "efa_proto_av_entry released! entry[%p] GID[%s] QP[%u]\n",
+ entry, gidstr, efa_proto_av_entry_ep_addr(entry)->qpn);
+
+ entry->ah = NULL;
+ memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN);
+}
+
+/**
+ * @brief Release a proto AV entry.
+ *
+ * Caller must hold srx_lock. Acquires util_domain.lock internally
+ * via efa_ah_release. Called from the AV removal path.
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry to release
+ * @param[in] release_from_implicit_av whether entry is in implicit AV
+ */
+void efa_proto_av_entry_release(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av)
+{
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+
+ efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av);
+ efa_proto_av_entry_deinit(av, entry);
+
+ if (release_from_implicit_av) {
+ dlist_remove(&entry->ah_implicit_conn_list_entry);
+ dlist_remove(&entry->implicit_av_lru_entry);
+ }
+
+ efa_proto_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av);
+ efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av);
+
+ release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--;
+}
+
+/**
+ * @brief Release a proto AV entry without acquiring util_domain.lock.
+ *
+ * Caller must hold srx_lock AND util_domain.lock. Called from the AH
+ * eviction path in the CQ read path which already holds both locks.
+ *
+ * @param[in] av protocol address vector
+ * @param[in] entry proto av entry to release
+ * @param[in] release_from_implicit_av whether entry is in implicit AV
+ */
+void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av)
+{
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+ assert(ofi_genlock_held(&av->efa_av.domain->util_domain.lock));
+
+ efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av);
+ efa_proto_av_entry_deinit(av, entry);
+
+ if (release_from_implicit_av) {
+ dlist_remove(&entry->ah_implicit_conn_list_entry);
+ dlist_remove(&entry->implicit_av_lru_entry);
+ }
+
+ /* Decrement refcnts before release_util_av which NULLs entry->ah */
+ release_from_implicit_av ? efa_proto_ah_from_ah(entry->ah)->implicit_refcnt-- :
+ efa_proto_ah_from_ah(entry->ah)->explicit_refcnt--;
+ entry->ah->refcnt--;
+
+ efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av);
+
+ release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--;
+}
+
+/* ---- Protocol AH helpers ---- */
+
+/**
+ * @brief Move the AH to the end of the LRU list (most recently used)
+ *
+ * @param[in] domain efa domain
+ * @param[in] ah base AH (must be embedded in efa_proto_ah)
+ */
+static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah)
+{
+ struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah);
+
+ assert(efa_proto_ah_from_ah(ah)->implicit_refcnt > 0 || efa_proto_ah_from_ah(ah)->explicit_refcnt > 0);
+ assert(dlist_entry_in_list(&domain->ah_lru_list,
+ &proto_ah->lru_list_entry));
+
+ dlist_remove(&proto_ah->lru_list_entry);
+ dlist_insert_tail(&proto_ah->lru_list_entry, &domain->ah_lru_list);
+}
+
+/**
+ * @brief Evict the least recently used AH that has no explicit AV entries.
+ *
+ * Finds the LRU AH with only implicit references, releases all its
+ * implicit AV entries, and destroys the AH. Called when ibv_create_ah
+ * fails with ENOMEM.
+ *
+ * Caller must hold srx_lock. This function acquires util_domain.lock.
+ *
+ * @param[in] domain efa domain
+ * @return 0 on success, -FI_ENOMEM if no AH is available to evict
+ */
+static int efa_proto_ah_evict(struct efa_domain *domain)
+{
+ struct efa_proto_av_entry *entry_to_release;
+ struct efa_proto_ah *proto_ah_tmp, *proto_ah_to_release = NULL;
+ struct dlist_entry *tmp;
+
+ assert(ofi_genlock_held(&domain->srx_lock));
+
+ ofi_genlock_lock(&domain->util_domain.lock);
+
+ dlist_foreach_container(&domain->ah_lru_list, struct efa_proto_ah,
+ proto_ah_tmp, lru_list_entry) {
+ if (proto_ah_tmp->explicit_refcnt == 0) {
+ proto_ah_to_release = proto_ah_tmp;
+ break;
+ }
+ }
+
+ if (!proto_ah_to_release) {
+ ofi_genlock_unlock(&domain->util_domain.lock);
+ EFA_WARN(FI_LOG_AV,
+ "AH creation for implicit AV entry failed with ENOMEM "
+ "but no AH entries available to evict\n");
+ return -FI_ENOMEM;
+ }
+
+ assert(proto_ah_to_release->implicit_refcnt > 0);
+
+ dlist_foreach_container_safe(&proto_ah_to_release->implicit_conn_list,
+ struct efa_proto_av_entry, entry_to_release,
+ ah_implicit_conn_list_entry, tmp) {
+
+ assert(entry_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
+ entry_to_release->fi_addr == FI_ADDR_NOTAVAIL);
+
+ efa_proto_av_entry_release_ah_unsafe(entry_to_release->av,
+ entry_to_release, true);
+ }
+
+ if (proto_ah_to_release->implicit_refcnt == 0 &&
+ proto_ah_to_release->explicit_refcnt == 0) {
+ dlist_remove(&proto_ah_to_release->lru_list_entry);
+ assert(dlist_empty(&proto_ah_to_release->implicit_conn_list));
+ assert(proto_ah_to_release->ah.refcnt == 0);
+ efa_ah_destroy(domain, &proto_ah_to_release->ah);
+ }
+
+ ofi_genlock_unlock(&domain->util_domain.lock);
+
+ return FI_SUCCESS;
+}
+
+/**
+ * @brief Allocate a protocol AH with eviction retry.
+ *
+ * Calls efa_ah_alloc with sizeof(efa_proto_ah) to allocate the
+ * protocol wrapper. Initializes implicit_refcnt, explicit_refcnt,
+ * implicit_conn_list, and inserts into the domain LRU list.
+ * On ENOMEM, evicts an AH with only implicit references and retries.
+ *
+ * Protocol refcnts and the LRU list are shared across all AVs sharing
+ * the same PD (domain), but per-AV call sites only hold their own
+ * util_av lock. This function takes util_domain.lock around the proto
+ * field mutations to serialize against concurrent efa_proto_ah_alloc
+ * / efa_proto_ah_release on a different AV.
+ *
+ * @param[in] domain efa domain
+ * @param[in] gid GID
+ * @param[in] insert_implicit_av whether this is for an implicit AV entry
+ * @return pointer to base efa_ah on success, NULL on failure
+ */
+struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain,
+ const uint8_t *gid,
+ bool insert_implicit_av)
+{
+ struct efa_ah *ah;
+ struct efa_proto_ah *proto_ah;
+ int err;
+ bool first_proto_user;
+
+ ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah));
+ if (!ah) {
+ if (errno != FI_ENOMEM)
+ return NULL;
+
+ EFA_INFO(FI_LOG_AV,
+ "ibv_create_ah failed with ENOMEM. "
+ "Attempting to evict AH entry\n");
+
+ err = efa_proto_ah_evict(domain);
+ if (err)
+ return NULL;
+
+ ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah));
+ if (!ah)
+ return NULL;
+ }
+
+ /*
+ * efa_ah_alloc released util_domain.lock on return. Reacquire it
+ * before touching the protocol-specific fields (refcnts, LRU list,
+ * implicit_conn_list) so concurrent allocators on a different AV's
+ * lock don't race on a shared AH.
+ *
+ * Between efa_ah_alloc returning and reacquiring the lock, a
+ * concurrent efa_proto_ah_release could have dropped both proto
+ * refcnts to zero and removed the AH from the LRU list, even though
+ * the base ah->refcnt stayed > 0. Detect "first proto user" by
+ * checking the proto refcnts directly rather than ah->refcnt.
+ */
+ ofi_genlock_lock(&domain->util_domain.lock);
+
+ proto_ah = efa_proto_ah_from_ah(ah);
+
+ /*
+ * first_proto_user is true when both proto refcnts are zero — either
+ * this is a brand-new AH (refcnt just incremented from 0 to 1 inside
+ * efa_ah_alloc) or an AH where the last proto user released (and
+ * removed it from the LRU list) but the base layer kept it alive.
+ * Either way we need to (re)init the proto fields and (re)insert
+ * into the LRU list.
+ */
+ first_proto_user = (proto_ah->implicit_refcnt == 0 &&
+ proto_ah->explicit_refcnt == 0);
+ if (first_proto_user) {
+ dlist_init(&proto_ah->implicit_conn_list);
+ dlist_insert_tail(&proto_ah->lru_list_entry,
+ &domain->ah_lru_list);
+ }
+
+ insert_implicit_av ? proto_ah->implicit_refcnt++ :
+ proto_ah->explicit_refcnt++;
+
+ if (!first_proto_user)
+ efa_proto_ah_lru_move(domain, ah);
+
+ ofi_genlock_unlock(&domain->util_domain.lock);
+
+ return ah;
+}
+
+/**
+ * @brief Release a protocol AH reference.
+ *
+ * Decrements the appropriate protocol refcount. When both protocol
+ * refcounts reach zero, removes from LRU list and calls efa_ah_release
+ * to decrement the base refcount (which destroys the AH).
+ *
+ * Protocol refcnts and the LRU list are shared across all AVs sharing
+ * the same PD (domain), but per-AV call sites only hold their own
+ * util_av lock. This function takes util_domain.lock around the proto
+ * field mutations to serialize against concurrent efa_proto_ah_alloc
+ * / efa_proto_ah_release on a different AV.
+ *
+ * @param[in] domain efa domain
+ * @param[in] ah base AH
+ * @param[in] release_from_implicit_av whether releasing implicit ref
+ */
+void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah,
+ bool release_from_implicit_av)
+{
+ struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah);
+
+ /*
+ * Protocol refcnts and LRU list are shared across AVs sharing the
+ * same PD (domain), so mutations must be serialized by
+ * util_domain.lock — the same lock efa_ah_release acquires.
+ */
+ ofi_genlock_lock(&domain->util_domain.lock);
+
+ assert((release_from_implicit_av && proto_ah->implicit_refcnt > 0) ||
+ (!release_from_implicit_av && proto_ah->explicit_refcnt > 0));
+
+ release_from_implicit_av ? proto_ah->implicit_refcnt-- :
+ proto_ah->explicit_refcnt--;
+
+ if (proto_ah->implicit_refcnt == 0 && proto_ah->explicit_refcnt == 0) {
+ dlist_remove(&proto_ah->lru_list_entry);
+ assert(dlist_empty(&proto_ah->implicit_conn_list));
+ }
+
+ ofi_genlock_unlock(&domain->util_domain.lock);
+
+ efa_ah_release(domain, ah);
+}
+
+/* ---- Entry alloc ---- */
+
+/**
+ * @brief Allocate and initialize a proto AV entry.
+ *
+ * Caller must hold util_av.lock (explicit) or util_av_implicit.lock (implicit),
+ * and must hold srx_lock. srx_lock is required because this function calls
+ * efa_proto_av_entry_deinit on the error path, which walks the per-entry
+ * ep_peer_map and destructs peers under srx_lock.
+ *
+ * @param[in] av protocol address vector
+ * @param[in] raw_addr raw efa address
+ * @param[in] flags flags application passed to fi_av_insert
+ * @param[in] context context application passed to fi_av_insert
+ * @param[in] insert_shm_av whether to insert address into shm av
+ * @param[in] insert_implicit_av whether to insert into implicit AV
+ * @return on success, return a pointer to the entry; otherwise NULL
+ */
+struct efa_proto_av_entry *efa_proto_av_entry_alloc(
+ struct efa_proto_av *av, struct efa_ep_addr *raw_addr,
+ uint64_t flags, void *context, bool insert_shm_av,
+ bool insert_implicit_av)
+{
+ struct util_av *util_av;
+ struct efa_cur_reverse_av **cur_reverse_av;
+ struct efa_prv_reverse_av **prv_reverse_av;
+ struct util_av_entry *util_av_entry = NULL;
+ struct efa_proto_av_entry *entry;
+ fi_addr_t fi_addr;
+ int err;
+ bool on_lru_list = false;
+
+ if (flags & FI_SYNC_ERR)
+ memset(context, 0, sizeof(int));
+
+ if (insert_implicit_av) {
+ assert(ofi_genlock_held(&av->util_av_implicit.lock));
+ util_av = &av->util_av_implicit;
+ cur_reverse_av = &av->cur_reverse_av_implicit;
+ prv_reverse_av = &av->prv_reverse_av_implicit;
+ } else {
+ assert(ofi_genlock_held(&av->efa_av.util_av.lock));
+ util_av = &av->efa_av.util_av;
+ cur_reverse_av = &av->efa_av.cur_reverse_av;
+ prv_reverse_av = &av->efa_av.prv_reverse_av;
+ }
+
+ err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr);
+ if (err) {
+ EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n",
+ fi_strerror(err));
+ return NULL;
+ }
+
+ util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr);
+ entry = (struct efa_proto_av_entry *)util_av_entry->data;
+ assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)entry->ep_addr));
+
+ memset((char *)entry + EFA_EP_ADDR_LEN, 0,
+ sizeof(*entry) - EFA_EP_ADDR_LEN);
+ assert(av->efa_av.type == FI_AV_TABLE);
+
+ entry->av = av;
+
+ if (insert_implicit_av) {
+ entry->fi_addr = FI_ADDR_NOTAVAIL;
+ entry->implicit_fi_addr = fi_addr;
+ err = efa_proto_av_implicit_av_lru_insert(av, entry);
+ if (err)
+ goto err_release;
+ on_lru_list = true;
+ } else {
+ entry->fi_addr = fi_addr;
+ entry->implicit_fi_addr = FI_ADDR_NOTAVAIL;
+ }
+
+ entry->ah = efa_proto_ah_alloc(av->efa_av.domain, raw_addr->raw, insert_implicit_av);
+ if (!entry->ah)
+ goto err_release;
+
+ if (insert_implicit_av)
+ dlist_insert_tail(&entry->ah_implicit_conn_list_entry,
+ &efa_proto_ah_from_ah(entry->ah)->implicit_conn_list);
+
+ entry->shm_fi_addr = FI_ADDR_NOTAVAIL;
+
+ /*
+ * This function is called in two situations:
+ * 1. application calls fi_av_insert API
+ * 2. efa progress engine gets a message from unknown peer through
+ * efa device, meaning peer is not local or shm is disabled.
+ * For situation 1, shm av insertion should happen when peer is local
+ * (insert_shm_av=1). For situation 2, it shouldn't (insert_shm_av=0).
+ */
+ if (insert_shm_av) {
+ err = efa_proto_av_entry_insert_shm_av(av, entry);
+ if (err) {
+ errno = -err;
+ goto err_release;
+ }
+ }
+
+ err = efa_av_reverse_av_add(&av->efa_av, cur_reverse_av, prv_reverse_av,
+ (struct efa_av_entry *)entry);
+ if (err) {
+ efa_proto_av_entry_deinit(av, entry);
+ goto err_release;
+ }
+
+ insert_implicit_av ? av->used_implicit++ : av->efa_av.used++;
+
+ return entry;
+
+err_release:
+ if (insert_implicit_av && on_lru_list)
+ dlist_remove(&entry->implicit_av_lru_entry);
+
+ if (entry->ah)
+ efa_proto_ah_release(av->efa_av.domain, entry->ah, insert_implicit_av);
+
+ entry->ah = NULL;
+ memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN);
+ err = ofi_av_remove_addr(util_av, fi_addr);
+ if (err)
+ EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n",
+ err);
+
+ return NULL;
+}
+
+/* ---- Implicit to explicit migration ---- */
+
+/**
+ * @brief get the fi_addr from a peer rx entry's packet context
+ *
+ * Used as a callback for foreach_unspec_addr during implicit-to-explicit
+ * migration.
+ *
+ * @param[in] rx_entry peer rx entry
+ * @return fi_addr of the peer
+ */
+static fi_addr_t
+efa_proto_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry)
+{
+ struct efa_rdm_pke *pke;
+
+ pke = (struct efa_rdm_pke *) rx_entry->peer_context;
+
+ return pke->peer->av_entry->fi_addr;
+}
+
+/**
+ * @brief migrate an implicit AV entry to the explicit AV
+ *
+ * Moves the entry, its peer map, AH, and SHM fi_addr from the implicit
+ * AV to the explicit AV. Updates reverse AVs and notifies the SRX to
+ * move unexpected messages from the unspecified queue.
+ *
+ * Caller must hold util_av.lock and util_av_implicit.lock.
+ *
+ * @param[in] av protocol AV
+ * @param[in] raw_addr raw efa address
+ * @param[in] implicit_fi_addr fi_addr in the implicit AV
+ * @param[out] fi_addr fi_addr assigned in the explicit AV
+ * @return 0 on success, negative error code on failure
+ */
+int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av,
+ struct efa_ep_addr *raw_addr,
+ fi_addr_t implicit_fi_addr,
+ fi_addr_t *fi_addr)
+{
+ int err;
+ struct efa_ah *ah;
+ struct efa_proto_av_entry *implicit_entry, *explicit_entry;
+ struct efa_rdm_ep *ep;
+ struct dlist_entry *list_entry;
+ struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry;
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry, *tmp;
+ struct fid_peer_srx *peer_srx;
+
+ EFA_INFO(FI_LOG_AV,
+ "Moving peer with implicit fi_addr %" PRIu64
+ " to explicit AV\n",
+ implicit_fi_addr);
+
+ assert(ofi_genlock_held(&av->efa_av.util_av.lock));
+ assert(ofi_genlock_held(&av->util_av_implicit.lock));
+
+ implicit_util_av_entry =
+ ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr);
+ implicit_entry = (struct efa_proto_av_entry *) implicit_util_av_entry->data;
+
+ assert(implicit_entry);
+ assert(efa_is_same_addr(
+ raw_addr, (struct efa_ep_addr *) implicit_entry->ep_addr));
+ assert(implicit_entry->fi_addr == FI_ADDR_NOTAVAIL &&
+ implicit_entry->implicit_fi_addr == implicit_fi_addr);
+
+ ah = implicit_entry->ah;
+
+ /* Create explicit util AV entry */
+ err = ofi_av_insert_addr(&av->efa_av.util_av, raw_addr, fi_addr);
+ if (err) {
+ EFA_WARN(FI_LOG_AV,
+ "ofi_av_insert_addr into explicit AV failed! Error "
+ "message: %s\n",
+ fi_strerror(err));
+ return err;
+ }
+
+ explicit_util_av_entry =
+ ofi_bufpool_get_ibuf(av->efa_av.util_av.av_entry_pool, *fi_addr);
+ explicit_entry = (struct efa_proto_av_entry *) explicit_util_av_entry->data;
+ assert(efa_is_same_addr(
+ raw_addr, (struct efa_ep_addr *) explicit_entry->ep_addr));
+
+ /* Copy information from implicit to explicit */
+ memset((char *)explicit_entry + EFA_EP_ADDR_LEN, 0,
+ sizeof(*explicit_entry) - EFA_EP_ADDR_LEN);
+ assert(av->efa_av.type == FI_AV_TABLE);
+ explicit_entry->av = av;
+ explicit_entry->ah = implicit_entry->ah;
+ explicit_entry->fi_addr = *fi_addr;
+ explicit_entry->shm_fi_addr = implicit_entry->shm_fi_addr;
+ explicit_entry->implicit_fi_addr = FI_ADDR_NOTAVAIL;
+ HASH_ITER(hh, implicit_entry->ep_peer_map, map_entry, tmp) {
+ HASH_DELETE(hh, implicit_entry->ep_peer_map, map_entry);
+ HASH_ADD_PTR(explicit_entry->ep_peer_map, ep_ptr, map_entry);
+ map_entry->peer.av_entry = explicit_entry;
+ }
+ assert(HASH_CNT(hh, implicit_entry->ep_peer_map) == 0);
+
+ /* Handle reverse AV and AV ref counts */
+ efa_av_reverse_av_remove(&av->cur_reverse_av_implicit,
+ &av->prv_reverse_av_implicit,
+ (struct efa_av_entry *)implicit_entry);
+
+ dlist_remove(&implicit_entry->implicit_av_lru_entry);
+
+ err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr);
+ if (err) {
+ EFA_WARN(FI_LOG_AV,
+ "ofi_av_remove_addr from implicit AV failed! Error "
+ "message: %s\n",
+ fi_strerror(err));
+ return err;
+ }
+
+ av->used_implicit--;
+
+ err = efa_av_reverse_av_add(&av->efa_av, &av->efa_av.cur_reverse_av,
+ &av->efa_av.prv_reverse_av,
+ (struct efa_av_entry *)explicit_entry);
+ if (err)
+ return err;
+
+ av->efa_av.used++;
+
+ /* Handle AH LRU list and refcnt */
+ assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list));
+ dlist_remove(&implicit_entry->ah_implicit_conn_list_entry);
+ efa_proto_ah_lru_move(av->efa_av.domain, ah);
+ efa_proto_ah_from_ah(ah)->implicit_refcnt--;
+ efa_proto_ah_from_ah(ah)->explicit_refcnt++;
+
+ EFA_INFO(FI_LOG_AV,
+ "Peer with implicit fi_addr %" PRIu64
+ " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n",
+ implicit_fi_addr, *fi_addr);
+
+ ofi_genlock_lock(&av->efa_av.util_av.ep_list_lock);
+ dlist_foreach(&av->efa_av.util_av.ep_list, list_entry) {
+ ep = container_of(list_entry, struct efa_rdm_ep, base_ep.util_ep.av_entry);
+ peer_srx = util_get_peer_srx(ep->peer_srx_ep);
+ peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_proto_av_get_addr_from_peer_rx_entry);
+ }
+ ofi_genlock_unlock(&av->efa_av.util_av.ep_list_lock);
+
+ return FI_SUCCESS;
+}
+
+/* ---- Protocol AV insert_one ---- */
+
+/**
+ * @brief insert one address into the protocol AV
+ *
+ * Checks explicit and implicit AVs for duplicates. Handles
+ * implicit-to-explicit migration when an implicit entry exists.
+ *
+ * Caller must hold srx_lock.
+ *
+ * @param[in] av protocol AV
+ * @param[in] addr raw address (gid:qpn:qkey)
+ * @param[out] fi_addr output fi_addr
+ * @param[in] flags flags from fi_av_insert
+ * @param[in] context context from fi_av_insert
+ * @param[in] insert_shm_av whether to insert into SHM AV
+ * @param[in] insert_implicit_av whether to insert into implicit AV
+ * @return 0 on success, negative error code on failure
+ */
+int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr,
+ fi_addr_t *fi_addr, uint64_t flags, void *context,
+ bool insert_shm_av, bool insert_implicit_av)
+{
+ struct efa_proto_av_entry *entry;
+ char raw_gid_str[INET6_ADDRSTRLEN];
+ fi_addr_t efa_fiaddr;
+ fi_addr_t implicit_fi_addr;
+ int ret = 0;
+
+ if (!efa_av_is_valid_address(addr)) {
+ EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n");
+ *fi_addr = FI_ADDR_NOTAVAIL;
+ return -FI_EADDRNOTAVAIL;
+ }
+
+ assert(ofi_genlock_held(&av->efa_av.domain->srx_lock));
+ ofi_genlock_lock(&av->util_av_implicit.lock);
+ ofi_genlock_lock(&av->efa_av.util_av.lock);
+
+ memset(raw_gid_str, 0, sizeof(raw_gid_str));
+ if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) {
+ EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno);
+ ret = -FI_EINVAL;
+ *fi_addr = FI_ADDR_NOTAVAIL;
+ goto out;
+ }
+
+ EFA_INFO(FI_LOG_AV,
+ "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n",
+ raw_gid_str, addr->qpn, addr->qkey,
+ insert_implicit_av ? "implicit" : "explicit");
+
+ /* Check explicit AV */
+ efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->efa_av.util_av, addr);
+ if (efa_fiaddr != FI_ADDR_NOTAVAIL) {
+ assert(!insert_implicit_av);
+ EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr);
+ *fi_addr = efa_fiaddr;
+ ret = 0;
+ goto out;
+ }
+
+ /* Check implicit AV */
+ implicit_fi_addr =
+ ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr);
+ if (implicit_fi_addr != FI_ADDR_NOTAVAIL) {
+ EFA_INFO(FI_LOG_AV,
+ "Found implicit AV entry id %ld for the same address\n",
+ implicit_fi_addr);
+
+ if (insert_implicit_av) {
+ entry = efa_proto_av_addr_to_entry_implicit(av, implicit_fi_addr);
+ efa_proto_av_implicit_av_lru_entry_move(av, entry);
+ *fi_addr = implicit_fi_addr;
+ goto out;
+ }
+
+ ret = efa_proto_av_entry_implicit_to_explicit(av, addr, implicit_fi_addr, fi_addr);
+ if (ret)
+ *fi_addr = FI_ADDR_NOTAVAIL;
+ goto out;
+ }
+
+ entry = efa_proto_av_entry_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av);
+ if (!entry) {
+ *fi_addr = FI_ADDR_NOTAVAIL;
+ ret = -FI_EADDRNOTAVAIL;
+ goto out;
+ }
+
+ if (insert_implicit_av) {
+ *fi_addr = entry->implicit_fi_addr;
+ EFA_INFO(FI_LOG_AV,
+ "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to implicit AV. fi_addr: %ld\n",
+ raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
+ } else {
+ *fi_addr = entry->fi_addr;
+ EFA_INFO(FI_LOG_AV,
+ "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n",
+ raw_gid_str, addr->qpn, addr->qkey, *fi_addr);
+ }
+ ret = 0;
+
+out:
+ ofi_genlock_unlock(&av->efa_av.util_av.lock);
+ ofi_genlock_unlock(&av->util_av_implicit.lock);
+ return ret;
+}
+
+/* ---- Protocol AV fi_ops ---- */
+
+/**
+ * @brief insert addresses into protocol AV (fi_av_insert implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] addr buffer containing addresses to insert
+ * @param[in] count number of addresses
+ * @param[out] fi_addr array for returned fabric addresses
+ * @param[in] flags operation flags
+ * @param[in] context user context
+ * @return number of addresses successfully inserted
+ */
+static int efa_proto_av_insert(struct fid_av *av_fid, const void *addr,
+ size_t count, fi_addr_t *fi_addr,
+ uint64_t flags, void *context)
+{
+ struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid);
+ struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av);
+ int ret = 0, success_cnt = 0;
+ size_t i = 0;
+ struct efa_ep_addr *addr_i;
+ fi_addr_t fi_addr_res;
+
+ if (av->efa_av.util_av.flags & FI_EVENT)
+ return -FI_ENOEQ;
+
+ if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT)))
+ return -FI_EINVAL;
+
+ flags &= ~FI_MORE;
+ if (flags)
+ return -FI_ENOSYS;
+
+ ofi_genlock_lock(&av->efa_av.domain->srx_lock);
+
+ for (i = 0; i < count; i++) {
+ addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN);
+
+ ret = efa_proto_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false);
+ if (ret) {
+ EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret);
+ break;
+ }
+
+ if (fi_addr)
+ fi_addr[i] = fi_addr_res;
+ success_cnt++;
+ }
+
+ ofi_genlock_unlock(&av->efa_av.domain->srx_lock);
+
+ for (; i < count ; i++) {
+ if (fi_addr)
+ fi_addr[i] = FI_ADDR_NOTAVAIL;
+ }
+
+ return success_cnt;
+}
+
+/**
+ * @brief retrieve an address from the protocol AV (fi_av_lookup implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] fi_addr fabric address to look up
+ * @param[out] addr buffer to store the returned address
+ * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written
+ * @return 0 on success, negative error code on failure
+ */
+static int efa_proto_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr,
+ void *addr, size_t *addrlen)
+{
+ struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid);
+ struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av);
+ struct efa_proto_av_entry *entry = NULL;
+
+ if (av->efa_av.type != FI_AV_TABLE)
+ return -FI_EINVAL;
+
+ if (fi_addr == FI_ADDR_NOTAVAIL)
+ return -FI_EINVAL;
+
+ ofi_genlock_lock(&av->efa_av.util_av.lock);
+ entry = efa_proto_av_addr_to_entry(av, fi_addr);
+ if (!entry) {
+ ofi_genlock_unlock(&av->efa_av.util_av.lock);
+ return -FI_EINVAL;
+ }
+
+ memcpy(addr, (void *)entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen));
+ ofi_genlock_unlock(&av->efa_av.util_av.lock);
+ if (*addrlen > EFA_EP_ADDR_LEN)
+ *addrlen = EFA_EP_ADDR_LEN;
+ return 0;
+}
+
+/**
+ * @brief remove addresses from the protocol AV (fi_av_remove implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] fi_addr array of fabric addresses to remove
+ * @param[in] count number of addresses
+ * @param[in] flags operation flags
+ * @return 0 on success, negative error code on failure
+ */
+static int efa_proto_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr,
+ size_t count, uint64_t flags)
+{
+ int err = 0;
+ size_t i;
+ struct efa_av *base_av;
+ struct efa_proto_av *av;
+ struct efa_proto_av_entry *entry;
+
+ if (!fi_addr)
+ return -FI_EINVAL;
+
+ base_av = container_of(av_fid, struct efa_av, util_av.av_fid);
+ av = container_of(base_av, struct efa_proto_av, efa_av);
+ if (av->efa_av.type != FI_AV_TABLE)
+ return -FI_EINVAL;
+
+ ofi_genlock_lock(&av->efa_av.domain->srx_lock);
+ ofi_genlock_lock(&av->efa_av.util_av.lock);
+ for (i = 0; i < count; i++) {
+ entry = efa_proto_av_addr_to_entry(av, fi_addr[i]);
+ if (!entry) {
+ err = -FI_EINVAL;
+ break;
+ }
+
+ efa_proto_av_entry_release(av, entry, false);
+ }
+
+ if (i < count)
+ assert(err);
+
+ ofi_genlock_unlock(&av->efa_av.util_av.lock);
+ ofi_genlock_unlock(&av->efa_av.domain->srx_lock);
+ return err;
+}
+
+/**
+ * @brief convert an address to a printable string (fi_av_straddr implementation)
+ *
+ * @param[in] av_fid fid of AV
+ * @param[in] addr address to convert
+ * @param[out] buf buffer to store the string
+ * @param[in,out] len on input, size of buf; on output, bytes written
+ * @return pointer to buf
+ */
+static const char *efa_proto_av_straddr(struct fid_av *av_fid, const void *addr,
+ char *buf, size_t *len)
+{
+ return ofi_straddr(buf, len, FI_ADDR_EFA, addr);
+}
+
+static struct fi_ops_av efa_proto_av_ops = {
+ .size = sizeof(struct fi_ops_av),
+ .insert = efa_proto_av_insert,
+ .insertsvc = fi_no_av_insertsvc,
+ .insertsym = fi_no_av_insertsym,
+ .remove = efa_proto_av_remove,
+ .lookup = efa_proto_av_lookup,
+ .straddr = efa_proto_av_straddr
+};
+
+/**
+ * @brief release all entries in the explicit and implicit reverse AVs
+ *
+ * @param[in] av protocol AV
+ */
+static void efa_proto_av_close_reverse_av(struct efa_proto_av *av)
+{
+ struct efa_cur_reverse_av *cur_entry, *curtmp;
+ struct efa_prv_reverse_av *prv_entry, *prvtmp;
+
+ ofi_genlock_lock(&av->efa_av.domain->srx_lock);
+
+ ofi_genlock_lock(&av->efa_av.util_av.lock);
+
+ HASH_ITER(hh, av->efa_av.cur_reverse_av, cur_entry, curtmp) {
+ efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, false);
+ }
+
+ HASH_ITER(hh, av->efa_av.prv_reverse_av, prv_entry, prvtmp) {
+ efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, false);
+ }
+
+ ofi_genlock_unlock(&av->efa_av.util_av.lock);
+
+ ofi_genlock_lock(&av->util_av_implicit.lock);
+
+ HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) {
+ efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, true);
+ }
+
+ HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) {
+ efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, true);
+ }
+
+ ofi_genlock_unlock(&av->util_av_implicit.lock);
+
+ ofi_genlock_unlock(&av->efa_av.domain->srx_lock);
+}
+
+/**
+ * @brief close the protocol AV and release all resources (fi_close implementation)
+ *
+ * @param[in] fid fid of AV
+ * @return 0 on success, negative error code on failure
+ */
+static int efa_proto_av_close(struct fid *fid)
+{
+ struct efa_av *base_av;
+ struct efa_proto_av *av;
+ int err = 0;
+ struct efa_ep_addr_hashable *ep_addr_hashable, *tmp;
+
+ base_av = container_of(fid, struct efa_av, util_av.av_fid.fid);
+ av = container_of(base_av, struct efa_proto_av, efa_av);
+
+ efa_proto_av_close_reverse_av(av);
+
+ err = ofi_av_close(&av->efa_av.util_av);
+ if (OFI_UNLIKELY(err))
+ EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n",
+ fi_strerror(err));
+
+ err = ofi_av_close(&av->util_av_implicit);
+ if (OFI_UNLIKELY(err))
+ EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n",
+ fi_strerror(err));
+
+ if (av->shm_rdm_av) {
+ err = fi_close(&av->shm_rdm_av->fid);
+ if (OFI_UNLIKELY(err))
+ EFA_WARN(FI_LOG_AV,
+ "Failed to close shm av: %s\n",
+ fi_strerror(err));
+ }
+
+ HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) {
+ HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable);
+ free(ep_addr_hashable);
+ }
+
+ free(av);
+ return err;
+}
+
+static struct fi_ops efa_proto_av_fi_ops = {
+ .size = sizeof(struct fi_ops),
+ .close = efa_proto_av_close,
+ .bind = fi_no_bind,
+ .control = fi_no_control,
+ .ops_open = fi_no_ops_open,
+};
+
+/**
+ * @brief open a protocol AV (fi_av_open implementation for RDM)
+ *
+ * @param[in] domain_fid fid of domain
+ * @param[in] attr AV attributes
+ * @param[out] av_fid pointer to store the opened AV fid
+ * @param[in] context user context
+ * @return 0 on success, negative error code on failure
+ */
+int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
+ struct fid_av **av_fid, void *context)
+{
+ struct efa_domain *efa_domain;
+ struct efa_proto_av *av;
+ struct fi_av_attr av_attr = { 0 };
+ size_t context_len;
+ size_t universe_size;
+ int ret, retv;
+
+ if (!attr)
+ return -FI_EINVAL;
+
+ if (attr->name)
+ return -FI_ENOSYS;
+
+ if (attr->flags)
+ return -FI_ENOSYS;
+
+ if (!attr->count)
+ attr->count = EFA_MIN_AV_SIZE;
+ else
+ attr->count = MAX(attr->count, EFA_MIN_AV_SIZE);
+
+ av = calloc(1, sizeof(*av));
+ if (!av)
+ return -FI_ENOMEM;
+
+ if (attr->type == FI_AV_MAP) {
+ EFA_INFO(FI_LOG_AV, "FI_AV_MAP is deprecated in Libfabric 2.x. Please use FI_AV_TABLE. "
+ "EFA provider will now switch to using FI_AV_TABLE.\n");
+ }
+ attr->type = FI_AV_TABLE;
+
+ efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid);
+
+ if (fi_param_get_size_t(NULL, "universe_size",
+ &universe_size) == FI_SUCCESS)
+ attr->count = MAX(attr->count, universe_size);
+
+ context_len = sizeof(struct efa_proto_av_entry) - EFA_EP_ADDR_LEN;
+
+ ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context,
+ context_len);
+ if (ret)
+ goto err;
+
+ ret = efa_av_init_util_av(efa_domain, attr, &av->efa_av.util_av, context,
+ context_len);
+ if (ret)
+ goto err_close_util_av_implicit;
+
+ if (efa_domain->fabric && efa_domain->fabric->shm_fabric) {
+ av_attr = *attr;
+ if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) {
+ ret = -FI_ENOSYS;
+ EFA_WARN(FI_LOG_AV,
+ "The requested av size is beyond"
+ " shm supported maximum av size: %s\n",
+ fi_strerror(-ret));
+ goto err_close_util_av;
+ }
+ av_attr.count = efa_env.shm_av_size;
+ assert(av_attr.type == FI_AV_TABLE);
+ ret = fi_av_open(efa_domain->shm_domain, &av_attr,
+ &av->shm_rdm_av, context);
+ if (ret)
+ goto err_close_util_av;
+ }
+
+ EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags);
+
+ av->efa_av.domain = efa_domain;
+ av->efa_av.type = attr->type;
+ av->efa_av.used = 0;
+ av->implicit_av_size = efa_env.implicit_av_size;
+ av->used_implicit = 0;
+ av->shm_used = 0;
+
+ *av_fid = &av->efa_av.util_av.av_fid;
+ (*av_fid)->fid.fclass = FI_CLASS_AV;
+ (*av_fid)->fid.context = context;
+ (*av_fid)->fid.ops = &efa_proto_av_fi_ops;
+ (*av_fid)->ops = &efa_proto_av_ops;
+
+ dlist_init(&av->implicit_av_lru_list);
+
+ return 0;
+
+err_close_util_av:
+ retv = ofi_av_close(&av->efa_av.util_av);
+ if (retv)
+ EFA_WARN(FI_LOG_AV,
+ "Unable to close util_av: %s\n", fi_strerror(-retv));
+
+err_close_util_av_implicit:
+ retv = ofi_av_close(&av->util_av_implicit);
+ if (retv)
+ EFA_WARN(FI_LOG_AV,
+ "Unable to close util_av_implicit: %s\n", fi_strerror(-retv));
+
+err:
+ free(av);
+ return ret;
+}
diff --git a/prov/efa/src/rdm/efa_proto_av.h b/prov/efa/src/rdm/efa_proto_av.h
new file mode 100644
index 00000000000..6cf8f330383
--- /dev/null
+++ b/prov/efa/src/rdm/efa_proto_av.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
+/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
+
+#ifndef EFA_PROTO_AV_H
+#define EFA_PROTO_AV_H
+
+#include "efa_av.h"
+
+struct efa_rdm_ep;
+struct efa_rdm_peer;
+
+/**
+ * @brief Protocol AH — wraps base efa_ah with implicit refcount and LRU
+ *
+ * The base efa_ah has a single refcount and no LRU knowledge.
+ * efa_proto_ah adds the implicit/explicit refcount split, the
+ * implicit_conn_list (entries using this AH), and the LRU list
+ * entry for AH eviction.
+ *
+ * pahole: size: 128, cachelines: 2
+ *
+ * All efa_proto_ah fields are control path only (AV insert/remove/eviction).
+ * The TX hot fields (ibv_ah, ahn) are in the embedded efa_ah at cacheline 0.
+ * The protocol extension fields start at offset 88 (cacheline 1), so
+ * accessing them on the eviction path does not pollute the TX cache line.
+ */
+struct efa_proto_ah {
+ struct efa_ah ah; /* 0 88 must be first (castable) */
+ /* --- cacheline 1 boundary (64 bytes) was 24 bytes ago --- */
+ int implicit_refcnt; /* 88 4 */
+ int explicit_refcnt; /* 92 4 */
+ struct dlist_entry implicit_conn_list; /* 96 16 */
+ struct dlist_entry lru_list_entry; /* 112 16 */
+};
+
+/**
+ * @brief Protocol AV entry — flat layout with same field prefix as efa_av_entry
+ *
+ * pahole:
+ * size: 112, cachelines: 2, members: 9
+ *
+ * Cache line 0 (64 bytes): data-path hot fields
+ * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20)
+ * ah* off=32 — TX hot (EFA send path)
+ * fi_addr off=40 — RX hot (explicit peer lookup, CQ poll)
+ * implicit_fi_addr off=48 — RX hot (implicit peer lookup, CQ poll)
+ * ep_peer_map* off=56 — TX+RX hot (peer lookup on every op)
+ *
+ * Cache line 1 (48 bytes): SHM-only TX / control-path fields
+ * shm_fi_addr off=64 — SHM TX only
+ * implicit_av_lru_entry off=72 — implicit RX LRU bookkeeping
+ * ah_implicit_conn_list_entry off=88 — implicit AV insert/release
+ * av* off=104 — back-pointer for AH eviction
+ */
+struct efa_proto_av_entry {
+ uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */
+ struct efa_ah *ah; /* 32 8 */
+ fi_addr_t fi_addr; /* 40 8 */
+ fi_addr_t implicit_fi_addr; /* 48 8 */
+ struct efa_proto_av_entry_ep_peer_map_entry *ep_peer_map; /* 56 8 */
+ /* --- cacheline 1 boundary (64 bytes) --- */
+ fi_addr_t shm_fi_addr; /* 64 8 */
+ struct dlist_entry implicit_av_lru_entry; /* 72 16 */
+ struct dlist_entry ah_implicit_conn_list_entry; /* 88 16 */
+ struct efa_proto_av *av; /* 104 8 */
+};
+
+/**
+ * @brief Peer map entry — maps (ep_ptr) to efa_rdm_peer for a given AV entry
+ *
+ * pahole: size: 328, cachelines: 6
+ */
+struct efa_proto_av_entry_ep_peer_map_entry {
+ struct efa_rdm_ep *ep_ptr; /* 0 8 */
+ struct efa_rdm_peer peer; /* 8 264 */
+ UT_hash_handle hh; /* 272 56 */
+};
+
+/**
+ * @brief Protocol AV — embeds efa_av as first member (castable)
+ *
+ * pahole:
+ * size: 672, cachelines: 11, members: 10
+ *
+ * efa_av off=0 size=320 (cachelines 0-4)
+ * domain* off=0 — cacheline 0
+ * cur_reverse_av* off=24 — RX hot: explicit peer reverse lookup
+ * prv_reverse_av* off=32 — RX hot: QPN reuse fallback
+ * util_av off=40 size=280
+ * --- cacheline 5 boundary (320 bytes) ---
+ * shm_rdm_av* off=320 — control path only
+ * util_av_implicit off=328 size=280
+ * --- cacheline 9 boundary (576 bytes) + 32 ---
+ * cur_reverse_av_implicit* off=608 — RX hot (implicit peers only)
+ * prv_reverse_av_implicit* off=616 — RX hot (implicit peers only)
+ * implicit_av_lru_list off=624 — implicit RX: LRU reorder
+ * --- cacheline 10 boundary (640 bytes) ---
+ * used_implicit off=640
+ * shm_used off=648
+ * implicit_av_size off=656
+ * evicted_peers_hashset* off=664
+ *
+ * RX hot path (every RX completion):
+ * efa_av.cur_reverse_av (off=24) — HASH_FIND for explicit peer reverse lookup
+ * efa_av.prv_reverse_av (off=32) — HASH_FIND fallback for QPN reuse (connid mismatch)
+ * These are in cacheline 0 — explicit peer reverse lookup stays in one line.
+ *
+ * RX hot path for implicit (unknown) peers:
+ * cur_reverse_av_implicit (off=608) — HASH_FIND for implicit peer reverse lookup
+ * prv_reverse_av_implicit (off=616) — HASH_FIND fallback
+ * implicit_av_lru_list (off=624) — LRU reorder on every implicit RX
+ * All three are in cacheline 9 — implicit peer reverse lookup + LRU
+ * update stays in one cache line.
+ *
+ * Control path only (AV insert/remove/close):
+ * shm_rdm_av, util_av_implicit, used_implicit, shm_used,
+ * implicit_av_size, evicted_peers_hashset
+ */
+struct efa_proto_av {
+ struct efa_av efa_av; /* 0 320 */
+ /* --- cacheline 5 boundary (320 bytes) --- */
+ struct fid_av *shm_rdm_av; /* 320 8 */
+ /* implicit AV is used when receiving messages from peers not
+ * explicitly inserted by the application */
+ struct util_av util_av_implicit; /* 328 280 */
+ struct efa_cur_reverse_av *cur_reverse_av_implicit; /* 608 8 */
+ struct efa_prv_reverse_av *prv_reverse_av_implicit; /* 616 8 */
+ struct dlist_entry implicit_av_lru_list; /* 624 16 */
+ /* --- cacheline 10 boundary (640 bytes) --- */
+ size_t used_implicit; /* 640 8 */
+ size_t shm_used; /* 648 8 */
+ size_t implicit_av_size; /* 656 8 */
+ struct efa_ep_addr_hashable *evicted_peers_hashset; /* 664 8 */
+};
+
+/**
+ * @brief get the protocol AH wrapper from a base AH pointer
+ *
+ * @param[in] ah base AH (must be embedded in efa_proto_ah)
+ * @return pointer to the containing efa_proto_ah
+ */
+static inline struct efa_proto_ah *efa_proto_ah_from_ah(struct efa_ah *ah)
+{
+ return container_of(ah, struct efa_proto_ah, ah);
+}
+
+/**
+ * @brief typed accessor for the ep_addr field of a proto AV entry
+ *
+ * @param[in] entry proto AV entry
+ * @return pointer to the efa_ep_addr embedded in the entry
+ */
+static inline struct efa_ep_addr *
+efa_proto_av_entry_ep_addr(struct efa_proto_av_entry *entry)
+{
+ return (struct efa_ep_addr *)entry->ep_addr;
+}
+
+/* Address lookup */
+struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av,
+ fi_addr_t fi_addr);
+
+struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit(
+ struct efa_proto_av *av, fi_addr_t fi_addr);
+
+/* Peer map operations */
+void efa_proto_av_entry_ep_peer_map_insert(
+ struct efa_proto_av_entry *entry,
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry);
+
+struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup(
+ struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep);
+
+void efa_proto_av_entry_ep_peer_map_remove(
+ struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep);
+
+/* Protocol AH allocation / release (shared base AH + proto wrapper) */
+struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain,
+ const uint8_t *gid,
+ bool insert_implicit_av);
+
+void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah,
+ bool release_from_implicit_av);
+
+/* SHM AV operations */
+int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry);
+
+/* Entry deinit (tears down peers on the entry) */
+void efa_proto_av_entry_deinit(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry);
+
+/* Implicit AV LRU */
+void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry);
+
+/* Reverse lookup for protocol path */
+fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av,
+ uint16_t ahn, uint16_t qpn,
+ struct efa_rdm_pke *pkt_entry);
+
+fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av,
+ uint16_t ahn, uint16_t qpn,
+ struct efa_rdm_pke *pkt_entry);
+
+/* Entry alloc/release */
+struct efa_proto_av_entry *efa_proto_av_entry_alloc(
+ struct efa_proto_av *av, struct efa_ep_addr *raw_addr,
+ uint64_t flags, void *context, bool insert_shm_av,
+ bool insert_implicit_av);
+
+void efa_proto_av_entry_release(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av);
+
+void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av,
+ struct efa_proto_av_entry *entry,
+ bool release_from_implicit_av);
+
+/* Implicit to explicit migration */
+int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av,
+ struct efa_ep_addr *raw_addr,
+ fi_addr_t implicit_fi_addr,
+ fi_addr_t *fi_addr);
+
+/* AV open/close/insert/remove for protocol path */
+int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
+ struct fid_av **av_fid, void *context);
+
+int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr,
+ fi_addr_t *fi_addr, uint64_t flags, void *context,
+ bool insert_shm_av, bool insert_implicit_av);
+
+#endif
diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c
index 1999e540520..1170861498e 100644
--- a/prov/efa/src/rdm/efa_rdm_cq.c
+++ b/prov/efa/src/rdm/efa_rdm_cq.c
@@ -6,6 +6,7 @@
#include "efa_data_path_ops.h"
#include "ofi_util.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_cntr.h"
#include "efa_rdm_pke_cmd.h"
#include "efa_rdm_pke_utils.h"
@@ -190,17 +191,15 @@ static void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion(
struct util_cq *target_cq;
int ret;
fi_addr_t src_addr;
- struct efa_av *efa_av;
uint32_t imm_data = efa_ibv_cq_wc_read_imm_data(ibv_cq);
uint32_t len = efa_ibv_cq_wc_read_byte_len(ibv_cq);
target_cq = ep->base_ep.util_ep.rx_cq;
- efa_av = ep->base_ep.av;
if (ep->base_ep.util_ep.caps & FI_SOURCE) {
/* Only check the explicit AV when writing completions */
- src_addr = efa_av_reverse_lookup_rdm(efa_av,
+ src_addr = efa_proto_av_reverse_lookup(ep->proto_av,
efa_ibv_cq_wc_read_slid(ibv_cq),
efa_ibv_cq_wc_read_src_qp(ibv_cq),
NULL);
@@ -361,7 +360,7 @@ efa_rdm_cq_lookup_raw_addr(struct efa_rdm_pke *pke,
}
/* Next check implicit AV */
- addr = ofi_av_lookup_fi_addr(&ep->base_ep.av->util_av_implicit,
+ addr = ofi_av_lookup_fi_addr(&ep->proto_av->util_av_implicit,
(void *) efa_ep_addr);
if (addr != FI_ADDR_NOTAVAIL) {
implicit = true;
@@ -401,7 +400,6 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
struct efa_ibv_cq *efa_ibv_cq,
struct efa_rdm_pke *pkt_entry)
{
- struct efa_av *efa_av = ep->base_ep.av;
fi_addr_t explicit_fi_addr, implicit_fi_addr;
struct efa_ep_addr efa_ep_addr = {0};
struct efa_ep_addr_hashable *efa_ep_addr_hashable = NULL;
@@ -433,7 +431,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
* behavior is fixed
*/
explicit_fi_addr =
- efa_av_reverse_lookup_rdm(efa_av, gid, qpn, pkt_entry);
+ efa_proto_av_reverse_lookup(ep->proto_av, gid, qpn, pkt_entry);
if (explicit_fi_addr != FI_ADDR_NOTAVAIL) {
EFA_DBG(FI_LOG_CQ,
@@ -445,7 +443,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
}
implicit_fi_addr =
- efa_av_reverse_lookup_rdm_implicit(efa_av, gid, qpn, pkt_entry);
+ efa_proto_av_reverse_lookup_implicit(ep->proto_av, gid, qpn, pkt_entry);
if (implicit_fi_addr != FI_ADDR_NOTAVAIL) {
EFA_DBG(FI_LOG_CQ,
@@ -473,7 +471,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
* TODO: continue communication with peer by saving the previous state
* and restoring it
*/
- HASH_FIND(hh, ep->base_ep.av->evicted_peers_hashset, &efa_ep_addr,
+ HASH_FIND(hh, ep->proto_av->evicted_peers_hashset, &efa_ep_addr,
sizeof(struct efa_ep_addr), efa_ep_addr_hashable);
if (OFI_UNLIKELY(!!efa_ep_addr_hashable)) {
EFA_WARN(FI_LOG_CQ, "Received packet from peer already evicted "
@@ -494,7 +492,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
* not local or shm is disabled for transmission. We shouldn't insert
* in to shm av in this case.
*/
- ret = efa_av_insert_one(ep->base_ep.av, &efa_ep_addr, &implicit_fi_addr,
+ ret = efa_proto_av_insert_one(ep->proto_av, &efa_ep_addr, &implicit_fi_addr,
0, NULL, false, true);
if (OFI_UNLIKELY(ret != 0)) {
efa_base_ep_write_eq_error(&ep->base_ep, ret,
@@ -506,10 +504,10 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep,
out:
assert(peer);
- assert((peer->conn->fi_addr != FI_ADDR_NOTAVAIL &&
- peer->conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) ||
- (peer->conn->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
- peer->conn->fi_addr == FI_ADDR_NOTAVAIL));
+ assert((peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL &&
+ peer->av_entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) ||
+ (peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL &&
+ peer->av_entry->fi_addr == FI_ADDR_NOTAVAIL));
return peer;
}
@@ -584,8 +582,8 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
EFA_WARN(FI_LOG_CQ,
"Peer fi_addr: %ld implicit fi_addr %ld is requesting "
"feature %d, which this EP does not support.\n",
- pkt_entry->peer->conn->fi_addr,
- pkt_entry->peer->conn->implicit_fi_addr,
+ pkt_entry->peer->av_entry->fi_addr,
+ pkt_entry->peer->av_entry->implicit_fi_addr,
base_hdr->type);
assert(0 && "invalid REQ packet type");
@@ -690,7 +688,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc_closing_ep(struct efa_ibv_cq *cq, struc
efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id,
(size_t) pkt_entry->ope->cq_entry.op_context,
pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag,
- pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL,
+ pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL,
efa_rdm_pkt_type_of_pke(pkt_entry));
#endif
@@ -756,7 +754,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc(struct efa_ibv_cq *cq, struct efa_rdm_e
efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id,
(size_t) pkt_entry->ope->cq_entry.op_context,
pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag,
- pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL,
+ pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL,
efa_rdm_pkt_type_of_pke(pkt_entry));
#endif
diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h
index 8684bdf7305..2e05fcad221 100644
--- a/prov/efa/src/rdm/efa_rdm_ep.h
+++ b/prov/efa/src/rdm/efa_rdm_ep.h
@@ -54,6 +54,7 @@ struct efa_rdm_ep_queued_copy {
struct efa_rdm_ep {
struct efa_base_ep base_ep;
+ struct efa_proto_av *proto_av; /* set during fi_ep_bind, avoids container_of on hot path */
/* self_ah necessary for local reads when application does not insert
* its own address into the AV */
@@ -561,15 +562,7 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep);
char ep_addr_str[OFI_ADDRSTRLEN] = {0}; \
efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &(size_t){sizeof ep_addr_str});
-static inline
-fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr)
-{
- struct efa_conn *conn;
-
- assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock));
- conn = efa_av_addr_to_conn(ep->base_ep.av, addr);
- return conn ? conn->shm_fi_addr : FI_ADDR_NOTAVAIL;
-}
+fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr);
static inline size_t efa_rdm_ep_get_available_tx_pkts(struct efa_rdm_ep *ep)
{
diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c
index 5020c487bc6..64e8b4c83d4 100644
--- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c
+++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c
@@ -3,6 +3,7 @@
#include "efa.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_rdm_ep.h"
#include "efa_rdm_cq.h"
#include "efa_rdm_srx.h"
@@ -259,7 +260,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep)
goto err_free;
ret = ofi_bufpool_create(&ep->peer_map_entry_pool,
- sizeof(struct efa_conn_ep_peer_map_entry),
+ sizeof(struct efa_proto_av_entry_ep_peer_map_entry),
EFA_RDM_BUFPOOL_ALIGNMENT,
0, /* no limit to max_cnt */
EFA_RDM_EP_MIN_PEER_POOL_SIZE,
@@ -660,10 +661,12 @@ static int efa_rdm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags)
if (ret)
return ret;
+ efa_rdm_ep->proto_av = container_of(av, struct efa_proto_av, efa_av);
+
/* Bind shm provider endpoint & shm av */
if (efa_rdm_ep->shm_ep) {
- assert(av->shm_rdm_av);
- ret = fi_ep_bind(efa_rdm_ep->shm_ep, &av->shm_rdm_av->fid, flags);
+ assert(efa_rdm_ep->proto_av->shm_rdm_av);
+ ret = fi_ep_bind(efa_rdm_ep->shm_ep, &efa_rdm_ep->proto_av->shm_rdm_av->fid, flags);
if (ret)
return ret;
}
@@ -722,9 +725,9 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep)
struct efa_rdm_ope *rxe;
struct efa_rdm_ope *txe;
struct efa_rdm_peer *peer;
- struct util_av_entry *util_av_entry;
- struct efa_av_entry *av_entry;
- struct efa_conn_ep_peer_map_entry *peer_map_entry;
+ struct efa_proto_av_entry *proto_entry;
+ struct efa_proto_av_entry_ep_peer_map_entry *pm_entry;
+
/*
* Destruct peers first so overflow packets are properly
@@ -735,28 +738,24 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep)
struct efa_rdm_peer, peer,
ep_peer_list_entry, tmp) {
- if (peer->conn->fi_addr != FI_ADDR_UNSPEC) {
- util_av_entry = ofi_bufpool_get_ibuf(
- efa_rdm_ep->base_ep.av->util_av.av_entry_pool,
- peer->conn->fi_addr);
+ if (peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL) {
+ proto_entry = efa_proto_av_addr_to_entry(
+ efa_rdm_ep->proto_av, peer->av_entry->fi_addr);
} else {
- assert(peer->conn->implicit_fi_addr != FI_ADDR_UNSPEC);
+ assert(peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL);
- util_av_entry = ofi_bufpool_get_ibuf(
- efa_rdm_ep->base_ep.av->util_av_implicit.av_entry_pool,
- peer->conn->implicit_fi_addr);
+ proto_entry = efa_proto_av_addr_to_entry_implicit(
+ efa_rdm_ep->proto_av, peer->av_entry->implicit_fi_addr);
}
dlist_remove(&peer->ep_peer_list_entry);
efa_rdm_peer_destruct(peer, efa_rdm_ep);
- peer_map_entry = container_of(
- peer, struct efa_conn_ep_peer_map_entry, peer);
-
- av_entry = (struct efa_av_entry *) util_av_entry->data;
- HASH_DEL(av_entry->conn.ep_peer_map, peer_map_entry);
- ofi_buf_free(peer_map_entry);
+ pm_entry = container_of(
+ peer, struct efa_proto_av_entry_ep_peer_map_entry, peer);
+ HASH_DEL(proto_entry->ep_peer_map, pm_entry);
+ ofi_buf_free(pm_entry);
}
#if ENABLE_DEBUG
@@ -803,7 +802,6 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep)
efa_rdm_txe_release(txe);
}
-
if (efa_rdm_ep->ope_pool)
ofi_bufpool_destroy(efa_rdm_ep->ope_pool);
@@ -1082,7 +1080,7 @@ static int efa_rdm_ep_close(struct fid *fid)
efa_rdm_ep_remove_cntr_ibv_cq_poll_list(&efa_rdm_ep->base_ep);
if (efa_rdm_ep->self_ah)
- efa_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false);
+ efa_proto_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false);
efa_rdm_ep_deregister_ibv_cqs(efa_rdm_ep);
@@ -1184,7 +1182,6 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep)
{
int ret, retv = 0;
struct efa_domain *efa_domain;
- struct efa_av *efa_av;
struct efa_rdm_cq *efa_rdm_cq;
@@ -1194,14 +1191,13 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep)
retv = ret;
}
- efa_av = efa_rdm_ep->base_ep.av;
- if (efa_av->shm_rdm_av) {
- ret = fi_close(&efa_av->shm_rdm_av->fid);
+ if (efa_rdm_ep->proto_av->shm_rdm_av) {
+ ret = fi_close(&efa_rdm_ep->proto_av->shm_rdm_av->fid);
if (ret) {
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm av: %s\n", fi_strerror(-ret));
retv = ret;
}
- efa_av->shm_rdm_av = NULL;
+ efa_rdm_ep->proto_av->shm_rdm_av = NULL;
}
efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq);
@@ -1354,7 +1350,7 @@ static inline
int efa_rdm_ep_create_self_ah(struct efa_rdm_ep *rdm_ep)
{
- rdm_ep->self_ah = efa_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false);
+ rdm_ep->self_ah = efa_proto_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false);
return rdm_ep->self_ah ? 0 : -FI_EINVAL;
}
diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c
index 834519802bd..5e6e76da1c0 100644
--- a/prov/efa/src/rdm/efa_rdm_ep_utils.c
+++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c
@@ -9,6 +9,7 @@
#include
#include "efa.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_rdm_msg.h"
#include "efa_rdm_rma.h"
#include "efa_rdm_atomic.h"
@@ -37,12 +38,10 @@ struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep)
*/
int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr)
{
- struct efa_av *efa_av;
- struct efa_conn *efa_conn;
+ struct efa_proto_av_entry *entry;
- efa_av = ep->base_ep.av;
- efa_conn = efa_av_addr_to_conn(efa_av, addr);
- return efa_conn ? efa_conn->ah->ahn : -1;
+ entry = efa_proto_av_addr_to_entry(ep->proto_av, addr);
+ return entry ? entry->ah->ahn : -1;
}
@@ -74,18 +73,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr)
*/
struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr_t addr)
{
- struct efa_conn *conn;
- struct efa_conn_ep_peer_map_entry *map_entry;
+ struct efa_proto_av_entry *entry;
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry;
struct efa_rdm_peer *peer;
assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock));
- conn = efa_av_addr_to_conn(ep->base_ep.av, addr);
-
- if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL))
+ entry = efa_proto_av_addr_to_entry(ep->proto_av, addr);
+ if (!entry)
return NULL;
- peer = efa_conn_ep_peer_map_lookup(conn, ep);
+ peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep);
if (peer)
return peer;
@@ -100,9 +98,9 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr
memset(map_entry, 0, sizeof(*map_entry));
map_entry->ep_ptr = ep;
- efa_rdm_peer_construct(&map_entry->peer, ep, conn);
+ efa_rdm_peer_construct(&map_entry->peer, ep, entry);
- efa_conn_ep_peer_map_insert(conn, map_entry);
+ efa_proto_av_entry_ep_peer_map_insert(entry, map_entry);
dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list);
@@ -119,18 +117,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr
*/
struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr_t addr)
{
- struct efa_conn *conn;
+ struct efa_proto_av_entry *entry;
struct efa_rdm_peer *peer;
- struct efa_conn_ep_peer_map_entry *map_entry;
+ struct efa_proto_av_entry_ep_peer_map_entry *map_entry;
assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock));
- conn = efa_av_addr_to_conn_implicit(ep->base_ep.av, addr);
-
- if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL))
+ entry = efa_proto_av_addr_to_entry_implicit(ep->proto_av, addr);
+ if (!entry)
return NULL;
- peer = efa_conn_ep_peer_map_lookup(conn, ep);
+ peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep);
if (peer)
goto out;
@@ -145,17 +142,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr
memset(map_entry, 0, sizeof(*map_entry));
map_entry->ep_ptr = ep;
- efa_rdm_peer_construct(&map_entry->peer, ep, conn);
+ efa_rdm_peer_construct(&map_entry->peer, ep, entry);
peer = &map_entry->peer;
- efa_conn_ep_peer_map_insert(conn, map_entry);
+ efa_proto_av_entry_ep_peer_map_insert(entry, map_entry);
dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list);
out:
assert(peer);
/* Move to the front of the LRU list */
- efa_av_implicit_av_lru_conn_move(ep->base_ep.av, peer->conn);
+ efa_proto_av_implicit_av_lru_entry_move(ep->proto_av, peer->av_entry);
return peer;
}
@@ -532,7 +529,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent
"initializing backoff timeout for peer fi_addr: "
"%" PRIu64 " implicit fi_addr: %" PRIu64
" timeout: %ld rnr_queued_pkts: %d\n",
- peer->conn->fi_addr, peer->conn->implicit_fi_addr,
+ peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr,
peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt);
} else {
peer->rnr_backoff_wait_time = MIN(peer->rnr_backoff_wait_time * 2,
@@ -541,7 +538,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent
"increasing backoff timeout for peer fi_addr: %" PRIu64
" implicit fi_addr %" PRIu64
" to %ld rnr_queued_pkts: %d\n",
- peer->conn->fi_addr, peer->conn->implicit_fi_addr,
+ peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr,
peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt);
}
}
@@ -575,7 +572,7 @@ static ssize_t efa_rdm_ep_handshake_common(struct efa_rdm_ep *ep, struct efa_rdm
(peer->flags & EFA_RDM_PEER_REQ_SENT)))
return 0;
- msg.addr = peer->conn->fi_addr;
+ msg.addr = peer->av_entry->fi_addr;
txe = ofi_buf_alloc(ep->ope_pool);
if (OFI_UNLIKELY(!txe)) {
@@ -712,7 +709,7 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe
if (OFI_UNLIKELY(err)) {
EFA_WARN(FI_LOG_EP_CTRL,
"Failed to post HANDSHAKE to peer fi_addr: %ld implicit fi_addr %ld. %s\n",
- peer->conn->fi_addr, peer->conn->implicit_fi_addr, fi_strerror(-err));
+ peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, fi_strerror(-err));
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE);
return;
}
@@ -1008,3 +1005,12 @@ int efa_rdm_ep_enforce_handshake_for_txe(struct efa_rdm_ep *ep, struct efa_rdm_o
}
return FI_SUCCESS;
}
+
+fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr)
+{
+ struct efa_proto_av_entry *entry;
+
+ assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock));
+ entry = efa_proto_av_addr_to_entry(ep->proto_av, addr);
+ return entry ? entry->shm_fi_addr : FI_ADDR_NOTAVAIL;
+}
diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c
index ab5e0fb8f63..25aa40efd2b 100644
--- a/prov/efa/src/rdm/efa_rdm_msg.c
+++ b/prov/efa/src/rdm/efa_rdm_msg.c
@@ -209,7 +209,7 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, const struct fi_msg *msg
EFA_DBG(FI_LOG_EP_DATA,
"peer: %" PRIu64
": size %lu tag: %lx op: %x flags: %lx msg_id: %" PRIu32 "\n",
- peer->conn->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id);
+ peer->av_entry->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id);
efa_rdm_tracepoint(send_begin, txe->msg_id,
(size_t) txe->cq_entry.op_context, txe->total_len);
@@ -794,7 +794,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep,
peer_srx = util_get_peer_srx(ep->peer_srx_ep);
peer = (*pkt_entry_ptr)->peer;
- attr.addr = peer->conn->fi_addr;
+ attr.addr = peer->av_entry->fi_addr;
attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr);
attr.tag = 0;
ret = peer_srx->owner_ops->get_msg(peer_srx, &attr, &peer_rxe);
@@ -832,7 +832,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep,
efa_rdm_tracepoint(msg_recv_unexpected_nontagged, (uint64_t) orig_pke_ptr,
(*pkt_entry_ptr)->pkt_size, rxe->msg_id,
(size_t) rxe->cq_entry.op_context,
- rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr);
+ rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr);
#endif
} else { /* Unexpected errors */
@@ -882,7 +882,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep,
peer = (*pkt_entry_ptr)->peer;
peer_srx = util_get_peer_srx(ep->peer_srx_ep);
- attr.addr = peer->conn->fi_addr;
+ attr.addr = peer->av_entry->fi_addr;
attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr);
attr.tag = efa_rdm_pke_get_rtm_tag(*pkt_entry_ptr);
@@ -927,7 +927,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep,
efa_rdm_tracepoint(msg_recv_unexpected_tagged, (uint64_t) orig_pke_ptr,
(*pkt_entry_ptr)->pkt_size, rxe->msg_id,
(size_t) rxe->cq_entry.op_context,
- rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr);
+ rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr);
#endif
} else { /* Unexpected errors */
diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c
index feed792c600..a378446c1b2 100644
--- a/prov/efa/src/rdm/efa_rdm_ope.c
+++ b/prov/efa/src/rdm/efa_rdm_ope.c
@@ -876,7 +876,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe)
" implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %" PRIu64
" incoming message size: %" PRIu64
" receiving buffer size: %zu\n",
- rxe->peer->conn->fi_addr, rxe->peer->conn->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag,
+ rxe->peer->av_entry->fi_addr, rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag,
rxe->total_len, rxe->cq_entry.len);
ret = ofi_cq_write_error_trunc(ep->base_ep.util_ep.rx_cq,
@@ -909,13 +909,13 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe)
" implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32
" msg_id: %" PRIu32 " tag: %lx total_len: %" PRIu64
"\n",
- rxe->peer->conn->fi_addr,
- rxe->peer->conn->implicit_fi_addr, rxe->rx_id,
+ rxe->peer->av_entry->fi_addr,
+ rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id,
rxe->msg_id, rxe->cq_entry.tag, rxe->total_len);
efa_rdm_tracepoint(recv_end,
rxe->msg_id, (size_t) rxe->cq_entry.op_context,
- rxe->total_len, rxe->cq_entry.tag, rxe->peer->conn->fi_addr);
+ rxe->total_len, rxe->cq_entry.tag, rxe->peer->av_entry->fi_addr);
if (ep->base_ep.util_ep.caps & FI_SOURCE)
@@ -926,7 +926,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe)
rxe->cq_entry.buf,
rxe->cq_entry.data,
rxe->cq_entry.tag,
- rxe->peer->conn->fi_addr);
+ rxe->peer->av_entry->fi_addr);
else
ret = ofi_cq_write(rx_cq,
rxe->cq_entry.op_context,
@@ -1010,13 +1010,13 @@ void efa_rdm_txe_report_completion(struct efa_rdm_ope *txe)
"Writing send completion for txe to peer: %" PRIu64
" tx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx len: %"
PRIu64 "\n",
- txe->peer->conn->fi_addr, txe->tx_id, txe->msg_id,
+ txe->peer->av_entry->fi_addr, txe->tx_id, txe->msg_id,
txe->cq_entry.tag, txe->total_len);
efa_rdm_tracepoint(send_end,
txe->msg_id, (size_t) txe->cq_entry.op_context,
- txe->total_len, txe->cq_entry.tag, txe->peer->conn->fi_addr);
+ txe->total_len, txe->cq_entry.tag, txe->peer->av_entry->fi_addr);
/* TX completions should not send peer address to util_cq */
if (txe->ep->base_ep.util_ep.caps & FI_SOURCE)
diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c
index 9188f5b96ec..4809bed5f75 100644
--- a/prov/efa/src/rdm/efa_rdm_peer.c
+++ b/prov/efa/src/rdm/efa_rdm_peer.c
@@ -3,6 +3,7 @@
#include "efa.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_rdm_pkt_type.h"
#include "efa_rdm_pke_rtm.h"
#include "efa_rdm_pke_utils.h"
@@ -17,14 +18,14 @@
* @param[in] conn efa conn object
* @relates efa_rdm_peer
*/
-void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn)
+void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry)
{
int ret;
memset(peer, 0, sizeof(struct efa_rdm_peer));
peer->ep = ep;
- peer->conn = conn;
- peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, conn->ep_addr);
+ peer->av_entry = av_entry;
+ peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, efa_proto_av_entry_ep_addr(av_entry));
peer->host_id = peer->is_self ? ep->host_id : 0; /* Peer host id is exchanged via handshake */
peer->num_runt_bytes_in_flight = 0;
/* allocate the robuf circular queue from the pre-allocated buffer pool */
@@ -39,7 +40,7 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st
dlist_init(&peer->rxe_list);
dlist_init(&peer->overflow_pke_list);
- if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL) {
+ if (av_entry->shm_fi_addr != FI_ADDR_NOTAVAIL) {
peer->is_local = 1;
}
diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h
index caf804111be..ac68d58f54d 100644
--- a/prov/efa/src/rdm/efa_rdm_peer.h
+++ b/prov/efa/src/rdm/efa_rdm_peer.h
@@ -9,6 +9,8 @@
#include "efa_rdm_protocol.h"
#include "efa_rdm_rxe_map.h"
+struct efa_proto_av_entry;
+
#define EFA_RDM_PEER_DEFAULT_REORDER_BUFFER_SIZE (16)
#define EFA_RDM_PEER_REQ_SENT BIT_ULL(0) /**< A REQ packet has been sent to the peer (peer should send a handshake back) */
@@ -90,7 +92,7 @@ struct efa_rdm_peer {
bool is_self; /**< flag indicating whether the peer is the endpoint itself */
bool is_local; /**< flag indicating wehther the peer is local (on the same instance) */
uint32_t device_version; /**< EFA device version */
- struct efa_conn *conn; /**< pointer to efa_conn struct in the av entry */
+ struct efa_proto_av_entry *av_entry; /**< pointer to efa_proto_av_entry in the av entry */
uint64_t host_id; /* Optional peer host id. Default 0 */
/**
* @brief reorder buffer
@@ -239,9 +241,9 @@ bool efa_rdm_peer_need_connid(struct efa_rdm_peer *peer)
(peer->extra_info[0] & EFA_RDM_EXTRA_REQUEST_CONNID_HEADER);
}
-struct efa_conn;
+struct efa_proto_av_entry;
-void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn);
+void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry);
void efa_rdm_peer_destruct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep);
@@ -260,6 +262,6 @@ int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_e
/* Macro for getting peer address string */
#define EFA_RDM_GET_PEER_ADDR_STR(ep, peer, peer_addr_str) \
char peer_addr_str[OFI_ADDRSTRLEN] = {0}; \
- efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str});
+ efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str});
#endif /* EFA_RDM_PEER_H */
diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c
index e45456e2cce..4855bcd5e63 100644
--- a/prov/efa/src/rdm/efa_rdm_pke.c
+++ b/prov/efa/src/rdm/efa_rdm_pke.c
@@ -10,6 +10,7 @@
#include "efa.h"
#include "efa_av.h"
+#include "rdm/efa_proto_av.h"
#include "efa_data_path_ops.h"
#include "efa_tp.h"
@@ -189,8 +190,8 @@ void efa_rdm_pke_release_tx(struct efa_rdm_pke *pkt_entry)
EFA_DBG(FI_LOG_EP_DATA,
"reset backoff timer for peer fi_addr: %" PRIu64
" implicit fi_addr: %" PRIu64 "\n",
- pkt_entry->peer->conn->fi_addr,
- pkt_entry->peer->conn->implicit_fi_addr);
+ pkt_entry->peer->av_entry->fi_addr,
+ pkt_entry->peer->av_entry->implicit_fi_addr);
}
efa_rdm_pke_release(pkt_entry);
@@ -454,7 +455,7 @@ static inline uint64_t efa_rdm_pke_get_wr_id(struct efa_rdm_pke *pkt_entry)
ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
int pkt_entry_cnt, uint64_t flags)
{
- struct efa_conn *conn;
+ struct efa_proto_av_entry *av_entry;
struct efa_rdm_ep *ep;
struct efa_rdm_pke *pkt_entry;
struct efa_rdm_peer *peer;
@@ -477,8 +478,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
if (peer->flags & EFA_RDM_PEER_IN_BACKOFF)
return -FI_EAGAIN;
- conn = pkt_entry_vec[0]->peer->conn;
- assert(conn && conn->ep_addr);
+ av_entry = pkt_entry_vec[0]->peer->av_entry;
+ assert(av_entry && efa_proto_av_entry_ep_addr(av_entry));
for (pkt_idx = 0; pkt_idx < pkt_entry_cnt; ++pkt_idx) {
pkt_entry = pkt_entry_vec[pkt_idx];
@@ -521,8 +522,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
qpn = peer->user_recv_qp.qpn;
qkey = peer->user_recv_qp.qkey;
} else {
- qpn = conn->ep_addr->qpn;
- qkey = conn->ep_addr->qkey;
+ qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn;
+ qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey;
}
/* This will make efa_qp_post_send not ring the doorbell until the last itertion of the loop */
@@ -533,7 +534,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec,
ret = efa_qp_post_send(ep->base_ep.qp, sg_list,
inline_data_list, iov_cnt, use_inline,
- wr_id, cq_data, flags_in_loop, conn->ah,
+ wr_id, cq_data, flags_in_loop, av_entry->ah,
qpn, qkey);
if (OFI_UNLIKELY(ret))
@@ -580,7 +581,7 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry,
{
struct efa_rdm_ep *ep;
struct efa_qp *qp;
- struct efa_conn *conn;
+ struct efa_proto_av_entry *av_entry;
struct ibv_sge sge;
struct efa_rdm_ope *txe;
int err = 0;
@@ -599,11 +600,11 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry,
qpn = qp->qp_num;
qkey = qp->qkey;
} else {
- conn = pkt_entry->peer->conn;
- assert(conn && conn->ep_addr);
- ah = conn->ah;
- qpn = conn->ep_addr->qpn;
- qkey = conn->ep_addr->qkey;
+ av_entry = pkt_entry->peer->av_entry;
+ assert(av_entry && efa_proto_av_entry_ep_addr(av_entry));
+ ah = av_entry->ah;
+ qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn;
+ qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey;
}
sge.addr = (uint64_t)local_buf;
@@ -652,7 +653,7 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry)
{
struct efa_rdm_ep *ep;
struct efa_qp *qp;
- struct efa_conn *conn;
+ struct efa_proto_av_entry *av_entry;
struct ibv_sge sge;
struct efa_rdm_rma_context_pkt *rma_context_pkt;
struct efa_rdm_ope *txe;
@@ -689,11 +690,11 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry)
qpn = qp->qp_num;
qkey = qp->qkey;
} else {
- conn = pkt_entry->peer->conn;
- assert(conn && conn->ep_addr);
- ah = conn->ah;
- qpn = conn->ep_addr->qpn;
- qkey = conn->ep_addr->qkey;
+ av_entry = pkt_entry->peer->av_entry;
+ assert(av_entry && efa_proto_av_entry_ep_addr(av_entry));
+ ah = av_entry->ah;
+ qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn;
+ qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey;
}
wr_id = efa_rdm_pke_get_wr_id(pkt_entry);
diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c
index 2ed75f38a00..452d09e5c2e 100644
--- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c
+++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c
@@ -93,8 +93,8 @@ void efa_rdm_pke_handle_handshake_recv(struct efa_rdm_pke *pkt_entry)
EFA_INFO(FI_LOG_CQ,
"HANDSHAKE received from peer with explicit fi_addr %" PRIu64
" implicit fi_addr %" PRIu64 "\n",
- pkt_entry->peer->conn->fi_addr,
- pkt_entry->peer->conn->implicit_fi_addr);
+ pkt_entry->peer->av_entry->fi_addr,
+ pkt_entry->peer->av_entry->implicit_fi_addr);
handshake_pkt = (struct efa_rdm_handshake_hdr *)pkt_entry->wiredata;
diff --git a/prov/efa/src/rdm/efa_rdm_pke_print.c b/prov/efa/src/rdm/efa_rdm_pke_print.c
index 529fddfe0f3..37b80505355 100644
--- a/prov/efa/src/rdm/efa_rdm_pke_print.c
+++ b/prov/efa/src/rdm/efa_rdm_pke_print.c
@@ -154,7 +154,7 @@ static void efa_rdm_pke_print_eager_tag_rtm(char *prefix,
tag_rtm_hdr = (struct efa_rdm_eager_tagrtm_hdr *) pkt_entry->wiredata;
if (pkt_entry->peer)
- fi_addr = pkt_entry->peer->conn->fi_addr;
+ fi_addr = pkt_entry->peer->av_entry->fi_addr;
EFA_DBG(FI_LOG_EP_DATA,
"%s EFA RDM RTM packet - type: %" PRIu32 " version: %" PRIu8
@@ -195,7 +195,7 @@ static void efa_rdm_pke_print_longread_rtw(char *prefix,
" msg_length: %" PRIu64 " send_id: %" PRIu32
" read_iov_count: %" PRIu32 "\n",
prefix, base_hdr->type, base_hdr->version, base_hdr->flags,
- pkt_entry->peer->conn->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count,
+ pkt_entry->peer->av_entry->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count,
rtw_hdr->msg_length, rtw_hdr->send_id, rtw_hdr->read_iov_count);
efa_rdm_pke_print_fi_rma_iov("rma_iov", rtw_hdr->rma_iov_count,
diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c
index de0f3d4c478..894a38b745b 100644
--- a/prov/efa/src/rdm/efa_rdm_util.c
+++ b/prov/efa/src/rdm/efa_rdm_util.c
@@ -119,7 +119,7 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep,
len = sizeof(ep_addr_str);
efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &len);
len = sizeof(peer_addr_str);
- efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &len);
+ efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &len);
if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) {
strcpy(local_host_id_str, "N/A");
diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c
index fdbc2bc71e5..927f130d552 100644
--- a/prov/efa/test/efa_unit_test_av.c
+++ b/prov/efa/test/efa_unit_test_av.c
@@ -2,8 +2,6 @@
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
#include "efa_unit_tests.h"
-#include "efa_rdm_cq.h"
-#include "efa_rdm_pke_req.h"
#include "efa_av.h"
/**
@@ -78,7 +76,7 @@ void test_av_insert_duplicate_gid(struct efa_resource **state)
assert_int_not_equal(addr1, addr2);
}
-static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, bool multi_av)
+static void efa_ah_cnt_av_efa_impl(struct efa_resource **state, bool multi_av)
{
struct efa_resource *resource = *state;
struct efa_ep_addr raw_addr = {0};
@@ -90,23 +88,19 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo
struct fi_av_attr av_attr = {0};
struct fid_av *av1 = NULL, *av2 = NULL;
- efa_unit_test_resource_construct(resource, FI_EP_RDM, efa_fabric ? EFA_FABRIC_NAME : EFA_DIRECT_FABRIC_NAME);
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid);
err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
assert_int_equal(err, 0);
- /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 for efa fabric */
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0);
+ /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 */
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah);
- if (efa_fabric) {
- assert_non_null(efa_ah);
- assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 1 : 0);
- assert_int_equal(efa_ah->implicit_refcnt, 0);
- } else {
- assert_null(efa_ah);
- }
+ assert_non_null(efa_ah);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0);
if (multi_av) {
/* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */
@@ -132,15 +126,10 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo
assert_int_not_equal(addr1, addr2);
}
- if (!efa_fabric) {
- HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah);
- assert_non_null(efa_ah);
- }
-
- /* So far we should still have 1 ah, and its refcnt is 3 for efa fabric (including self AH) and 2 for efa-direct fabric) */
+ /* So far we should still have 1 ah, and its refcnt is 3 (including self AH) */
assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
- assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 3 : 2);
- assert_int_equal(efa_ah->implicit_refcnt, 0);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 3);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0);
if (multi_av) {
/* ah refcnt should be decremented to 1 after av close */
@@ -152,15 +141,87 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo
assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0);
}
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0);
- if (efa_fabric) {
- /* efa_ah is still alive because self-AH holds a reference */
- assert_int_equal(efa_ah->explicit_refcnt, 1);
- assert_int_equal(efa_ah->implicit_refcnt, 0);
+ /* efa_ah is still alive because self-AH holds a reference */
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0);
+
+ /* ah map should be empty now after closing ep which destroys the self ah */
+ assert_int_equal(fi_close(&resource->ep->fid), 0);
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
+ /* Reset to NULL to avoid test reaper closing again */
+ resource->ep = NULL;
+}
+
+static void efa_ah_cnt_av_efa_direct_impl(struct efa_resource **state, bool multi_av)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t addr1, addr2;
+ int err, num_addr;
+ struct efa_domain *efa_domain;
+ struct efa_ah *efa_ah = NULL;
+ struct fi_av_attr av_attr = {0};
+ struct fid_av *av1 = NULL, *av2 = NULL;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME);
+
+ efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+
+ /* efa-direct does not create a self AH, so ah_map should be empty */
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
+ HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah);
+ assert_null(efa_ah);
+
+ if (multi_av) {
+ /* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */
+ assert_int_equal(fi_av_open(resource->domain, &av_attr, &av1, NULL), 0);
+ assert_int_equal(fi_av_open(resource->domain, &av_attr, &av2, NULL), 0);
}
- /* else: efa_ah has been freed, do not dereference */
- /* ah map should be empty now after closing ep which destroys the self ah for efa fabric */
+ raw_addr.qpn = 1;
+ raw_addr.qkey = 0x1234;
+
+ num_addr = fi_av_insert(multi_av ? av1 : resource->av, &raw_addr, 1, &addr1, 0, NULL);
+ assert_int_equal(num_addr, 1);
+
+ raw_addr.qpn = 2;
+ raw_addr.qkey = 0x5678;
+ num_addr = fi_av_insert(multi_av ? av2 : resource->av, &raw_addr, 1, &addr2, 0, NULL);
+ assert_int_equal(num_addr, 1);
+
+ if (multi_av) {
+ /* They should be equal as 0 since they are in different avs */
+ assert_int_equal(addr1, addr2);
+ } else {
+ assert_int_not_equal(addr1, addr2);
+ }
+
+ HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah);
+ assert_non_null(efa_ah);
+
+ /* So far we should still have 1 ah, and its refcnt is 2 */
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
+ assert_int_equal(efa_ah->refcnt, 2);
+
+ if (multi_av) {
+ /* ah refcnt should be decremented to 0 after av close */
+ assert_int_equal(fi_close(&av1->fid), 0);
+ assert_int_equal(fi_close(&av2->fid), 0);
+ } else {
+ /* ah refcnt should be decremented to 0 after av entry removals */
+ assert_int_equal(fi_av_remove(resource->av, &addr1, 1, 0), 0);
+ assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0);
+ }
+
+ /* efa_ah has been freed (no self AH holding a reference on efa-direct) */
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
+
+ /* ah map should still be empty after closing ep */
assert_int_equal(fi_close(&resource->ep->fid), 0);
assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
/* Reset to NULL to avoid test reaper closing again */
@@ -169,22 +230,22 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo
void test_efa_ah_cnt_one_av_efa(struct efa_resource **state)
{
- efa_ah_cnt_av_impl(state, true, false);
+ efa_ah_cnt_av_efa_impl(state, false);
}
void test_efa_ah_cnt_one_av_efa_direct(struct efa_resource **state)
{
- efa_ah_cnt_av_impl(state, false, false);
+ efa_ah_cnt_av_efa_direct_impl(state, false);
}
void test_efa_ah_cnt_multi_av_efa(struct efa_resource **state)
{
- efa_ah_cnt_av_impl(state, true, true);
+ efa_ah_cnt_av_efa_impl(state, true);
}
void test_efa_ah_cnt_multi_av_efa_direct(struct efa_resource **state)
{
- efa_ah_cnt_av_impl(state, false, true);
+ efa_ah_cnt_av_efa_direct_impl(state, true);
}
/**
@@ -223,7 +284,6 @@ void test_av_multiple_ep_impl(struct efa_resource **state, char *fabric_name)
fi_close(&ep2->fid);
}
-
/**
* @brief This test verifies that multiple endpoints can bind to the same AV
* for the efa fabric
@@ -246,724 +306,81 @@ void test_av_multiple_ep_efa_direct(struct efa_resource **state)
return test_av_multiple_ep_impl(state, EFA_DIRECT_FABRIC_NAME);
}
-static void test_av_verify_av_hash_cnt(struct efa_av *av,
- int explicit_cur_av_count,
- int explicit_prv_av_count,
- int implicit_cur_av_count,
- int implicit_prv_av_count)
-{
- assert_int_equal(HASH_CNT(hh, av->util_av.hash),
- explicit_cur_av_count + explicit_prv_av_count);
- assert_int_equal(HASH_CNT(hh, av->cur_reverse_av),
- explicit_cur_av_count);
- assert_int_equal(HASH_CNT(hh, av->prv_reverse_av),
- explicit_prv_av_count);
-
- assert_int_equal(HASH_CNT(hh, av->util_av_implicit.hash),
- implicit_cur_av_count + implicit_prv_av_count);
- assert_int_equal(HASH_CNT(hh, av->cur_reverse_av_implicit),
- implicit_cur_av_count);
- assert_int_equal(HASH_CNT(hh, av->prv_reverse_av_implicit),
- implicit_prv_av_count);
-}
-
/**
- * @brief This test removes a peer and inserts it again
+ * @brief Test base AV (efa-direct) insert, lookup, remove cycle
*
* @param[in] state struct efa_resource that is managed by the framework
*/
-void test_av_reinsertion(struct efa_resource **state)
+void test_av_insert_remove_lookup_efa_direct(struct efa_resource **state)
{
struct efa_resource *resource = *state;
- struct efa_rdm_peer *peer;
- struct efa_ep_addr raw_addr, raw_addr_2;
+ struct efa_ep_addr raw_addr = {0}, raw_addr_out = {0};
size_t raw_addr_len = sizeof(struct efa_ep_addr);
fi_addr_t fi_addr;
struct efa_av *av;
- struct efa_rdm_ep *efa_rdm_ep;
- int err;
-
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
-
- err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
- assert_int_equal(err, 0);
- raw_addr.qpn = 174;
- raw_addr.qkey = 0x1234;
+ struct efa_av_entry *entry;
+ int err, num_addr;
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME);
av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
- assert_int_equal(err, 1);
- assert_int_equal(fi_addr, 0);
- test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
-
- err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len);
- assert_int_equal(err, 0);
- assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
- peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
- assert_int_equal(peer->conn->fi_addr, fi_addr);
- assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1);
-
- err = fi_av_remove(resource->av, &fi_addr, 1, 0);
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
assert_int_equal(err, 0);
- test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+ raw_addr.qpn = 7;
+ raw_addr.qkey = 0xABCD;
- err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
- assert_int_equal(err, 1);
+ num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(num_addr, 1);
assert_int_equal(fi_addr, 0);
- test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
-
- err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len);
+ assert_int_equal(av->used, 1);
+
+ /* Verify entry is accessible and fields are correct */
+ entry = efa_av_addr_to_entry(av, fi_addr);
+ assert_non_null(entry);
+ assert_non_null(entry->ah);
+ assert_int_equal(entry->fi_addr, fi_addr);
+ assert_int_equal(efa_av_entry_ep_addr(entry)->qpn, 7);
+ assert_int_equal(efa_av_entry_ep_addr(entry)->qkey, 0xABCD);
+
+ /* Lookup should return the same address */
+ raw_addr_len = sizeof(raw_addr_out);
+ err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len);
assert_int_equal(err, 0);
- assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
-
- peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
- assert_int_equal(peer->conn->fi_addr, fi_addr);
- assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1);
+ assert_int_equal(raw_addr_out.qpn, 7);
+ assert_int_equal(raw_addr_out.qkey, 0xABCD);
+ assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_out), 1);
+ /* Remove and verify */
err = fi_av_remove(resource->av, &fi_addr, 1, 0);
assert_int_equal(err, 0);
- test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
-}
-
-/**
- * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then
- * remove the first-inserted peer before the second. This reproduces the bug
- * in efa_av_reverse_av_remove() where the code blindly deletes the
- * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to
- * a different (newer) conn. Removing the surviving second peer afterwards
- * then hits a NULL prv_reverse_av_entry and SEGVs.
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state)
-{
- struct efa_resource *resource = *state;
- struct efa_ep_addr raw_addr;
- size_t raw_addr_len = sizeof(struct efa_ep_addr);
- fi_addr_t fi_addr1, fi_addr2;
- struct efa_av *av;
- struct efa_rdm_ep *efa_rdm_ep;
- uint32_t ahn;
- int err;
+ assert_int_equal(av->used, 0);
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ /* Entry should be NULL after remove */
+ entry = efa_av_addr_to_entry(av, fi_addr);
+ assert_null(entry);
- err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
- assert_int_equal(err, 0);
-
- av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep,
- base_ep.util_ep.ep_fid);
- ahn = efa_rdm_ep->self_ah->ahn;
-
- /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */
- raw_addr.qpn = 100;
- raw_addr.qkey = 0xAAAA;
- err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL);
- assert_int_equal(err, 1);
- test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
- /* cur_reverse_av (ahn, 100) -> conn1 (fi_addr1) */
- assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL),
- fi_addr1);
-
- /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's
- * reverse-AV entry from cur_reverse_av into prv_reverse_av. */
- raw_addr.qpn = 100;
- raw_addr.qkey = 0xBBBB;
- err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL);
- assert_int_equal(err, 1);
- assert_int_not_equal(fi_addr1, fi_addr2);
- test_av_verify_av_hash_cnt(av, 1, 1, 0, 0);
- /* cur_reverse_av (ahn, 100) now points to conn2 (fi_addr2); peer1 is
- * in prv_reverse_av keyed by its own qkey. */
- assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL),
- fi_addr2);
-
- /* Remove peer1 first. Without the fix this would incorrectly delete
- * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */
- err = fi_av_remove(resource->av, &fi_addr1, 1, 0);
- assert_int_equal(err, 0);
- /* peer1's prv entry is gone; peer2's cur entry must still be intact. */
- test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
- assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL),
- fi_addr2);
-
- /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry
- * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */
- err = fi_av_remove(resource->av, &fi_addr2, 1, 0);
- assert_int_equal(err, 0);
- test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
- assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL),
- FI_ADDR_NOTAVAIL);
+ /* Lookup should fail after remove */
+ err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len);
+ assert_int_not_equal(err, 0);
}
/**
- * @brief Generate a peer with a unique QPN and a random QKEY and insert it
- * into the implicit AV
- *
- * The QPN is drawn from a static monotonic counter so every peer minted by
- * this helper has a distinct (ahn, qpn) key in the reverse AV. Callers rely
- * on this uniqueness to exercise LRU ordering and eviction behavior without
- * tripping over the provider's QPN-collision path.
+ * @brief Test base AV (efa-direct) addr_to_entry returns NULL for invalid fi_addr
*
* @param[in] state struct efa_resource that is managed by the framework
*/
-static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource)
-{
- struct efa_ep_addr raw_addr;
- size_t raw_addr_len = sizeof(struct efa_ep_addr);
- struct efa_rdm_ep *efa_rdm_ep;
- struct efa_rdm_peer *peer;
- fi_addr_t implicit_fi_addr, test_addr;
- struct efa_av *av;
- uint32_t ahn;
- int err;
-
- av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
- assert_int_equal(err, 0);
-
- static uint16_t next_qpn = 0;
- raw_addr.qpn = next_qpn++;
- raw_addr.qkey = rand();
- ahn = efa_rdm_ep->self_ah->ahn;
-
- /* Manually insert into implicit AV */
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
-
- err = efa_av_insert_one(av, &raw_addr, &implicit_fi_addr, 0, NULL, true, true);
-
- peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr);
-
- assert_int_equal(peer->conn->implicit_fi_addr, implicit_fi_addr);
- assert_int_equal(peer->conn->fi_addr, FI_ADDR_NOTAVAIL);
- assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1);
-
- test_addr = efa_av_reverse_lookup_rdm_implicit(av, ahn, raw_addr.qpn, NULL);
- assert_int_equal(test_addr, implicit_fi_addr);
-
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
-
- return peer;
-}
-
-/**
- * @brief This test fakes a peer in the implicit AV and closes the AV with an
- * implicit peer in it
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_av_implicit(struct efa_resource **state)
+void test_av_base_addr_to_entry_invalid(struct efa_resource **state)
{
struct efa_resource *resource = *state;
-
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
- test_av_get_peer_from_implicit_av(resource);
-}
-
-/**
- * @brief This test fakes a peer in the implicit AV and verifies that the peer
- * is moved to the explicit AV when fi_av_insert is called
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_av_implicit_to_explicit(struct efa_resource **state)
-{
- struct efa_resource *resource = *state;
- struct efa_ep_addr raw_addr, raw_addr_2;
- size_t raw_addr_len = sizeof(struct efa_ep_addr);
- struct efa_rdm_ep *efa_rdm_ep;
- struct efa_rdm_peer *peer;
- fi_addr_t explicit_fi_addr, test_addr;
struct efa_av *av;
- uint32_t ahn;
- int err;
+ struct efa_av_entry *entry;
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME);
av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */
- peer = test_av_get_peer_from_implicit_av(resource);
- err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
- assert_int_equal(err, 0);
-
- /* Modify the peer and verify that the peer is moved as-is */
- peer->next_msg_id = 355;
- peer->flags |= EFA_RDM_PEER_IN_BACKOFF;
-
- /* Insert explicitly */
- raw_addr.qpn = peer->conn->ep_addr->qpn;
- raw_addr.qkey = peer->conn->ep_addr->qkey;
- err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL);
- test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
-
- err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len);
- assert_int_equal(err, 0);
- assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
+ entry = efa_av_addr_to_entry(av, FI_ADDR_NOTAVAIL);
+ assert_null(entry);
- peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr);
- assert_int_equal(peer->conn->fi_addr, explicit_fi_addr);
- assert_int_equal(peer->conn->implicit_fi_addr, FI_ADDR_NOTAVAIL);
- assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1);
-
- ahn = efa_rdm_ep->self_ah->ahn;
- test_addr = efa_av_reverse_lookup_rdm(av, ahn, raw_addr.qpn, NULL);
- assert_int_equal(test_addr, explicit_fi_addr);
-
- /* Verify the manually set peer properties above */
- assert_int_equal(peer->next_msg_id, 355);
- assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF);
-
- /* Unset the flag to make fi_av_remove easier */
- peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF;
-
- err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0);
- assert_int_equal(err, 0);
- test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
-}
-
-static void test_av_implicit_av_verify_lru_list_first_last_elements(
- struct efa_av *av, struct efa_conn *first_conn_expected,
- struct efa_conn *last_conn_expected)
-{
- struct dlist_entry *first_entry, *last_entry;
- struct efa_conn *first_conn_actual, *last_conn_actual;
-
- first_entry = av->implicit_av_lru_list.next;
- last_entry = av->implicit_av_lru_list.prev;
-
- first_conn_actual = container_of(first_entry, struct efa_conn,
- implicit_av_lru_entry);
- last_conn_actual = container_of(last_entry, struct efa_conn,
- implicit_av_lru_entry);
-
- assert_ptr_equal(first_conn_actual, first_conn_expected);
- assert_ptr_equal(last_conn_actual, last_conn_expected);
-}
-
-/**
- * @brief This test inserts three implicit peers and verifies that the last
- * inserted and/or accessed peer is at the tail of the LRU list
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_av_implicit_av_lru_insertion(struct efa_resource **state)
-{
- struct efa_resource *resource = *state;
- struct efa_rdm_ep *efa_rdm_ep;
- struct efa_rdm_peer *peer0, *peer1, *peer2;
- struct efa_av *av;
- fi_addr_t implicit_fi_addr;
- uint32_t ahn;
- int err;
-
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
- av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- /* Manually insert first address into implicit AV */
- peer0 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 1, 0);
-
- /* Expected LRU list: HEAD->peer0 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn);
-
- /* Manually insert second address into implicit AV */
- peer1 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
-
- /* Expected LRU list: HEAD->peer0->peer1 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn);
-
- /* Manually insert third address into implicit AV */
- peer2 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
-
- /* Expected LRU list: HEAD->peer0->peer1->peer2 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn);
-
-
- /* Access peer0 through the CQ read path */
- ahn = efa_rdm_ep->self_ah->ahn;
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit(
- av, ahn, peer0->conn->ep_addr->qpn, NULL);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(implicit_fi_addr, 0);
-
- /* Expected LRU list: HEAD->peer1->peer2->peer0 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn);
-
- /* Access peer2 through the CQ read path */
- ahn = efa_rdm_ep->self_ah->ahn;
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit(
- av, ahn, peer2->conn->ep_addr->qpn, NULL);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(implicit_fi_addr, 2);
-
- /* Expected LRU list: HEAD->peer1->peer0->peer2 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer2->conn);
-
-
- /* Access peer1 through repeated AV insertion path */
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- err = efa_av_insert_one(av, peer1->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(err, 0);
- assert_int_equal(implicit_fi_addr, 1);
- test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
-
- /* Expected LRU list: HEAD->peer0->peer2->peer1 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn);
-
- /* Access peer2 through repeated AV insertion path */
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- err = efa_av_insert_one(av, peer2->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(err, 0);
- assert_int_equal(implicit_fi_addr, 2);
- test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
-
- /* Expected LRU list: HEAD->peer0->peer1->peer2 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn);
-}
-
-/**
- * @brief This test sets the implicit AV size to 2 and inserts four implicit
- * peers. It verifies that the least recently used peer is evicted.
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_av_implicit_av_lru_eviction(struct efa_resource **state)
-{
- struct efa_resource *resource = *state;
- struct efa_rdm_ep *efa_rdm_ep;
- struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3;
- struct efa_ep_addr_hashable *efa_ep_addr_hashable;
- struct efa_av *av;
- fi_addr_t implicit_fi_addr;
- uint32_t ahn;
- int err;
-
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
- av = container_of(resource->av, struct efa_av, util_av.av_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- /* Modify implicit AV size */
- av->implicit_av_size = 2;
-
- /* Manually insert first address into implicit AV */
- peer0 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 1, 0);
-
- /* Expected LRU list: HEAD->peer0 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn);
-
- /* Manually insert second address into implicit AV */
- peer1 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
-
- /* Expected LRU list: HEAD->peer0->peer1 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn);
-
- /* Access peer0 through the CQ read path */
- ahn = efa_rdm_ep->self_ah->ahn;
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit(
- av, ahn, peer0->conn->ep_addr->qpn, NULL);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(implicit_fi_addr, 0);
-
- /* Expected LRU list: HEAD->peer1->peer0 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn);
-
- /* Manually insert third address into implicit AV */
- peer2 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
-
- /* Expected LRU list: HEAD->peer0->peer2 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn);
-
- /* Verify that peer1 is evicted and added to the evicted hashmap */
- assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 1);
- HASH_FIND(hh, av->evicted_peers_hashset, peer1->conn->ep_addr,
- sizeof(struct efa_ep_addr), efa_ep_addr_hashable);
- assert_non_null(efa_ep_addr_hashable);
- assert_int_equal(efa_is_same_addr(peer1->conn->ep_addr,
- &efa_ep_addr_hashable->addr),
- 1);
-
- /* Access peer0 through repeated AV insertion path */
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- err = efa_av_insert_one(av, peer0->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
- assert_int_equal(err, 0);
- assert_int_equal(implicit_fi_addr, 0);
- test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
-
- /* Expected LRU list: HEAD->peer2->peer0 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->conn, peer0->conn);
-
- /* Manually insert fourth address into implicit AV */
- peer3 = test_av_get_peer_from_implicit_av(resource);
- test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
-
- /* Verify that peer2 is evicted and added to the evicted hashmap */
- assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 2);
- HASH_FIND(hh, av->evicted_peers_hashset, peer2->conn->ep_addr,
- sizeof(struct efa_ep_addr), efa_ep_addr_hashable);
- assert_non_null(efa_ep_addr_hashable);
- assert_int_equal(efa_is_same_addr(peer2->conn->ep_addr,
- &efa_ep_addr_hashable->addr),
- 1);
-
- /* Expected LRU list: HEAD->peer0->peer3 */
- test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer3->conn);
-}
-
-/**
- * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_ah_refcnt(struct efa_resource **state)
-{
- struct efa_resource *resource = *state;
- fi_addr_t fi_addr;
- struct efa_ep_addr raw_addr = {0};
- size_t raw_addr_len = sizeof(struct efa_ep_addr);
- struct efa_rdm_ep *efa_rdm_ep;
- struct efa_domain *efa_domain;
- struct efa_rdm_peer *peer;
- struct efa_av *av;
- struct efa_ah *efa_ah = NULL;
- int err;
-
- int allowed_ahs = 1;
-
- g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah;
- g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah;
- g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah;
- g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah;
-
- g_self_ah_cnt = 1;
- g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs;
- assert_int_equal(g_ibv_ah_cnt, 0);
-
- efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
- efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid);
- efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
- av = container_of(resource->av, struct efa_av, util_av.av_fid);
-
- /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */
- assert_int_equal(g_ibv_ah_cnt, 1);
-
- err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
- assert_int_equal(err, 0);
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
-
- /* Manually insert into implicit AV */
- ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
- err = efa_av_insert_one(av, &raw_addr, &fi_addr, 0, NULL, true, true);
- peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr);
- ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
-
- efa_ah = peer->conn->ah;
-
- assert_int_equal(g_ibv_ah_cnt, 2);
-
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
- assert_int_equal(efa_ah->explicit_refcnt, 0);
- assert_int_equal(efa_ah->implicit_refcnt, 1);
-
- /* Move implicit AV entry to explicit AV entry */
- err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
- assert_int_equal(err, 1);
-
- assert_int_equal(g_ibv_ah_cnt, 2);
-
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
- assert_int_equal(efa_ah->explicit_refcnt, 1);
- assert_int_equal(efa_ah->implicit_refcnt, 0);
-
- err = fi_av_remove(resource->av, &fi_addr, 1, 0);
- assert_int_equal(err, 0);
-
- assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
-
- /* Only the self AH should be left */
- assert_int_equal(g_ibv_ah_cnt, 1);
-}
-
-/**
- * @brief This test inserts one implicit AV entry and verifies that the
- * implicitly created AH is evicted when an explicit AV entry is inserted. It
- * requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_ah_lru_eviction_impl(bool explicit)
-{
- fi_addr_t fi_addr;
- struct efa_ep_addr raw_addr[2] = {0};
- size_t raw_addr_len = sizeof(struct efa_ep_addr);
- struct fid_fabric *fabric_fid[2];
- struct fid_domain *domain_fid[2];
- struct fid_ep *ep_fid[2];
- struct fid_cq *cq_fid[2];
- struct fid_av *av_fid[2];
- struct efa_domain *efa_domain[2];
- struct efa_rdm_ep *efa_rdm_ep[2];
- struct efa_rdm_peer *peer;
- struct efa_av *efa_av[2];
- struct efa_ah *efa_ah = NULL;
- int err;
- struct fi_av_attr av_attr = {0};
- struct fi_cq_attr cq_attr = {
- .format = FI_CQ_FORMAT_DATA
- };
- struct fi_info *hints, *info, *cur;
- int num_nic = 0;
-
- int allowed_ahs = 1;
-
- g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah;
- g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah;
- g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah;
- g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah;
-
- hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME);
- fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info);
- for (cur = info; cur; cur = cur->next) {
- num_nic++;
- }
-
- if (num_nic < 2) {
- fi_freeinfo(info);
- fi_freeinfo(hints);
- return;
- }
-
- g_self_ah_cnt = 2;
- g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */
- assert_int_equal(g_ibv_ah_cnt, 0);
-
- cur = info;
- for (int i = 0; i < 2; i++) {
- err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL);
- assert_int_equal(err, 0);
-
- err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL);
- assert_int_equal(err, 0);
-
- efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid);
-
- err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL);
- assert_int_equal(err, 0);
-
- efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid);
-
- err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL);
- assert_int_equal(err, 0);
-
- err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL);
- assert_int_equal(err, 0);
-
- efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-
- fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0);
- fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV);
-
- err = fi_enable(ep_fid[i]);
- assert_int_equal(err, 0);
-
- err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len);
- assert_int_equal(err, 0);
-
- cur = cur->next;
- }
-
- assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0);
-
- /* Manually insert into implicit AV in first domain */
- ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
- err = efa_av_insert_one(efa_av[0], &raw_addr[0], &fi_addr, 0, NULL, true, true);
- peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr);
- ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
-
- assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1);
- efa_ah = peer->conn->ah;
- assert_int_equal(efa_ah->implicit_refcnt, 1);
- assert_int_equal(efa_ah->explicit_refcnt, 0);
-
- if (explicit) {
- err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL);
- assert_int_equal(err, 1);
- peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr);
- } else {
- ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
- err = efa_av_insert_one(efa_av[0], &raw_addr[1], &fi_addr, 0, NULL, true, true);
- peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr);
- ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
- }
-
- assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1);
-
- efa_ah = peer->conn->ah;
- if (explicit) {
- assert_int_equal(efa_ah->implicit_refcnt, 0);
- assert_int_equal(efa_ah->explicit_refcnt, 1);
- } else {
- assert_int_equal(efa_ah->implicit_refcnt, 1);
- assert_int_equal(efa_ah->explicit_refcnt, 0);
- }
-
- if (explicit) {
- err = fi_av_remove(av_fid[0], &fi_addr, 1, 0);
- assert_int_equal(err, 0);
- assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0);
- }
-
- for (int i = 0; i < 2; i++) {
- efa_rdm_ep[i]->self_ah = NULL;
- fi_close(&ep_fid[i]->fid);
- fi_close(&cq_fid[i]->fid);
- fi_close(&av_fid[i]->fid);
- fi_close(&domain_fid[i]->fid);
- fi_close(&fabric_fid[i]->fid);
- }
- fi_freeinfo(hints);
- fi_freeinfo(info);
-}
-
-/**
- * @brief This test inserts one implicit AV entry and verifies that the
- * implicitly created AH is evicted when an explicit AV entry is inserted. It
- * requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state)
-{
- test_ah_lru_eviction_impl(true);
-}
-
-/**
- * @brief This test inserts one implicit AV entry and verifies that the
- * implicitly created AH is evicted when another implicit AV entry is inserted.
- * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
- *
- * @param[in] state struct efa_resource that is managed by the framework
- */
-void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state)
-{
- test_ah_lru_eviction_impl(false);
+ entry = efa_av_addr_to_entry(av, FI_ADDR_UNSPEC);
+ assert_null(entry);
}
diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c
index 1972611fc01..672d4a863b9 100644
--- a/prov/efa/test/efa_unit_test_cq.c
+++ b/prov/efa/test/efa_unit_test_cq.c
@@ -1084,7 +1084,7 @@ static void test_efa_cq_read_prep(struct efa_resource *resource,
will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_imm_data_return_mock, 0x1);
will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_qp_num_return_mock, base_ep->qp->qp_num);
will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_byte_len_return_mock, 4096);
- will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_conn(base_ep->av, addr)->ah->ahn);
+ will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_entry(base_ep->av, addr)->ah->ahn);
will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_src_qp_return_mock, raw_addr.qpn);
diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c
index e532c0813a9..584b5296241 100644
--- a/prov/efa/test/efa_unit_test_ep.c
+++ b/prov/efa/test/efa_unit_test_ep.c
@@ -480,7 +480,7 @@ void test_efa_rdm_ep_rma_queue_before_handshake(struct efa_resource **state, int
peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr);
peer->flags = EFA_RDM_PEER_REQ_SENT;
/* Do not use shm in this unit test because we are testing efa rma path */
- peer->conn->shm_fi_addr = FI_ADDR_NOTAVAIL;
+ peer->av_entry->shm_fi_addr = FI_ADDR_NOTAVAIL;
assert_false(efa_rdm_ep->homogeneous_peers);
assert_true(dlist_empty(&efa_rdm_ep->txe_list));
diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c
index 9f4875d4246..144e7f2d7f8 100644
--- a/prov/efa/test/efa_unit_test_mocks.c
+++ b/prov/efa/test/efa_unit_test_mocks.c
@@ -21,7 +21,7 @@ int g_ibv_ah_limit = 1024;
int g_ibv_ah_cnt = 0;
int g_self_ah_cnt = 1;
struct ibv_ah g_dummy_ah;
-struct efa_ah g_dummy_efa_ah = {0};
+struct efa_proto_ah g_dummy_proto_ah = {0};
void efa_ibv_ah_limit_cnt_reset()
{
@@ -74,40 +74,49 @@ int efa_mock_ibv_destroy_ah_dont_create_self_ah(struct ibv_ah *ibv_ah)
}
struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av)
+ size_t alloc_size)
{
return NULL;
}
struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av)
+ size_t alloc_size)
{
/* Intercept the self AH call in efa_ah_alloc and do not call
* ibv_create_ah or modify the AH map etc */
if (g_ibv_ah_cnt < g_self_ah_cnt) {
g_ibv_ah_cnt++;
- g_dummy_efa_ah.ibv_ah = &g_dummy_ah;
- g_dummy_efa_ah.ahn = -1;
- memset(g_dummy_efa_ah.gid, 0, sizeof(g_dummy_efa_ah.gid));
- g_dummy_efa_ah.explicit_refcnt = 1;
- g_dummy_efa_ah.implicit_refcnt = 0;
- return &g_dummy_efa_ah;
+ g_dummy_proto_ah.ah.ibv_ah = &g_dummy_ah;
+ g_dummy_proto_ah.ah.ahn = -1;
+ memset(g_dummy_proto_ah.ah.gid, 0, sizeof(g_dummy_proto_ah.ah.gid));
+ g_dummy_proto_ah.ah.refcnt = 1;
+ /*
+ * Reset protocol fields so efa_proto_ah_alloc sees a fresh AH
+ * regardless of prior test state. Without this reset, stale
+ * proto refcnts or a stale lru_list_entry from a freed domain
+ * would carry forward into the current test.
+ */
+ g_dummy_proto_ah.implicit_refcnt = 0;
+ g_dummy_proto_ah.explicit_refcnt = 0;
+ memset(&g_dummy_proto_ah.lru_list_entry, 0,
+ sizeof(g_dummy_proto_ah.lru_list_entry));
+ dlist_init(&g_dummy_proto_ah.implicit_conn_list);
+ return &g_dummy_proto_ah.ah;
} else {
- return __real_efa_ah_alloc(domain, gid, insert_implicit_av);
+ return __real_efa_ah_alloc(domain, gid, alloc_size);
}
}
void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain,
- struct efa_ah *ah,
- bool release_from_implicit_av)
+ struct efa_ah *ah)
{
/* Intercept the self AH destruct call in efa_ah_release and do not call
* ibv_destroy_ah or modify the AH map etc */
if (g_ibv_ah_cnt <= g_self_ah_cnt)
g_ibv_ah_cnt--;
else
- return __real_efa_ah_release(domain, ah, release_from_implicit_av);
+ return __real_efa_ah_release(domain, ah);
}
int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibv_ctx,
@@ -516,16 +525,14 @@ int __wrap_efadv_query_device(struct ibv_context *ibv_ctx, struct efadv_device_a
}
struct efa_ah *__wrap_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av)
+ size_t alloc_size)
{
- return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, insert_implicit_av);
+ return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, alloc_size);
}
-void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av)
+void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah)
{
- return g_efa_unit_test_mocks.efa_ah_release(domain, ah,
- release_from_implicit_av);
+ return g_efa_unit_test_mocks.efa_ah_release(domain, ah);
}
struct ibv_cq_ex *efa_mock_create_cq_ex_return_null(struct ibv_context *context, struct ibv_cq_init_attr_ex *init_attr)
diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h
index ae68e77935f..96a618886b2 100644
--- a/prov/efa/test/efa_unit_test_mocks.h
+++ b/prov/efa/test/efa_unit_test_mocks.h
@@ -35,20 +35,18 @@ int __real_efadv_query_device(struct ibv_context *ibvctx, struct efadv_device_at
uint32_t inlen);
struct efa_ah *__real_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av);
+ size_t alloc_size);
struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av);
+ size_t alloc_size);
struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid,
- bool insert_implicit_av);
+ size_t alloc_size);
-void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av);
+void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah);
void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain,
- struct efa_ah *ah,
- bool release_from_implicit_av);
+ struct efa_ah *ah);
int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibvctx, struct efadv_device_attr *attr,
uint32_t inlen);
@@ -166,9 +164,8 @@ struct efa_unit_test_mocks
uint32_t inlen);
struct efa_ah *(*efa_ah_alloc)(struct efa_domain *domain,
const uint8_t *gid,
- bool insert_implicit_av);
- void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah,
- bool release_from_implicit_av);
+ size_t alloc_size);
+ void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah);
#if HAVE_EFADV_CQ_EX
struct ibv_cq_ex *(*efadv_create_cq)(struct ibv_context *ibvctx,
diff --git a/prov/efa/test/efa_unit_test_proto_av.c b/prov/efa/test/efa_unit_test_proto_av.c
new file mode 100644
index 00000000000..a7a898e9e01
--- /dev/null
+++ b/prov/efa/test/efa_unit_test_proto_av.c
@@ -0,0 +1,1236 @@
+/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
+/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
+
+#include "efa_unit_tests.h"
+#include "efa_rdm_cq.h"
+#include "efa_rdm_pke_req.h"
+#include "efa_av.h"
+
+static void test_av_verify_av_hash_cnt(struct efa_av *av,
+ int explicit_cur_av_count,
+ int explicit_prv_av_count,
+ int implicit_cur_av_count,
+ int implicit_prv_av_count)
+{
+ struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av);
+
+ assert_int_equal(HASH_CNT(hh, av->util_av.hash),
+ explicit_cur_av_count + explicit_prv_av_count);
+ assert_int_equal(HASH_CNT(hh, av->cur_reverse_av),
+ explicit_cur_av_count);
+ assert_int_equal(HASH_CNT(hh, av->prv_reverse_av),
+ explicit_prv_av_count);
+
+ assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash),
+ implicit_cur_av_count + implicit_prv_av_count);
+ assert_int_equal(HASH_CNT(hh, proto_av->cur_reverse_av_implicit),
+ implicit_cur_av_count);
+ assert_int_equal(HASH_CNT(hh, proto_av->prv_reverse_av_implicit),
+ implicit_prv_av_count);
+}
+
+/**
+ * @brief This test removes a peer and inserts it again
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_reinsertion(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_rdm_peer *peer;
+ struct efa_ep_addr raw_addr, raw_addr_2;
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr;
+ struct efa_av *av;
+ struct efa_rdm_ep *efa_rdm_ep;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+ raw_addr.qpn = 174;
+ raw_addr.qkey = 0x1234;
+
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(err, 1);
+ assert_int_equal(fi_addr, 0);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len);
+ assert_int_equal(err, 0);
+ assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
+
+ peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
+ assert_int_equal(peer->av_entry->fi_addr, fi_addr);
+ assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1);
+
+ err = fi_av_remove(resource->av, &fi_addr, 1, 0);
+ assert_int_equal(err, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+
+ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(err, 1);
+ assert_int_equal(fi_addr, 0);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len);
+ assert_int_equal(err, 0);
+ assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
+
+ peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
+ assert_int_equal(peer->av_entry->fi_addr, fi_addr);
+ assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1);
+
+ err = fi_av_remove(resource->av, &fi_addr, 1, 0);
+ assert_int_equal(err, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+/**
+ * @brief Generate a peer with random QPN and QKEY and insert it into the implicit AV
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource)
+{
+ struct efa_ep_addr raw_addr;
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *peer;
+ fi_addr_t implicit_fi_addr, test_addr;
+ struct efa_av *av;
+ uint32_t ahn;
+ int err;
+
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+
+ raw_addr.qpn = rand();
+ raw_addr.qkey = rand();
+ ahn = efa_rdm_ep->self_ah->ahn;
+
+ /* Manually insert into implicit AV */
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+
+ err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &implicit_fi_addr, 0, NULL, true, true);
+ assert_int_equal(err, 0);
+
+ peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr);
+
+ assert_int_equal(peer->av_entry->implicit_fi_addr, implicit_fi_addr);
+ assert_int_equal(peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL);
+ assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1);
+
+ test_addr = efa_proto_av_reverse_lookup_implicit(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL);
+ assert_int_equal(test_addr, implicit_fi_addr);
+
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+
+ return peer;
+}
+
+/**
+ * @brief This test fakes a peer in the implicit AV and closes the AV with an
+ * implicit peer in it
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ test_av_get_peer_from_implicit_av(resource);
+}
+
+/**
+ * @brief This test fakes a peer in the implicit AV and verifies that the peer
+ * is moved to the explicit AV when fi_av_insert is called
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_to_explicit(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr, raw_addr_2;
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *peer;
+ fi_addr_t explicit_fi_addr, test_addr;
+ struct efa_av *av;
+ uint32_t ahn;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */
+ peer = test_av_get_peer_from_implicit_av(resource);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+
+ /* Modify the peer and verify that the peer is moved as-is */
+ peer->next_msg_id = 355;
+ peer->flags |= EFA_RDM_PEER_IN_BACKOFF;
+
+ /* Insert explicitly */
+ raw_addr.qpn = efa_proto_av_entry_ep_addr(peer->av_entry)->qpn;
+ raw_addr.qkey = efa_proto_av_entry_ep_addr(peer->av_entry)->qkey;
+ err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len);
+ assert_int_equal(err, 0);
+ assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1);
+
+ peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr);
+ assert_int_equal(peer->av_entry->fi_addr, explicit_fi_addr);
+ assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL);
+ assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1);
+
+ ahn = efa_rdm_ep->self_ah->ahn;
+ test_addr = efa_proto_av_reverse_lookup(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL);
+ assert_int_equal(test_addr, explicit_fi_addr);
+
+ /* Verify the manually set peer properties above */
+ assert_int_equal(peer->next_msg_id, 355);
+ assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF);
+
+ /* Unset the flag to make fi_av_remove easier */
+ peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF;
+
+ err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0);
+ assert_int_equal(err, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+static void test_av_implicit_av_verify_lru_list_first_last_elements(
+ struct efa_av *av, struct efa_proto_av_entry *first_conn_expected,
+ struct efa_proto_av_entry *last_conn_expected)
+{
+ struct dlist_entry *first_entry, *last_entry;
+ struct efa_proto_av_entry *first_conn_actual, *last_conn_actual;
+
+ first_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.next;
+ last_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.prev;
+
+ first_conn_actual = container_of(first_entry, struct efa_proto_av_entry,
+ implicit_av_lru_entry);
+ last_conn_actual = container_of(last_entry, struct efa_proto_av_entry,
+ implicit_av_lru_entry);
+
+ assert_ptr_equal(first_conn_actual, first_conn_expected);
+ assert_ptr_equal(last_conn_actual, last_conn_expected);
+}
+
+/**
+ * @brief This test inserts three implicit peers and verifies that the last
+ * inserted and/or accessed peer is at the tail of the LRU list
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_av_lru_insertion(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *peer0, *peer1, *peer2;
+ struct efa_av *av;
+ fi_addr_t implicit_fi_addr;
+ uint32_t ahn;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ /* Manually insert first address into implicit AV */
+ peer0 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 1, 0);
+
+ /* Expected LRU list: HEAD->peer0 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry);
+
+ /* Manually insert second address into implicit AV */
+ peer1 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
+
+ /* Expected LRU list: HEAD->peer0->peer1 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry);
+
+ /* Manually insert third address into implicit AV */
+ peer2 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
+
+ /* Expected LRU list: HEAD->peer0->peer1->peer2 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry);
+
+
+ /* Access peer0 through the CQ read path */
+ ahn = efa_rdm_ep->self_ah->ahn;
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ implicit_fi_addr = efa_proto_av_reverse_lookup_implicit(
+ container_of(av, struct efa_proto_av, efa_av), ahn,
+ efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(implicit_fi_addr, 0);
+
+ /* Expected LRU list: HEAD->peer1->peer2->peer0 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry);
+
+ /* Access peer2 through the CQ read path */
+ ahn = efa_rdm_ep->self_ah->ahn;
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ implicit_fi_addr = efa_proto_av_reverse_lookup_implicit(
+ container_of(av, struct efa_proto_av, efa_av), ahn,
+ efa_proto_av_entry_ep_addr(peer2->av_entry)->qpn, NULL);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(implicit_fi_addr, 2);
+
+ /* Expected LRU list: HEAD->peer1->peer0->peer2 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer2->av_entry);
+
+
+ /* Access peer1 through repeated AV insertion path */
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer1->av_entry), &implicit_fi_addr, 0, NULL, true, true);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(err, 0);
+ assert_int_equal(implicit_fi_addr, 1);
+ test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
+
+ /* Expected LRU list: HEAD->peer0->peer2->peer1 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry);
+
+ /* Access peer2 through repeated AV insertion path */
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer2->av_entry), &implicit_fi_addr, 0, NULL, true, true);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(err, 0);
+ assert_int_equal(implicit_fi_addr, 2);
+ test_av_verify_av_hash_cnt(av, 0, 0, 3, 0);
+
+ /* Expected LRU list: HEAD->peer0->peer1->peer2 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry);
+}
+
+/**
+ * @brief This test sets the implicit AV size to 2 and inserts four implicit
+ * peers. It verifies that the least recently used peer is evicted.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_av_lru_eviction(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3;
+ struct efa_ep_addr peer1_ep_addr, peer2_ep_addr;
+ struct efa_ep_addr_hashable *efa_ep_addr_hashable;
+ struct efa_av *av;
+ fi_addr_t implicit_fi_addr;
+ uint32_t ahn;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ /* Modify implicit AV size */
+ container_of(av, struct efa_proto_av, efa_av)->implicit_av_size = 2;
+
+ /* Manually insert first address into implicit AV */
+ peer0 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 1, 0);
+
+ /* Expected LRU list: HEAD->peer0 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry);
+
+ /* Manually insert second address into implicit AV */
+ peer1 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
+
+ /*
+ * Snapshot peer1/peer2 ep_addr before they are evicted. After
+ * eviction the enclosing peer_map_entry is returned to the bufpool
+ * and peer1->av_entry / peer2->av_entry become stale memory.
+ */
+ memcpy(&peer1_ep_addr, efa_proto_av_entry_ep_addr(peer1->av_entry),
+ sizeof(struct efa_ep_addr));
+
+ /* Expected LRU list: HEAD->peer0->peer1 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry);
+
+ /* Access peer0 through the CQ read path */
+ ahn = efa_rdm_ep->self_ah->ahn;
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ implicit_fi_addr = efa_proto_av_reverse_lookup_implicit(
+ container_of(av, struct efa_proto_av, efa_av), ahn,
+ efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(implicit_fi_addr, 0);
+
+ /* Expected LRU list: HEAD->peer1->peer0 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry);
+
+ /* Manually insert third address into implicit AV */
+ peer2 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
+ /* Snapshot peer2 ep_addr before it too gets evicted later. */
+ memcpy(&peer2_ep_addr, efa_proto_av_entry_ep_addr(peer2->av_entry),
+ sizeof(struct efa_ep_addr));
+
+ /* Expected LRU list: HEAD->peer0->peer2 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry);
+
+ /* Verify that peer1 is evicted and added to the evicted hashmap */
+ assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 1);
+ HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer1_ep_addr,
+ sizeof(struct efa_ep_addr), efa_ep_addr_hashable);
+ assert_non_null(efa_ep_addr_hashable);
+ assert_int_equal(efa_is_same_addr(&peer1_ep_addr,
+ &efa_ep_addr_hashable->addr),
+ 1);
+
+ /* Access peer0 through repeated AV insertion path */
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer0->av_entry), &implicit_fi_addr, 0, NULL, true, true);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ assert_int_equal(err, 0);
+ assert_int_equal(implicit_fi_addr, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
+
+ /* Expected LRU list: HEAD->peer2->peer0 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->av_entry, peer0->av_entry);
+
+ /* Manually insert fourth address into implicit AV */
+ peer3 = test_av_get_peer_from_implicit_av(resource);
+ test_av_verify_av_hash_cnt(av, 0, 0, 2, 0);
+
+ /* Verify that peer2 is evicted and added to the evicted hashmap */
+ assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 2);
+ HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer2_ep_addr,
+ sizeof(struct efa_ep_addr), efa_ep_addr_hashable);
+ assert_non_null(efa_ep_addr_hashable);
+ assert_int_equal(efa_is_same_addr(&peer2_ep_addr,
+ &efa_ep_addr_hashable->addr),
+ 1);
+
+ /* Expected LRU list: HEAD->peer0->peer3 */
+ test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer3->av_entry);
+}
+
+/**
+ * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_ah_refcnt(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ fi_addr_t fi_addr;
+ struct efa_ep_addr raw_addr = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_domain *efa_domain;
+ struct efa_rdm_peer *peer;
+ struct efa_av *av;
+ struct efa_ah *efa_ah = NULL;
+ int err;
+
+ int allowed_ahs = 1;
+
+ g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah;
+ g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah;
+ g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah;
+ g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah;
+
+ g_self_ah_cnt = 1;
+ g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs;
+ assert_int_equal(g_ibv_ah_cnt, 0);
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+
+ /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */
+ assert_int_equal(g_ibv_ah_cnt, 1);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
+
+ /* Manually insert into implicit AV */
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &fi_addr, 0, NULL, true, true);
+ peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+
+ efa_ah = peer->av_entry->ah;
+
+ assert_int_equal(g_ibv_ah_cnt, 2);
+
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1);
+
+ /* Move implicit AV entry to explicit AV entry */
+ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(err, 1);
+
+ assert_int_equal(g_ibv_ah_cnt, 2);
+
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0);
+
+ err = fi_av_remove(resource->av, &fi_addr, 1, 0);
+ assert_int_equal(err, 0);
+
+ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0);
+
+ /* Only the self AH should be left */
+ assert_int_equal(g_ibv_ah_cnt, 1);
+}
+
+/**
+ * @brief This test inserts one implicit AV entry and verifies that the
+ * implicitly created AH is evicted when an explicit AV entry is inserted. It
+ * requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_ah_lru_eviction_impl(bool explicit)
+{
+ fi_addr_t fi_addr;
+ struct efa_ep_addr raw_addr[2] = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ struct fid_fabric *fabric_fid[2];
+ struct fid_domain *domain_fid[2];
+ struct fid_ep *ep_fid[2];
+ struct fid_cq *cq_fid[2];
+ struct fid_av *av_fid[2];
+ struct efa_domain *efa_domain[2];
+ struct efa_rdm_ep *efa_rdm_ep[2];
+ struct efa_rdm_peer *peer;
+ struct efa_av *efa_av[2];
+ struct efa_ah *efa_ah = NULL;
+ int err;
+ struct fi_av_attr av_attr = {0};
+ struct fi_cq_attr cq_attr = {
+ .format = FI_CQ_FORMAT_DATA
+ };
+ struct fi_info *hints, *info, *cur;
+ int num_nic = 0;
+
+ int allowed_ahs = 1;
+
+ g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah;
+ g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah;
+ g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah;
+ g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah;
+
+ hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME);
+ fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info);
+ for (cur = info; cur; cur = cur->next) {
+ num_nic++;
+ }
+
+ if (num_nic < 2) {
+ fi_freeinfo(info);
+ fi_freeinfo(hints);
+ return;
+ }
+
+ g_self_ah_cnt = 2;
+ g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */
+ assert_int_equal(g_ibv_ah_cnt, 0);
+
+ cur = info;
+ for (int i = 0; i < 2; i++) {
+ err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL);
+ assert_int_equal(err, 0);
+
+ err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL);
+ assert_int_equal(err, 0);
+
+ efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid);
+
+ err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL);
+ assert_int_equal(err, 0);
+
+ efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid);
+
+ err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL);
+ assert_int_equal(err, 0);
+
+ err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL);
+ assert_int_equal(err, 0);
+
+ efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0);
+ fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV);
+
+ err = fi_enable(ep_fid[i]);
+ assert_int_equal(err, 0);
+
+ err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len);
+ assert_int_equal(err, 0);
+
+ cur = cur->next;
+ }
+
+ assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0);
+
+ /* Manually insert into implicit AV in first domain */
+ ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[0], &fi_addr, 0, NULL, true, true);
+ peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr);
+ ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
+
+ assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1);
+ efa_ah = peer->av_entry->ah;
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0);
+
+ if (explicit) {
+ err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL);
+ assert_int_equal(err, 1);
+ peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr);
+ } else {
+ ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
+ err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[1], &fi_addr, 0, NULL, true, true);
+ peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr);
+ ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock);
+ }
+
+ assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1);
+
+ efa_ah = peer->av_entry->ah;
+ if (explicit) {
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1);
+ } else {
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1);
+ assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0);
+ }
+
+ if (explicit) {
+ err = fi_av_remove(av_fid[0], &fi_addr, 1, 0);
+ assert_int_equal(err, 0);
+ assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ efa_rdm_ep[i]->self_ah = NULL;
+ fi_close(&ep_fid[i]->fid);
+ fi_close(&cq_fid[i]->fid);
+ fi_close(&av_fid[i]->fid);
+ fi_close(&domain_fid[i]->fid);
+ fi_close(&fabric_fid[i]->fid);
+ }
+ fi_freeinfo(hints);
+ fi_freeinfo(info);
+}
+
+/**
+ * @brief This test inserts one implicit AV entry and verifies that the
+ * implicitly created AH is evicted when an explicit AV entry is inserted. It
+ * requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state)
+{
+ test_ah_lru_eviction_impl(true);
+}
+
+/**
+ * @brief This test inserts one implicit AV entry and verifies that the
+ * implicitly created AH is evicted when another implicit AV entry is inserted.
+ * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state)
+{
+ test_ah_lru_eviction_impl(false);
+}
+
+/**
+ * @brief Test proto AV explicit reverse lookup returns correct fi_addr
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_reverse_lookup_explicit(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr, lookup_addr;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_rdm_ep *efa_rdm_ep;
+ uint32_t ahn;
+ int num_addr;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+ ahn = efa_rdm_ep->self_ah->ahn;
+
+ /* Reverse lookup on empty AV should return NOTAVAIL */
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL);
+ assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL);
+
+ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0);
+ raw_addr.qpn = 42;
+ raw_addr.qkey = 0x5678;
+
+ num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(num_addr, 1);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ /* Reverse lookup should find the entry */
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL);
+ assert_int_equal(lookup_addr, fi_addr);
+
+ /* Lookup with wrong QPN should return NOTAVAIL */
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 99, NULL);
+ assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL);
+
+ /* After remove, reverse lookup should return FI_ADDR_NOTAVAIL */
+ fi_av_remove(resource->av, &fi_addr, 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL);
+ assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL);
+}
+
+/**
+ * @brief Test that proto AV addr_to_entry returns NULL after entry is removed
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_addr_to_entry_after_remove(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_proto_av_entry *entry;
+ int num_addr;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+
+ /* addr_to_entry on empty AV should return NULL */
+ entry = efa_proto_av_addr_to_entry(proto_av, 0);
+ assert_null(entry);
+
+ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0);
+ raw_addr.qpn = 99;
+ raw_addr.qkey = 0x9999;
+
+ num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(num_addr, 1);
+
+ /* Entry should be found with correct fields */
+ entry = efa_proto_av_addr_to_entry(proto_av, fi_addr);
+ assert_non_null(entry);
+ assert_non_null(entry->ah);
+ assert_int_equal(entry->fi_addr, fi_addr);
+ assert_int_equal(entry->implicit_fi_addr, FI_ADDR_NOTAVAIL);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 99);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x9999);
+
+ /* Remove and verify entry is no longer valid */
+ fi_av_remove(resource->av, &fi_addr, 1, 0);
+ entry = efa_proto_av_addr_to_entry(proto_av, fi_addr);
+ assert_null(entry);
+}
+
+/**
+ * @brief Test proto AV insert/remove with peer creation via get_peer
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_insert_remove_with_peer(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr;
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *peer, *peer2;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_proto_av_entry *entry;
+ int num_addr;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0);
+ raw_addr.qpn = 55;
+ raw_addr.qkey = 0x5555;
+
+ num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(num_addr, 1);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ /* Create peer via get_peer */
+ peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
+ assert_non_null(peer);
+ assert_non_null(peer->av_entry);
+ assert_int_equal(peer->av_entry->fi_addr, fi_addr);
+ assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL);
+ assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qpn, 55);
+ assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qkey, 0x5555);
+ assert_ptr_equal(peer->ep, efa_rdm_ep);
+
+ /* Peer map lookup should find the same peer */
+ peer2 = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr);
+ assert_ptr_equal(peer2, peer);
+
+ /* Verify peer map on the entry itself */
+ entry = efa_proto_av_addr_to_entry(proto_av, fi_addr);
+ assert_non_null(entry);
+ assert_ptr_equal(efa_proto_av_entry_ep_peer_map_lookup(entry, efa_rdm_ep), peer);
+
+ /* Remove — peer is destroyed during av_remove */
+ fi_av_remove(resource->av, &fi_addr, 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+/**
+ * @brief Test proto AV implicit insert followed by explicit insert of same addr
+ * verifies the peer's av_entry pointer is updated to the explicit entry
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_to_explicit_peer_updated(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_rdm_peer *implicit_peer, *explicit_peer;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ fi_addr_t implicit_fi_addr, explicit_fi_addr;
+ struct efa_ah *ah_before;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+
+ /* Insert implicit peer */
+ implicit_peer = test_av_get_peer_from_implicit_av(resource);
+ assert_non_null(implicit_peer);
+ implicit_fi_addr = implicit_peer->av_entry->implicit_fi_addr;
+ assert_int_equal(implicit_peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL);
+ assert_int_not_equal(implicit_fi_addr, FI_ADDR_NOTAVAIL);
+ test_av_verify_av_hash_cnt(av, 0, 0, 1, 0);
+
+ /* Remember the AH — it should be reused after migration */
+ ah_before = implicit_peer->av_entry->ah;
+ assert_non_null(ah_before);
+
+ /* Now insert explicitly with the same address */
+ struct efa_ep_addr raw_addr;
+ memcpy(&raw_addr, implicit_peer->av_entry->ep_addr, EFA_EP_ADDR_LEN);
+
+ err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL);
+ assert_int_equal(err, 1);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ /* Implicit entry should be gone */
+ assert_null(efa_proto_av_addr_to_entry_implicit(proto_av, implicit_fi_addr));
+
+ /* Get peer via explicit addr — should be the same peer with updated av_entry */
+ explicit_peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr);
+ assert_non_null(explicit_peer);
+ assert_ptr_equal(explicit_peer, implicit_peer);
+ assert_int_equal(explicit_peer->av_entry->fi_addr, explicit_fi_addr);
+ assert_int_equal(explicit_peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL);
+
+ /* AH should be the same object (reused, not reallocated) */
+ assert_ptr_equal(explicit_peer->av_entry->ah, ah_before);
+
+ fi_av_remove(resource->av, &explicit_fi_addr, 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+/**
+ * @brief Test proto AV batch insert of multiple addresses in one fi_av_insert call
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_batch_insert(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addrs[3] = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addrs[3];
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_proto_av_entry *entry;
+ int num_addr, i;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+
+ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addrs[0], &raw_addr_len), 0);
+ memcpy(&raw_addrs[1], &raw_addrs[0], sizeof(struct efa_ep_addr));
+ memcpy(&raw_addrs[2], &raw_addrs[0], sizeof(struct efa_ep_addr));
+ raw_addrs[0].qpn = 10; raw_addrs[0].qkey = 0x1000;
+ raw_addrs[1].qpn = 11; raw_addrs[1].qkey = 0x1001;
+ raw_addrs[2].qpn = 12; raw_addrs[2].qkey = 0x1002;
+
+ num_addr = fi_av_insert(resource->av, raw_addrs, 3, fi_addrs, 0, NULL);
+ assert_int_equal(num_addr, 3);
+
+ /* All three should have distinct fi_addrs */
+ assert_int_not_equal(fi_addrs[0], fi_addrs[1]);
+ assert_int_not_equal(fi_addrs[1], fi_addrs[2]);
+ assert_int_not_equal(fi_addrs[0], fi_addrs[2]);
+
+ test_av_verify_av_hash_cnt(av, 3, 0, 0, 0);
+
+ /* Verify each entry is accessible with correct QPN */
+ for (i = 0; i < 3; i++) {
+ entry = efa_proto_av_addr_to_entry(proto_av, fi_addrs[i]);
+ assert_non_null(entry);
+ assert_non_null(entry->ah);
+ assert_int_equal(entry->fi_addr, fi_addrs[i]);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 10 + i);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x1000 + i);
+ }
+
+ /* Remove one at a time and verify counts */
+ fi_av_remove(resource->av, &fi_addrs[0], 1, 0);
+ test_av_verify_av_hash_cnt(av, 2, 0, 0, 0);
+ assert_null(efa_proto_av_addr_to_entry(proto_av, fi_addrs[0]));
+
+ fi_av_remove(resource->av, &fi_addrs[1], 1, 0);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ fi_av_remove(resource->av, &fi_addrs[2], 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+/**
+ * @brief Test proto AV remove of non-existent address returns error
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_remove_nonexistent(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ fi_addr_t bad_addr = 9999;
+ fi_addr_t notavail = FI_ADDR_NOTAVAIL;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+
+ /* Remove with out-of-range fi_addr */
+ err = fi_av_remove(resource->av, &bad_addr, 1, 0);
+ assert_int_not_equal(err, 0);
+
+ /* Remove with FI_ADDR_NOTAVAIL */
+ err = fi_av_remove(resource->av, ¬avail, 1, 0);
+ assert_int_not_equal(err, 0);
+}
+
+/**
+ * @brief Test proto AV prv_reverse_av path: insert two addresses with same GID
+ * but different QPN/QKEY, remove the first, insert a new one with the same QPN
+ * as the first but different QKEY. The old entry should be in prv_reverse_av.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_prv_reverse_av(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr1 = {0}, raw_addr2 = {0};
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr1, fi_addr2;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_rdm_ep *efa_rdm_ep;
+ struct efa_proto_av_entry *entry1, *entry2;
+ fi_addr_t lookup_addr;
+ uint32_t ahn;
+ int num_addr;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
+ ahn = efa_rdm_ep->self_ah->ahn;
+
+ assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr1, &raw_addr_len), 0);
+ memcpy(&raw_addr2, &raw_addr1, sizeof(struct efa_ep_addr));
+
+ /* Insert first address with qpn=20, qkey=0xAAAA */
+ raw_addr1.qpn = 20;
+ raw_addr1.qkey = 0xAAAA;
+ num_addr = fi_av_insert(resource->av, &raw_addr1, 1, &fi_addr1, 0, NULL);
+ assert_int_equal(num_addr, 1);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+
+ /* Verify first entry */
+ entry1 = efa_proto_av_addr_to_entry(proto_av, fi_addr1);
+ assert_non_null(entry1);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry1)->qkey, 0xAAAA);
+
+ /* Reverse lookup should find first entry */
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL);
+ assert_int_equal(lookup_addr, fi_addr1);
+
+ /* Insert second address with same qpn=20 but different qkey=0xBBBB.
+ * This simulates QPN reuse — the first entry moves to prv_reverse_av */
+ raw_addr2.qpn = 20;
+ raw_addr2.qkey = 0xBBBB;
+ num_addr = fi_av_insert(resource->av, &raw_addr2, 1, &fi_addr2, 0, NULL);
+ assert_int_equal(num_addr, 1);
+ assert_int_not_equal(fi_addr1, fi_addr2);
+
+ /* cur_reverse_av has 1 entry (the latest), prv_reverse_av has 1 (the old) */
+ test_av_verify_av_hash_cnt(av, 1, 1, 0, 0);
+
+ /* Verify second entry */
+ entry2 = efa_proto_av_addr_to_entry(proto_av, fi_addr2);
+ assert_non_null(entry2);
+ assert_int_equal(efa_proto_av_entry_ep_addr(entry2)->qkey, 0xBBBB);
+
+ /* Both entries should share the same AH (same GID) */
+ assert_ptr_equal(entry1->ah, entry2->ah);
+
+ /* Reverse lookup without connid should return the current (latest) entry */
+ lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL);
+ assert_int_equal(lookup_addr, fi_addr2);
+
+ /* Remove in reverse order: current entry first, then previous */
+ fi_av_remove(resource->av, &fi_addr2, 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 1, 0, 0);
+
+ fi_av_remove(resource->av, &fi_addr1, 1, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+}
+
+/**
+ * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then
+ * remove the first-inserted peer before the second. This reproduces the bug
+ * in efa_av_reverse_av_remove() where the code blindly deletes the
+ * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to
+ * a different (newer) conn. Removing the surviving second peer afterwards
+ * then hits a NULL prv_reverse_av_entry and SEGVs.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr raw_addr;
+ size_t raw_addr_len = sizeof(struct efa_ep_addr);
+ fi_addr_t fi_addr1, fi_addr2;
+ struct efa_av *av;
+ struct efa_proto_av *proto_av;
+ struct efa_rdm_ep *efa_rdm_ep;
+ uint32_t ahn;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+
+ err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
+ assert_int_equal(err, 0);
+
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep,
+ base_ep.util_ep.ep_fid);
+ ahn = efa_rdm_ep->self_ah->ahn;
+
+ /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */
+ raw_addr.qpn = 100;
+ raw_addr.qkey = 0xAAAA;
+ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL);
+ assert_int_equal(err, 1);
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+ /* cur_reverse_av (ahn, 100) -> entry1 (fi_addr1) */
+ assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL),
+ fi_addr1);
+
+ /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's
+ * reverse-AV entry from cur_reverse_av into prv_reverse_av. */
+ raw_addr.qpn = 100;
+ raw_addr.qkey = 0xBBBB;
+ err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL);
+ assert_int_equal(err, 1);
+ assert_int_not_equal(fi_addr1, fi_addr2);
+ test_av_verify_av_hash_cnt(av, 1, 1, 0, 0);
+ /* cur_reverse_av (ahn, 100) now points to entry2 (fi_addr2); peer1 is
+ * in prv_reverse_av keyed by its own qkey. */
+ assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL),
+ fi_addr2);
+
+ /* Remove peer1 first. Without the fix this would incorrectly delete
+ * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */
+ err = fi_av_remove(resource->av, &fi_addr1, 1, 0);
+ assert_int_equal(err, 0);
+ /* peer1's prv entry is gone; peer2's cur entry must still be intact. */
+ test_av_verify_av_hash_cnt(av, 1, 0, 0, 0);
+ assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL),
+ fi_addr2);
+
+ /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry
+ * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */
+ err = fi_av_remove(resource->av, &fi_addr2, 1, 0);
+ assert_int_equal(err, 0);
+ test_av_verify_av_hash_cnt(av, 0, 0, 0, 0);
+ assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL),
+ FI_ADDR_NOTAVAIL);
+}
+
+/**
+ * @brief Inserting an all-zero GID into the protocol AV must be rejected.
+ *
+ * efa_av_is_valid_address() returns 0 for all-zero GIDs. fi_av_insert
+ * should skip the bad address and return 0 (no address inserted), and
+ * the output fi_addr should be FI_ADDR_NOTAVAIL.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_insert_invalid_address(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_ep_addr zero_addr = {0};
+ fi_addr_t fi_addr = 0;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+
+ zero_addr.qpn = 5;
+ zero_addr.qkey = 0x1234;
+ /* zero_addr.raw is left all-zero */
+
+ err = fi_av_insert(resource->av, &zero_addr, 1, &fi_addr, 0, NULL);
+ assert_int_equal(err, 0);
+ assert_int_equal(fi_addr, FI_ADDR_NOTAVAIL);
+}
+
+/**
+ * @brief With implicit_av_size set to 0 (unbounded mode), the implicit AV
+ * never evicts entries.
+ *
+ * Insert several implicit peers and verify all remain in the LRU list and
+ * util_av, and evicted_peers_hashset stays empty.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_av_unbounded(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_proto_av *proto_av;
+ struct efa_av *av;
+ const int num_peers = 10;
+ int i;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+
+ /* Disable the eviction limit */
+ proto_av->implicit_av_size = 0;
+
+ for (i = 0; i < num_peers; i++)
+ test_av_get_peer_from_implicit_av(resource);
+
+ /* All peers should still be in the implicit AV */
+ assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), num_peers);
+ /* No peer should have been evicted */
+ assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 0);
+}
+
+/**
+ * @brief efa_proto_av_open rejects attr->name and attr->flags (both unsupported)
+ *
+ * Ensures the early-return error paths in efa_proto_av_open are exercised.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_proto_open_unsupported_attrs(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct fi_av_attr av_attr = {0};
+ struct fid_av *av = NULL;
+ int err;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+
+ /* attr->name is not supported */
+ av_attr.name = "foo";
+ err = fi_av_open(resource->domain, &av_attr, &av, NULL);
+ assert_int_equal(err, -FI_ENOSYS);
+ assert_null(av);
+ av_attr.name = NULL;
+
+ /* attr->flags is not supported */
+ av_attr.flags = 1;
+ err = fi_av_open(resource->domain, &av_attr, &av, NULL);
+ assert_int_equal(err, -FI_ENOSYS);
+ assert_null(av);
+}
+
+/**
+ * @brief efa_proto_av_implicit_av_lru_entry_move on a single-element list
+ *
+ * Insert exactly one implicit peer; the LRU list has exactly one node.
+ * Call efa_proto_av_implicit_av_lru_entry_move on it — this exercises the
+ * dlist_entry_in_list assertion on the smallest non-empty list.
+ *
+ * @param[in] state struct efa_resource that is managed by the framework
+ */
+void test_av_implicit_av_lru_move_single(struct efa_resource **state)
+{
+ struct efa_resource *resource = *state;
+ struct efa_rdm_peer *peer;
+ struct efa_proto_av *proto_av;
+ struct efa_av *av;
+ struct efa_rdm_ep *efa_rdm_ep;
+
+ efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME);
+ av = container_of(resource->av, struct efa_av, util_av.av_fid);
+ proto_av = container_of(av, struct efa_proto_av, efa_av);
+ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep,
+ base_ep.util_ep.ep_fid);
+
+ peer = test_av_get_peer_from_implicit_av(resource);
+ assert_non_null(peer);
+
+ ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock);
+ efa_proto_av_implicit_av_lru_entry_move(proto_av, peer->av_entry);
+ ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock);
+
+ /* Still exactly one entry in the LRU list */
+ test_av_implicit_av_verify_lru_list_first_last_elements(
+ av, peer->av_entry, peer->av_entry);
+}
diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c
index 9a54e522bad..01239822b6b 100644
--- a/prov/efa/test/efa_unit_test_srx.c
+++ b/prov/efa/test/efa_unit_test_srx.c
@@ -84,7 +84,7 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state)
struct efa_rdm_pke *pke;
struct efa_ep_addr raw_addr = {0};
size_t raw_addr_len = sizeof(raw_addr);
- struct efa_conn conn = {0};
+ struct efa_proto_av_entry fake_entry = {0};
struct efa_rdm_peer peer;
struct efa_unit_test_eager_rtm_pkt_attr pke_attr = {.msg_id = 0,
.connid = 0x1234};
@@ -113,8 +113,8 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state)
fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0);
raw_addr.qpn = 0;
raw_addr.qkey = 0x1234;
- conn.ep_addr = &raw_addr;
- efa_rdm_peer_construct(&peer, efa_rdm_ep, &conn);
+ memcpy(fake_entry.ep_addr, &raw_addr, EFA_EP_ADDR_LEN);
+ efa_rdm_peer_construct(&peer, efa_rdm_ep, &fake_entry);
pke->peer = &peer;
efa_unit_test_eager_msgrtm_pkt_construct(pke, &pke_attr);
diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c
index 6f6f7771361..49fd5672326 100644
--- a/prov/efa/test/efa_unit_tests.c
+++ b/prov/efa/test/efa_unit_tests.c
@@ -147,6 +147,11 @@ int main(void)
cmocka_unit_test_setup_teardown(test_efa_ah_cnt_multi_av_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_base_addr_to_entry_invalid, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ /* end efa_unit_test_av.c */
+
+ /* begin efa_unit_test_proto_av.c */
cmocka_unit_test_setup_teardown(test_av_reinsertion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_av_reverse_av_remove_qpn_collision, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_av_implicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
@@ -156,7 +161,19 @@ int main(void)
cmocka_unit_test_setup_teardown(test_ah_refcnt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ah_lru_eviction_explicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_ah_lru_eviction_implicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
- /* end efa_unit_test_av.c */
+ cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_reverse_lookup_explicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_addr_to_entry_after_remove, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_insert_remove_with_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_implicit_to_explicit_peer_updated, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_batch_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_remove_nonexistent, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_prv_reverse_av, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_insert_invalid_address, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_implicit_av_unbounded, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_proto_open_unsupported_attrs, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ cmocka_unit_test_setup_teardown(test_av_implicit_av_lru_move_single, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
+ /* end efa_unit_test_proto_av.c */
/* begin efa_unit_test_ep.c */
cmocka_unit_test_setup_teardown(test_efa_device_construct_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h
index 1c9b021f051..06f0405f911 100644
--- a/prov/efa/test/efa_unit_tests.h
+++ b/prov/efa/test/efa_unit_tests.h
@@ -117,6 +117,11 @@ void test_efa_ah_cnt_multi_av_efa();
void test_efa_ah_cnt_multi_av_efa_direct();
void test_av_multiple_ep_efa();
void test_av_multiple_ep_efa_direct();
+void test_av_insert_remove_lookup_efa_direct();
+void test_av_base_addr_to_entry_invalid();
+/* end efa_unit_test_av.c */
+
+/* begin efa_unit_test_proto_av.c */
void test_av_reinsertion();
void test_av_reverse_av_remove_qpn_collision();
void test_av_implicit();
@@ -126,7 +131,18 @@ void test_av_implicit_av_lru_eviction();
void test_ah_refcnt();
void test_ah_lru_eviction_explicit_av_insert();
void test_ah_lru_eviction_implicit_av_insert();
-/* end efa_unit_test_av.c */
+void test_av_proto_reverse_lookup_explicit();
+void test_av_proto_addr_to_entry_after_remove();
+void test_av_proto_insert_remove_with_peer();
+void test_av_implicit_to_explicit_peer_updated();
+void test_av_proto_batch_insert();
+void test_av_proto_remove_nonexistent();
+void test_av_proto_prv_reverse_av();
+void test_av_proto_insert_invalid_address();
+void test_av_implicit_av_unbounded();
+void test_av_proto_open_unsupported_attrs();
+void test_av_implicit_av_lru_move_single();
+/* end efa_unit_test_proto_av.c */
void test_efa_device_construct_error_handling();
void test_efa_rdm_ep_ignore_missing_host_id_file();