diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 76d1f73ba48..d59ea6e0a68 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -873,7 +873,7 @@ - + @@ -1018,7 +1018,7 @@ - + diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 3d7780e389f..a4d3197de8a 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -36,7 +36,7 @@ _efa_files = \ prov/efa/src/efa_shm.c \ prov/efa/src/efa_av.c \ prov/efa/src/efa_ah.c \ - prov/efa/src/efa_conn.c \ + prov/efa/src/rdm/efa_proto_av.c \ prov/efa/src/efa_domain.c \ prov/efa/src/efa_fabric.c \ prov/efa/src/efa_mr.c \ @@ -89,7 +89,6 @@ _efa_headers = \ prov/efa/src/efa.h \ prov/efa/src/efa_av.h \ prov/efa/src/efa_ah.h \ - prov/efa/src/efa_conn.h \ prov/efa/src/efa_mr.h \ prov/efa/src/efa_shm.h \ prov/efa/src/efa_hmem.h \ @@ -117,6 +116,7 @@ _efa_headers = \ prov/efa/src/efa_data_path_direct_internal.h \ prov/efa/src/efa_mmio.h \ prov/efa/src/rdm/efa_rdm_peer.h \ + prov/efa/src/rdm/efa_proto_av.h \ prov/efa/src/rdm/efa_rdm_cq.h \ prov/efa/src/rdm/efa_rdm_cntr.h \ prov/efa/src/rdm/efa_rdm_ep.h \ @@ -159,6 +159,7 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_domain.c \ prov/efa/test/efa_unit_test_ep.c \ prov/efa/test/efa_unit_test_av.c \ + prov/efa/test/efa_unit_test_proto_av.c \ prov/efa/test/efa_unit_test_cq.c \ prov/efa/test/efa_unit_test_cntr.c \ prov/efa/test/efa_unit_test_device.c \ diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index 7e85962cbee..b59d796cbc6 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -52,6 +52,7 @@ #include "rdm/efa_rdm_ope.h" #include "rdm/efa_rdm_pke.h" #include "rdm/efa_rdm_peer.h" +#include "rdm/efa_proto_av.h" #include "rdm/efa_rdm_util.h" #include "fi_ext_efa.h" diff --git a/prov/efa/src/efa_ah.c b/prov/efa/src/efa_ah.c index 53bf736f1fd..12d2167d835 100644 --- a/prov/efa/src/efa_ah.c +++ b/prov/efa/src/efa_ah.c @@ -5,75 +5,18 @@ #include "efa.h" #include "efa_ah.h" -#include "efa_conn.h" #include -void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah); - /** - * @brief Move the AH to the end of the LRU list to indicate that it is the - * most recently used entry + * @brief Emit a detailed warning for ibv_create_ah EINVAL. * - * This function is not called in the efa_rdm_ep_get_peer so that we don't add - * extra latency to the critical path with explicit AV insertion. We use the LRU - * list to remove AH entries with only implicit AV entries, so it is OK to do - * that. + * The most common reasons for EINVAL are cross-AZ addressing, invalid + * remote GID, and invalid PD. Log both local and remote GIDs plus the + * PD pointer to help operators diagnose failures from logs alone. * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list + * @param[in] domain efa domain (for local GID and PD) + * @param[in] gid remote GID that failed */ -void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, - struct efa_ah *ah) -{ - assert(ah->implicit_refcnt > 0 || ah->explicit_refcnt > 0); - assert(dlist_entry_in_list(&domain->ah_lru_list, - &ah->domain_lru_ah_list_entry)); - - dlist_remove(&ah->domain_lru_ah_list_entry); - dlist_insert_tail(&ah->domain_lru_ah_list_entry, - &domain->ah_lru_list); -} - -static inline int efa_ah_implicit_av_evict_ah(struct efa_domain *domain) { - struct efa_conn *conn_to_release; - struct efa_ah *ah_tmp, *ah_to_release = NULL; - struct dlist_entry *tmp; - - dlist_foreach_container (&domain->ah_lru_list, struct efa_ah, ah_tmp, - domain_lru_ah_list_entry) { - if (ah_tmp->explicit_refcnt == 0) { - ah_to_release = ah_tmp; - break; - } - } - - if (!ah_to_release) { - EFA_WARN(FI_LOG_AV, - "AH creation for implicit AV entry failed with ENOMEM " - "but no AH entries available to evict\n"); - return -FI_ENOMEM; - } - - assert(ah_to_release->implicit_refcnt > 0); - - dlist_foreach_container_safe(&ah_to_release->implicit_conn_list, - struct efa_conn, conn_to_release, - ah_implicit_conn_list_entry, tmp) { - - assert(conn_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && - conn_to_release->fi_addr == FI_ADDR_NOTAVAIL); - - efa_conn_release_ah_unsafe(conn_to_release->av, conn_to_release, true); - } - - if (ah_to_release->implicit_refcnt == 0 && - ah_to_release->explicit_refcnt == 0) { - efa_ah_destroy_ah(domain, ah_to_release); - } - - return FI_SUCCESS; -} - static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t *gid) { char remote_gid_str[INET6_ADDRSTRLEN] = {0}; @@ -95,15 +38,20 @@ static void efa_ah_warn_create_einval(struct efa_domain *domain, const uint8_t * } /** - * @brief allocate an ibv_ah object from GID. - * This function use a hash map to store GID to ibv_ah map, - * and re-use ibv_ah for same GID + * @brief allocate an ibv_ah from GID, reusing existing AH if possible + * + * Uses a hash map to store GID to ibv_ah mapping and reuses ibv_ah for + * the same GID. If ibv_create_ah fails, returns NULL with errno set. + * The caller is responsible for handling ENOMEM (e.g. by evicting AH + * entries and retrying). * - * @param[in] domain efa_domain - * @param[in] gid GID + * @param[in] domain efa domain + * @param[in] gid GID + * @param[in] alloc_size size to allocate (sizeof(efa_ah) or larger for protocol wrapper) + * @return pointer to efa_ah on success, NULL on failure (errno set) */ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { struct ibv_pd *ibv_pd = domain->ibv_pd; struct efa_ah *efa_ah; @@ -111,21 +59,23 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, struct efadv_ah_attr efa_ah_attr = { 0 }; int err; + assert(alloc_size >= sizeof(struct efa_ah)); + efa_ah = NULL; ofi_genlock_lock(&domain->util_domain.lock); HASH_FIND(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah); if (efa_ah) { - insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++; - efa_ah_implicit_av_lru_ah_move(domain, efa_ah); + efa_ah->refcnt++; ofi_genlock_unlock(&domain->util_domain.lock); return efa_ah; } - efa_ah = malloc(sizeof(struct efa_ah)); + efa_ah = calloc(1, alloc_size); if (!efa_ah) { errno = FI_ENOMEM; EFA_WARN(FI_LOG_AV, "cannot allocate memory for efa_ah\n"); + ofi_genlock_unlock(&domain->util_domain.lock); return NULL; } @@ -134,39 +84,13 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, memcpy(ibv_ah_attr.grh.dgid.raw, gid, EFA_GID_LEN); efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr); if (!efa_ah->ibv_ah) { - /* If the failure is because we have too many AH entries, try to - * evict an AH entry with no explicit AV entries and try AH - * creation again */ - if (errno == FI_ENOMEM) { - EFA_INFO( - FI_LOG_AV, - "ibv_create_ah failed with ENOMEM for implicit " - "AV insertion. Attempting to evict AH entry\n"); - - err = efa_ah_implicit_av_evict_ah(domain); - if (err) - goto err_free_efa_ah; - - efa_ah->ibv_ah = ibv_create_ah(ibv_pd, &ibv_ah_attr); - if (!efa_ah->ibv_ah) { - if (errno == EINVAL) { - efa_ah_warn_create_einval(domain, gid); - } else { - EFA_WARN(FI_LOG_AV, - "ibv_create_ah failed for implicit AV " - "insertion! errno: %d\n", - errno); - } - goto err_free_efa_ah; - } - } else if (errno == EINVAL) { + if (errno == EINVAL) { efa_ah_warn_create_einval(domain, gid); - goto err_free_efa_ah; } else { EFA_WARN(FI_LOG_AV, - "ibv_create_ah failed! errno: %s\n", strerror(errno)); - goto err_free_efa_ah; + "ibv_create_ah failed! errno: %d\n", errno); } + goto err_free; } err = efadv_query_ah(efa_ah->ibv_ah, &efa_ah_attr, sizeof(efa_ah_attr)); @@ -176,11 +100,7 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, goto err_destroy_ibv_ah; } - dlist_init(&efa_ah->implicit_conn_list); - dlist_insert_tail(&efa_ah->domain_lru_ah_list_entry, &domain->ah_lru_list); - efa_ah->implicit_refcnt = 0; - efa_ah->explicit_refcnt = 0; - insert_implicit_av ? efa_ah->implicit_refcnt++ : efa_ah->explicit_refcnt++; + efa_ah->refcnt = 1; efa_ah->ahn = efa_ah_attr.ahn; memcpy(efa_ah->gid, gid, EFA_GID_LEN); HASH_ADD(hh, domain->ah_map, gid, EFA_GID_LEN, efa_ah); @@ -189,21 +109,27 @@ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, err_destroy_ibv_ah: ibv_destroy_ah(efa_ah->ibv_ah); -err_free_efa_ah: +err_free: free(efa_ah); ofi_genlock_unlock(&domain->util_domain.lock); return NULL; } -void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) +/** + * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free) + * + * Caller must hold util_domain.lock. + * + * @param[in] domain efa domain + * @param[in] ah efa_ah to destroy + */ +void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah) { int err; - assert(ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0); - assert(dlist_empty(&ah->implicit_conn_list)); + assert(ah->refcnt == 0); EFA_INFO(FI_LOG_AV, "Destroying AH for ahn %d\n", ah->ahn); - dlist_remove(&ah->domain_lru_ah_list_entry); HASH_DEL(domain->ah_map, ah); err = ibv_destroy_ah(ah->ibv_ah); @@ -213,29 +139,20 @@ void efa_ah_destroy_ah(struct efa_domain *domain, struct efa_ah *ah) } /** - * @brief release an efa_ah object after acquiring the util domain lock + * @brief release an efa_ah, destroying it when refcount reaches zero * - * @param[in] domain efa_domain - * @param[in] ah efa_ah object pointer + * @param[in] domain efa domain + * @param[in] ah efa_ah to release */ -void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av) +void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah) { ofi_genlock_lock(&domain->util_domain.lock); -#if ENABLE_DEBUG - struct efa_ah *tmp; - HASH_FIND(hh, domain->ah_map, ah->gid, EFA_GID_LEN, tmp); - assert(tmp == ah); -#endif - assert((release_from_implicit_av && ah->implicit_refcnt > 0) || - (!release_from_implicit_av && ah->explicit_refcnt > 0)); + assert(ah->refcnt > 0); + ah->refcnt--; - release_from_implicit_av ? ah->implicit_refcnt-- : - ah->explicit_refcnt--; + if (ah->refcnt == 0) + efa_ah_destroy(domain, ah); - if (ah->implicit_refcnt == 0 && ah->explicit_refcnt == 0) { - efa_ah_destroy_ah(domain, ah); - } ofi_genlock_unlock(&domain->util_domain.lock); } diff --git a/prov/efa/src/efa_ah.h b/prov/efa/src/efa_ah.h index b04b53a0114..25a81ffac1a 100644 --- a/prov/efa/src/efa_ah.h +++ b/prov/efa/src/efa_ah.h @@ -9,31 +9,55 @@ #define EFA_GID_LEN 16 +/** + * @brief Base address handle — shared by efa-direct and protocol paths + * + * Contains only the ibv_ah, GID, AHN, refcount, and hash handle. + * Protocol-specific fields (implicit_refcnt, implicit_conn_list, + * LRU list entry) are in efa_proto_ah. + * + * pahole: size: 88, cachelines: 2 (2-byte hole after ahn) + * + * TX hot path: ibv_ah (off=16) is passed to ibv post_send/read/write + * on every send. Both ibv_ah and ahn are in cacheline 0. + * All other fields are control path only (AH alloc/release/hash lookup). + */ struct efa_ah { - uint8_t gid[EFA_GID_LEN]; /* efa device GID */ - struct ibv_ah *ibv_ah; /* created by ibv_create_ah() using GID */ - uint16_t ahn; /* adress handle number */ - /* Number of explicit AV entries associated with this AH */ - int explicit_refcnt; - /* Number of implicit AV entries associated with this AH */ - int implicit_refcnt; - /* dlist of all implicit AV entries associated with this AH entry */ - struct dlist_entry implicit_conn_list; - /* dlist entry in domain's LRU AH list */ - struct dlist_entry domain_lru_ah_list_entry; - UT_hash_handle hh; /* hash map handle, link all efa_ah with efa_ep->ah_map */ + uint8_t gid[EFA_GID_LEN]; /* 0 16 */ + struct ibv_ah *ibv_ah; /* 16 8 */ + uint16_t ahn; /* 24 2 */ + /* 2-byte hole */ + int refcnt; /* 28 4 */ + UT_hash_handle hh; /* 32 56 */ }; -void efa_ah_implicit_av_lru_ah_move(struct efa_domain *domain, - struct efa_ah *ah); - +/** + * @brief allocate an ibv_ah from GID, reusing existing AH if possible + * + * @param[in] domain efa domain + * @param[in] gid GID + * @param[in] alloc_size size to allocate (sizeof(efa_ah) or sizeof(efa_proto_ah)) + * @return pointer to efa_ah on success, NULL on failure (errno set) + */ struct efa_ah *efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); - -void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); - -void efa_ah_release_unsafe(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); - -#endif \ No newline at end of file + size_t alloc_size); + +/** + * @brief release an efa_ah, destroying it when refcount reaches zero + * + * @param[in] domain efa domain + * @param[in] ah efa_ah to release + */ +void efa_ah_release(struct efa_domain *domain, struct efa_ah *ah); + +/** + * @brief destroy an efa_ah (remove from hash, destroy ibv_ah, free) + * + * Caller must hold util_domain.lock. + * + * @param[in] domain efa domain + * @param[in] ah efa_ah to destroy + */ +void efa_ah_destroy(struct efa_domain *domain, struct efa_ah *ah); + +#endif diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 4bb8e5b680d..a4bd828b926 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -11,59 +11,39 @@ #include "efa.h" #include "efa_av.h" -#include "rdm/efa_rdm_pke_utils.h" -static inline struct efa_conn *efa_av_addr_to_conn_impl(struct util_av *util_av, - fi_addr_t fi_addr) +/** + * @brief find efa_av_entry using fi_addr in the explicit AV + * + * @param[in] av efa av + * @param[in] fi_addr libfabric address + * @return if address is valid, return pointer to efa_av_entry + * otherwise, return NULL + */ +struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr) { struct util_av_entry *util_av_entry; - struct efa_av_entry *efa_av_entry; + struct efa_av_entry *av_entry; if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) return NULL; - if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr))) - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); + if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(av->util_av.av_entry_pool, fi_addr))) + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); else return NULL; - efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - return efa_av_entry->conn.ep_addr ? &efa_av_entry->conn : NULL; -} - -/** - * @brief find efa_conn struct using fi_addr in the explicit AV - * - * @param[in] av efa av - * @param[in] addr fi_addr - * @return if address is valid, return pointer to efa_conn struct - * otherwise, return NULL - */ -struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr) -{ - return efa_av_addr_to_conn_impl(&av->util_av, fi_addr); + av_entry = (struct efa_av_entry *)util_av_entry->data; + return av_entry->ah ? av_entry : NULL; } /** - * @brief find efa_conn struct using fi_addr in the implicit AV - * - * @param[in] av efa av - * @param[in] addr fi_addr - * @return if address is valid, return pointer to efa_conn struct - * otherwise, return NULL - */ -struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, fi_addr_t fi_addr) -{ - return efa_av_addr_to_conn_impl(&av->util_av_implicit, fi_addr); -} - -/** - * @brief find fi_addr for efa endpoint + * @brief find fi_addr for efa endpoint (base, AHN+QPN only) * * @param[in] av address vector * @param[in] ahn address handle number * @param[in] qpn QP number - * @return On success, return fi_addr to the peer who send the packet + * @return On success, return fi_addr to the peer * If no such peer exist, return FI_ADDR_NOTAVAIL */ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) @@ -76,170 +56,30 @@ fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn) cur_key.qpn = qpn; HASH_FIND(hh, av->cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); - return (OFI_LIKELY(!!cur_entry)) ? cur_entry->conn->fi_addr : FI_ADDR_NOTAVAIL; -} - -static inline struct efa_conn * -efa_av_reverse_lookup_rdm_conn(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - uint16_t ahn, uint16_t qpn, - struct efa_rdm_pke *pkt_entry) -{ - uint32_t *connid; - struct efa_cur_reverse_av *cur_entry; - struct efa_prv_reverse_av *prv_entry; - struct efa_cur_reverse_av_key cur_key; - struct efa_prv_reverse_av_key prv_key; - - cur_key.ahn = ahn; - cur_key.qpn = qpn; - - HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); - - if (OFI_UNLIKELY(!cur_entry)) - return NULL; - - if (!pkt_entry) { - /** - * There is no packet entry to extract connid from when we get - * an IBV_WC_RECV_RDMA_WITH_IMM completion from rdma-core. Or - * the pkt_entry is allocated from a buffer user posted that - * doesn't expect any pkt hdr. - */ - return cur_entry->conn; - } - - connid = efa_rdm_pke_connid_ptr(pkt_entry); - if (!connid) { - EFA_WARN_ONCE(FI_LOG_EP_CTRL, - "An incoming packet does NOT have connection ID " - "in its header.\n" - "This means the peer is using an older version " - "of libfabric.\n" - "The communication can continue but it is " - "encouraged to use\n" - "a newer version of libfabric\n"); - return cur_entry->conn; - } - - if (OFI_LIKELY(*connid == cur_entry->conn->ep_addr->qkey)) - return cur_entry->conn; - - /* the packet is from a previous peer, look for its address from the - * prv_reverse_av */ - prv_key.ahn = ahn; - prv_key.qpn = qpn; - prv_key.connid = *connid; - HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry); - - return OFI_LIKELY(!!prv_entry) ? prv_entry->conn : NULL; -}; - -/** - * @brief find fi_addr for rdm endpoint in the explicit AV - * - * @param[in] av address vector - * @param[in] ahn address handle number - * @param[in] qpn QP number - * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid - * @return On success, return fi_addr to the peer who send the packet - * If no such peer exist, return FI_ADDR_NOTAVAIL - */ -fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, - uint16_t qpn, struct efa_rdm_pke *pkt_entry) -{ - struct efa_conn *conn; - - conn = efa_av_reverse_lookup_rdm_conn( - &av->cur_reverse_av, &av->prv_reverse_av, ahn, qpn, pkt_entry); - - if (OFI_LIKELY(!!conn)) - return conn->fi_addr; - - return FI_ADDR_NOTAVAIL; -} - -/** - * @brief find fi_addr for rdm endpoint in the implicit AV - * - * @param[in] av address vector - * @param[in] ahn address handle number - * @param[in] qpn QP number - * @param[in] pkt_entry NULL or rdm packet entry, used to extract connid - * @return On success, return fi_addr to the peer who send the packet - * If no such peer exist, return FI_ADDR_NOTAVAIL - */ -fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, - uint16_t qpn, - struct efa_rdm_pke *pkt_entry) -{ - struct efa_conn *conn; - - assert(ofi_genlock_held(&av->domain->srx_lock)); - - conn = efa_av_reverse_lookup_rdm_conn(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, ahn, - qpn, pkt_entry); - - if (OFI_LIKELY(!!conn)) { - efa_av_implicit_av_lru_conn_move(av, conn); - return conn->implicit_fi_addr; - } - - return FI_ADDR_NOTAVAIL; -} - -static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) -{ - struct efa_ep_addr all_zeros = { 0 }; - - return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); -} - -/** - * @brief Move the conn to the front of the LRU list to indicate that it is the - * most recently used entry - * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list - */ -void efa_av_implicit_av_lru_conn_move(struct efa_av *av, - struct efa_conn *conn) -{ - assert(av->implicit_av_size == 0 || - HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size); - assert(dlist_entry_in_list(&av->implicit_av_lru_list, - &conn->implicit_av_lru_entry)); - - dlist_remove(&conn->implicit_av_lru_entry); - dlist_insert_tail(&conn->implicit_av_lru_entry, - &av->implicit_av_lru_list); - - efa_ah_implicit_av_lru_ah_move(av->domain, conn->ah); + return (OFI_LIKELY(!!cur_entry)) ? cur_entry->av_entry->fi_addr : FI_ADDR_NOTAVAIL; } /* - * @brief Add newly insert address to the reverse AVs + * @brief Add newly inserted address to the reverse AVs * * @param[in] av EFA AV object - * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key - * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] conn efa_conn object - * @return On success, return 0. - * Otherwise, return a negative libfabric error code + * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key + * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key + * @param[in] av_entry AV entry to add + * @return 0 on success, negative libfabric error code on failure */ int efa_av_reverse_av_add(struct efa_av *av, - struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn) + struct efa_cur_reverse_av **cur_reverse_av, + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_entry; struct efa_prv_reverse_av *prv_entry; struct efa_cur_reverse_av_key cur_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = conn->ah->ahn; - cur_key.qpn = conn->ep_addr->qpn; + cur_key.ahn = av_entry->ah->ahn; + cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; cur_entry = NULL; HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); @@ -252,16 +92,13 @@ int efa_av_reverse_av_add(struct efa_av *av, cur_entry->key.ahn = cur_key.ahn; cur_entry->key.qpn = cur_key.qpn; - cur_entry->conn = conn; + cur_entry->av_entry = av_entry; HASH_ADD(hh, *cur_reverse_av, key, sizeof(cur_key), cur_entry); return 0; } - /* We used a static connid for all dgram endpoints, therefore cur_entry should always be NULL, - * and only RDM endpoint can reach here. hence the following assertion - */ - assert(av->domain->info_type == EFA_INFO_RDM); + /* Only RDM endpoint can reach here (dgram uses static connid) */ prv_entry = malloc(sizeof(*prv_entry)); if (!prv_entry) { EFA_WARN(FI_LOG_AV, "Cannot allocate memory for prv_reverse_av entry\n"); @@ -270,11 +107,11 @@ int efa_av_reverse_av_add(struct efa_av *av, prv_entry->key.ahn = cur_key.ahn; prv_entry->key.qpn = cur_key.qpn; - prv_entry->key.connid = cur_entry->conn->ep_addr->qkey; - prv_entry->conn = cur_entry->conn; + prv_entry->key.connid = efa_av_entry_ep_addr(cur_entry->av_entry)->qkey; + prv_entry->av_entry = cur_entry->av_entry; HASH_ADD(hh, *prv_reverse_av, key, sizeof(prv_entry->key), prv_entry); - cur_entry->conn = conn; + cur_entry->av_entry = av_entry; return 0; } @@ -285,16 +122,13 @@ int efa_av_reverse_av_add(struct efa_av *av, * cur_reverse_av. Keeping the address in prv_reverse_av helps avoid QPN * collisions. * - * @param[in] av EFA AV object - * @param[in,out] cur_reverse_av Reverse AV with AHN and QPN as key - * @param[in,out] prv_reverse_av Reverse AV with AHN, QPN and QKEY as key - * @param[in] conn efa_conn object - * @return On success, return 0. - * Otherwise, return a negative libfabric error code + * @param[in,out] cur_reverse_av reverse AV with AHN and QPN as key + * @param[in,out] prv_reverse_av reverse AV with AHN, QPN and QKEY as key + * @param[in] av_entry AV entry to remove */ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn) + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry) { struct efa_cur_reverse_av *cur_reverse_av_entry; struct efa_prv_reverse_av *prv_reverse_av_entry; @@ -302,182 +136,137 @@ void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av_key prv_key; memset(&cur_key, 0, sizeof(cur_key)); - cur_key.ahn = conn->ah->ahn; - cur_key.qpn = conn->ep_addr->qpn; + cur_key.ahn = av_entry->ah->ahn; + cur_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_reverse_av_entry); - if (cur_reverse_av_entry && cur_reverse_av_entry->conn == conn) { + if (cur_reverse_av_entry && cur_reverse_av_entry->av_entry == av_entry) { HASH_DEL(*cur_reverse_av, cur_reverse_av_entry); free(cur_reverse_av_entry); } else { memset(&prv_key, 0, sizeof(prv_key)); - prv_key.ahn = conn->ah->ahn; - prv_key.qpn = conn->ep_addr->qpn; - prv_key.connid = conn->ep_addr->qkey; + prv_key.ahn = av_entry->ah->ahn; + prv_key.qpn = efa_av_entry_ep_addr(av_entry)->qpn; + prv_key.connid = efa_av_entry_ep_addr(av_entry)->qkey; HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_reverse_av_entry); assert(prv_reverse_av_entry && - prv_reverse_av_entry->conn == conn); + prv_reverse_av_entry->av_entry == av_entry); HASH_DEL(*prv_reverse_av, prv_reverse_av_entry); free(prv_reverse_av_entry); } } - -static fi_addr_t -efa_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) -{ - struct efa_rdm_pke *pke; - - pke = (struct efa_rdm_pke *) rx_entry->peer_context; - - return pke->peer->conn->fi_addr; -} - -static int efa_conn_implicit_to_explicit(struct efa_av *av, - struct efa_ep_addr *raw_addr, - fi_addr_t implicit_fi_addr, - fi_addr_t *fi_addr) +/** + * @brief Initialize an efa_av_entry (base path) + * + * Caller must hold util_av.lock. + * + * @param[in] av address vector + * @param[in] raw_addr raw efa address + * @param[in] flags flags from fi_av_insert + * @param[in] context context from fi_av_insert + * @return pointer to initialized entry on success, NULL on failure + */ +static struct efa_av_entry *efa_av_entry_init(struct efa_av *av, + struct efa_ep_addr *raw_addr, + uint64_t flags, void *context) { + struct util_av_entry *util_av_entry = NULL; + struct efa_av_entry *av_entry = NULL; + fi_addr_t fi_addr; int err; - struct efa_ah *ah; - struct efa_conn *implicit_conn, *explicit_conn; - struct efa_rdm_ep *ep; - struct dlist_entry *entry; - struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry; - struct efa_conn_ep_peer_map_entry *map_entry, *tmp; - struct efa_av_entry *implicit_av_entry, *explicit_av_entry; - struct fid_peer_srx *peer_srx; - - EFA_INFO(FI_LOG_AV, - "Moving peer with implicit fi_addr %" PRIu64 - " to explicit AV\n", - implicit_fi_addr); assert(ofi_genlock_held(&av->util_av.lock)); - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - - /* Get implicit util AV entry and conn */ - implicit_util_av_entry = - ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr); - implicit_av_entry = (struct efa_av_entry *) implicit_util_av_entry->data; + if (flags & FI_SYNC_ERR) + memset(context, 0, sizeof(int)); - assert(implicit_av_entry); - assert(efa_is_same_addr( - raw_addr, (struct efa_ep_addr *) implicit_av_entry->ep_addr)); - - implicit_conn = &implicit_av_entry->conn; - assert(implicit_conn->fi_addr == FI_ADDR_NOTAVAIL && - implicit_conn->implicit_fi_addr == implicit_fi_addr); - - ah = implicit_conn->ah; - - /* Create explicit util AV entry and conn */ - err = ofi_av_insert_addr(&av->util_av, raw_addr, fi_addr); + err = ofi_av_insert_addr(&av->util_av, raw_addr, &fi_addr); if (err) { - EFA_WARN(FI_LOG_AV, - "ofi_av_insert_addr into explicit AV failed! Error " - "message: %s\n", + EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", fi_strerror(err)); - return err; + return NULL; } - explicit_util_av_entry = - ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, *fi_addr); - explicit_av_entry = (struct efa_av_entry *) explicit_util_av_entry->data; - assert(efa_is_same_addr( - raw_addr, (struct efa_ep_addr *) explicit_av_entry->ep_addr)); + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, fi_addr); + av_entry = (struct efa_av_entry *)util_av_entry->data; + assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)av_entry->ep_addr)); - /* Copy information from implicit conn to explicit conn */ - explicit_conn = &explicit_av_entry->conn; - memset(explicit_conn, 0, sizeof(*explicit_conn)); - explicit_conn->ep_addr = (struct efa_ep_addr *) explicit_av_entry->ep_addr; + av_entry->fi_addr = fi_addr; assert(av->type == FI_AV_TABLE); - explicit_conn->ah = implicit_conn->ah; - explicit_conn->fi_addr = *fi_addr; - explicit_conn->shm_fi_addr = implicit_conn->shm_fi_addr; - explicit_conn->implicit_fi_addr = FI_ADDR_NOTAVAIL; - HASH_ITER(hh, implicit_conn->ep_peer_map, map_entry, tmp) { - HASH_DELETE(hh, implicit_conn->ep_peer_map, map_entry); - HASH_ADD_PTR(explicit_conn->ep_peer_map, ep_ptr, map_entry); - map_entry->peer.conn = explicit_conn; - } - assert(HASH_CNT(hh, implicit_conn->ep_peer_map) == 0); - - /* Handle reverse AV and AV ref counts */ - efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, implicit_conn); - dlist_remove(&implicit_av_entry->conn.implicit_av_lru_entry); + av_entry->ah = efa_ah_alloc(av->domain, raw_addr->raw, sizeof(struct efa_ah)); + if (!av_entry->ah) + goto err_release; - err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, - "ofi_av_remove_addr from implicit AV failed! Error " - "message: %s\n", - fi_strerror(err)); - return err; - } + err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av, + av_entry); + if (err) + goto err_release_ah; - av->used_implicit--; + av->used++; + return av_entry; - err = efa_av_reverse_av_add(av, &av->cur_reverse_av, &av->prv_reverse_av, - explicit_conn); +err_release_ah: + efa_ah_release(av->domain, av_entry->ah); +err_release: + av_entry->ah = NULL; + memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); + err = ofi_av_remove_addr(&av->util_av, fi_addr); if (err) - return err; + EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", + err); + return NULL; +} - av->used_explicit++; +/** + * @brief Release an efa_av_entry (base path) + * + * Caller must hold util_av.lock. + * + * @param[in] av address vector + * @param[in] av_entry entry to release + */ +static void efa_av_entry_release(struct efa_av *av, struct efa_av_entry *av_entry) +{ + char gidstr[INET6_ADDRSTRLEN]; + int err; - /* Handle AH LRU list and refcnt */ - assert(!dlist_empty(&ah->implicit_conn_list)); - dlist_remove(&implicit_conn->ah_implicit_conn_list_entry); - efa_ah_implicit_av_lru_ah_move(av->domain, ah); - ah->implicit_refcnt--; - ah->explicit_refcnt++; + assert(ofi_genlock_held(&av->util_av.lock)); - EFA_INFO(FI_LOG_AV, - "Peer with implicit fi_addr %" PRIu64 - " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n", - implicit_fi_addr, *fi_addr); - - /* Call foreach_unspec_addr to move unexpected messages - * from the unspecified queue to the specified queues - * - * util_ep is bound to the explicit util_av, so the explicit util_av's - * ep_list contains all of the endpoints bound to this AV */ - ofi_genlock_lock(&av->util_av.ep_list_lock); - dlist_foreach(&av->util_av.ep_list, entry) { - ep = container_of(entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); - peer_srx = util_get_peer_srx(ep->peer_srx_ep); - peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_av_get_addr_from_peer_rx_entry); - } - ofi_genlock_unlock(&av->util_av.ep_list_lock); + efa_av_reverse_av_remove(&av->cur_reverse_av, &av->prv_reverse_av, av_entry); + efa_ah_release(av->domain, av_entry->ah); + + inet_ntop(AF_INET6, efa_av_entry_ep_addr(av_entry)->raw, gidstr, INET6_ADDRSTRLEN); + EFA_INFO(FI_LOG_AV, "efa_av_entry released! entry[%p] GID[%s] QP[%u]\n", + av_entry, gidstr, efa_av_entry_ep_addr(av_entry)->qpn); + + err = ofi_av_remove_addr(&av->util_av, av_entry->fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); - return FI_SUCCESS; + av_entry->ah = NULL; + memset(av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); + av->used--; } /** - * @brief insert one address into address vector (AV) + * @brief insert one address into AV (base, efa-direct path) * * @param[in] av address vector * @param[in] addr raw address, in the format of gid:qpn:qkey - * @param[out] fi_addr pointer to the output fi address. This address is used by fi_send - * @param[in] flags flags user passed to fi_av_insert. + * @param[out] fi_addr pointer to the output fi address + * @param[in] flags flags user passed to fi_av_insert * @param[in] context context user passed to fi_av_insert - * @param[in] insert_shm_av whether insert address to shm av - * @param[in] insert_implicit_av whether insert address to implicit AV * @return 0 on success, a negative error code on failure */ -int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, - fi_addr_t *fi_addr, uint64_t flags, void *context, - bool insert_shm_av, bool insert_implicit_av) +static int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context) { - struct efa_conn *conn; + struct efa_av_entry *av_entry; char raw_gid_str[INET6_ADDRSTRLEN]; fi_addr_t efa_fiaddr; - fi_addr_t implicit_fi_addr; - int ret = 0; if (!efa_av_is_valid_address(addr)) { EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n"); @@ -488,94 +277,57 @@ int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, if (av->domain->info_type == EFA_INFO_DGRAM) addr->qkey = EFA_DGRAM_CONNID; - if (av->domain->info_type == EFA_INFO_RDM) - assert(ofi_genlock_held(&av->domain->srx_lock)); - ofi_genlock_lock(&av->util_av_implicit.lock); ofi_genlock_lock(&av->util_av.lock); memset(raw_gid_str, 0, sizeof(raw_gid_str)); if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) { EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno); - ret = -FI_EINVAL; *fi_addr = FI_ADDR_NOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return -FI_EINVAL; } EFA_INFO(FI_LOG_AV, - "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n", - raw_gid_str, addr->qpn, addr->qkey, - insert_implicit_av ? "implicit" : "explicit"); + "Inserting address GID[%s] QP[%u] QKEY[%u] to explicit AV ....\n", + raw_gid_str, addr->qpn, addr->qkey); - /* - * Check if this address already has been inserted, if so set *fi_addr - * to existing address, and return 0 for success. - */ + /* Check if already inserted */ efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->util_av, addr); if (efa_fiaddr != FI_ADDR_NOTAVAIL) { - /* We should never try to insert into the implicit AV an address - * that's already in the explicit AV */ - assert(!insert_implicit_av); - EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr); *fi_addr = efa_fiaddr; - ret = 0; - goto out; - } - - implicit_fi_addr = - ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr); - if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { - EFA_INFO(FI_LOG_AV, - "Found implicit AV entry id %ld for the same " - "address\n", - implicit_fi_addr); - - if (insert_implicit_av) { - /* Move to the end of the LRU list */ - conn = efa_av_addr_to_conn_implicit(av, - implicit_fi_addr); - efa_av_implicit_av_lru_conn_move(av, conn); - - *fi_addr = implicit_fi_addr; - goto out; - } - - ret = efa_conn_implicit_to_explicit(av, addr, implicit_fi_addr, - fi_addr); - if (ret) - *fi_addr = FI_ADDR_NOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return 0; } - conn = efa_conn_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av); - if (!conn) { + av_entry = efa_av_entry_init(av, addr, flags, context); + if (!av_entry) { *fi_addr = FI_ADDR_NOTAVAIL; - ret = -FI_EADDRNOTAVAIL; - goto out; + ofi_genlock_unlock(&av->util_av.lock); + return -FI_EADDRNOTAVAIL; } - if (insert_implicit_av) { - *fi_addr = conn->implicit_fi_addr; - EFA_INFO(FI_LOG_AV, - "Successfully inserted address GID[%s] QP[%u] " - "QKEY[%u] to implicit AV. fi_addr: %ld\n", - raw_gid_str, addr->qpn, addr->qkey, *fi_addr); - } else { - *fi_addr = conn->fi_addr; - EFA_INFO(FI_LOG_AV, - "Successfully inserted address GID[%s] QP[%u] " - "QKEY[%u] to explicit AV. fi_addr: %ld\n", - raw_gid_str, addr->qpn, addr->qkey, *fi_addr); - } - ret = 0; + *fi_addr = av_entry->fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); -out: ofi_genlock_unlock(&av->util_av.lock); - ofi_genlock_unlock(&av->util_av_implicit.lock); - return ret; + return 0; } -int efa_av_insert(struct fid_av *av_fid, const void *addr, +/** + * @brief insert addresses into AV (fi_av_insert implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr buffer containing one or more addresses to insert + * @param[in] count number of addresses to insert + * @param[out] fi_addr array where returned fabric addresses will be written + * @param[in] flags operation flags + * @param[in] context user context + * @return number of addresses successfully inserted + */ +static int efa_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { @@ -591,25 +343,16 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) return -FI_EINVAL; - /* - * Providers are allowed to ignore FI_MORE. - */ flags &= ~FI_MORE; if (flags) return -FI_ENOSYS; - /* The order in which the util AV and SRX locks are acquired must match - * in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); - for (i = 0; i < count; i++) { addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); - ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false); + ret = efa_av_insert_one(av, addr_i, &fi_addr_res, flags, context); if (ret) { - EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", - ret); + EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret); break; } @@ -618,9 +361,6 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, success_cnt++; } - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); - /* cancel remaining request and log to event queue */ for (; i < count ; i++) { if (fi_addr) @@ -630,11 +370,20 @@ int efa_av_insert(struct fid_av *av_fid, const void *addr, return success_cnt; } +/** + * @brief retrieve an address stored in the AV (fi_av_lookup implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr fabric address to look up + * @param[out] addr buffer to store the returned address + * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written + * @return 0 on success, negative libfabric error code on failure + */ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, void *addr, size_t *addrlen) { struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); - struct efa_conn *conn = NULL; + struct efa_av_entry *av_entry = NULL; if (av->type != FI_AV_TABLE) return -FI_EINVAL; @@ -643,13 +392,13 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, return -FI_EINVAL; ofi_genlock_lock(&av->util_av.lock); - conn = efa_av_addr_to_conn(av, fi_addr); - if (!conn) { + av_entry = efa_av_addr_to_entry(av, fi_addr); + if (!av_entry) { ofi_genlock_unlock(&av->util_av.lock); return -FI_EINVAL; } - memcpy(addr, (void *)conn->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); + memcpy(addr, (void *)av_entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); ofi_genlock_unlock(&av->util_av.lock); if (*addrlen > EFA_EP_ADDR_LEN) *addrlen = EFA_EP_ADDR_LEN; @@ -670,16 +419,16 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, * was set to FI_ADDR_NOTAVAIL. The TX completion handler will * ignore TX packet whose address is FI_ADDR_NOTAVAIL. * - * Meanwhile, lower provider will set a packet's address to - * FI_ADDR_NOTAVAIL from it is from a removed address. RX completion + * Meanwhile, lower provider will set a packet's address to + * FI_ADDR_NOTAVAIL if it is from a removed address. RX completion * handler will ignore such packets. * * @param[in] av_fid fid of AV (address vector) - * @param[in] fi_addr pointer to an array of libfabric addresses - * @param[in] counter number of libfabric addresses in the array + * @param[in] fi_addr pointer to an array of libfabric addresses + * @param[in] count number of libfabric addresses in the array * @param[in] flags flags * @return 0 if all addresses have been removed successfully, - * negative libfabric error code if error was encoutnered. + * negative libfabric error code if error was encountered. */ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count, uint64_t flags) @@ -687,7 +436,7 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, int err = 0; size_t i; struct efa_av *av; - struct efa_conn *conn; + struct efa_av_entry *av_entry; if (!fi_addr) return -FI_EINVAL; @@ -696,19 +445,15 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, if (av->type != FI_AV_TABLE) return -FI_EINVAL; - /* The order in which the util AV and SRX locks are acquired must match - in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); ofi_genlock_lock(&av->util_av.lock); for (i = 0; i < count; i++) { - conn = efa_av_addr_to_conn(av, fi_addr[i]); - if (!conn) { + av_entry = efa_av_addr_to_entry(av, fi_addr[i]); + if (!av_entry) { err = -FI_EINVAL; break; } - efa_conn_release(av, conn, false); + efa_av_entry_release(av, av_entry); } if (i < count) { @@ -717,11 +462,18 @@ static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, } ofi_genlock_unlock(&av->util_av.lock); - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); return err; } +/** + * @brief convert an address into a printable string (fi_av_straddr implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr address to convert + * @param[out] buf buffer to store the string + * @param[in,out] len on input, size of buf; on output, bytes written + * @return pointer to buf + */ static const char *efa_av_straddr(struct fid_av *av_fid, const void *addr, char *buf, size_t *len) { @@ -738,81 +490,37 @@ static struct fi_ops_av efa_av_ops = { .straddr = efa_av_straddr }; -static void efa_av_close_reverse_av(struct efa_av *av) +/** + * @brief close an AV and release all resources (fi_close implementation) + * + * @param[in] fid fid of AV + * @return 0 on success, negative libfabric error code on failure + */ +static int efa_av_close(struct fid *fid) { + struct efa_av *av; struct efa_cur_reverse_av *cur_entry, *curtmp; struct efa_prv_reverse_av *prv_entry, *prvtmp; + int err = 0; - /* The order in which the util AV and SRX locks are acquired must match - in the AV insertion, removal and CQ read paths to prevent deadlocks */ - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_lock(&av->domain->srx_lock); + av = container_of(fid, struct efa_av, util_av.av_fid.fid); ofi_genlock_lock(&av->util_av.lock); HASH_ITER(hh, av->cur_reverse_av, cur_entry, curtmp) { - efa_conn_release(av, cur_entry->conn, false); + efa_av_entry_release(av, cur_entry->av_entry); } HASH_ITER(hh, av->prv_reverse_av, prv_entry, prvtmp) { - efa_conn_release(av, prv_entry->conn, false); + efa_av_entry_release(av, prv_entry->av_entry); } ofi_genlock_unlock(&av->util_av.lock); - ofi_genlock_lock(&av->util_av_implicit.lock); - - HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) { - efa_conn_release(av, cur_entry->conn, true); - } - - HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) { - efa_conn_release(av, prv_entry->conn, true); - } - - ofi_genlock_unlock(&av->util_av_implicit.lock); - - if (av->domain->info_type == EFA_INFO_RDM) - ofi_genlock_unlock(&av->domain->srx_lock); -} - -static int efa_av_close(struct fid *fid) -{ - struct efa_av *av; - int err = 0; - struct efa_ep_addr_hashable *ep_addr_hashable, *tmp; - - av = container_of(fid, struct efa_av, util_av.av_fid.fid); - - efa_av_close_reverse_av(av); - err = ofi_av_close(&av->util_av); - if (OFI_UNLIKELY(err)) { + if (OFI_UNLIKELY(err)) EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n", fi_strerror(err)); - } - - err = ofi_av_close(&av->util_av_implicit); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n", - fi_strerror(err)); - } - - if (av->domain->info_type == EFA_INFO_RDM) { - if (av->shm_rdm_av) { - err = fi_close(&av->shm_rdm_av->fid); - if (OFI_UNLIKELY(err)) { - EFA_WARN(FI_LOG_AV, - "Failed to close shm av: %s\n", - fi_strerror(err)); - } - } - } - - HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) { - HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable); - free(ep_addr_hashable); - } free(av); return err; @@ -827,37 +535,47 @@ static struct fi_ops efa_av_fi_ops = { }; /** - * @brief initialize the util_av field in efa_av + * @brief initialize a util_av * - * @param[in] util_domain util_domain which is part of efa_domain_base + * @param[in] efa_domain efa domain * @param[in] attr AV attr application passed to fi_av_open - * @param[out] util_av util_av field in efa_av - * @param[in] context contexted application passed to fi_av_open + * @param[out] util_av util_av to initialize + * @param[in] context context application passed to fi_av_open + * @param[in] context_len size of provider-specific context per AV entry * @return On success, return 0. * On failure, return a negative libfabric error code. */ int efa_av_init_util_av(struct efa_domain *efa_domain, struct fi_av_attr *attr, struct util_av *util_av, - void *context) + void *context, + size_t context_len) { struct util_av_attr util_attr; util_attr.addrlen = EFA_EP_ADDR_LEN; - util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN; + util_attr.context_len = context_len; util_attr.flags = 0; return ofi_av_init(&efa_domain->util_domain, attr, &util_attr, util_av, context); } +/** + * @brief open an address vector (fi_av_open implementation for efa-direct/dgram) + * + * @param[in] domain_fid fid of domain + * @param[in] attr AV attributes + * @param[out] av_fid pointer to store the opened AV fid + * @param[in] context user context + * @return 0 on success, negative libfabric error code on failure + */ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context) { struct efa_domain *efa_domain; struct efa_av *av; - struct fi_av_attr av_attr = { 0 }; - int ret, retv; size_t universe_size; + int ret; if (!attr) return -FI_EINVAL; @@ -894,47 +612,16 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, &universe_size) == FI_SUCCESS) attr->count = MAX(attr->count, universe_size); - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context); + ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context, + sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN); if (ret) goto err; - ret = efa_av_init_util_av(efa_domain, attr, &av->util_av, context); - if (ret) - goto err_close_util_av_implicit; - - if (efa_domain->info_type == EFA_INFO_RDM && efa_domain->fabric && - efa_domain->fabric->shm_fabric) { - /* - * shm av supports maximum 256 entries - * Reset the count to 128 to reduce memory footprint and satisfy - * the need of the instances with more CPUs. - */ - av_attr = *attr; - if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) { - ret = -FI_ENOSYS; - EFA_WARN(FI_LOG_AV, - "The requested av size is beyond" - " shm supported maximum av size: %s\n", - fi_strerror(-ret)); - goto err_close_util_av; - } - av_attr.count = efa_env.shm_av_size; - assert(av_attr.type == FI_AV_TABLE); - ret = fi_av_open(efa_domain->shm_domain, &av_attr, - &av->shm_rdm_av, context); - if (ret) - goto err_close_util_av; - } - - EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", - attr->flags); + EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags); av->domain = efa_domain; av->type = attr->type; - av->implicit_av_size = efa_env.implicit_av_size; - av->used_implicit = 0; - av->used_explicit = 0; - av->shm_used = 0; + av->used = 0; *av_fid = &av->util_av.av_fid; (*av_fid)->fid.fclass = FI_CLASS_AV; @@ -942,22 +629,8 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, (*av_fid)->fid.ops = &efa_av_fi_ops; (*av_fid)->ops = &efa_av_ops; - dlist_init(&av->implicit_av_lru_list); - return 0; -err_close_util_av: - retv = ofi_av_close(&av->util_av); - if (retv) - EFA_WARN(FI_LOG_AV, - "Unable to close util_av: %s\n", fi_strerror(-retv)); - -err_close_util_av_implicit: - retv = ofi_av_close(&av->util_av_implicit); - if (retv) - EFA_WARN(FI_LOG_AV, - "Unable to close util_av_implicit: %s\n", fi_strerror(-retv)); - err: free(av); return ret; diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index 6cbe7b506ea..b92eebb91e6 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -5,10 +5,7 @@ #define EFA_AV_H #include -#include "rdm/efa_rdm_protocol.h" -#include "rdm/efa_rdm_peer.h" #include "efa_ah.h" -#include "efa_conn.h" #define EFA_MIN_AV_SIZE (16384) #define EFA_SHM_MAX_AV_COUNT (256) @@ -28,93 +25,128 @@ struct efa_ep_addr_hashable { #define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) -/* util_av implementation requires the first element of efa_av_entry to be - * ep_addr */ +/** + * @brief Base AV entry (efa-direct) + * + * pahole: + * size: 48, cachelines: 1, members: 3 + * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20) + * ah* off=32 — TX hot + * fi_addr off=40 — RX hot + */ struct efa_av_entry { - uint8_t ep_addr[EFA_EP_ADDR_LEN]; - struct efa_conn conn; + uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */ + struct efa_ah *ah; /* 32 8 */ + fi_addr_t fi_addr; /* 40 8 */ }; +/* pahole: size: 4, no holes */ struct efa_cur_reverse_av_key { uint16_t ahn; uint16_t qpn; }; +/** + * @brief Reverse AV entry keyed by (AHN, QPN) — points to current peer + * + * pahole: size: 72, cachelines: 2 (4-byte hole after key) + */ struct efa_cur_reverse_av { - struct efa_cur_reverse_av_key key; - struct efa_conn *conn; - UT_hash_handle hh; + struct efa_cur_reverse_av_key key; /* 0 4 */ + /* 4-byte hole */ + struct efa_av_entry *av_entry; /* 8 8 */ + UT_hash_handle hh; /* 16 56 */ }; +/* pahole: size: 8, no holes */ struct efa_prv_reverse_av_key { uint16_t ahn; uint16_t qpn; uint32_t connid; }; +/** + * @brief Reverse AV entry keyed by (AHN, QPN, connid) — points to previous peer + * + * pahole: size: 72, cachelines: 2 + */ struct efa_prv_reverse_av { - struct efa_prv_reverse_av_key key; - struct efa_conn *conn; - UT_hash_handle hh; + struct efa_prv_reverse_av_key key; /* 0 8 */ + struct efa_av_entry *av_entry; /* 8 8 */ + UT_hash_handle hh; /* 16 56 */ }; +/** + * @brief Base AV — contains only what efa-direct needs + * + * pahole: + * size: 320, cachelines: 5 + * domain* off=0 — cacheline 0 + * used off=8 + * type off=16 + * (4-byte hole) off=20 + * cur_reverse_av* off=24 — RX hot: reverse lookup hash head + * prv_reverse_av* off=32 — RX hot: QPN reuse fallback hash head + * util_av off=40 — 280 bytes (contains bufpool, locks, ep_list) + */ struct efa_av { - struct fid_av *shm_rdm_av; - struct efa_domain *domain; - size_t used_explicit; - size_t used_implicit; - size_t shm_used; - enum fi_av_type type; - /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_conn. - * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_conns. - * cur_reverse_av is faster to search because its key size is smaller + struct efa_domain *domain; /* 0 8 */ + size_t used; /* 8 8 */ + enum fi_av_type type; /* 16 4 */ + /* 4-byte hole */ + /* cur_reverse_av is a map from (ahn + qpn) to current (latest) efa_av_entry. + * prv_reverse_av is a map from (ahn + qpn + connid) to all previous efa_av_entries. + * cur_reverse_av is faster to search because its key size is smaller. */ - struct efa_cur_reverse_av *cur_reverse_av; - struct efa_prv_reverse_av *prv_reverse_av; - struct util_av util_av; - - /* implicit AV is used when receiving messages from peers not explicity - * inserted by the application - */ - struct util_av util_av_implicit; - struct efa_cur_reverse_av *cur_reverse_av_implicit; - struct efa_prv_reverse_av *prv_reverse_av_implicit; - - size_t implicit_av_size; - struct dlist_entry implicit_av_lru_list; - struct efa_ep_addr_hashable *evicted_peers_hashset; + struct efa_cur_reverse_av *cur_reverse_av; /* 24 8 */ + struct efa_prv_reverse_av *prv_reverse_av; /* 32 8 */ + struct util_av util_av; /* 40 280 */ }; int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context); -int efa_av_insert_one(struct efa_av *av, struct efa_ep_addr *addr, - fi_addr_t *fi_addr, uint64_t flags, void *context, - bool insert_shm_av, bool insert_implicit_av); - -struct efa_conn *efa_av_addr_to_conn(struct efa_av *av, fi_addr_t fi_addr); -struct efa_conn *efa_av_addr_to_conn_implicit(struct efa_av *av, - fi_addr_t fi_addr); +int efa_av_init_util_av(struct efa_domain *efa_domain, + struct fi_av_attr *attr, + struct util_av *util_av, + void *context, + size_t context_len); -fi_addr_t efa_av_reverse_lookup_rdm(struct efa_av *av, uint16_t ahn, - uint16_t qpn, struct efa_rdm_pke *pkt_entry); - -fi_addr_t efa_av_reverse_lookup_rdm_implicit(struct efa_av *av, uint16_t ahn, - uint16_t qpn, - struct efa_rdm_pke *pkt_entry); +struct efa_av_entry *efa_av_addr_to_entry(struct efa_av *av, fi_addr_t fi_addr); fi_addr_t efa_av_reverse_lookup(struct efa_av *av, uint16_t ahn, uint16_t qpn); int efa_av_reverse_av_add(struct efa_av *av, struct efa_cur_reverse_av **cur_reverse_av, struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn); + struct efa_av_entry *av_entry); void efa_av_reverse_av_remove(struct efa_cur_reverse_av **cur_reverse_av, - struct efa_prv_reverse_av **prv_reverse_av, - struct efa_conn *conn); - -void efa_av_implicit_av_lru_conn_move(struct efa_av *av, - struct efa_conn *conn); - -#endif \ No newline at end of file + struct efa_prv_reverse_av **prv_reverse_av, + struct efa_av_entry *av_entry); + +/** + * @brief typed accessor for the ep_addr field of an AV entry + * + * @param[in] entry AV entry + * @return pointer to the efa_ep_addr embedded in the entry + */ +static inline struct efa_ep_addr *efa_av_entry_ep_addr(struct efa_av_entry *entry) +{ + return (struct efa_ep_addr *)entry->ep_addr; +} + +/** + * @brief check if an efa_ep_addr has a non-zero GID + * + * @param[in] addr address to check + * @return non-zero if valid, 0 if all-zeros + */ +static inline int efa_av_is_valid_address(struct efa_ep_addr *addr) +{ + struct efa_ep_addr all_zeros = { 0 }; + + return memcmp(addr->raw, all_zeros.raw, sizeof(addr->raw)); +} + +#endif diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index a429be8e26c..1b18879c72b 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -693,11 +693,11 @@ const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, siz struct efa_ep_addr *efa_base_ep_get_peer_raw_addr(struct efa_base_ep *base_ep, fi_addr_t addr) { struct efa_av *efa_av; - struct efa_conn *efa_conn; + struct efa_av_entry *av_entry; efa_av = base_ep->av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ep_addr : NULL; + av_entry = efa_av_addr_to_entry(efa_av, addr); + return av_entry ? efa_av_entry_ep_addr(av_entry) : NULL; } /** diff --git a/prov/efa/src/efa_conn.c b/prov/efa/src/efa_conn.c deleted file mode 100644 index a58f1f6e333..00000000000 --- a/prov/efa/src/efa_conn.c +++ /dev/null @@ -1,478 +0,0 @@ - -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#include - -#include "efa.h" - -/* - * Local/remote peer detection by comparing peer GID with stored local GIDs - */ -static bool efa_is_local_peer(struct efa_av *av, const void *addr) -{ - int i; - uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw; - -#if ENABLE_DEBUG - char raw_gid_str[INET6_ADDRSTRLEN] = { 0 }; - - if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) { - EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno); - return 0; - } - EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str); -#endif - for (i = 0; i < g_efa_ibv_gid_cnt; ++i) { - if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) { - EFA_INFO(FI_LOG_AV, "The peer is local.\n"); - return 1; - } - } - - return 0; -} - -/** - * @brief Add the conn to the LRU list. If the list is full, evict the least - * recently used entry at the front of the LRU list and add the latest one - * - * @param[in] av efa address vector - * @param[in] conn efa conn to be added to the LRU list - */ -static inline int efa_av_implicit_av_lru_insert(struct efa_av *av, - struct efa_conn *conn) -{ - size_t cur_size; - struct efa_ep_addr_hashable *ep_addr_hashable; - struct efa_conn *conn_to_release; - - /* Implicit AV size of 0 means we allow the implicit AV to grow without - * bound */ - if (av->implicit_av_size == 0) - goto out; - - cur_size = HASH_CNT(hh, av->util_av_implicit.hash); - if (cur_size <= av->implicit_av_size) - goto out; - - assert(ofi_genlock_held(&av->domain->srx_lock)); - - dlist_pop_front(&av->implicit_av_lru_list, struct efa_conn, - conn_to_release, implicit_av_lru_entry); - EFA_INFO(FI_LOG_AV, - "Evicting AV entry for peer implicit fi_addr %" PRIu64 - " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from " - "implicit AV\n", - conn_to_release->implicit_fi_addr, conn_to_release->ah->ahn, - conn_to_release->ep_addr->qpn, conn_to_release->ep_addr->qkey); - - /* Add to hashset with list of evicted peers */ - ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable)); - if (!ep_addr_hashable) { - EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n"); - return FI_ENOMEM; - } - memcpy(ep_addr_hashable, conn->ep_addr, sizeof(struct efa_ep_addr)); - HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable); - - assert(ofi_genlock_held(&av->domain->srx_lock)); - efa_conn_release(av, conn_to_release, true); - - assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size); - -out: - dlist_insert_tail(&conn->implicit_av_lru_entry, - &av->implicit_av_lru_list); - return FI_SUCCESS; -} - -/** - * @brief Insert the address into SHM provider's AV for RDM endpoints - * - * If shm transfer is enabled and the addr comes from local peer, - * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later. - * 2. insert gid_qpn into shm's av - * 3. store returned fi_addr from shm into the hash table - * - * @param[in] av address vector - * @param[in] conn efa_conn object - * @return On success return 0, otherwise return a negative error code - */ -int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn) -{ - int err, ret; - char smr_name[EFA_SHM_NAME_MAX]; - size_t smr_name_len; - - - assert(av->domain->info_type == EFA_INFO_RDM); - assert(conn->ep_addr); - - if (efa_is_local_peer(av, conn->ep_addr) && av->shm_rdm_av) { - if (av->shm_used >= efa_env.shm_av_size) { - EFA_WARN(FI_LOG_AV, - "Max number of shm AV entry (%d) has been reached.\n", - efa_env.shm_av_size); - return -FI_ENOMEM; - } - - smr_name_len = EFA_SHM_NAME_MAX; - err = efa_shm_ep_name_construct(smr_name, &smr_name_len, conn->ep_addr); - if (err != FI_SUCCESS) { - EFA_WARN(FI_LOG_AV, - "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err); - return err; - } - - /* - * The shm provider supports FI_AV_USER_ID flag. This flag - * associates a user-assigned identifier with each av entry that is - * returned with any completion entry in place of the AV's address. - * In the fi_av_insert call below, the &conn->shm_fi_addr is both an input - * and an output. conn->shm_fi_addr is passed in the function with value as - * conn->fi_addr, which is the address of peer in efa provider's av. shm - * records this value as user id in its internal hashmap for the use of cq - * write, and then overwrite conn->shm_fi_addr as the actual fi_addr in shm's - * av. The efa provider should still use conn->shm_fi_addr for transmissions - * through shm ep. - */ - conn->shm_fi_addr = conn->fi_addr; - ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &conn->shm_fi_addr, FI_AV_USER_ID, NULL); - if (OFI_UNLIKELY(ret != 1)) { - EFA_WARN(FI_LOG_AV, - "Failed to insert address to shm provider's av: %s\n", - fi_strerror(-ret)); - return ret; - } - - EFA_INFO(FI_LOG_AV, - "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", - smr_name, conn->fi_addr, conn->shm_fi_addr); - - assert(conn->shm_fi_addr < efa_env.shm_av_size); - av->shm_used++; - } - - return 0; -} - -/** - * @brief release the rdm related resources of an efa_conn object. This function - * requires the caller to take the SRX lock because this function modifies the - * peer map and destroys peers which are accessed and modified in the CQ read - * path. - * - * this function release the shm av entry and rdm peer; - * - * @param[in] av address vector - * @param[in] conn efa_conn object - * peer - */ -void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn) -{ - int err; - struct efa_conn_ep_peer_map_entry *peer_map_entry, *tmp; - - assert(av->domain->info_type == EFA_INFO_RDM); - - assert((conn->fi_addr != FI_ADDR_NOTAVAIL && - conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) || - (conn->implicit_fi_addr != FI_ADDR_NOTAVAIL && - conn->fi_addr == FI_ADDR_NOTAVAIL)); - - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { - err = fi_av_remove(av->shm_rdm_av, &conn->shm_fi_addr, 1, 0); - if (err) { - EFA_WARN(FI_LOG_AV, - "remove address from shm av failed! err=%d\n", - err); - } else { - av->shm_used--; - assert(conn->shm_fi_addr < efa_env.shm_av_size); - } - } - - assert(ofi_genlock_held(&av->domain->srx_lock)); - HASH_ITER(hh, conn->ep_peer_map, peer_map_entry, tmp) { - dlist_remove(&peer_map_entry->peer.ep_peer_list_entry); - efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr); - HASH_DEL(conn->ep_peer_map, peer_map_entry); - ofi_buf_free(peer_map_entry); - } - assert(HASH_CNT(hh, conn->ep_peer_map) == 0); -} - -/** - * @brief allocate an efa_conn object - * caller of this function must obtain av->util_av.lock or av->util_av_implicit.lock - * - * @param[in] av efa address vector - * @param[in] raw_addr raw efa address - * @param[in] flags flags application passed to fi_av_insert - * @param[in] context context application passed to fi_av_insert - * @param[in] insert_shm_av whether insert address to shm av - * @param[in] insert_implicit_av whether insert address to implicit AV - * @return on success, return a pointer to an efa_conn object - * otherwise, return NULL. errno will be set to a positive error code. - */ -struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, - uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av) -{ - struct util_av *util_av; - struct efa_cur_reverse_av **cur_reverse_av; - struct efa_prv_reverse_av **prv_reverse_av; - struct util_av_entry *util_av_entry = NULL; - struct efa_av_entry *efa_av_entry = NULL; - struct efa_conn *conn; - fi_addr_t fi_addr; - int err; - - if (flags & FI_SYNC_ERR) - memset(context, 0, sizeof(int)); - - if (insert_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - util_av = &av->util_av_implicit; - cur_reverse_av = &av->cur_reverse_av_implicit; - prv_reverse_av = &av->prv_reverse_av_implicit; - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - util_av = &av->util_av; - cur_reverse_av = &av->cur_reverse_av; - prv_reverse_av = &av->prv_reverse_av; - } - - err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", - fi_strerror(err)); - return NULL; - } - - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, - fi_addr); - efa_av_entry = (struct efa_av_entry *)util_av_entry->data; - assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)efa_av_entry->ep_addr)); - - conn = &efa_av_entry->conn; - memset(conn, 0, sizeof(*conn)); - conn->ep_addr = (struct efa_ep_addr *)efa_av_entry->ep_addr; - assert(av->type == FI_AV_TABLE); - - conn->av = av; - - if (insert_implicit_av) { - conn->fi_addr = FI_ADDR_NOTAVAIL; - conn->implicit_fi_addr = fi_addr; - err = efa_av_implicit_av_lru_insert(av, conn); - if (err) - return NULL; - } else { - conn->fi_addr = fi_addr; - conn->implicit_fi_addr = FI_ADDR_NOTAVAIL; - } - - conn->ah = efa_ah_alloc(av->domain, raw_addr->raw, insert_implicit_av); - if (!conn->ah) - goto err_release; - - if (insert_implicit_av) - dlist_insert_tail(&conn->ah_implicit_conn_list_entry, - &conn->ah->implicit_conn_list); - - conn->shm_fi_addr = FI_ADDR_NOTAVAIL; - /* - * The efa_conn_alloc() call can be made in two situations: - * 1. application calls fi_av_insert API - * 2. efa progress engine get a message from unknown peer through efa device, - * which means peer is not local or shm is disabled for transmission. - * For situation 1, the shm av insertion should happen when the peer is local (insert_shm_av=1) - * For situation 2, the shm av insertion shouldn't happen anyway (insert_shm_av=0). - */ - if (av->domain->info_type == EFA_INFO_RDM && insert_shm_av) { - err = efa_conn_rdm_insert_shm_av(av, conn); - if (err) { - errno = -err; - goto err_release; - } - } - - err = efa_av_reverse_av_add(av, cur_reverse_av, prv_reverse_av, conn); - if (err) { - if (av->domain->info_type == EFA_INFO_RDM) { - /* insert_implicit_av is only true for the CQ read path - * which already has the SRX lock */ - if (insert_implicit_av) - ofi_genlock_lock(&av->domain->srx_lock); - efa_conn_rdm_deinit(av, conn); - if (insert_implicit_av) - ofi_genlock_unlock(&av->domain->srx_lock); - } - goto err_release; - } - - insert_implicit_av ? av->used_implicit++ : av->used_explicit++; - - return conn; - -err_release: - if (conn->ah) - efa_ah_release(av->domain, conn->ah, insert_implicit_av); - - conn->ep_addr = NULL; - err = ofi_av_remove_addr(util_av, fi_addr); - if (err) - EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", - err); - - return NULL; -} - -void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - if (release_from_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, - &av->prv_reverse_av_implicit, conn); - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - efa_av_reverse_av_remove(&av->cur_reverse_av, - &av->prv_reverse_av, conn); - } -} - -void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - struct util_av *util_av; - struct util_av_entry *util_av_entry; - struct efa_av_entry *efa_av_entry; - char gidstr[INET6_ADDRSTRLEN]; - fi_addr_t fi_addr; - int err; - - if (release_from_implicit_av) { - assert(ofi_genlock_held(&av->util_av_implicit.lock)); - util_av = &av->util_av_implicit; - fi_addr = conn->implicit_fi_addr; - } else { - assert(ofi_genlock_held(&av->util_av.lock)); - util_av = &av->util_av; - fi_addr = conn->fi_addr; - } - - util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); - assert(util_av_entry); - efa_av_entry = (struct efa_av_entry *) util_av_entry->data; - - err = ofi_av_remove_addr(util_av, fi_addr); - if (err) { - EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); - } - - inet_ntop(AF_INET6, conn->ep_addr->raw, gidstr, INET6_ADDRSTRLEN); - EFA_INFO(FI_LOG_AV, "efa_conn released! conn[%p] GID[%s] QP[%u]\n", - conn, gidstr, conn->ep_addr->qpn); - - conn->ep_addr = NULL; - memset(efa_av_entry->ep_addr, 0, EFA_EP_ADDR_LEN); -} - -/** - * @brief release an efa conn object - * Caller of this function must obtain av->util_av.lock or - * av->util_av_implicit.lock. This function obtains the SRX lock and is called - * from the AV removal path. - * - * @param[in] av address vector - * @param[in] conn efa_conn object pointer - * @param[in] release_from_implicit_av whether to release conn - * from implicit AV - * @param[in] grab_srx_lock whether to get the SRX lock before - * destroying the peer struct - */ -void efa_conn_release(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - assert(av->domain->info_type != EFA_INFO_RDM || - ofi_genlock_held(&av->domain->srx_lock)); - - efa_conn_release_reverse_av(av, conn, release_from_implicit_av); - if (av->domain->info_type == EFA_INFO_RDM) - efa_conn_rdm_deinit(av, conn); - - if (release_from_implicit_av) - dlist_remove(&conn->ah_implicit_conn_list_entry); - - efa_ah_release(av->domain, conn->ah, release_from_implicit_av); - - efa_conn_release_util_av(av, conn, release_from_implicit_av); - - release_from_implicit_av ? av->used_implicit-- : av->used_explicit--; -} - -/** - * @brief release an efa conn object - * Caller of this function must obtain av->util_av.lock or - * av->util_av_implicit.lock and the SRX lock. It also calls - * efa_ah_release_unsafe which does not acquire the util_domain lock the - * protects the AH map. This function is called when evicting an AH entry in the - * CQ read path which already has the SRX lock and the util_domain lock. - * - * @param[in] av address vector - * @param[in] conn efa_conn object pointer - * @param[in] release_from_implicit_av whether to release conn - * from implicit AV - * @param[in] grab_srx_lock whether to get the SRX lock before - * destroying the peer struct - */ -void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av) -{ - assert(av->domain->info_type != EFA_INFO_RDM || - ofi_genlock_held(&av->domain->srx_lock)); - - assert(ofi_genlock_held(&av->domain->util_domain.lock)); - - efa_conn_release_reverse_av(av, conn, release_from_implicit_av); - if (av->domain->info_type == EFA_INFO_RDM) - efa_conn_rdm_deinit(av, conn); - - if (release_from_implicit_av) - dlist_remove(&conn->ah_implicit_conn_list_entry); - - efa_conn_release_util_av(av, conn, release_from_implicit_av); - - release_from_implicit_av ? conn->ah->implicit_refcnt-- : - conn->ah->explicit_refcnt--; - release_from_implicit_av ? av->used_implicit-- : av->used_explicit--; -} - -void efa_conn_ep_peer_map_insert(struct efa_conn *conn, struct efa_conn_ep_peer_map_entry *map_entry) -{ - HASH_ADD_PTR(conn->ep_peer_map, ep_ptr, map_entry); -} - -struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn, - struct efa_rdm_ep *ep) -{ - struct efa_conn_ep_peer_map_entry *map_entry; - - HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry); - - return map_entry ? &map_entry->peer : NULL; -} - -void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep) -{ - struct efa_conn_ep_peer_map_entry *map_entry; - - HASH_FIND_PTR(conn->ep_peer_map, &ep, map_entry); - assert(map_entry); - HASH_DELETE(hh, conn->ep_peer_map, map_entry); - ofi_buf_free(map_entry); -} diff --git a/prov/efa/src/efa_conn.h b/prov/efa/src/efa_conn.h deleted file mode 100644 index bafa293da5f..00000000000 --- a/prov/efa/src/efa_conn.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ - -#ifndef EFA_CONN_H -#define EFA_CONN_H - -#include "ofi_util.h" -#include "rdm/efa_rdm_peer.h" - -struct efa_conn { - struct efa_ah *ah; - struct efa_ep_addr *ep_addr; - struct efa_av *av; - fi_addr_t implicit_fi_addr; - fi_addr_t fi_addr; - fi_addr_t shm_fi_addr; - struct dlist_entry implicit_av_lru_entry; - struct dlist_entry ah_implicit_conn_list_entry; - struct efa_conn_ep_peer_map_entry *ep_peer_map; -}; - -struct efa_conn_ep_peer_map_entry { - struct efa_rdm_ep *ep_ptr; - struct efa_rdm_peer peer; - UT_hash_handle hh; -}; - -void efa_conn_ep_peer_map_insert(struct efa_conn *conn, - struct efa_conn_ep_peer_map_entry *map_entry); - -struct efa_rdm_peer *efa_conn_ep_peer_map_lookup(struct efa_conn *conn, - struct efa_rdm_ep *ep); - -void efa_conn_ep_peer_map_remove(struct efa_conn *conn, struct efa_rdm_ep *ep); - -int efa_conn_rdm_insert_shm_av(struct efa_av *av, struct efa_conn *conn); - -void efa_conn_rdm_deinit(struct efa_av *av, struct efa_conn *conn); - -struct efa_conn *efa_conn_alloc(struct efa_av *av, struct efa_ep_addr *raw_addr, - uint64_t flags, void *context, bool insert_shm_av, bool insert_implicit_av); - -void efa_conn_release_reverse_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release_util_av(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -void efa_conn_release_ah_unsafe(struct efa_av *av, struct efa_conn *conn, - bool release_from_implicit_av); - -#endif \ No newline at end of file diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 3eab28231d4..b734b199c1c 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -8,6 +8,7 @@ #include "config.h" #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_cntr.h" #include "rdm/efa_rdm_cntr.h" #include "rdm/efa_rdm_cq.h" @@ -46,7 +47,7 @@ static struct fi_ops_domain efa_domain_ops = { static struct fi_ops_domain efa_domain_ops_rdm = { .size = sizeof(struct fi_ops_domain), - .av_open = efa_av_open, + .av_open = efa_proto_av_open, .cq_open = efa_rdm_cq_open, .endpoint = efa_rdm_ep_open, .scalable_ep = fi_no_scalable_ep, @@ -496,14 +497,14 @@ static int efa_domain_query_addr(struct fid_ep *ep_fid, fi_addr_t addr, uint32_t *remote_qkey) { struct efa_base_ep *base_ep = container_of(ep_fid, struct efa_base_ep, util_ep.ep_fid); - struct efa_conn *conn = efa_av_addr_to_conn(base_ep->av, addr); - if (!conn || !conn->ah || !conn->ep_addr) { + struct efa_av_entry *av_entry = efa_av_addr_to_entry(base_ep->av, addr); + if (!av_entry || !av_entry->ah || !efa_av_entry_ep_addr(av_entry)) { EFA_WARN(FI_LOG_EP_CTRL, "Failed to find connection for addr %lu\n", addr); return -FI_EINVAL; } - *ahn = conn->ah->ahn; - *remote_qpn = conn->ep_addr->qpn; - *remote_qkey = conn->ep_addr->qkey; + *ahn = av_entry->ah->ahn; + *remote_qpn = efa_av_entry_ep_addr(av_entry)->qpn; + *remote_qkey = efa_av_entry_ep_addr(av_entry)->qkey; return FI_SUCCESS; } @@ -824,8 +825,8 @@ void efa_domain_progress_rdm_peers_and_queues(struct efa_domain *domain) EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer fi_addr: " "%ld implicit fi_addr: %ld. %s\n", - peer->conn->fi_addr, - peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, + peer->av_entry->implicit_fi_addr, fi_strerror(-ret)); efa_base_ep_write_eq_error(&peer->ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE); continue; diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index b6a7e83b864..ab3935ab6f6 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -206,7 +206,7 @@ static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi_msg *msg, uint64_t flags) { struct efa_qp *qp = base_ep->qp; - struct efa_conn *conn; + struct efa_av_entry *av_entry; struct ibv_sge sg_list[2]; /* efa device support up to 2 iov */ struct ibv_data_buf inline_data_list[2]; struct efa_context *efa_ctx; @@ -227,8 +227,8 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi dump_msg(msg, "send"); - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); assert(msg->iov_count <= base_ep->info->tx_attr->iov_limit); @@ -330,7 +330,7 @@ static inline ssize_t efa_post_send(struct efa_base_ep *base_ep, const struct fi /* Use consolidated send function */ ret = efa_qp_post_send(qp, sg_list, inline_data_list, iov_count, use_inline, wr_id, msg->data, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(ret)) ret = (ret == ENOMEM) ? -FI_EAGAIN : -ret; diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c index cf136e623b5..cb0f3283697 100644 --- a/prov/efa/src/efa_rma.c +++ b/prov/efa/src/efa_rma.c @@ -38,7 +38,7 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, { struct efa_domain *domain = base_ep->domain; struct efa_mr *efa_mr; - struct efa_conn *conn; + struct efa_av_entry *av_entry; size_t iov_count = msg->iov_count; struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */ uintptr_t wr_id; @@ -102,15 +102,15 @@ static inline ssize_t efa_rma_post_read(struct efa_base_ep *base_ep, } } - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); /* Use consolidated RDMA read function */ /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ err = efa_qp_post_read(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -197,7 +197,7 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, uint64_t flags) { struct efa_domain *domain = base_ep->domain; - struct efa_conn *conn; + struct efa_av_entry *av_entry; size_t iov_count = msg->iov_count; struct ibv_sge sge_list[2]; /* efa device support up to 2 iov */ uintptr_t wr_id; @@ -258,14 +258,14 @@ static inline ssize_t efa_rma_post_write(struct efa_base_ep *base_ep, } } - conn = efa_av_addr_to_conn(base_ep->av, msg->addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, msg->addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); /* Use consolidated RDMA write function */ err = efa_qp_post_write(base_ep->qp, sge_list, iov_count, msg->rma_iov[0].key, msg->rma_iov[0].addr, wr_id, msg->data, flags, - conn->ah, conn->ep_addr->qpn, conn->ep_addr->qkey); + av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -365,7 +365,7 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, struct efa_base_ep *base_ep; struct efa_domain *domain; struct ibv_sge sge; - struct efa_conn *conn; + struct efa_av_entry *av_entry; uintptr_t wr_id; int err; @@ -387,12 +387,12 @@ ssize_t efa_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, sge.length = 0; sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, 0, 0, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); + wr_id, 0, 0, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, + efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; @@ -406,7 +406,7 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size { struct efa_base_ep *base_ep; struct efa_domain *domain; - struct efa_conn *conn; + struct efa_av_entry *av_entry; struct ibv_sge sge; uintptr_t wr_id; int err; @@ -429,12 +429,12 @@ static ssize_t efa_rma_inject_writedata(struct fid_ep *ep, const void *buf, size sge.length = 0; sge.lkey = domain->zero_byte_bounce_buf_mr->ibv_mr->lkey; - conn = efa_av_addr_to_conn(base_ep->av, dest_addr); - assert(conn && conn->ep_addr); + av_entry = efa_av_addr_to_entry(base_ep->av, dest_addr); + assert(av_entry && efa_av_entry_ep_addr(av_entry)); err = efa_qp_post_write(base_ep->qp, &sge, 1, key, addr, - wr_id, data, IBV_SEND_INLINE, conn->ah, conn->ep_addr->qpn, - conn->ep_addr->qkey); + wr_id, data, IBV_SEND_INLINE, av_entry->ah, efa_av_entry_ep_addr(av_entry)->qpn, + efa_av_entry_ep_addr(av_entry)->qkey); if (OFI_UNLIKELY(err)) err = (err == ENOMEM) ? -FI_EAGAIN : -err; diff --git a/prov/efa/src/rdm/efa_proto_av.c b/prov/efa/src/rdm/efa_proto_av.c new file mode 100644 index 00000000000..8b84f315bff --- /dev/null +++ b/prov/efa/src/rdm/efa_proto_av.c @@ -0,0 +1,1590 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include +#include + +#include "efa.h" +#include "efa_av.h" +#include "rdm/efa_proto_av.h" +#include "rdm/efa_rdm_pke_utils.h" + +/* + * efa_av_entry and efa_proto_av_entry share the same cache-line-0 layout + * (ep_addr, ah) so reverse_av entries and util_av contexts work across + * both. Break loudly if anyone ever reorders either struct. + */ +_Static_assert(offsetof(struct efa_proto_av_entry, ep_addr) == + offsetof(struct efa_av_entry, ep_addr), + "efa_av_entry and efa_proto_av_entry must share ep_addr offset"); +_Static_assert(offsetof(struct efa_proto_av_entry, ah) == + offsetof(struct efa_av_entry, ah), + "efa_av_entry and efa_proto_av_entry must share ah offset"); + +/** + * @brief Local/remote peer detection by comparing peer GID with stored local GIDs + * + * @param[in] av efa AV + * @param[in] addr peer address to check + * @return true if local, false otherwise + */ +static bool efa_is_local_peer(struct efa_av *av, const void *addr) +{ + int i; + uint8_t *raw_gid = ((struct efa_ep_addr *)addr)->raw; + +#if ENABLE_DEBUG + char raw_gid_str[INET6_ADDRSTRLEN] = { 0 }; + + if (!inet_ntop(AF_INET6, raw_gid, raw_gid_str, INET6_ADDRSTRLEN)) { + EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno); + return 0; + } + EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", raw_gid_str); +#endif + for (i = 0; i < g_efa_ibv_gid_cnt; ++i) { + if (!memcmp(raw_gid, g_efa_ibv_gid_list[i].raw, EFA_GID_LEN)) { + EFA_INFO(FI_LOG_AV, "The peer is local.\n"); + return 1; + } + } + + return 0; +} + +/* Forward declaration for static helper defined after entry release */ +static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah); + +/* ---- Address lookup ---- */ + +/** + * @brief find proto AV entry using fi_addr in the given util_av + * + * @param[in] util_av util AV to search + * @param[in] fi_addr fabric address to look up + * @return pointer to entry if valid, NULL otherwise + */ +static inline struct efa_proto_av_entry * +efa_proto_av_addr_to_entry_impl(struct util_av *util_av, fi_addr_t fi_addr) +{ + struct util_av_entry *util_av_entry; + struct efa_proto_av_entry *entry; + + if (OFI_UNLIKELY(fi_addr == FI_ADDR_UNSPEC || fi_addr == FI_ADDR_NOTAVAIL)) + return NULL; + + if (OFI_LIKELY(ofi_bufpool_ibuf_is_valid(util_av->av_entry_pool, fi_addr))) + util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); + else + return NULL; + + entry = (struct efa_proto_av_entry *)util_av_entry->data; + return entry->ah ? entry : NULL; +} + +/** + * @brief find proto AV entry using fi_addr in the explicit AV + * + * @param[in] av protocol AV + * @param[in] fi_addr fabric address + * @return pointer to entry if valid, NULL otherwise + */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av, + fi_addr_t fi_addr) +{ + return efa_proto_av_addr_to_entry_impl(&av->efa_av.util_av, fi_addr); +} + +/** + * @brief find proto AV entry using fi_addr in the implicit AV + * + * @param[in] av protocol AV + * @param[in] fi_addr fabric address + * @return pointer to entry if valid, NULL otherwise + */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit( + struct efa_proto_av *av, fi_addr_t fi_addr) +{ + return efa_proto_av_addr_to_entry_impl(&av->util_av_implicit, fi_addr); +} + +/* ---- Peer map operations ---- */ + +/** + * @brief insert an entry into the peer map for a given AV entry + * + * @param[in] entry proto AV entry + * @param[in] map_entry peer map entry to insert + */ +void efa_proto_av_entry_ep_peer_map_insert( + struct efa_proto_av_entry *entry, + struct efa_proto_av_entry_ep_peer_map_entry *map_entry) +{ + HASH_ADD_PTR(entry->ep_peer_map, ep_ptr, map_entry); +} + +/** + * @brief look up a peer in the peer map for a given AV entry and endpoint + * + * @param[in] entry proto AV entry + * @param[in] ep RDM endpoint + * @return pointer to peer if found, NULL otherwise + */ +struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep) +{ + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; + + HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry); + return map_entry ? &map_entry->peer : NULL; +} + +/** + * @brief remove an endpoint's peer from the peer map for a given AV entry + * + * @param[in] entry proto AV entry + * @param[in] ep RDM endpoint whose peer to remove + */ +void efa_proto_av_entry_ep_peer_map_remove( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep) +{ + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; + + HASH_FIND_PTR(entry->ep_peer_map, &ep, map_entry); + assert(map_entry); + HASH_DELETE(hh, entry->ep_peer_map, map_entry); + ofi_buf_free(map_entry); +} + +/* ---- SHM AV operations ---- */ + +/** + * @brief Insert the address into SHM provider's AV + * + * If shm transfer is enabled and the addr comes from local peer, + * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later. + * 2. insert gid_qpn into shm's av + * 3. store returned fi_addr from shm into the hash table + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry + * @return On success return 0, otherwise return a negative error code + */ +int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + int err, ret; + char smr_name[EFA_SHM_NAME_MAX]; + size_t smr_name_len; + struct efa_ep_addr *ep_addr = efa_proto_av_entry_ep_addr(entry); + + assert(ep_addr); + + if (efa_is_local_peer(&av->efa_av, ep_addr) && av->shm_rdm_av) { + if (av->shm_used >= efa_env.shm_av_size) { + EFA_WARN(FI_LOG_AV, + "Max number of shm AV entry (%d) has been reached.\n", + efa_env.shm_av_size); + return -FI_ENOMEM; + } + + smr_name_len = EFA_SHM_NAME_MAX; + err = efa_shm_ep_name_construct(smr_name, &smr_name_len, ep_addr); + if (err != FI_SUCCESS) { + EFA_WARN(FI_LOG_AV, + "efa_rdm_ep_efa_addr_to_str() failed! err=%d\n", err); + return err; + } + + /* + * The shm provider supports FI_AV_USER_ID flag. This flag + * associates a user-assigned identifier with each av entry that + * is returned with any completion entry in place of the AV's + * address. Below, &entry->shm_fi_addr is both input and output. + * It is passed in with value entry->fi_addr (the efa provider's + * fi_addr). shm records this as user id for cq write, then + * overwrites shm_fi_addr with the actual fi_addr in shm's av. + * The efa provider uses shm_fi_addr for transmissions through + * the shm ep. + */ + entry->shm_fi_addr = entry->fi_addr; + ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &entry->shm_fi_addr, FI_AV_USER_ID, NULL); + if (OFI_UNLIKELY(ret != 1)) { + EFA_WARN(FI_LOG_AV, + "Failed to insert address to shm provider's av: %s\n", + fi_strerror(-ret)); + entry->shm_fi_addr = FI_ADDR_NOTAVAIL; + return ret; + } + + EFA_INFO(FI_LOG_AV, + "Successfully inserted %s to shm provider's av. efa_fiaddr: %ld shm_fiaddr = %ld\n", + smr_name, entry->fi_addr, entry->shm_fi_addr); + + assert(entry->shm_fi_addr < efa_env.shm_av_size); + av->shm_used++; + } + + return 0; +} + +/** + * @brief Release the protocol-specific resources of an AV entry. + * + * Releases the shm av entry and destroys rdm peers. Caller must hold + * the SRX lock because this function modifies the peer map and destroys + * peers which are accessed and modified in the CQ read path. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry + */ +void efa_proto_av_entry_deinit(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + int err; + struct efa_proto_av_entry_ep_peer_map_entry *peer_map_entry, *tmp; + + assert((entry->fi_addr != FI_ADDR_NOTAVAIL && + entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) || + (entry->implicit_fi_addr != FI_ADDR_NOTAVAIL && + entry->fi_addr == FI_ADDR_NOTAVAIL)); + + if (entry->shm_fi_addr != FI_ADDR_NOTAVAIL && av->shm_rdm_av) { + err = fi_av_remove(av->shm_rdm_av, &entry->shm_fi_addr, 1, 0); + if (err) { + EFA_WARN(FI_LOG_AV, + "remove address from shm av failed! err=%d\n", + err); + } else { + av->shm_used--; + assert(entry->shm_fi_addr < efa_env.shm_av_size); + } + } + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + HASH_ITER(hh, entry->ep_peer_map, peer_map_entry, tmp) { + dlist_remove(&peer_map_entry->peer.ep_peer_list_entry); + efa_rdm_peer_destruct(&peer_map_entry->peer, peer_map_entry->ep_ptr); + HASH_DEL(entry->ep_peer_map, peer_map_entry); + ofi_buf_free(peer_map_entry); + } + assert(HASH_CNT(hh, entry->ep_peer_map) == 0); +} + +/* ---- Implicit AV LRU ---- */ + +/** + * @brief Add entry to the LRU list. If the list is full, evict the least + * recently used entry at the front of the LRU list and add the latest one. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to be added to the LRU list + */ +static inline int efa_proto_av_implicit_av_lru_insert(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + size_t cur_size; + struct efa_ep_addr_hashable *ep_addr_hashable; + struct efa_proto_av_entry *entry_to_release; + + if (av->implicit_av_size == 0) + goto out; + + cur_size = HASH_CNT(hh, av->util_av_implicit.hash); + if (cur_size <= av->implicit_av_size) + goto out; + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + dlist_pop_front(&av->implicit_av_lru_list, struct efa_proto_av_entry, + entry_to_release, implicit_av_lru_entry); + /* + * dlist_pop_front leaves entry_to_release's dlist_entry pointing at its + * old neighbors. Re-init so that efa_proto_av_entry_release's call to + * dlist_remove is a no-op on the already-popped node and does not + * disturb the surrounding list. + */ + dlist_init(&entry_to_release->implicit_av_lru_entry); + + EFA_INFO(FI_LOG_AV, + "Evicting AV entry for peer implicit fi_addr %" PRIu64 + " AHN %" PRIu16 " QPN %" PRIu16 " QKEY %" PRIu32 " from " + "implicit AV\n", + entry_to_release->implicit_fi_addr, entry_to_release->ah->ahn, + efa_proto_av_entry_ep_addr(entry_to_release)->qpn, + efa_proto_av_entry_ep_addr(entry_to_release)->qkey); + + ep_addr_hashable = malloc(sizeof(struct efa_ep_addr_hashable)); + if (!ep_addr_hashable) { + EFA_WARN(FI_LOG_AV, "Could not allocate memory for LRU AV entry hashset entry\n"); + /* Re-insert the victim at the head so it remains tracked in the LRU. */ + dlist_insert_head(&entry_to_release->implicit_av_lru_entry, + &av->implicit_av_lru_list); + return -FI_ENOMEM; + } + memcpy(ep_addr_hashable, entry_to_release->ep_addr, sizeof(struct efa_ep_addr)); + HASH_ADD(hh, av->evicted_peers_hashset, addr, sizeof(struct efa_ep_addr), ep_addr_hashable); + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + efa_proto_av_entry_release(av, entry_to_release, true); + + assert(HASH_CNT(hh, av->util_av_implicit.hash) == av->implicit_av_size); + +out: + dlist_insert_tail(&entry->implicit_av_lru_entry, + &av->implicit_av_lru_list); + return FI_SUCCESS; +} + +/** + * @brief Move entry to the end of the LRU list (most recently used) + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to move + */ +void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av, + struct efa_proto_av_entry *entry) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + assert(av->implicit_av_size == 0 || + HASH_CNT(hh, av->util_av_implicit.hash) <= av->implicit_av_size); + assert(dlist_entry_in_list(&av->implicit_av_lru_list, + &entry->implicit_av_lru_entry)); + + dlist_remove(&entry->implicit_av_lru_entry); + dlist_insert_tail(&entry->implicit_av_lru_entry, + &av->implicit_av_lru_list); + + efa_proto_ah_lru_move(av->efa_av.domain, entry->ah); +} + +/* ---- Reverse lookup (protocol, connid-aware) ---- */ + +/** + * @brief reverse lookup a proto AV entry by AHN, QPN, and optional connid + * + * @param[in] cur_reverse_av current reverse AV hash table + * @param[in] prv_reverse_av previous reverse AV hash table + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or packet entry to extract connid from + * @return pointer to entry if found, NULL otherwise + */ +static inline struct efa_proto_av_entry * +efa_proto_av_reverse_lookup_entry(struct efa_cur_reverse_av **cur_reverse_av, + struct efa_prv_reverse_av **prv_reverse_av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + uint32_t *connid; + struct efa_cur_reverse_av *cur_entry; + struct efa_prv_reverse_av *prv_entry; + struct efa_cur_reverse_av_key cur_key; + struct efa_prv_reverse_av_key prv_key; + + cur_key.ahn = ahn; + cur_key.qpn = qpn; + + HASH_FIND(hh, *cur_reverse_av, &cur_key, sizeof(cur_key), cur_entry); + + if (OFI_UNLIKELY(!cur_entry)) + return NULL; + + /* + * Cast is safe: in protocol path, av_entry points to the ep_addr field + * of a efa_proto_av_entry which has the same layout prefix. + */ + if (!pkt_entry) { + return (struct efa_proto_av_entry *)cur_entry->av_entry; + } + + connid = efa_rdm_pke_connid_ptr(pkt_entry); + if (!connid) { + EFA_WARN_ONCE(FI_LOG_EP_CTRL, + "An incoming packet does NOT have connection ID " + "in its header.\n" + "This means the peer is using an older version " + "of libfabric.\n" + "The communication can continue but it is " + "encouraged to use\n" + "a newer version of libfabric\n"); + return (struct efa_proto_av_entry *)cur_entry->av_entry; + } + + if (OFI_LIKELY(*connid == efa_av_entry_ep_addr(cur_entry->av_entry)->qkey)) + return (struct efa_proto_av_entry *)cur_entry->av_entry; + + prv_key.ahn = ahn; + prv_key.qpn = qpn; + prv_key.connid = *connid; + HASH_FIND(hh, *prv_reverse_av, &prv_key, sizeof(prv_key), prv_entry); + + return OFI_LIKELY(!!prv_entry) ? (struct efa_proto_av_entry *)prv_entry->av_entry : NULL; +} + +/** + * @brief find fi_addr for RDM endpoint in the explicit AV (connid-aware) + * + * @param[in] av protocol AV + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid + * @return fi_addr on success, FI_ADDR_NOTAVAIL if not found + */ +fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + struct efa_proto_av_entry *entry; + + entry = efa_proto_av_reverse_lookup_entry( + &av->efa_av.cur_reverse_av, &av->efa_av.prv_reverse_av, + ahn, qpn, pkt_entry); + + if (OFI_LIKELY(!!entry)) + return entry->fi_addr; + + return FI_ADDR_NOTAVAIL; +} + +/** + * @brief find fi_addr for RDM endpoint in the implicit AV (connid-aware) + * + * Caller must hold srx_lock. Updates LRU list on hit. + * + * @param[in] av protocol AV + * @param[in] ahn address handle number + * @param[in] qpn QP number + * @param[in] pkt_entry NULL or RDM packet entry, used to extract connid + * @return implicit fi_addr on success, FI_ADDR_NOTAVAIL if not found + */ +fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry) +{ + struct efa_proto_av_entry *entry; + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + entry = efa_proto_av_reverse_lookup_entry( + &av->cur_reverse_av_implicit, &av->prv_reverse_av_implicit, + ahn, qpn, pkt_entry); + + if (OFI_LIKELY(!!entry)) { + efa_proto_av_implicit_av_lru_entry_move(av, entry); + return entry->implicit_fi_addr; + } + + return FI_ADDR_NOTAVAIL; +} + +/* ---- Entry release helpers ---- */ + +/** + * @brief remove entry from the appropriate reverse AV hash tables + * + * @param[in] av protocol AV + * @param[in] entry entry to remove + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +static void efa_proto_av_entry_release_reverse_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + if (release_from_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, + &av->prv_reverse_av_implicit, + (struct efa_av_entry *)entry); + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + efa_av_reverse_av_remove(&av->efa_av.cur_reverse_av, + &av->efa_av.prv_reverse_av, + (struct efa_av_entry *)entry); + } +} + +/** + * @brief remove entry from the appropriate util_av and clear its fields + * + * @param[in] av protocol AV + * @param[in] entry entry to remove + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +static void efa_proto_av_entry_release_util_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + struct util_av *util_av; + char gidstr[INET6_ADDRSTRLEN]; + fi_addr_t fi_addr; + int err; + + if (release_from_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + util_av = &av->util_av_implicit; + fi_addr = entry->implicit_fi_addr; + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + util_av = &av->efa_av.util_av; + fi_addr = entry->fi_addr; + } + + err = ofi_av_remove_addr(util_av, fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "ofi_av_remove_addr failed! err=%d\n", err); + + inet_ntop(AF_INET6, efa_proto_av_entry_ep_addr(entry)->raw, gidstr, INET6_ADDRSTRLEN); + EFA_INFO(FI_LOG_AV, "efa_proto_av_entry released! entry[%p] GID[%s] QP[%u]\n", + entry, gidstr, efa_proto_av_entry_ep_addr(entry)->qpn); + + entry->ah = NULL; + memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN); +} + +/** + * @brief Release a proto AV entry. + * + * Caller must hold srx_lock. Acquires util_domain.lock internally + * via efa_ah_release. Called from the AV removal path. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to release + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +void efa_proto_av_entry_release(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + + efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av); + efa_proto_av_entry_deinit(av, entry); + + if (release_from_implicit_av) { + dlist_remove(&entry->ah_implicit_conn_list_entry); + dlist_remove(&entry->implicit_av_lru_entry); + } + + efa_proto_ah_release(av->efa_av.domain, entry->ah, release_from_implicit_av); + efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); + + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--; +} + +/** + * @brief Release a proto AV entry without acquiring util_domain.lock. + * + * Caller must hold srx_lock AND util_domain.lock. Called from the AH + * eviction path in the CQ read path which already holds both locks. + * + * @param[in] av protocol address vector + * @param[in] entry proto av entry to release + * @param[in] release_from_implicit_av whether entry is in implicit AV + */ +void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av) +{ + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + assert(ofi_genlock_held(&av->efa_av.domain->util_domain.lock)); + + efa_proto_av_entry_release_reverse_av(av, entry, release_from_implicit_av); + efa_proto_av_entry_deinit(av, entry); + + if (release_from_implicit_av) { + dlist_remove(&entry->ah_implicit_conn_list_entry); + dlist_remove(&entry->implicit_av_lru_entry); + } + + /* Decrement refcnts before release_util_av which NULLs entry->ah */ + release_from_implicit_av ? efa_proto_ah_from_ah(entry->ah)->implicit_refcnt-- : + efa_proto_ah_from_ah(entry->ah)->explicit_refcnt--; + entry->ah->refcnt--; + + efa_proto_av_entry_release_util_av(av, entry, release_from_implicit_av); + + release_from_implicit_av ? av->used_implicit-- : av->efa_av.used--; +} + +/* ---- Protocol AH helpers ---- */ + +/** + * @brief Move the AH to the end of the LRU list (most recently used) + * + * @param[in] domain efa domain + * @param[in] ah base AH (must be embedded in efa_proto_ah) + */ +static void efa_proto_ah_lru_move(struct efa_domain *domain, struct efa_ah *ah) +{ + struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah); + + assert(efa_proto_ah_from_ah(ah)->implicit_refcnt > 0 || efa_proto_ah_from_ah(ah)->explicit_refcnt > 0); + assert(dlist_entry_in_list(&domain->ah_lru_list, + &proto_ah->lru_list_entry)); + + dlist_remove(&proto_ah->lru_list_entry); + dlist_insert_tail(&proto_ah->lru_list_entry, &domain->ah_lru_list); +} + +/** + * @brief Evict the least recently used AH that has no explicit AV entries. + * + * Finds the LRU AH with only implicit references, releases all its + * implicit AV entries, and destroys the AH. Called when ibv_create_ah + * fails with ENOMEM. + * + * Caller must hold srx_lock. This function acquires util_domain.lock. + * + * @param[in] domain efa domain + * @return 0 on success, -FI_ENOMEM if no AH is available to evict + */ +static int efa_proto_ah_evict(struct efa_domain *domain) +{ + struct efa_proto_av_entry *entry_to_release; + struct efa_proto_ah *proto_ah_tmp, *proto_ah_to_release = NULL; + struct dlist_entry *tmp; + + assert(ofi_genlock_held(&domain->srx_lock)); + + ofi_genlock_lock(&domain->util_domain.lock); + + dlist_foreach_container(&domain->ah_lru_list, struct efa_proto_ah, + proto_ah_tmp, lru_list_entry) { + if (proto_ah_tmp->explicit_refcnt == 0) { + proto_ah_to_release = proto_ah_tmp; + break; + } + } + + if (!proto_ah_to_release) { + ofi_genlock_unlock(&domain->util_domain.lock); + EFA_WARN(FI_LOG_AV, + "AH creation for implicit AV entry failed with ENOMEM " + "but no AH entries available to evict\n"); + return -FI_ENOMEM; + } + + assert(proto_ah_to_release->implicit_refcnt > 0); + + dlist_foreach_container_safe(&proto_ah_to_release->implicit_conn_list, + struct efa_proto_av_entry, entry_to_release, + ah_implicit_conn_list_entry, tmp) { + + assert(entry_to_release->implicit_fi_addr != FI_ADDR_NOTAVAIL && + entry_to_release->fi_addr == FI_ADDR_NOTAVAIL); + + efa_proto_av_entry_release_ah_unsafe(entry_to_release->av, + entry_to_release, true); + } + + if (proto_ah_to_release->implicit_refcnt == 0 && + proto_ah_to_release->explicit_refcnt == 0) { + dlist_remove(&proto_ah_to_release->lru_list_entry); + assert(dlist_empty(&proto_ah_to_release->implicit_conn_list)); + assert(proto_ah_to_release->ah.refcnt == 0); + efa_ah_destroy(domain, &proto_ah_to_release->ah); + } + + ofi_genlock_unlock(&domain->util_domain.lock); + + return FI_SUCCESS; +} + +/** + * @brief Allocate a protocol AH with eviction retry. + * + * Calls efa_ah_alloc with sizeof(efa_proto_ah) to allocate the + * protocol wrapper. Initializes implicit_refcnt, explicit_refcnt, + * implicit_conn_list, and inserts into the domain LRU list. + * On ENOMEM, evicts an AH with only implicit references and retries. + * + * Protocol refcnts and the LRU list are shared across all AVs sharing + * the same PD (domain), but per-AV call sites only hold their own + * util_av lock. This function takes util_domain.lock around the proto + * field mutations to serialize against concurrent efa_proto_ah_alloc + * / efa_proto_ah_release on a different AV. + * + * @param[in] domain efa domain + * @param[in] gid GID + * @param[in] insert_implicit_av whether this is for an implicit AV entry + * @return pointer to base efa_ah on success, NULL on failure + */ +struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, + const uint8_t *gid, + bool insert_implicit_av) +{ + struct efa_ah *ah; + struct efa_proto_ah *proto_ah; + int err; + bool first_proto_user; + + ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah)); + if (!ah) { + if (errno != FI_ENOMEM) + return NULL; + + EFA_INFO(FI_LOG_AV, + "ibv_create_ah failed with ENOMEM. " + "Attempting to evict AH entry\n"); + + err = efa_proto_ah_evict(domain); + if (err) + return NULL; + + ah = efa_ah_alloc(domain, gid, sizeof(struct efa_proto_ah)); + if (!ah) + return NULL; + } + + /* + * efa_ah_alloc released util_domain.lock on return. Reacquire it + * before touching the protocol-specific fields (refcnts, LRU list, + * implicit_conn_list) so concurrent allocators on a different AV's + * lock don't race on a shared AH. + * + * Between efa_ah_alloc returning and reacquiring the lock, a + * concurrent efa_proto_ah_release could have dropped both proto + * refcnts to zero and removed the AH from the LRU list, even though + * the base ah->refcnt stayed > 0. Detect "first proto user" by + * checking the proto refcnts directly rather than ah->refcnt. + */ + ofi_genlock_lock(&domain->util_domain.lock); + + proto_ah = efa_proto_ah_from_ah(ah); + + /* + * first_proto_user is true when both proto refcnts are zero — either + * this is a brand-new AH (refcnt just incremented from 0 to 1 inside + * efa_ah_alloc) or an AH where the last proto user released (and + * removed it from the LRU list) but the base layer kept it alive. + * Either way we need to (re)init the proto fields and (re)insert + * into the LRU list. + */ + first_proto_user = (proto_ah->implicit_refcnt == 0 && + proto_ah->explicit_refcnt == 0); + if (first_proto_user) { + dlist_init(&proto_ah->implicit_conn_list); + dlist_insert_tail(&proto_ah->lru_list_entry, + &domain->ah_lru_list); + } + + insert_implicit_av ? proto_ah->implicit_refcnt++ : + proto_ah->explicit_refcnt++; + + if (!first_proto_user) + efa_proto_ah_lru_move(domain, ah); + + ofi_genlock_unlock(&domain->util_domain.lock); + + return ah; +} + +/** + * @brief Release a protocol AH reference. + * + * Decrements the appropriate protocol refcount. When both protocol + * refcounts reach zero, removes from LRU list and calls efa_ah_release + * to decrement the base refcount (which destroys the AH). + * + * Protocol refcnts and the LRU list are shared across all AVs sharing + * the same PD (domain), but per-AV call sites only hold their own + * util_av lock. This function takes util_domain.lock around the proto + * field mutations to serialize against concurrent efa_proto_ah_alloc + * / efa_proto_ah_release on a different AV. + * + * @param[in] domain efa domain + * @param[in] ah base AH + * @param[in] release_from_implicit_av whether releasing implicit ref + */ +void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah, + bool release_from_implicit_av) +{ + struct efa_proto_ah *proto_ah = efa_proto_ah_from_ah(ah); + + /* + * Protocol refcnts and LRU list are shared across AVs sharing the + * same PD (domain), so mutations must be serialized by + * util_domain.lock — the same lock efa_ah_release acquires. + */ + ofi_genlock_lock(&domain->util_domain.lock); + + assert((release_from_implicit_av && proto_ah->implicit_refcnt > 0) || + (!release_from_implicit_av && proto_ah->explicit_refcnt > 0)); + + release_from_implicit_av ? proto_ah->implicit_refcnt-- : + proto_ah->explicit_refcnt--; + + if (proto_ah->implicit_refcnt == 0 && proto_ah->explicit_refcnt == 0) { + dlist_remove(&proto_ah->lru_list_entry); + assert(dlist_empty(&proto_ah->implicit_conn_list)); + } + + ofi_genlock_unlock(&domain->util_domain.lock); + + efa_ah_release(domain, ah); +} + +/* ---- Entry alloc ---- */ + +/** + * @brief Allocate and initialize a proto AV entry. + * + * Caller must hold util_av.lock (explicit) or util_av_implicit.lock (implicit), + * and must hold srx_lock. srx_lock is required because this function calls + * efa_proto_av_entry_deinit on the error path, which walks the per-entry + * ep_peer_map and destructs peers under srx_lock. + * + * @param[in] av protocol address vector + * @param[in] raw_addr raw efa address + * @param[in] flags flags application passed to fi_av_insert + * @param[in] context context application passed to fi_av_insert + * @param[in] insert_shm_av whether to insert address into shm av + * @param[in] insert_implicit_av whether to insert into implicit AV + * @return on success, return a pointer to the entry; otherwise NULL + */ +struct efa_proto_av_entry *efa_proto_av_entry_alloc( + struct efa_proto_av *av, struct efa_ep_addr *raw_addr, + uint64_t flags, void *context, bool insert_shm_av, + bool insert_implicit_av) +{ + struct util_av *util_av; + struct efa_cur_reverse_av **cur_reverse_av; + struct efa_prv_reverse_av **prv_reverse_av; + struct util_av_entry *util_av_entry = NULL; + struct efa_proto_av_entry *entry; + fi_addr_t fi_addr; + int err; + bool on_lru_list = false; + + if (flags & FI_SYNC_ERR) + memset(context, 0, sizeof(int)); + + if (insert_implicit_av) { + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + util_av = &av->util_av_implicit; + cur_reverse_av = &av->cur_reverse_av_implicit; + prv_reverse_av = &av->prv_reverse_av_implicit; + } else { + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + util_av = &av->efa_av.util_av; + cur_reverse_av = &av->efa_av.cur_reverse_av; + prv_reverse_av = &av->efa_av.prv_reverse_av; + } + + err = ofi_av_insert_addr(util_av, raw_addr, &fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, "ofi_av_insert_addr failed! Error message: %s\n", + fi_strerror(err)); + return NULL; + } + + util_av_entry = ofi_bufpool_get_ibuf(util_av->av_entry_pool, fi_addr); + entry = (struct efa_proto_av_entry *)util_av_entry->data; + assert(efa_is_same_addr(raw_addr, (struct efa_ep_addr *)entry->ep_addr)); + + memset((char *)entry + EFA_EP_ADDR_LEN, 0, + sizeof(*entry) - EFA_EP_ADDR_LEN); + assert(av->efa_av.type == FI_AV_TABLE); + + entry->av = av; + + if (insert_implicit_av) { + entry->fi_addr = FI_ADDR_NOTAVAIL; + entry->implicit_fi_addr = fi_addr; + err = efa_proto_av_implicit_av_lru_insert(av, entry); + if (err) + goto err_release; + on_lru_list = true; + } else { + entry->fi_addr = fi_addr; + entry->implicit_fi_addr = FI_ADDR_NOTAVAIL; + } + + entry->ah = efa_proto_ah_alloc(av->efa_av.domain, raw_addr->raw, insert_implicit_av); + if (!entry->ah) + goto err_release; + + if (insert_implicit_av) + dlist_insert_tail(&entry->ah_implicit_conn_list_entry, + &efa_proto_ah_from_ah(entry->ah)->implicit_conn_list); + + entry->shm_fi_addr = FI_ADDR_NOTAVAIL; + + /* + * This function is called in two situations: + * 1. application calls fi_av_insert API + * 2. efa progress engine gets a message from unknown peer through + * efa device, meaning peer is not local or shm is disabled. + * For situation 1, shm av insertion should happen when peer is local + * (insert_shm_av=1). For situation 2, it shouldn't (insert_shm_av=0). + */ + if (insert_shm_av) { + err = efa_proto_av_entry_insert_shm_av(av, entry); + if (err) { + errno = -err; + goto err_release; + } + } + + err = efa_av_reverse_av_add(&av->efa_av, cur_reverse_av, prv_reverse_av, + (struct efa_av_entry *)entry); + if (err) { + efa_proto_av_entry_deinit(av, entry); + goto err_release; + } + + insert_implicit_av ? av->used_implicit++ : av->efa_av.used++; + + return entry; + +err_release: + if (insert_implicit_av && on_lru_list) + dlist_remove(&entry->implicit_av_lru_entry); + + if (entry->ah) + efa_proto_ah_release(av->efa_av.domain, entry->ah, insert_implicit_av); + + entry->ah = NULL; + memset(entry->ep_addr, 0, EFA_EP_ADDR_LEN); + err = ofi_av_remove_addr(util_av, fi_addr); + if (err) + EFA_WARN(FI_LOG_AV, "While processing previous failure, ofi_av_remove_addr failed! err=%d\n", + err); + + return NULL; +} + +/* ---- Implicit to explicit migration ---- */ + +/** + * @brief get the fi_addr from a peer rx entry's packet context + * + * Used as a callback for foreach_unspec_addr during implicit-to-explicit + * migration. + * + * @param[in] rx_entry peer rx entry + * @return fi_addr of the peer + */ +static fi_addr_t +efa_proto_av_get_addr_from_peer_rx_entry(struct fi_peer_rx_entry *rx_entry) +{ + struct efa_rdm_pke *pke; + + pke = (struct efa_rdm_pke *) rx_entry->peer_context; + + return pke->peer->av_entry->fi_addr; +} + +/** + * @brief migrate an implicit AV entry to the explicit AV + * + * Moves the entry, its peer map, AH, and SHM fi_addr from the implicit + * AV to the explicit AV. Updates reverse AVs and notifies the SRX to + * move unexpected messages from the unspecified queue. + * + * Caller must hold util_av.lock and util_av_implicit.lock. + * + * @param[in] av protocol AV + * @param[in] raw_addr raw efa address + * @param[in] implicit_fi_addr fi_addr in the implicit AV + * @param[out] fi_addr fi_addr assigned in the explicit AV + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, + struct efa_ep_addr *raw_addr, + fi_addr_t implicit_fi_addr, + fi_addr_t *fi_addr) +{ + int err; + struct efa_ah *ah; + struct efa_proto_av_entry *implicit_entry, *explicit_entry; + struct efa_rdm_ep *ep; + struct dlist_entry *list_entry; + struct util_av_entry *implicit_util_av_entry, *explicit_util_av_entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry, *tmp; + struct fid_peer_srx *peer_srx; + + EFA_INFO(FI_LOG_AV, + "Moving peer with implicit fi_addr %" PRIu64 + " to explicit AV\n", + implicit_fi_addr); + + assert(ofi_genlock_held(&av->efa_av.util_av.lock)); + assert(ofi_genlock_held(&av->util_av_implicit.lock)); + + implicit_util_av_entry = + ofi_bufpool_get_ibuf(av->util_av_implicit.av_entry_pool, implicit_fi_addr); + implicit_entry = (struct efa_proto_av_entry *) implicit_util_av_entry->data; + + assert(implicit_entry); + assert(efa_is_same_addr( + raw_addr, (struct efa_ep_addr *) implicit_entry->ep_addr)); + assert(implicit_entry->fi_addr == FI_ADDR_NOTAVAIL && + implicit_entry->implicit_fi_addr == implicit_fi_addr); + + ah = implicit_entry->ah; + + /* Create explicit util AV entry */ + err = ofi_av_insert_addr(&av->efa_av.util_av, raw_addr, fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, + "ofi_av_insert_addr into explicit AV failed! Error " + "message: %s\n", + fi_strerror(err)); + return err; + } + + explicit_util_av_entry = + ofi_bufpool_get_ibuf(av->efa_av.util_av.av_entry_pool, *fi_addr); + explicit_entry = (struct efa_proto_av_entry *) explicit_util_av_entry->data; + assert(efa_is_same_addr( + raw_addr, (struct efa_ep_addr *) explicit_entry->ep_addr)); + + /* Copy information from implicit to explicit */ + memset((char *)explicit_entry + EFA_EP_ADDR_LEN, 0, + sizeof(*explicit_entry) - EFA_EP_ADDR_LEN); + assert(av->efa_av.type == FI_AV_TABLE); + explicit_entry->av = av; + explicit_entry->ah = implicit_entry->ah; + explicit_entry->fi_addr = *fi_addr; + explicit_entry->shm_fi_addr = implicit_entry->shm_fi_addr; + explicit_entry->implicit_fi_addr = FI_ADDR_NOTAVAIL; + HASH_ITER(hh, implicit_entry->ep_peer_map, map_entry, tmp) { + HASH_DELETE(hh, implicit_entry->ep_peer_map, map_entry); + HASH_ADD_PTR(explicit_entry->ep_peer_map, ep_ptr, map_entry); + map_entry->peer.av_entry = explicit_entry; + } + assert(HASH_CNT(hh, implicit_entry->ep_peer_map) == 0); + + /* Handle reverse AV and AV ref counts */ + efa_av_reverse_av_remove(&av->cur_reverse_av_implicit, + &av->prv_reverse_av_implicit, + (struct efa_av_entry *)implicit_entry); + + dlist_remove(&implicit_entry->implicit_av_lru_entry); + + err = ofi_av_remove_addr(&av->util_av_implicit, implicit_fi_addr); + if (err) { + EFA_WARN(FI_LOG_AV, + "ofi_av_remove_addr from implicit AV failed! Error " + "message: %s\n", + fi_strerror(err)); + return err; + } + + av->used_implicit--; + + err = efa_av_reverse_av_add(&av->efa_av, &av->efa_av.cur_reverse_av, + &av->efa_av.prv_reverse_av, + (struct efa_av_entry *)explicit_entry); + if (err) + return err; + + av->efa_av.used++; + + /* Handle AH LRU list and refcnt */ + assert(!dlist_empty(&efa_proto_ah_from_ah(ah)->implicit_conn_list)); + dlist_remove(&implicit_entry->ah_implicit_conn_list_entry); + efa_proto_ah_lru_move(av->efa_av.domain, ah); + efa_proto_ah_from_ah(ah)->implicit_refcnt--; + efa_proto_ah_from_ah(ah)->explicit_refcnt++; + + EFA_INFO(FI_LOG_AV, + "Peer with implicit fi_addr %" PRIu64 + " moved to explicit AV. Explicit fi_addr: %" PRIu64 "\n", + implicit_fi_addr, *fi_addr); + + ofi_genlock_lock(&av->efa_av.util_av.ep_list_lock); + dlist_foreach(&av->efa_av.util_av.ep_list, list_entry) { + ep = container_of(list_entry, struct efa_rdm_ep, base_ep.util_ep.av_entry); + peer_srx = util_get_peer_srx(ep->peer_srx_ep); + peer_srx->owner_ops->foreach_unspec_addr(peer_srx, &efa_proto_av_get_addr_from_peer_rx_entry); + } + ofi_genlock_unlock(&av->efa_av.util_av.ep_list_lock); + + return FI_SUCCESS; +} + +/* ---- Protocol AV insert_one ---- */ + +/** + * @brief insert one address into the protocol AV + * + * Checks explicit and implicit AVs for duplicates. Handles + * implicit-to-explicit migration when an implicit entry exists. + * + * Caller must hold srx_lock. + * + * @param[in] av protocol AV + * @param[in] addr raw address (gid:qpn:qkey) + * @param[out] fi_addr output fi_addr + * @param[in] flags flags from fi_av_insert + * @param[in] context context from fi_av_insert + * @param[in] insert_shm_av whether to insert into SHM AV + * @param[in] insert_implicit_av whether to insert into implicit AV + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context, + bool insert_shm_av, bool insert_implicit_av) +{ + struct efa_proto_av_entry *entry; + char raw_gid_str[INET6_ADDRSTRLEN]; + fi_addr_t efa_fiaddr; + fi_addr_t implicit_fi_addr; + int ret = 0; + + if (!efa_av_is_valid_address(addr)) { + EFA_WARN(FI_LOG_AV, "Failed to insert bad addr\n"); + *fi_addr = FI_ADDR_NOTAVAIL; + return -FI_EADDRNOTAVAIL; + } + + assert(ofi_genlock_held(&av->efa_av.domain->srx_lock)); + ofi_genlock_lock(&av->util_av_implicit.lock); + ofi_genlock_lock(&av->efa_av.util_av.lock); + + memset(raw_gid_str, 0, sizeof(raw_gid_str)); + if (!inet_ntop(AF_INET6, addr->raw, raw_gid_str, INET6_ADDRSTRLEN)) { + EFA_WARN(FI_LOG_AV, "cannot convert address to string. errno: %d\n", errno); + ret = -FI_EINVAL; + *fi_addr = FI_ADDR_NOTAVAIL; + goto out; + } + + EFA_INFO(FI_LOG_AV, + "Inserting address GID[%s] QP[%u] QKEY[%u] to %s AV ....\n", + raw_gid_str, addr->qpn, addr->qkey, + insert_implicit_av ? "implicit" : "explicit"); + + /* Check explicit AV */ + efa_fiaddr = ofi_av_lookup_fi_addr_unsafe(&av->efa_av.util_av, addr); + if (efa_fiaddr != FI_ADDR_NOTAVAIL) { + assert(!insert_implicit_av); + EFA_INFO(FI_LOG_AV, "Found existing AV entry pointing to this address! fi_addr: %ld\n", efa_fiaddr); + *fi_addr = efa_fiaddr; + ret = 0; + goto out; + } + + /* Check implicit AV */ + implicit_fi_addr = + ofi_av_lookup_fi_addr_unsafe(&av->util_av_implicit, addr); + if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { + EFA_INFO(FI_LOG_AV, + "Found implicit AV entry id %ld for the same address\n", + implicit_fi_addr); + + if (insert_implicit_av) { + entry = efa_proto_av_addr_to_entry_implicit(av, implicit_fi_addr); + efa_proto_av_implicit_av_lru_entry_move(av, entry); + *fi_addr = implicit_fi_addr; + goto out; + } + + ret = efa_proto_av_entry_implicit_to_explicit(av, addr, implicit_fi_addr, fi_addr); + if (ret) + *fi_addr = FI_ADDR_NOTAVAIL; + goto out; + } + + entry = efa_proto_av_entry_alloc(av, addr, flags, context, insert_shm_av, insert_implicit_av); + if (!entry) { + *fi_addr = FI_ADDR_NOTAVAIL; + ret = -FI_EADDRNOTAVAIL; + goto out; + } + + if (insert_implicit_av) { + *fi_addr = entry->implicit_fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to implicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); + } else { + *fi_addr = entry->fi_addr; + EFA_INFO(FI_LOG_AV, + "Successfully inserted address GID[%s] QP[%u] QKEY[%u] to explicit AV. fi_addr: %ld\n", + raw_gid_str, addr->qpn, addr->qkey, *fi_addr); + } + ret = 0; + +out: + ofi_genlock_unlock(&av->efa_av.util_av.lock); + ofi_genlock_unlock(&av->util_av_implicit.lock); + return ret; +} + +/* ---- Protocol AV fi_ops ---- */ + +/** + * @brief insert addresses into protocol AV (fi_av_insert implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr buffer containing addresses to insert + * @param[in] count number of addresses + * @param[out] fi_addr array for returned fabric addresses + * @param[in] flags operation flags + * @param[in] context user context + * @return number of addresses successfully inserted + */ +static int efa_proto_av_insert(struct fid_av *av_fid, const void *addr, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av); + int ret = 0, success_cnt = 0; + size_t i = 0; + struct efa_ep_addr *addr_i; + fi_addr_t fi_addr_res; + + if (av->efa_av.util_av.flags & FI_EVENT) + return -FI_ENOEQ; + + if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) + return -FI_EINVAL; + + flags &= ~FI_MORE; + if (flags) + return -FI_ENOSYS; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + + for (i = 0; i < count; i++) { + addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); + + ret = efa_proto_av_insert_one(av, addr_i, &fi_addr_res, flags, context, true, false); + if (ret) { + EFA_WARN(FI_LOG_AV, "insert raw_addr to av failed! ret=%d\n", ret); + break; + } + + if (fi_addr) + fi_addr[i] = fi_addr_res; + success_cnt++; + } + + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); + + for (; i < count ; i++) { + if (fi_addr) + fi_addr[i] = FI_ADDR_NOTAVAIL; + } + + return success_cnt; +} + +/** + * @brief retrieve an address from the protocol AV (fi_av_lookup implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr fabric address to look up + * @param[out] addr buffer to store the returned address + * @param[in,out] addrlen on input, size of addr buffer; on output, bytes written + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, + void *addr, size_t *addrlen) +{ + struct efa_av *base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + struct efa_proto_av *av = container_of(base_av, struct efa_proto_av, efa_av); + struct efa_proto_av_entry *entry = NULL; + + if (av->efa_av.type != FI_AV_TABLE) + return -FI_EINVAL; + + if (fi_addr == FI_ADDR_NOTAVAIL) + return -FI_EINVAL; + + ofi_genlock_lock(&av->efa_av.util_av.lock); + entry = efa_proto_av_addr_to_entry(av, fi_addr); + if (!entry) { + ofi_genlock_unlock(&av->efa_av.util_av.lock); + return -FI_EINVAL; + } + + memcpy(addr, (void *)entry->ep_addr, MIN(EFA_EP_ADDR_LEN, *addrlen)); + ofi_genlock_unlock(&av->efa_av.util_av.lock); + if (*addrlen > EFA_EP_ADDR_LEN) + *addrlen = EFA_EP_ADDR_LEN; + return 0; +} + +/** + * @brief remove addresses from the protocol AV (fi_av_remove implementation) + * + * @param[in] av_fid fid of AV + * @param[in] fi_addr array of fabric addresses to remove + * @param[in] count number of addresses + * @param[in] flags operation flags + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) +{ + int err = 0; + size_t i; + struct efa_av *base_av; + struct efa_proto_av *av; + struct efa_proto_av_entry *entry; + + if (!fi_addr) + return -FI_EINVAL; + + base_av = container_of(av_fid, struct efa_av, util_av.av_fid); + av = container_of(base_av, struct efa_proto_av, efa_av); + if (av->efa_av.type != FI_AV_TABLE) + return -FI_EINVAL; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + ofi_genlock_lock(&av->efa_av.util_av.lock); + for (i = 0; i < count; i++) { + entry = efa_proto_av_addr_to_entry(av, fi_addr[i]); + if (!entry) { + err = -FI_EINVAL; + break; + } + + efa_proto_av_entry_release(av, entry, false); + } + + if (i < count) + assert(err); + + ofi_genlock_unlock(&av->efa_av.util_av.lock); + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); + return err; +} + +/** + * @brief convert an address to a printable string (fi_av_straddr implementation) + * + * @param[in] av_fid fid of AV + * @param[in] addr address to convert + * @param[out] buf buffer to store the string + * @param[in,out] len on input, size of buf; on output, bytes written + * @return pointer to buf + */ +static const char *efa_proto_av_straddr(struct fid_av *av_fid, const void *addr, + char *buf, size_t *len) +{ + return ofi_straddr(buf, len, FI_ADDR_EFA, addr); +} + +static struct fi_ops_av efa_proto_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = efa_proto_av_insert, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = efa_proto_av_remove, + .lookup = efa_proto_av_lookup, + .straddr = efa_proto_av_straddr +}; + +/** + * @brief release all entries in the explicit and implicit reverse AVs + * + * @param[in] av protocol AV + */ +static void efa_proto_av_close_reverse_av(struct efa_proto_av *av) +{ + struct efa_cur_reverse_av *cur_entry, *curtmp; + struct efa_prv_reverse_av *prv_entry, *prvtmp; + + ofi_genlock_lock(&av->efa_av.domain->srx_lock); + + ofi_genlock_lock(&av->efa_av.util_av.lock); + + HASH_ITER(hh, av->efa_av.cur_reverse_av, cur_entry, curtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, false); + } + + HASH_ITER(hh, av->efa_av.prv_reverse_av, prv_entry, prvtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, false); + } + + ofi_genlock_unlock(&av->efa_av.util_av.lock); + + ofi_genlock_lock(&av->util_av_implicit.lock); + + HASH_ITER(hh, av->cur_reverse_av_implicit, cur_entry, curtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)cur_entry->av_entry, true); + } + + HASH_ITER(hh, av->prv_reverse_av_implicit, prv_entry, prvtmp) { + efa_proto_av_entry_release(av, (struct efa_proto_av_entry *)prv_entry->av_entry, true); + } + + ofi_genlock_unlock(&av->util_av_implicit.lock); + + ofi_genlock_unlock(&av->efa_av.domain->srx_lock); +} + +/** + * @brief close the protocol AV and release all resources (fi_close implementation) + * + * @param[in] fid fid of AV + * @return 0 on success, negative error code on failure + */ +static int efa_proto_av_close(struct fid *fid) +{ + struct efa_av *base_av; + struct efa_proto_av *av; + int err = 0; + struct efa_ep_addr_hashable *ep_addr_hashable, *tmp; + + base_av = container_of(fid, struct efa_av, util_av.av_fid.fid); + av = container_of(base_av, struct efa_proto_av, efa_av); + + efa_proto_av_close_reverse_av(av); + + err = ofi_av_close(&av->efa_av.util_av); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, "Failed to close util av: %s\n", + fi_strerror(err)); + + err = ofi_av_close(&av->util_av_implicit); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, "Failed to close implicit util av: %s\n", + fi_strerror(err)); + + if (av->shm_rdm_av) { + err = fi_close(&av->shm_rdm_av->fid); + if (OFI_UNLIKELY(err)) + EFA_WARN(FI_LOG_AV, + "Failed to close shm av: %s\n", + fi_strerror(err)); + } + + HASH_ITER(hh, av->evicted_peers_hashset, ep_addr_hashable, tmp) { + HASH_DEL(av->evicted_peers_hashset, ep_addr_hashable); + free(ep_addr_hashable); + } + + free(av); + return err; +} + +static struct fi_ops efa_proto_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = efa_proto_av_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +/** + * @brief open a protocol AV (fi_av_open implementation for RDM) + * + * @param[in] domain_fid fid of domain + * @param[in] attr AV attributes + * @param[out] av_fid pointer to store the opened AV fid + * @param[in] context user context + * @return 0 on success, negative error code on failure + */ +int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, + struct fid_av **av_fid, void *context) +{ + struct efa_domain *efa_domain; + struct efa_proto_av *av; + struct fi_av_attr av_attr = { 0 }; + size_t context_len; + size_t universe_size; + int ret, retv; + + if (!attr) + return -FI_EINVAL; + + if (attr->name) + return -FI_ENOSYS; + + if (attr->flags) + return -FI_ENOSYS; + + if (!attr->count) + attr->count = EFA_MIN_AV_SIZE; + else + attr->count = MAX(attr->count, EFA_MIN_AV_SIZE); + + av = calloc(1, sizeof(*av)); + if (!av) + return -FI_ENOMEM; + + if (attr->type == FI_AV_MAP) { + EFA_INFO(FI_LOG_AV, "FI_AV_MAP is deprecated in Libfabric 2.x. Please use FI_AV_TABLE. " + "EFA provider will now switch to using FI_AV_TABLE.\n"); + } + attr->type = FI_AV_TABLE; + + efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid); + + if (fi_param_get_size_t(NULL, "universe_size", + &universe_size) == FI_SUCCESS) + attr->count = MAX(attr->count, universe_size); + + context_len = sizeof(struct efa_proto_av_entry) - EFA_EP_ADDR_LEN; + + ret = efa_av_init_util_av(efa_domain, attr, &av->util_av_implicit, context, + context_len); + if (ret) + goto err; + + ret = efa_av_init_util_av(efa_domain, attr, &av->efa_av.util_av, context, + context_len); + if (ret) + goto err_close_util_av_implicit; + + if (efa_domain->fabric && efa_domain->fabric->shm_fabric) { + av_attr = *attr; + if (efa_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) { + ret = -FI_ENOSYS; + EFA_WARN(FI_LOG_AV, + "The requested av size is beyond" + " shm supported maximum av size: %s\n", + fi_strerror(-ret)); + goto err_close_util_av; + } + av_attr.count = efa_env.shm_av_size; + assert(av_attr.type == FI_AV_TABLE); + ret = fi_av_open(efa_domain->shm_domain, &av_attr, + &av->shm_rdm_av, context); + if (ret) + goto err_close_util_av; + } + + EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", attr->flags); + + av->efa_av.domain = efa_domain; + av->efa_av.type = attr->type; + av->efa_av.used = 0; + av->implicit_av_size = efa_env.implicit_av_size; + av->used_implicit = 0; + av->shm_used = 0; + + *av_fid = &av->efa_av.util_av.av_fid; + (*av_fid)->fid.fclass = FI_CLASS_AV; + (*av_fid)->fid.context = context; + (*av_fid)->fid.ops = &efa_proto_av_fi_ops; + (*av_fid)->ops = &efa_proto_av_ops; + + dlist_init(&av->implicit_av_lru_list); + + return 0; + +err_close_util_av: + retv = ofi_av_close(&av->efa_av.util_av); + if (retv) + EFA_WARN(FI_LOG_AV, + "Unable to close util_av: %s\n", fi_strerror(-retv)); + +err_close_util_av_implicit: + retv = ofi_av_close(&av->util_av_implicit); + if (retv) + EFA_WARN(FI_LOG_AV, + "Unable to close util_av_implicit: %s\n", fi_strerror(-retv)); + +err: + free(av); + return ret; +} diff --git a/prov/efa/src/rdm/efa_proto_av.h b/prov/efa/src/rdm/efa_proto_av.h new file mode 100644 index 00000000000..6cf8f330383 --- /dev/null +++ b/prov/efa/src/rdm/efa_proto_av.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#ifndef EFA_PROTO_AV_H +#define EFA_PROTO_AV_H + +#include "efa_av.h" + +struct efa_rdm_ep; +struct efa_rdm_peer; + +/** + * @brief Protocol AH — wraps base efa_ah with implicit refcount and LRU + * + * The base efa_ah has a single refcount and no LRU knowledge. + * efa_proto_ah adds the implicit/explicit refcount split, the + * implicit_conn_list (entries using this AH), and the LRU list + * entry for AH eviction. + * + * pahole: size: 128, cachelines: 2 + * + * All efa_proto_ah fields are control path only (AV insert/remove/eviction). + * The TX hot fields (ibv_ah, ahn) are in the embedded efa_ah at cacheline 0. + * The protocol extension fields start at offset 88 (cacheline 1), so + * accessing them on the eviction path does not pollute the TX cache line. + */ +struct efa_proto_ah { + struct efa_ah ah; /* 0 88 must be first (castable) */ + /* --- cacheline 1 boundary (64 bytes) was 24 bytes ago --- */ + int implicit_refcnt; /* 88 4 */ + int explicit_refcnt; /* 92 4 */ + struct dlist_entry implicit_conn_list; /* 96 16 */ + struct dlist_entry lru_list_entry; /* 112 16 */ +}; + +/** + * @brief Protocol AV entry — flat layout with same field prefix as efa_av_entry + * + * pahole: + * size: 112, cachelines: 2, members: 9 + * + * Cache line 0 (64 bytes): data-path hot fields + * ep_addr[32] off=0 — TX hot (qpn@+16, qkey@+20) + * ah* off=32 — TX hot (EFA send path) + * fi_addr off=40 — RX hot (explicit peer lookup, CQ poll) + * implicit_fi_addr off=48 — RX hot (implicit peer lookup, CQ poll) + * ep_peer_map* off=56 — TX+RX hot (peer lookup on every op) + * + * Cache line 1 (48 bytes): SHM-only TX / control-path fields + * shm_fi_addr off=64 — SHM TX only + * implicit_av_lru_entry off=72 — implicit RX LRU bookkeeping + * ah_implicit_conn_list_entry off=88 — implicit AV insert/release + * av* off=104 — back-pointer for AH eviction + */ +struct efa_proto_av_entry { + uint8_t ep_addr[EFA_EP_ADDR_LEN]; /* 0 32 must be first (util_av) */ + struct efa_ah *ah; /* 32 8 */ + fi_addr_t fi_addr; /* 40 8 */ + fi_addr_t implicit_fi_addr; /* 48 8 */ + struct efa_proto_av_entry_ep_peer_map_entry *ep_peer_map; /* 56 8 */ + /* --- cacheline 1 boundary (64 bytes) --- */ + fi_addr_t shm_fi_addr; /* 64 8 */ + struct dlist_entry implicit_av_lru_entry; /* 72 16 */ + struct dlist_entry ah_implicit_conn_list_entry; /* 88 16 */ + struct efa_proto_av *av; /* 104 8 */ +}; + +/** + * @brief Peer map entry — maps (ep_ptr) to efa_rdm_peer for a given AV entry + * + * pahole: size: 328, cachelines: 6 + */ +struct efa_proto_av_entry_ep_peer_map_entry { + struct efa_rdm_ep *ep_ptr; /* 0 8 */ + struct efa_rdm_peer peer; /* 8 264 */ + UT_hash_handle hh; /* 272 56 */ +}; + +/** + * @brief Protocol AV — embeds efa_av as first member (castable) + * + * pahole: + * size: 672, cachelines: 11, members: 10 + * + * efa_av off=0 size=320 (cachelines 0-4) + * domain* off=0 — cacheline 0 + * cur_reverse_av* off=24 — RX hot: explicit peer reverse lookup + * prv_reverse_av* off=32 — RX hot: QPN reuse fallback + * util_av off=40 size=280 + * --- cacheline 5 boundary (320 bytes) --- + * shm_rdm_av* off=320 — control path only + * util_av_implicit off=328 size=280 + * --- cacheline 9 boundary (576 bytes) + 32 --- + * cur_reverse_av_implicit* off=608 — RX hot (implicit peers only) + * prv_reverse_av_implicit* off=616 — RX hot (implicit peers only) + * implicit_av_lru_list off=624 — implicit RX: LRU reorder + * --- cacheline 10 boundary (640 bytes) --- + * used_implicit off=640 + * shm_used off=648 + * implicit_av_size off=656 + * evicted_peers_hashset* off=664 + * + * RX hot path (every RX completion): + * efa_av.cur_reverse_av (off=24) — HASH_FIND for explicit peer reverse lookup + * efa_av.prv_reverse_av (off=32) — HASH_FIND fallback for QPN reuse (connid mismatch) + * These are in cacheline 0 — explicit peer reverse lookup stays in one line. + * + * RX hot path for implicit (unknown) peers: + * cur_reverse_av_implicit (off=608) — HASH_FIND for implicit peer reverse lookup + * prv_reverse_av_implicit (off=616) — HASH_FIND fallback + * implicit_av_lru_list (off=624) — LRU reorder on every implicit RX + * All three are in cacheline 9 — implicit peer reverse lookup + LRU + * update stays in one cache line. + * + * Control path only (AV insert/remove/close): + * shm_rdm_av, util_av_implicit, used_implicit, shm_used, + * implicit_av_size, evicted_peers_hashset + */ +struct efa_proto_av { + struct efa_av efa_av; /* 0 320 */ + /* --- cacheline 5 boundary (320 bytes) --- */ + struct fid_av *shm_rdm_av; /* 320 8 */ + /* implicit AV is used when receiving messages from peers not + * explicitly inserted by the application */ + struct util_av util_av_implicit; /* 328 280 */ + struct efa_cur_reverse_av *cur_reverse_av_implicit; /* 608 8 */ + struct efa_prv_reverse_av *prv_reverse_av_implicit; /* 616 8 */ + struct dlist_entry implicit_av_lru_list; /* 624 16 */ + /* --- cacheline 10 boundary (640 bytes) --- */ + size_t used_implicit; /* 640 8 */ + size_t shm_used; /* 648 8 */ + size_t implicit_av_size; /* 656 8 */ + struct efa_ep_addr_hashable *evicted_peers_hashset; /* 664 8 */ +}; + +/** + * @brief get the protocol AH wrapper from a base AH pointer + * + * @param[in] ah base AH (must be embedded in efa_proto_ah) + * @return pointer to the containing efa_proto_ah + */ +static inline struct efa_proto_ah *efa_proto_ah_from_ah(struct efa_ah *ah) +{ + return container_of(ah, struct efa_proto_ah, ah); +} + +/** + * @brief typed accessor for the ep_addr field of a proto AV entry + * + * @param[in] entry proto AV entry + * @return pointer to the efa_ep_addr embedded in the entry + */ +static inline struct efa_ep_addr * +efa_proto_av_entry_ep_addr(struct efa_proto_av_entry *entry) +{ + return (struct efa_ep_addr *)entry->ep_addr; +} + +/* Address lookup */ +struct efa_proto_av_entry *efa_proto_av_addr_to_entry(struct efa_proto_av *av, + fi_addr_t fi_addr); + +struct efa_proto_av_entry *efa_proto_av_addr_to_entry_implicit( + struct efa_proto_av *av, fi_addr_t fi_addr); + +/* Peer map operations */ +void efa_proto_av_entry_ep_peer_map_insert( + struct efa_proto_av_entry *entry, + struct efa_proto_av_entry_ep_peer_map_entry *map_entry); + +struct efa_rdm_peer *efa_proto_av_entry_ep_peer_map_lookup( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep); + +void efa_proto_av_entry_ep_peer_map_remove( + struct efa_proto_av_entry *entry, struct efa_rdm_ep *ep); + +/* Protocol AH allocation / release (shared base AH + proto wrapper) */ +struct efa_ah *efa_proto_ah_alloc(struct efa_domain *domain, + const uint8_t *gid, + bool insert_implicit_av); + +void efa_proto_ah_release(struct efa_domain *domain, struct efa_ah *ah, + bool release_from_implicit_av); + +/* SHM AV operations */ +int efa_proto_av_entry_insert_shm_av(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +/* Entry deinit (tears down peers on the entry) */ +void efa_proto_av_entry_deinit(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +/* Implicit AV LRU */ +void efa_proto_av_implicit_av_lru_entry_move(struct efa_proto_av *av, + struct efa_proto_av_entry *entry); + +/* Reverse lookup for protocol path */ +fi_addr_t efa_proto_av_reverse_lookup(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry); + +fi_addr_t efa_proto_av_reverse_lookup_implicit(struct efa_proto_av *av, + uint16_t ahn, uint16_t qpn, + struct efa_rdm_pke *pkt_entry); + +/* Entry alloc/release */ +struct efa_proto_av_entry *efa_proto_av_entry_alloc( + struct efa_proto_av *av, struct efa_ep_addr *raw_addr, + uint64_t flags, void *context, bool insert_shm_av, + bool insert_implicit_av); + +void efa_proto_av_entry_release(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av); + +void efa_proto_av_entry_release_ah_unsafe(struct efa_proto_av *av, + struct efa_proto_av_entry *entry, + bool release_from_implicit_av); + +/* Implicit to explicit migration */ +int efa_proto_av_entry_implicit_to_explicit(struct efa_proto_av *av, + struct efa_ep_addr *raw_addr, + fi_addr_t implicit_fi_addr, + fi_addr_t *fi_addr); + +/* AV open/close/insert/remove for protocol path */ +int efa_proto_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, + struct fid_av **av_fid, void *context); + +int efa_proto_av_insert_one(struct efa_proto_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context, + bool insert_shm_av, bool insert_implicit_av); + +#endif diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 1999e540520..1170861498e 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -6,6 +6,7 @@ #include "efa_data_path_ops.h" #include "ofi_util.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_cntr.h" #include "efa_rdm_pke_cmd.h" #include "efa_rdm_pke_utils.h" @@ -190,17 +191,15 @@ static void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( struct util_cq *target_cq; int ret; fi_addr_t src_addr; - struct efa_av *efa_av; uint32_t imm_data = efa_ibv_cq_wc_read_imm_data(ibv_cq); uint32_t len = efa_ibv_cq_wc_read_byte_len(ibv_cq); target_cq = ep->base_ep.util_ep.rx_cq; - efa_av = ep->base_ep.av; if (ep->base_ep.util_ep.caps & FI_SOURCE) { /* Only check the explicit AV when writing completions */ - src_addr = efa_av_reverse_lookup_rdm(efa_av, + src_addr = efa_proto_av_reverse_lookup(ep->proto_av, efa_ibv_cq_wc_read_slid(ibv_cq), efa_ibv_cq_wc_read_src_qp(ibv_cq), NULL); @@ -361,7 +360,7 @@ efa_rdm_cq_lookup_raw_addr(struct efa_rdm_pke *pke, } /* Next check implicit AV */ - addr = ofi_av_lookup_fi_addr(&ep->base_ep.av->util_av_implicit, + addr = ofi_av_lookup_fi_addr(&ep->proto_av->util_av_implicit, (void *) efa_ep_addr); if (addr != FI_ADDR_NOTAVAIL) { implicit = true; @@ -401,7 +400,6 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, struct efa_ibv_cq *efa_ibv_cq, struct efa_rdm_pke *pkt_entry) { - struct efa_av *efa_av = ep->base_ep.av; fi_addr_t explicit_fi_addr, implicit_fi_addr; struct efa_ep_addr efa_ep_addr = {0}; struct efa_ep_addr_hashable *efa_ep_addr_hashable = NULL; @@ -433,7 +431,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * behavior is fixed */ explicit_fi_addr = - efa_av_reverse_lookup_rdm(efa_av, gid, qpn, pkt_entry); + efa_proto_av_reverse_lookup(ep->proto_av, gid, qpn, pkt_entry); if (explicit_fi_addr != FI_ADDR_NOTAVAIL) { EFA_DBG(FI_LOG_CQ, @@ -445,7 +443,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, } implicit_fi_addr = - efa_av_reverse_lookup_rdm_implicit(efa_av, gid, qpn, pkt_entry); + efa_proto_av_reverse_lookup_implicit(ep->proto_av, gid, qpn, pkt_entry); if (implicit_fi_addr != FI_ADDR_NOTAVAIL) { EFA_DBG(FI_LOG_CQ, @@ -473,7 +471,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * TODO: continue communication with peer by saving the previous state * and restoring it */ - HASH_FIND(hh, ep->base_ep.av->evicted_peers_hashset, &efa_ep_addr, + HASH_FIND(hh, ep->proto_av->evicted_peers_hashset, &efa_ep_addr, sizeof(struct efa_ep_addr), efa_ep_addr_hashable); if (OFI_UNLIKELY(!!efa_ep_addr_hashable)) { EFA_WARN(FI_LOG_CQ, "Received packet from peer already evicted " @@ -494,7 +492,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, * not local or shm is disabled for transmission. We shouldn't insert * in to shm av in this case. */ - ret = efa_av_insert_one(ep->base_ep.av, &efa_ep_addr, &implicit_fi_addr, + ret = efa_proto_av_insert_one(ep->proto_av, &efa_ep_addr, &implicit_fi_addr, 0, NULL, false, true); if (OFI_UNLIKELY(ret != 0)) { efa_base_ep_write_eq_error(&ep->base_ep, ret, @@ -506,10 +504,10 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, out: assert(peer); - assert((peer->conn->fi_addr != FI_ADDR_NOTAVAIL && - peer->conn->implicit_fi_addr == FI_ADDR_NOTAVAIL) || - (peer->conn->implicit_fi_addr != FI_ADDR_NOTAVAIL && - peer->conn->fi_addr == FI_ADDR_NOTAVAIL)); + assert((peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL && + peer->av_entry->implicit_fi_addr == FI_ADDR_NOTAVAIL) || + (peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL && + peer->av_entry->fi_addr == FI_ADDR_NOTAVAIL)); return peer; } @@ -584,8 +582,8 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct EFA_WARN(FI_LOG_CQ, "Peer fi_addr: %ld implicit fi_addr %ld is requesting " "feature %d, which this EP does not support.\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr, + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr, base_hdr->type); assert(0 && "invalid REQ packet type"); @@ -690,7 +688,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc_closing_ep(struct efa_ibv_cq *cq, struc efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, (size_t) pkt_entry->ope->cq_entry.op_context, pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, - pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL, + pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL, efa_rdm_pkt_type_of_pke(pkt_entry)); #endif @@ -756,7 +754,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc(struct efa_ibv_cq *cq, struct efa_rdm_e efa_rdm_tracepoint(poll_cq_ope, pkt_entry->ope->msg_id, (size_t) pkt_entry->ope->cq_entry.op_context, pkt_entry->ope->total_len, pkt_entry->ope->cq_entry.tag, - pkt_entry->ope->peer ? pkt_entry->ope->peer->conn->fi_addr : FI_ADDR_NOTAVAIL, + pkt_entry->ope->peer ? pkt_entry->ope->peer->av_entry->fi_addr : FI_ADDR_NOTAVAIL, efa_rdm_pkt_type_of_pke(pkt_entry)); #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 8684bdf7305..2e05fcad221 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -54,6 +54,7 @@ struct efa_rdm_ep_queued_copy { struct efa_rdm_ep { struct efa_base_ep base_ep; + struct efa_proto_av *proto_av; /* set during fi_ep_bind, avoids container_of on hot path */ /* self_ah necessary for local reads when application does not insert * its own address into the AV */ @@ -561,15 +562,7 @@ void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep); char ep_addr_str[OFI_ADDRSTRLEN] = {0}; \ efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &(size_t){sizeof ep_addr_str}); -static inline -fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr) -{ - struct efa_conn *conn; - - assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn(ep->base_ep.av, addr); - return conn ? conn->shm_fi_addr : FI_ADDR_NOTAVAIL; -} +fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr); static inline size_t efa_rdm_ep_get_available_tx_pkts(struct efa_rdm_ep *ep) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 5020c487bc6..64e8b4c83d4 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -3,6 +3,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_ep.h" #include "efa_rdm_cq.h" #include "efa_rdm_srx.h" @@ -259,7 +260,7 @@ int efa_rdm_ep_create_buffer_pools(struct efa_rdm_ep *ep) goto err_free; ret = ofi_bufpool_create(&ep->peer_map_entry_pool, - sizeof(struct efa_conn_ep_peer_map_entry), + sizeof(struct efa_proto_av_entry_ep_peer_map_entry), EFA_RDM_BUFPOOL_ALIGNMENT, 0, /* no limit to max_cnt */ EFA_RDM_EP_MIN_PEER_POOL_SIZE, @@ -660,10 +661,12 @@ static int efa_rdm_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) if (ret) return ret; + efa_rdm_ep->proto_av = container_of(av, struct efa_proto_av, efa_av); + /* Bind shm provider endpoint & shm av */ if (efa_rdm_ep->shm_ep) { - assert(av->shm_rdm_av); - ret = fi_ep_bind(efa_rdm_ep->shm_ep, &av->shm_rdm_av->fid, flags); + assert(efa_rdm_ep->proto_av->shm_rdm_av); + ret = fi_ep_bind(efa_rdm_ep->shm_ep, &efa_rdm_ep->proto_av->shm_rdm_av->fid, flags); if (ret) return ret; } @@ -722,9 +725,9 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) struct efa_rdm_ope *rxe; struct efa_rdm_ope *txe; struct efa_rdm_peer *peer; - struct util_av_entry *util_av_entry; - struct efa_av_entry *av_entry; - struct efa_conn_ep_peer_map_entry *peer_map_entry; + struct efa_proto_av_entry *proto_entry; + struct efa_proto_av_entry_ep_peer_map_entry *pm_entry; + /* * Destruct peers first so overflow packets are properly @@ -735,28 +738,24 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) struct efa_rdm_peer, peer, ep_peer_list_entry, tmp) { - if (peer->conn->fi_addr != FI_ADDR_UNSPEC) { - util_av_entry = ofi_bufpool_get_ibuf( - efa_rdm_ep->base_ep.av->util_av.av_entry_pool, - peer->conn->fi_addr); + if (peer->av_entry->fi_addr != FI_ADDR_NOTAVAIL) { + proto_entry = efa_proto_av_addr_to_entry( + efa_rdm_ep->proto_av, peer->av_entry->fi_addr); } else { - assert(peer->conn->implicit_fi_addr != FI_ADDR_UNSPEC); + assert(peer->av_entry->implicit_fi_addr != FI_ADDR_NOTAVAIL); - util_av_entry = ofi_bufpool_get_ibuf( - efa_rdm_ep->base_ep.av->util_av_implicit.av_entry_pool, - peer->conn->implicit_fi_addr); + proto_entry = efa_proto_av_addr_to_entry_implicit( + efa_rdm_ep->proto_av, peer->av_entry->implicit_fi_addr); } dlist_remove(&peer->ep_peer_list_entry); efa_rdm_peer_destruct(peer, efa_rdm_ep); - peer_map_entry = container_of( - peer, struct efa_conn_ep_peer_map_entry, peer); - - av_entry = (struct efa_av_entry *) util_av_entry->data; - HASH_DEL(av_entry->conn.ep_peer_map, peer_map_entry); - ofi_buf_free(peer_map_entry); + pm_entry = container_of( + peer, struct efa_proto_av_entry_ep_peer_map_entry, peer); + HASH_DEL(proto_entry->ep_peer_map, pm_entry); + ofi_buf_free(pm_entry); } #if ENABLE_DEBUG @@ -803,7 +802,6 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) efa_rdm_txe_release(txe); } - if (efa_rdm_ep->ope_pool) ofi_bufpool_destroy(efa_rdm_ep->ope_pool); @@ -1082,7 +1080,7 @@ static int efa_rdm_ep_close(struct fid *fid) efa_rdm_ep_remove_cntr_ibv_cq_poll_list(&efa_rdm_ep->base_ep); if (efa_rdm_ep->self_ah) - efa_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false); + efa_proto_ah_release(efa_rdm_ep->base_ep.domain, efa_rdm_ep->self_ah, false); efa_rdm_ep_deregister_ibv_cqs(efa_rdm_ep); @@ -1184,7 +1182,6 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) { int ret, retv = 0; struct efa_domain *efa_domain; - struct efa_av *efa_av; struct efa_rdm_cq *efa_rdm_cq; @@ -1194,14 +1191,13 @@ int efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep) retv = ret; } - efa_av = efa_rdm_ep->base_ep.av; - if (efa_av->shm_rdm_av) { - ret = fi_close(&efa_av->shm_rdm_av->fid); + if (efa_rdm_ep->proto_av->shm_rdm_av) { + ret = fi_close(&efa_rdm_ep->proto_av->shm_rdm_av->fid); if (ret) { EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm av: %s\n", fi_strerror(-ret)); retv = ret; } - efa_av->shm_rdm_av = NULL; + efa_rdm_ep->proto_av->shm_rdm_av = NULL; } efa_rdm_cq = container_of(efa_rdm_ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, efa_cq.util_cq); @@ -1354,7 +1350,7 @@ static inline int efa_rdm_ep_create_self_ah(struct efa_rdm_ep *rdm_ep) { - rdm_ep->self_ah = efa_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false); + rdm_ep->self_ah = efa_proto_ah_alloc(rdm_ep->base_ep.domain, rdm_ep->base_ep.src_addr.raw, false); return rdm_ep->self_ah ? 0 : -FI_EINVAL; } diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 834519802bd..5e6e76da1c0 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -9,6 +9,7 @@ #include #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_msg.h" #include "efa_rdm_rma.h" #include "efa_rdm_atomic.h" @@ -37,12 +38,10 @@ struct efa_ep_addr *efa_rdm_ep_raw_addr(struct efa_rdm_ep *ep) */ int32_t efa_rdm_ep_get_peer_ahn(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_av *efa_av; - struct efa_conn *efa_conn; + struct efa_proto_av_entry *entry; - efa_av = ep->base_ep.av; - efa_conn = efa_av_addr_to_conn(efa_av, addr); - return efa_conn ? efa_conn->ah->ahn : -1; + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + return entry ? entry->ah->ahn : -1; } @@ -74,18 +73,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer(struct efa_rdm_ep *ep, fi_addr_t addr) */ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_conn *conn; - struct efa_conn_ep_peer_map_entry *map_entry; + struct efa_proto_av_entry *entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; struct efa_rdm_peer *peer; assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn(ep->base_ep.av, addr); - - if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + if (!entry) return NULL; - peer = efa_conn_ep_peer_map_lookup(conn, ep); + peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep); if (peer) return peer; @@ -100,9 +98,9 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr memset(map_entry, 0, sizeof(*map_entry)); map_entry->ep_ptr = ep; - efa_rdm_peer_construct(&map_entry->peer, ep, conn); + efa_rdm_peer_construct(&map_entry->peer, ep, entry); - efa_conn_ep_peer_map_insert(conn, map_entry); + efa_proto_av_entry_ep_peer_map_insert(entry, map_entry); dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list); @@ -119,18 +117,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_explicit(struct efa_rdm_ep *ep, fi_addr */ struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr_t addr) { - struct efa_conn *conn; + struct efa_proto_av_entry *entry; struct efa_rdm_peer *peer; - struct efa_conn_ep_peer_map_entry *map_entry; + struct efa_proto_av_entry_ep_peer_map_entry *map_entry; assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); - conn = efa_av_addr_to_conn_implicit(ep->base_ep.av, addr); - - if (OFI_UNLIKELY(addr == FI_ADDR_NOTAVAIL)) + entry = efa_proto_av_addr_to_entry_implicit(ep->proto_av, addr); + if (!entry) return NULL; - peer = efa_conn_ep_peer_map_lookup(conn, ep); + peer = efa_proto_av_entry_ep_peer_map_lookup(entry, ep); if (peer) goto out; @@ -145,17 +142,17 @@ struct efa_rdm_peer *efa_rdm_ep_get_peer_implicit(struct efa_rdm_ep *ep, fi_addr memset(map_entry, 0, sizeof(*map_entry)); map_entry->ep_ptr = ep; - efa_rdm_peer_construct(&map_entry->peer, ep, conn); + efa_rdm_peer_construct(&map_entry->peer, ep, entry); peer = &map_entry->peer; - efa_conn_ep_peer_map_insert(conn, map_entry); + efa_proto_av_entry_ep_peer_map_insert(entry, map_entry); dlist_insert_tail(&map_entry->peer.ep_peer_list_entry, &ep->ep_peer_list); out: assert(peer); /* Move to the front of the LRU list */ - efa_av_implicit_av_lru_conn_move(ep->base_ep.av, peer->conn); + efa_proto_av_implicit_av_lru_entry_move(ep->proto_av, peer->av_entry); return peer; } @@ -532,7 +529,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent "initializing backoff timeout for peer fi_addr: " "%" PRIu64 " implicit fi_addr: %" PRIu64 " timeout: %ld rnr_queued_pkts: %d\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt); } else { peer->rnr_backoff_wait_time = MIN(peer->rnr_backoff_wait_time * 2, @@ -541,7 +538,7 @@ void efa_rdm_ep_queue_rnr_pkt(struct efa_rdm_ep *ep, struct efa_rdm_pke *pkt_ent "increasing backoff timeout for peer fi_addr: %" PRIu64 " implicit fi_addr %" PRIu64 " to %ld rnr_queued_pkts: %d\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, peer->rnr_backoff_wait_time, peer->rnr_queued_pkt_cnt); } } @@ -575,7 +572,7 @@ static ssize_t efa_rdm_ep_handshake_common(struct efa_rdm_ep *ep, struct efa_rdm (peer->flags & EFA_RDM_PEER_REQ_SENT))) return 0; - msg.addr = peer->conn->fi_addr; + msg.addr = peer->av_entry->fi_addr; txe = ofi_buf_alloc(ep->ope_pool); if (OFI_UNLIKELY(!txe)) { @@ -712,7 +709,7 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer fi_addr: %ld implicit fi_addr %ld. %s\n", - peer->conn->fi_addr, peer->conn->implicit_fi_addr, fi_strerror(-err)); + peer->av_entry->fi_addr, peer->av_entry->implicit_fi_addr, fi_strerror(-err)); efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE); return; } @@ -1008,3 +1005,12 @@ int efa_rdm_ep_enforce_handshake_for_txe(struct efa_rdm_ep *ep, struct efa_rdm_o } return FI_SUCCESS; } + +fi_addr_t efa_rdm_ep_get_explicit_shm_fi_addr(struct efa_rdm_ep *ep, fi_addr_t addr) +{ + struct efa_proto_av_entry *entry; + + assert(ofi_genlock_held(&ep->base_ep.domain->srx_lock)); + entry = efa_proto_av_addr_to_entry(ep->proto_av, addr); + return entry ? entry->shm_fi_addr : FI_ADDR_NOTAVAIL; +} diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index ab5e0fb8f63..25aa40efd2b 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -209,7 +209,7 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, const struct fi_msg *msg EFA_DBG(FI_LOG_EP_DATA, "peer: %" PRIu64 ": size %lu tag: %lx op: %x flags: %lx msg_id: %" PRIu32 "\n", - peer->conn->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id); + peer->av_entry->fi_addr, txe->total_len, tag, op, fi_flags, txe->msg_id); efa_rdm_tracepoint(send_begin, txe->msg_id, (size_t) txe->cq_entry.op_context, txe->total_len); @@ -794,7 +794,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, peer_srx = util_get_peer_srx(ep->peer_srx_ep); peer = (*pkt_entry_ptr)->peer; - attr.addr = peer->conn->fi_addr; + attr.addr = peer->av_entry->fi_addr; attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr); attr.tag = 0; ret = peer_srx->owner_ops->get_msg(peer_srx, &attr, &peer_rxe); @@ -832,7 +832,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, efa_rdm_tracepoint(msg_recv_unexpected_nontagged, (uint64_t) orig_pke_ptr, (*pkt_entry_ptr)->pkt_size, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr); #endif } else { /* Unexpected errors */ @@ -882,7 +882,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, peer = (*pkt_entry_ptr)->peer; peer_srx = util_get_peer_srx(ep->peer_srx_ep); - attr.addr = peer->conn->fi_addr; + attr.addr = peer->av_entry->fi_addr; attr.msg_size = efa_rdm_pke_get_rtm_msg_length(*pkt_entry_ptr); attr.tag = efa_rdm_pke_get_rtm_tag(*pkt_entry_ptr); @@ -927,7 +927,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, efa_rdm_tracepoint(msg_recv_unexpected_tagged, (uint64_t) orig_pke_ptr, (*pkt_entry_ptr)->pkt_size, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->tag, rxe->peer->av_entry->fi_addr); #endif } else { /* Unexpected errors */ diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index feed792c600..a378446c1b2 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -876,7 +876,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) " implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %" PRIu64 " incoming message size: %" PRIu64 " receiving buffer size: %zu\n", - rxe->peer->conn->fi_addr, rxe->peer->conn->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, + rxe->peer->av_entry->fi_addr, rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, rxe->total_len, rxe->cq_entry.len); ret = ofi_cq_write_error_trunc(ep->base_ep.util_ep.rx_cq, @@ -909,13 +909,13 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) " implicit fi_addr: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx total_len: %" PRIu64 "\n", - rxe->peer->conn->fi_addr, - rxe->peer->conn->implicit_fi_addr, rxe->rx_id, + rxe->peer->av_entry->fi_addr, + rxe->peer->av_entry->implicit_fi_addr, rxe->rx_id, rxe->msg_id, rxe->cq_entry.tag, rxe->total_len); efa_rdm_tracepoint(recv_end, rxe->msg_id, (size_t) rxe->cq_entry.op_context, - rxe->total_len, rxe->cq_entry.tag, rxe->peer->conn->fi_addr); + rxe->total_len, rxe->cq_entry.tag, rxe->peer->av_entry->fi_addr); if (ep->base_ep.util_ep.caps & FI_SOURCE) @@ -926,7 +926,7 @@ void efa_rdm_rxe_report_completion(struct efa_rdm_ope *rxe) rxe->cq_entry.buf, rxe->cq_entry.data, rxe->cq_entry.tag, - rxe->peer->conn->fi_addr); + rxe->peer->av_entry->fi_addr); else ret = ofi_cq_write(rx_cq, rxe->cq_entry.op_context, @@ -1010,13 +1010,13 @@ void efa_rdm_txe_report_completion(struct efa_rdm_ope *txe) "Writing send completion for txe to peer: %" PRIu64 " tx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx len: %" PRIu64 "\n", - txe->peer->conn->fi_addr, txe->tx_id, txe->msg_id, + txe->peer->av_entry->fi_addr, txe->tx_id, txe->msg_id, txe->cq_entry.tag, txe->total_len); efa_rdm_tracepoint(send_end, txe->msg_id, (size_t) txe->cq_entry.op_context, - txe->total_len, txe->cq_entry.tag, txe->peer->conn->fi_addr); + txe->total_len, txe->cq_entry.tag, txe->peer->av_entry->fi_addr); /* TX completions should not send peer address to util_cq */ if (txe->ep->base_ep.util_ep.caps & FI_SOURCE) diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 9188f5b96ec..4809bed5f75 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -3,6 +3,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_rdm_pkt_type.h" #include "efa_rdm_pke_rtm.h" #include "efa_rdm_pke_utils.h" @@ -17,14 +18,14 @@ * @param[in] conn efa conn object * @relates efa_rdm_peer */ -void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn) +void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry) { int ret; memset(peer, 0, sizeof(struct efa_rdm_peer)); peer->ep = ep; - peer->conn = conn; - peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, conn->ep_addr); + peer->av_entry = av_entry; + peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, efa_proto_av_entry_ep_addr(av_entry)); peer->host_id = peer->is_self ? ep->host_id : 0; /* Peer host id is exchanged via handshake */ peer->num_runt_bytes_in_flight = 0; /* allocate the robuf circular queue from the pre-allocated buffer pool */ @@ -39,7 +40,7 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st dlist_init(&peer->rxe_list); dlist_init(&peer->overflow_pke_list); - if (conn->shm_fi_addr != FI_ADDR_NOTAVAIL) { + if (av_entry->shm_fi_addr != FI_ADDR_NOTAVAIL) { peer->is_local = 1; } diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index caf804111be..ac68d58f54d 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -9,6 +9,8 @@ #include "efa_rdm_protocol.h" #include "efa_rdm_rxe_map.h" +struct efa_proto_av_entry; + #define EFA_RDM_PEER_DEFAULT_REORDER_BUFFER_SIZE (16) #define EFA_RDM_PEER_REQ_SENT BIT_ULL(0) /**< A REQ packet has been sent to the peer (peer should send a handshake back) */ @@ -90,7 +92,7 @@ struct efa_rdm_peer { bool is_self; /**< flag indicating whether the peer is the endpoint itself */ bool is_local; /**< flag indicating wehther the peer is local (on the same instance) */ uint32_t device_version; /**< EFA device version */ - struct efa_conn *conn; /**< pointer to efa_conn struct in the av entry */ + struct efa_proto_av_entry *av_entry; /**< pointer to efa_proto_av_entry in the av entry */ uint64_t host_id; /* Optional peer host id. Default 0 */ /** * @brief reorder buffer @@ -239,9 +241,9 @@ bool efa_rdm_peer_need_connid(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_REQUEST_CONNID_HEADER); } -struct efa_conn; +struct efa_proto_av_entry; -void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_conn *conn); +void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, struct efa_proto_av_entry *av_entry); void efa_rdm_peer_destruct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep); @@ -260,6 +262,6 @@ int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, struct efa_rdm_e /* Macro for getting peer address string */ #define EFA_RDM_GET_PEER_ADDR_STR(ep, peer, peer_addr_str) \ char peer_addr_str[OFI_ADDRSTRLEN] = {0}; \ - efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str}); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &(size_t){sizeof peer_addr_str}); #endif /* EFA_RDM_PEER_H */ diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index e45456e2cce..4855bcd5e63 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -10,6 +10,7 @@ #include "efa.h" #include "efa_av.h" +#include "rdm/efa_proto_av.h" #include "efa_data_path_ops.h" #include "efa_tp.h" @@ -189,8 +190,8 @@ void efa_rdm_pke_release_tx(struct efa_rdm_pke *pkt_entry) EFA_DBG(FI_LOG_EP_DATA, "reset backoff timer for peer fi_addr: %" PRIu64 " implicit fi_addr: %" PRIu64 "\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr); + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr); } efa_rdm_pke_release(pkt_entry); @@ -454,7 +455,7 @@ static inline uint64_t efa_rdm_pke_get_wr_id(struct efa_rdm_pke *pkt_entry) ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, int pkt_entry_cnt, uint64_t flags) { - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct efa_rdm_ep *ep; struct efa_rdm_pke *pkt_entry; struct efa_rdm_peer *peer; @@ -477,8 +478,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, if (peer->flags & EFA_RDM_PEER_IN_BACKOFF) return -FI_EAGAIN; - conn = pkt_entry_vec[0]->peer->conn; - assert(conn && conn->ep_addr); + av_entry = pkt_entry_vec[0]->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); for (pkt_idx = 0; pkt_idx < pkt_entry_cnt; ++pkt_idx) { pkt_entry = pkt_entry_vec[pkt_idx]; @@ -521,8 +522,8 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, qpn = peer->user_recv_qp.qpn; qkey = peer->user_recv_qp.qkey; } else { - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } /* This will make efa_qp_post_send not ring the doorbell until the last itertion of the loop */ @@ -533,7 +534,7 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, ret = efa_qp_post_send(ep->base_ep.qp, sg_list, inline_data_list, iov_cnt, use_inline, - wr_id, cq_data, flags_in_loop, conn->ah, + wr_id, cq_data, flags_in_loop, av_entry->ah, qpn, qkey); if (OFI_UNLIKELY(ret)) @@ -580,7 +581,7 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, { struct efa_rdm_ep *ep; struct efa_qp *qp; - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct ibv_sge sge; struct efa_rdm_ope *txe; int err = 0; @@ -599,11 +600,11 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, qpn = qp->qp_num; qkey = qp->qkey; } else { - conn = pkt_entry->peer->conn; - assert(conn && conn->ep_addr); - ah = conn->ah; - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + av_entry = pkt_entry->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); + ah = av_entry->ah; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } sge.addr = (uint64_t)local_buf; @@ -652,7 +653,7 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_ep *ep; struct efa_qp *qp; - struct efa_conn *conn; + struct efa_proto_av_entry *av_entry; struct ibv_sge sge; struct efa_rdm_rma_context_pkt *rma_context_pkt; struct efa_rdm_ope *txe; @@ -689,11 +690,11 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) qpn = qp->qp_num; qkey = qp->qkey; } else { - conn = pkt_entry->peer->conn; - assert(conn && conn->ep_addr); - ah = conn->ah; - qpn = conn->ep_addr->qpn; - qkey = conn->ep_addr->qkey; + av_entry = pkt_entry->peer->av_entry; + assert(av_entry && efa_proto_av_entry_ep_addr(av_entry)); + ah = av_entry->ah; + qpn = efa_proto_av_entry_ep_addr(av_entry)->qpn; + qkey = efa_proto_av_entry_ep_addr(av_entry)->qkey; } wr_id = efa_rdm_pke_get_wr_id(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index 2ed75f38a00..452d09e5c2e 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -93,8 +93,8 @@ void efa_rdm_pke_handle_handshake_recv(struct efa_rdm_pke *pkt_entry) EFA_INFO(FI_LOG_CQ, "HANDSHAKE received from peer with explicit fi_addr %" PRIu64 " implicit fi_addr %" PRIu64 "\n", - pkt_entry->peer->conn->fi_addr, - pkt_entry->peer->conn->implicit_fi_addr); + pkt_entry->peer->av_entry->fi_addr, + pkt_entry->peer->av_entry->implicit_fi_addr); handshake_pkt = (struct efa_rdm_handshake_hdr *)pkt_entry->wiredata; diff --git a/prov/efa/src/rdm/efa_rdm_pke_print.c b/prov/efa/src/rdm/efa_rdm_pke_print.c index 529fddfe0f3..37b80505355 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_print.c +++ b/prov/efa/src/rdm/efa_rdm_pke_print.c @@ -154,7 +154,7 @@ static void efa_rdm_pke_print_eager_tag_rtm(char *prefix, tag_rtm_hdr = (struct efa_rdm_eager_tagrtm_hdr *) pkt_entry->wiredata; if (pkt_entry->peer) - fi_addr = pkt_entry->peer->conn->fi_addr; + fi_addr = pkt_entry->peer->av_entry->fi_addr; EFA_DBG(FI_LOG_EP_DATA, "%s EFA RDM RTM packet - type: %" PRIu32 " version: %" PRIu8 @@ -195,7 +195,7 @@ static void efa_rdm_pke_print_longread_rtw(char *prefix, " msg_length: %" PRIu64 " send_id: %" PRIu32 " read_iov_count: %" PRIu32 "\n", prefix, base_hdr->type, base_hdr->version, base_hdr->flags, - pkt_entry->peer->conn->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count, + pkt_entry->peer->av_entry->fi_addr, base_hdr->msg_id, rtw_hdr->rma_iov_count, rtw_hdr->msg_length, rtw_hdr->send_id, rtw_hdr->read_iov_count); efa_rdm_pke_print_fi_rma_iov("rma_iov", rtw_hdr->rma_iov_count, diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index de0f3d4c478..894a38b745b 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -119,7 +119,7 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, len = sizeof(ep_addr_str); efa_base_ep_raw_addr_str(&ep->base_ep, ep_addr_str, &len); len = sizeof(peer_addr_str); - efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->conn->fi_addr, peer_addr_str, &len); + efa_base_ep_get_peer_raw_addr_str(&ep->base_ep, peer->av_entry->fi_addr, peer_addr_str, &len); if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { strcpy(local_host_id_str, "N/A"); diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index fdbc2bc71e5..927f130d552 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -2,8 +2,6 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" -#include "efa_rdm_cq.h" -#include "efa_rdm_pke_req.h" #include "efa_av.h" /** @@ -78,7 +76,7 @@ void test_av_insert_duplicate_gid(struct efa_resource **state) assert_int_not_equal(addr1, addr2); } -static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, bool multi_av) +static void efa_ah_cnt_av_efa_impl(struct efa_resource **state, bool multi_av) { struct efa_resource *resource = *state; struct efa_ep_addr raw_addr = {0}; @@ -90,23 +88,19 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo struct fi_av_attr av_attr = {0}; struct fid_av *av1 = NULL, *av2 = NULL; - efa_unit_test_resource_construct(resource, FI_EP_RDM, efa_fabric ? EFA_FABRIC_NAME : EFA_DIRECT_FABRIC_NAME); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(err, 0); - /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 for efa fabric */ - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0); + /* So far we should only have 1 ah from ep self ah, and its refcnt is 1 */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); - if (efa_fabric) { - assert_non_null(efa_ah); - assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 1 : 0); - assert_int_equal(efa_ah->implicit_refcnt, 0); - } else { - assert_null(efa_ah); - } + assert_non_null(efa_ah); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); if (multi_av) { /* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */ @@ -132,15 +126,10 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo assert_int_not_equal(addr1, addr2); } - if (!efa_fabric) { - HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); - assert_non_null(efa_ah); - } - - /* So far we should still have 1 ah, and its refcnt is 3 for efa fabric (including self AH) and 2 for efa-direct fabric) */ + /* So far we should still have 1 ah, and its refcnt is 3 (including self AH) */ assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, efa_fabric ? 3 : 2); - assert_int_equal(efa_ah->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 3); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); if (multi_av) { /* ah refcnt should be decremented to 1 after av close */ @@ -152,15 +141,87 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0); } - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), efa_fabric ? 1 : 0); - if (efa_fabric) { - /* efa_ah is still alive because self-AH holds a reference */ - assert_int_equal(efa_ah->explicit_refcnt, 1); - assert_int_equal(efa_ah->implicit_refcnt, 0); + /* efa_ah is still alive because self-AH holds a reference */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + + /* ah map should be empty now after closing ep which destroys the self ah */ + assert_int_equal(fi_close(&resource->ep->fid), 0); + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + /* Reset to NULL to avoid test reaper closing again */ + resource->ep = NULL; +} + +static void efa_ah_cnt_av_efa_direct_impl(struct efa_resource **state, bool multi_av) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t addr1, addr2; + int err, num_addr; + struct efa_domain *efa_domain; + struct efa_ah *efa_ah = NULL; + struct fi_av_attr av_attr = {0}; + struct fid_av *av1 = NULL, *av2 = NULL; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); + + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + /* efa-direct does not create a self AH, so ah_map should be empty */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); + assert_null(efa_ah); + + if (multi_av) { + /* We open 2 avs with the same domain (PD) so they should share same AH given the same GID */ + assert_int_equal(fi_av_open(resource->domain, &av_attr, &av1, NULL), 0); + assert_int_equal(fi_av_open(resource->domain, &av_attr, &av2, NULL), 0); } - /* else: efa_ah has been freed, do not dereference */ - /* ah map should be empty now after closing ep which destroys the self ah for efa fabric */ + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + + num_addr = fi_av_insert(multi_av ? av1 : resource->av, &raw_addr, 1, &addr1, 0, NULL); + assert_int_equal(num_addr, 1); + + raw_addr.qpn = 2; + raw_addr.qkey = 0x5678; + num_addr = fi_av_insert(multi_av ? av2 : resource->av, &raw_addr, 1, &addr2, 0, NULL); + assert_int_equal(num_addr, 1); + + if (multi_av) { + /* They should be equal as 0 since they are in different avs */ + assert_int_equal(addr1, addr2); + } else { + assert_int_not_equal(addr1, addr2); + } + + HASH_FIND(hh, efa_domain->ah_map, raw_addr.raw, EFA_GID_LEN, efa_ah); + assert_non_null(efa_ah); + + /* So far we should still have 1 ah, and its refcnt is 2 */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_ah->refcnt, 2); + + if (multi_av) { + /* ah refcnt should be decremented to 0 after av close */ + assert_int_equal(fi_close(&av1->fid), 0); + assert_int_equal(fi_close(&av2->fid), 0); + } else { + /* ah refcnt should be decremented to 0 after av entry removals */ + assert_int_equal(fi_av_remove(resource->av, &addr1, 1, 0), 0); + assert_int_equal(fi_av_remove(resource->av, &addr2, 1, 0), 0); + } + + /* efa_ah has been freed (no self AH holding a reference on efa-direct) */ + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* ah map should still be empty after closing ep */ assert_int_equal(fi_close(&resource->ep->fid), 0); assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); /* Reset to NULL to avoid test reaper closing again */ @@ -169,22 +230,22 @@ static void efa_ah_cnt_av_impl(struct efa_resource **state, bool efa_fabric, boo void test_efa_ah_cnt_one_av_efa(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, true, false); + efa_ah_cnt_av_efa_impl(state, false); } void test_efa_ah_cnt_one_av_efa_direct(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, false, false); + efa_ah_cnt_av_efa_direct_impl(state, false); } void test_efa_ah_cnt_multi_av_efa(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, true, true); + efa_ah_cnt_av_efa_impl(state, true); } void test_efa_ah_cnt_multi_av_efa_direct(struct efa_resource **state) { - efa_ah_cnt_av_impl(state, false, true); + efa_ah_cnt_av_efa_direct_impl(state, true); } /** @@ -223,7 +284,6 @@ void test_av_multiple_ep_impl(struct efa_resource **state, char *fabric_name) fi_close(&ep2->fid); } - /** * @brief This test verifies that multiple endpoints can bind to the same AV * for the efa fabric @@ -246,724 +306,81 @@ void test_av_multiple_ep_efa_direct(struct efa_resource **state) return test_av_multiple_ep_impl(state, EFA_DIRECT_FABRIC_NAME); } -static void test_av_verify_av_hash_cnt(struct efa_av *av, - int explicit_cur_av_count, - int explicit_prv_av_count, - int implicit_cur_av_count, - int implicit_prv_av_count) -{ - assert_int_equal(HASH_CNT(hh, av->util_av.hash), - explicit_cur_av_count + explicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, av->cur_reverse_av), - explicit_cur_av_count); - assert_int_equal(HASH_CNT(hh, av->prv_reverse_av), - explicit_prv_av_count); - - assert_int_equal(HASH_CNT(hh, av->util_av_implicit.hash), - implicit_cur_av_count + implicit_prv_av_count); - assert_int_equal(HASH_CNT(hh, av->cur_reverse_av_implicit), - implicit_cur_av_count); - assert_int_equal(HASH_CNT(hh, av->prv_reverse_av_implicit), - implicit_prv_av_count); -} - /** - * @brief This test removes a peer and inserts it again + * @brief Test base AV (efa-direct) insert, lookup, remove cycle * * @param[in] state struct efa_resource that is managed by the framework */ -void test_av_reinsertion(struct efa_resource **state) +void test_av_insert_remove_lookup_efa_direct(struct efa_resource **state) { struct efa_resource *resource = *state; - struct efa_rdm_peer *peer; - struct efa_ep_addr raw_addr, raw_addr_2; + struct efa_ep_addr raw_addr = {0}, raw_addr_out = {0}; size_t raw_addr_len = sizeof(struct efa_ep_addr); fi_addr_t fi_addr; struct efa_av *av; - struct efa_rdm_ep *efa_rdm_ep; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 174; - raw_addr.qkey = 0x1234; + struct efa_av_entry *entry; + int err, num_addr; + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); - peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->conn->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); - - err = fi_av_remove(resource->av, &fi_addr, 1, 0); + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + raw_addr.qpn = 7; + raw_addr.qkey = 0xABCD; - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); assert_int_equal(fi_addr, 0); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(av->used, 1); + + /* Verify entry is accessible and fields are correct */ + entry = efa_av_addr_to_entry(av, fi_addr); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addr); + assert_int_equal(efa_av_entry_ep_addr(entry)->qpn, 7); + assert_int_equal(efa_av_entry_ep_addr(entry)->qkey, 0xABCD); + + /* Lookup should return the same address */ + raw_addr_len = sizeof(raw_addr_out); + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len); assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); - - peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); - assert_int_equal(peer->conn->fi_addr, fi_addr); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); + assert_int_equal(raw_addr_out.qpn, 7); + assert_int_equal(raw_addr_out.qkey, 0xABCD); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_out), 1); + /* Remove and verify */ err = fi_av_remove(resource->av, &fi_addr, 1, 0); assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); -} - -/** - * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then - * remove the first-inserted peer before the second. This reproduces the bug - * in efa_av_reverse_av_remove() where the code blindly deletes the - * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to - * a different (newer) conn. Removing the surviving second peer afterwards - * then hits a NULL prv_reverse_av_entry and SEGVs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t fi_addr1, fi_addr2; - struct efa_av *av; - struct efa_rdm_ep *efa_rdm_ep; - uint32_t ahn; - int err; + assert_int_equal(av->used, 0); - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + /* Entry should be NULL after remove */ + entry = efa_av_addr_to_entry(av, fi_addr); + assert_null(entry); - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, - base_ep.util_ep.ep_fid); - ahn = efa_rdm_ep->self_ah->ahn; - - /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */ - raw_addr.qpn = 100; - raw_addr.qkey = 0xAAAA; - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL); - assert_int_equal(err, 1); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); - /* cur_reverse_av (ahn, 100) -> conn1 (fi_addr1) */ - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), - fi_addr1); - - /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's - * reverse-AV entry from cur_reverse_av into prv_reverse_av. */ - raw_addr.qpn = 100; - raw_addr.qkey = 0xBBBB; - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL); - assert_int_equal(err, 1); - assert_int_not_equal(fi_addr1, fi_addr2); - test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); - /* cur_reverse_av (ahn, 100) now points to conn2 (fi_addr2); peer1 is - * in prv_reverse_av keyed by its own qkey. */ - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), - fi_addr2); - - /* Remove peer1 first. Without the fix this would incorrectly delete - * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */ - err = fi_av_remove(resource->av, &fi_addr1, 1, 0); - assert_int_equal(err, 0); - /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), - fi_addr2); - - /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry - * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */ - err = fi_av_remove(resource->av, &fi_addr2, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); - assert_int_equal(efa_av_reverse_lookup_rdm(av, ahn, 100, NULL), - FI_ADDR_NOTAVAIL); + /* Lookup should fail after remove */ + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_out, &raw_addr_len); + assert_int_not_equal(err, 0); } /** - * @brief Generate a peer with a unique QPN and a random QKEY and insert it - * into the implicit AV - * - * The QPN is drawn from a static monotonic counter so every peer minted by - * this helper has a distinct (ahn, qpn) key in the reverse AV. Callers rely - * on this uniqueness to exercise LRU ordering and eviction behavior without - * tripping over the provider's QPN-collision path. + * @brief Test base AV (efa-direct) addr_to_entry returns NULL for invalid fi_addr * * @param[in] state struct efa_resource that is managed by the framework */ -static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource) -{ - struct efa_ep_addr raw_addr; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - fi_addr_t implicit_fi_addr, test_addr; - struct efa_av *av; - uint32_t ahn; - int err; - - av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - static uint16_t next_qpn = 0; - raw_addr.qpn = next_qpn++; - raw_addr.qkey = rand(); - ahn = efa_rdm_ep->self_ah->ahn; - - /* Manually insert into implicit AV */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - - err = efa_av_insert_one(av, &raw_addr, &implicit_fi_addr, 0, NULL, true, true); - - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); - - assert_int_equal(peer->conn->implicit_fi_addr, implicit_fi_addr); - assert_int_equal(peer->conn->fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); - - test_addr = efa_av_reverse_lookup_rdm_implicit(av, ahn, raw_addr.qpn, NULL); - assert_int_equal(test_addr, implicit_fi_addr); - - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - - return peer; -} - -/** - * @brief This test fakes a peer in the implicit AV and closes the AV with an - * implicit peer in it - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit(struct efa_resource **state) +void test_av_base_addr_to_entry_invalid(struct efa_resource **state) { struct efa_resource *resource = *state; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - test_av_get_peer_from_implicit_av(resource); -} - -/** - * @brief This test fakes a peer in the implicit AV and verifies that the peer - * is moved to the explicit AV when fi_av_insert is called - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_to_explicit(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_ep_addr raw_addr, raw_addr_2; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer; - fi_addr_t explicit_fi_addr, test_addr; struct efa_av *av; - uint32_t ahn; - int err; + struct efa_av_entry *entry; - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_DIRECT_FABRIC_NAME); av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */ - peer = test_av_get_peer_from_implicit_av(resource); - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - - /* Modify the peer and verify that the peer is moved as-is */ - peer->next_msg_id = 355; - peer->flags |= EFA_RDM_PEER_IN_BACKOFF; - - /* Insert explicitly */ - raw_addr.qpn = peer->conn->ep_addr->qpn; - raw_addr.qkey = peer->conn->ep_addr->qkey; - err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); - test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); - - err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + entry = efa_av_addr_to_entry(av, FI_ADDR_NOTAVAIL); + assert_null(entry); - peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); - assert_int_equal(peer->conn->fi_addr, explicit_fi_addr); - assert_int_equal(peer->conn->implicit_fi_addr, FI_ADDR_NOTAVAIL); - assert_int_equal(efa_is_same_addr(&raw_addr, peer->conn->ep_addr), 1); - - ahn = efa_rdm_ep->self_ah->ahn; - test_addr = efa_av_reverse_lookup_rdm(av, ahn, raw_addr.qpn, NULL); - assert_int_equal(test_addr, explicit_fi_addr); - - /* Verify the manually set peer properties above */ - assert_int_equal(peer->next_msg_id, 355); - assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF); - - /* Unset the flag to make fi_av_remove easier */ - peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF; - - err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); - assert_int_equal(err, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); -} - -static void test_av_implicit_av_verify_lru_list_first_last_elements( - struct efa_av *av, struct efa_conn *first_conn_expected, - struct efa_conn *last_conn_expected) -{ - struct dlist_entry *first_entry, *last_entry; - struct efa_conn *first_conn_actual, *last_conn_actual; - - first_entry = av->implicit_av_lru_list.next; - last_entry = av->implicit_av_lru_list.prev; - - first_conn_actual = container_of(first_entry, struct efa_conn, - implicit_av_lru_entry); - last_conn_actual = container_of(last_entry, struct efa_conn, - implicit_av_lru_entry); - - assert_ptr_equal(first_conn_actual, first_conn_expected); - assert_ptr_equal(last_conn_actual, last_conn_expected); -} - -/** - * @brief This test inserts three implicit peers and verifies that the last - * inserted and/or accessed peer is at the tail of the LRU list - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_av_lru_insertion(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer0, *peer1, *peer2; - struct efa_av *av; - fi_addr_t implicit_fi_addr; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Manually insert first address into implicit AV */ - peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); - - /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn); - - /* Manually insert second address into implicit AV */ - peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); - - /* Manually insert third address into implicit AV */ - peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); - - - /* Access peer0 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer0->conn->ep_addr->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 0); - - /* Expected LRU list: HEAD->peer1->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn); - - /* Access peer2 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer2->conn->ep_addr->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 2); - - /* Expected LRU list: HEAD->peer1->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer2->conn); - - - /* Access peer1 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer1->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 1); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer2->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); - - /* Access peer2 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer2->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 2); - test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); - - /* Expected LRU list: HEAD->peer0->peer1->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); -} - -/** - * @brief This test sets the implicit AV size to 2 and inserts four implicit - * peers. It verifies that the least recently used peer is evicted. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_av_implicit_av_lru_eviction(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - struct efa_rdm_ep *efa_rdm_ep; - struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3; - struct efa_ep_addr_hashable *efa_ep_addr_hashable; - struct efa_av *av; - fi_addr_t implicit_fi_addr; - uint32_t ahn; - int err; - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - /* Modify implicit AV size */ - av->implicit_av_size = 2; - - /* Manually insert first address into implicit AV */ - peer0 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); - - /* Expected LRU list: HEAD->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer0->conn); - - /* Manually insert second address into implicit AV */ - peer1 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer0->peer1 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer1->conn); - - /* Access peer0 through the CQ read path */ - ahn = efa_rdm_ep->self_ah->ahn; - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - implicit_fi_addr = efa_av_reverse_lookup_rdm_implicit( - av, ahn, peer0->conn->ep_addr->qpn, NULL); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(implicit_fi_addr, 0); - - /* Expected LRU list: HEAD->peer1->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->conn, peer0->conn); - - /* Manually insert third address into implicit AV */ - peer2 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer0->peer2 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer2->conn); - - /* Verify that peer1 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 1); - HASH_FIND(hh, av->evicted_peers_hashset, peer1->conn->ep_addr, - sizeof(struct efa_ep_addr), efa_ep_addr_hashable); - assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(peer1->conn->ep_addr, - &efa_ep_addr_hashable->addr), - 1); - - /* Access peer0 through repeated AV insertion path */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, peer0->conn->ep_addr, &implicit_fi_addr, 0, NULL, true, true); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - assert_int_equal(err, 0); - assert_int_equal(implicit_fi_addr, 0); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); - - /* Expected LRU list: HEAD->peer2->peer0 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->conn, peer0->conn); - - /* Manually insert fourth address into implicit AV */ - peer3 = test_av_get_peer_from_implicit_av(resource); - test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); - - /* Verify that peer2 is evicted and added to the evicted hashmap */ - assert_int_equal(HASH_CNT(hh, av->evicted_peers_hashset), 2); - HASH_FIND(hh, av->evicted_peers_hashset, peer2->conn->ep_addr, - sizeof(struct efa_ep_addr), efa_ep_addr_hashable); - assert_non_null(efa_ep_addr_hashable); - assert_int_equal(efa_is_same_addr(peer2->conn->ep_addr, - &efa_ep_addr_hashable->addr), - 1); - - /* Expected LRU list: HEAD->peer0->peer3 */ - test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->conn, peer3->conn); -} - -/** - * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_refcnt(struct efa_resource **state) -{ - struct efa_resource *resource = *state; - fi_addr_t fi_addr; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct efa_rdm_ep *efa_rdm_ep; - struct efa_domain *efa_domain; - struct efa_rdm_peer *peer; - struct efa_av *av; - struct efa_ah *efa_ah = NULL; - int err; - - int allowed_ahs = 1; - - g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; - g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; - - g_self_ah_cnt = 1; - g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; - assert_int_equal(g_ibv_ah_cnt, 0); - - efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); - efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - av = container_of(resource->av, struct efa_av, util_av.av_fid); - - /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */ - assert_int_equal(g_ibv_ah_cnt, 1); - - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); - - /* Manually insert into implicit AV */ - ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); - err = efa_av_insert_one(av, &raw_addr, &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr); - ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); - - efa_ah = peer->conn->ah; - - assert_int_equal(g_ibv_ah_cnt, 2); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); - assert_int_equal(efa_ah->implicit_refcnt, 1); - - /* Move implicit AV entry to explicit AV entry */ - err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - - assert_int_equal(g_ibv_ah_cnt, 2); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); - assert_int_equal(efa_ah->explicit_refcnt, 1); - assert_int_equal(efa_ah->implicit_refcnt, 0); - - err = fi_av_remove(resource->av, &fi_addr, 1, 0); - assert_int_equal(err, 0); - - assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); - - /* Only the self AH should be left */ - assert_int_equal(g_ibv_ah_cnt, 1); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when an explicit AV entry is inserted. It - * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_impl(bool explicit) -{ - fi_addr_t fi_addr; - struct efa_ep_addr raw_addr[2] = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - struct fid_fabric *fabric_fid[2]; - struct fid_domain *domain_fid[2]; - struct fid_ep *ep_fid[2]; - struct fid_cq *cq_fid[2]; - struct fid_av *av_fid[2]; - struct efa_domain *efa_domain[2]; - struct efa_rdm_ep *efa_rdm_ep[2]; - struct efa_rdm_peer *peer; - struct efa_av *efa_av[2]; - struct efa_ah *efa_ah = NULL; - int err; - struct fi_av_attr av_attr = {0}; - struct fi_cq_attr cq_attr = { - .format = FI_CQ_FORMAT_DATA - }; - struct fi_info *hints, *info, *cur; - int num_nic = 0; - - int allowed_ahs = 1; - - g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; - g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; - g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; - - hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME); - fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info); - for (cur = info; cur; cur = cur->next) { - num_nic++; - } - - if (num_nic < 2) { - fi_freeinfo(info); - fi_freeinfo(hints); - return; - } - - g_self_ah_cnt = 2; - g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */ - assert_int_equal(g_ibv_ah_cnt, 0); - - cur = info; - for (int i = 0; i < 2; i++) { - err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL); - assert_int_equal(err, 0); - - err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL); - assert_int_equal(err, 0); - - efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid); - - err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL); - assert_int_equal(err, 0); - - efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid); - - err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL); - assert_int_equal(err, 0); - - err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL); - assert_int_equal(err, 0); - - efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid); - - fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0); - fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV); - - err = fi_enable(ep_fid[i]); - assert_int_equal(err, 0); - - err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len); - assert_int_equal(err, 0); - - cur = cur->next; - } - - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); - - /* Manually insert into implicit AV in first domain */ - ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_av_insert_one(efa_av[0], &raw_addr[0], &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); - ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - efa_ah = peer->conn->ah; - assert_int_equal(efa_ah->implicit_refcnt, 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); - - if (explicit) { - err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL); - assert_int_equal(err, 1); - peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr); - } else { - ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - err = efa_av_insert_one(efa_av[0], &raw_addr[1], &fi_addr, 0, NULL, true, true); - peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); - ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); - } - - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); - - efa_ah = peer->conn->ah; - if (explicit) { - assert_int_equal(efa_ah->implicit_refcnt, 0); - assert_int_equal(efa_ah->explicit_refcnt, 1); - } else { - assert_int_equal(efa_ah->implicit_refcnt, 1); - assert_int_equal(efa_ah->explicit_refcnt, 0); - } - - if (explicit) { - err = fi_av_remove(av_fid[0], &fi_addr, 1, 0); - assert_int_equal(err, 0); - assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); - } - - for (int i = 0; i < 2; i++) { - efa_rdm_ep[i]->self_ah = NULL; - fi_close(&ep_fid[i]->fid); - fi_close(&cq_fid[i]->fid); - fi_close(&av_fid[i]->fid); - fi_close(&domain_fid[i]->fid); - fi_close(&fabric_fid[i]->fid); - } - fi_freeinfo(hints); - fi_freeinfo(info); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when an explicit AV entry is inserted. It - * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state) -{ - test_ah_lru_eviction_impl(true); -} - -/** - * @brief This test inserts one implicit AV entry and verifies that the - * implicitly created AH is evicted when another implicit AV entry is inserted. - * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs. - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state) -{ - test_ah_lru_eviction_impl(false); + entry = efa_av_addr_to_entry(av, FI_ADDR_UNSPEC); + assert_null(entry); } diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 1972611fc01..672d4a863b9 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -1084,7 +1084,7 @@ static void test_efa_cq_read_prep(struct efa_resource *resource, will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_imm_data_return_mock, 0x1); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_qp_num_return_mock, base_ep->qp->qp_num); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_byte_len_return_mock, 4096); - will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_conn(base_ep->av, addr)->ah->ahn); + will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_slid_return_mock, efa_av_addr_to_entry(base_ep->av, addr)->ah->ahn); will_return_uint_maybe(efa_mock_efa_ibv_cq_wc_read_src_qp_return_mock, raw_addr.qpn); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index e532c0813a9..584b5296241 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -480,7 +480,7 @@ void test_efa_rdm_ep_rma_queue_before_handshake(struct efa_resource **state, int peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); peer->flags = EFA_RDM_PEER_REQ_SENT; /* Do not use shm in this unit test because we are testing efa rma path */ - peer->conn->shm_fi_addr = FI_ADDR_NOTAVAIL; + peer->av_entry->shm_fi_addr = FI_ADDR_NOTAVAIL; assert_false(efa_rdm_ep->homogeneous_peers); assert_true(dlist_empty(&efa_rdm_ep->txe_list)); diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 9f4875d4246..144e7f2d7f8 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -21,7 +21,7 @@ int g_ibv_ah_limit = 1024; int g_ibv_ah_cnt = 0; int g_self_ah_cnt = 1; struct ibv_ah g_dummy_ah; -struct efa_ah g_dummy_efa_ah = {0}; +struct efa_proto_ah g_dummy_proto_ah = {0}; void efa_ibv_ah_limit_cnt_reset() { @@ -74,40 +74,49 @@ int efa_mock_ibv_destroy_ah_dont_create_self_ah(struct ibv_ah *ibv_ah) } struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { return NULL; } struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { /* Intercept the self AH call in efa_ah_alloc and do not call * ibv_create_ah or modify the AH map etc */ if (g_ibv_ah_cnt < g_self_ah_cnt) { g_ibv_ah_cnt++; - g_dummy_efa_ah.ibv_ah = &g_dummy_ah; - g_dummy_efa_ah.ahn = -1; - memset(g_dummy_efa_ah.gid, 0, sizeof(g_dummy_efa_ah.gid)); - g_dummy_efa_ah.explicit_refcnt = 1; - g_dummy_efa_ah.implicit_refcnt = 0; - return &g_dummy_efa_ah; + g_dummy_proto_ah.ah.ibv_ah = &g_dummy_ah; + g_dummy_proto_ah.ah.ahn = -1; + memset(g_dummy_proto_ah.ah.gid, 0, sizeof(g_dummy_proto_ah.ah.gid)); + g_dummy_proto_ah.ah.refcnt = 1; + /* + * Reset protocol fields so efa_proto_ah_alloc sees a fresh AH + * regardless of prior test state. Without this reset, stale + * proto refcnts or a stale lru_list_entry from a freed domain + * would carry forward into the current test. + */ + g_dummy_proto_ah.implicit_refcnt = 0; + g_dummy_proto_ah.explicit_refcnt = 0; + memset(&g_dummy_proto_ah.lru_list_entry, 0, + sizeof(g_dummy_proto_ah.lru_list_entry)); + dlist_init(&g_dummy_proto_ah.implicit_conn_list); + return &g_dummy_proto_ah.ah; } else { - return __real_efa_ah_alloc(domain, gid, insert_implicit_av); + return __real_efa_ah_alloc(domain, gid, alloc_size); } } void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain, - struct efa_ah *ah, - bool release_from_implicit_av) + struct efa_ah *ah) { /* Intercept the self AH destruct call in efa_ah_release and do not call * ibv_destroy_ah or modify the AH map etc */ if (g_ibv_ah_cnt <= g_self_ah_cnt) g_ibv_ah_cnt--; else - return __real_efa_ah_release(domain, ah, release_from_implicit_av); + return __real_efa_ah_release(domain, ah); } int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibv_ctx, @@ -516,16 +525,14 @@ int __wrap_efadv_query_device(struct ibv_context *ibv_ctx, struct efadv_device_a } struct efa_ah *__wrap_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av) + size_t alloc_size) { - return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, insert_implicit_av); + return g_efa_unit_test_mocks.efa_ah_alloc(domain, gid, alloc_size); } -void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av) +void __wrap_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah) { - return g_efa_unit_test_mocks.efa_ah_release(domain, ah, - release_from_implicit_av); + return g_efa_unit_test_mocks.efa_ah_release(domain, ah); } struct ibv_cq_ex *efa_mock_create_cq_ex_return_null(struct ibv_context *context, struct ibv_cq_init_attr_ex *init_attr) diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index ae68e77935f..96a618886b2 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -35,20 +35,18 @@ int __real_efadv_query_device(struct ibv_context *ibvctx, struct efadv_device_at uint32_t inlen); struct efa_ah *__real_efa_ah_alloc(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); struct efa_ah *efa_mock_efa_ah_alloc_return_null(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); struct efa_ah *efa_mock_efa_ah_alloc_dont_create_self_ah(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); + size_t alloc_size); -void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); +void __real_efa_ah_release(struct efa_domain *domain, struct efa_ah *ah); void efa_mock_efa_ah_release_dont_create_self_ah(struct efa_domain *domain, - struct efa_ah *ah, - bool release_from_implicit_av); + struct efa_ah *ah); int efa_mock_efadv_query_device_return_mock(struct ibv_context *ibvctx, struct efadv_device_attr *attr, uint32_t inlen); @@ -166,9 +164,8 @@ struct efa_unit_test_mocks uint32_t inlen); struct efa_ah *(*efa_ah_alloc)(struct efa_domain *domain, const uint8_t *gid, - bool insert_implicit_av); - void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah, - bool release_from_implicit_av); + size_t alloc_size); + void (*efa_ah_release)(struct efa_domain *domain, struct efa_ah *ah); #if HAVE_EFADV_CQ_EX struct ibv_cq_ex *(*efadv_create_cq)(struct ibv_context *ibvctx, diff --git a/prov/efa/test/efa_unit_test_proto_av.c b/prov/efa/test/efa_unit_test_proto_av.c new file mode 100644 index 00000000000..a7a898e9e01 --- /dev/null +++ b/prov/efa/test/efa_unit_test_proto_av.c @@ -0,0 +1,1236 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + +#include "efa_unit_tests.h" +#include "efa_rdm_cq.h" +#include "efa_rdm_pke_req.h" +#include "efa_av.h" + +static void test_av_verify_av_hash_cnt(struct efa_av *av, + int explicit_cur_av_count, + int explicit_prv_av_count, + int implicit_cur_av_count, + int implicit_prv_av_count) +{ + struct efa_proto_av *proto_av = container_of(av, struct efa_proto_av, efa_av); + + assert_int_equal(HASH_CNT(hh, av->util_av.hash), + explicit_cur_av_count + explicit_prv_av_count); + assert_int_equal(HASH_CNT(hh, av->cur_reverse_av), + explicit_cur_av_count); + assert_int_equal(HASH_CNT(hh, av->prv_reverse_av), + explicit_prv_av_count); + + assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), + implicit_cur_av_count + implicit_prv_av_count); + assert_int_equal(HASH_CNT(hh, proto_av->cur_reverse_av_implicit), + implicit_cur_av_count); + assert_int_equal(HASH_CNT(hh, proto_av->prv_reverse_av_implicit), + implicit_prv_av_count); +} + +/** + * @brief This test removes a peer and inserts it again + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_reinsertion(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_peer *peer; + struct efa_ep_addr raw_addr, raw_addr_2; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_av *av; + struct efa_rdm_ep *efa_rdm_ep; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 174; + raw_addr.qkey = 0x1234; + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + assert_int_equal(fi_addr, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + assert_int_equal(fi_addr, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Generate a peer with random QPN and QKEY and insert it into the implicit AV + * + * @param[in] state struct efa_resource that is managed by the framework + */ +static struct efa_rdm_peer *test_av_get_peer_from_implicit_av(struct efa_resource *resource) +{ + struct efa_ep_addr raw_addr; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer; + fi_addr_t implicit_fi_addr, test_addr; + struct efa_av *av; + uint32_t ahn; + int err; + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + raw_addr.qpn = rand(); + raw_addr.qkey = rand(); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Manually insert into implicit AV */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &implicit_fi_addr, 0, NULL, true, true); + assert_int_equal(err, 0); + + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, implicit_fi_addr); + + assert_int_equal(peer->av_entry->implicit_fi_addr, implicit_fi_addr); + assert_int_equal(peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + test_addr = efa_proto_av_reverse_lookup_implicit(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL); + assert_int_equal(test_addr, implicit_fi_addr); + + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + return peer; +} + +/** + * @brief This test fakes a peer in the implicit AV and closes the AV with an + * implicit peer in it + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + test_av_get_peer_from_implicit_av(resource); +} + +/** + * @brief This test fakes a peer in the implicit AV and verifies that the peer + * is moved to the explicit AV when fi_av_insert is called + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_to_explicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr, raw_addr_2; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer; + fi_addr_t explicit_fi_addr, test_addr; + struct efa_av *av; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Generate a peer with random QPN and QKEY and insert it into the implicit AV */ + peer = test_av_get_peer_from_implicit_av(resource); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + /* Modify the peer and verify that the peer is moved as-is */ + peer->next_msg_id = 355; + peer->flags |= EFA_RDM_PEER_IN_BACKOFF; + + /* Insert explicitly */ + raw_addr.qpn = efa_proto_av_entry_ep_addr(peer->av_entry)->qpn; + raw_addr.qkey = efa_proto_av_entry_ep_addr(peer->av_entry)->qkey; + err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + err = fi_av_lookup(resource->av, explicit_fi_addr, &raw_addr_2, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(efa_is_same_addr(&raw_addr, &raw_addr_2), 1); + + peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); + assert_int_equal(peer->av_entry->fi_addr, explicit_fi_addr); + assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_is_same_addr(&raw_addr, efa_proto_av_entry_ep_addr(peer->av_entry)), 1); + + ahn = efa_rdm_ep->self_ah->ahn; + test_addr = efa_proto_av_reverse_lookup(container_of(av, struct efa_proto_av, efa_av), ahn, raw_addr.qpn, NULL); + assert_int_equal(test_addr, explicit_fi_addr); + + /* Verify the manually set peer properties above */ + assert_int_equal(peer->next_msg_id, 355); + assert_true(peer->flags & EFA_RDM_PEER_IN_BACKOFF); + + /* Unset the flag to make fi_av_remove easier */ + peer->flags &= ~EFA_RDM_PEER_IN_BACKOFF; + + err = fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +static void test_av_implicit_av_verify_lru_list_first_last_elements( + struct efa_av *av, struct efa_proto_av_entry *first_conn_expected, + struct efa_proto_av_entry *last_conn_expected) +{ + struct dlist_entry *first_entry, *last_entry; + struct efa_proto_av_entry *first_conn_actual, *last_conn_actual; + + first_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.next; + last_entry = container_of(av, struct efa_proto_av, efa_av)->implicit_av_lru_list.prev; + + first_conn_actual = container_of(first_entry, struct efa_proto_av_entry, + implicit_av_lru_entry); + last_conn_actual = container_of(last_entry, struct efa_proto_av_entry, + implicit_av_lru_entry); + + assert_ptr_equal(first_conn_actual, first_conn_expected); + assert_ptr_equal(last_conn_actual, last_conn_expected); +} + +/** + * @brief This test inserts three implicit peers and verifies that the last + * inserted and/or accessed peer is at the tail of the LRU list + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_insertion(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer0, *peer1, *peer2; + struct efa_av *av; + fi_addr_t implicit_fi_addr; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Manually insert first address into implicit AV */ + peer0 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Expected LRU list: HEAD->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry); + + /* Manually insert second address into implicit AV */ + peer1 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Expected LRU list: HEAD->peer0->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Manually insert third address into implicit AV */ + peer2 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer1->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); + + + /* Access peer0 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 0); + + /* Expected LRU list: HEAD->peer1->peer2->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry); + + /* Access peer2 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer2->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 2); + + /* Expected LRU list: HEAD->peer1->peer0->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer2->av_entry); + + + /* Access peer1 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer1->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 1); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer2->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Access peer2 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer2->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 2); + test_av_verify_av_hash_cnt(av, 0, 0, 3, 0); + + /* Expected LRU list: HEAD->peer0->peer1->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); +} + +/** + * @brief This test sets the implicit AV size to 2 and inserts four implicit + * peers. It verifies that the least recently used peer is evicted. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_eviction(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer0, *peer1, *peer2, *peer3; + struct efa_ep_addr peer1_ep_addr, peer2_ep_addr; + struct efa_ep_addr_hashable *efa_ep_addr_hashable; + struct efa_av *av; + fi_addr_t implicit_fi_addr; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Modify implicit AV size */ + container_of(av, struct efa_proto_av, efa_av)->implicit_av_size = 2; + + /* Manually insert first address into implicit AV */ + peer0 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Expected LRU list: HEAD->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer0->av_entry); + + /* Manually insert second address into implicit AV */ + peer1 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* + * Snapshot peer1/peer2 ep_addr before they are evicted. After + * eviction the enclosing peer_map_entry is returned to the bufpool + * and peer1->av_entry / peer2->av_entry become stale memory. + */ + memcpy(&peer1_ep_addr, efa_proto_av_entry_ep_addr(peer1->av_entry), + sizeof(struct efa_ep_addr)); + + /* Expected LRU list: HEAD->peer0->peer1 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer1->av_entry); + + /* Access peer0 through the CQ read path */ + ahn = efa_rdm_ep->self_ah->ahn; + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + implicit_fi_addr = efa_proto_av_reverse_lookup_implicit( + container_of(av, struct efa_proto_av, efa_av), ahn, + efa_proto_av_entry_ep_addr(peer0->av_entry)->qpn, NULL); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(implicit_fi_addr, 0); + + /* Expected LRU list: HEAD->peer1->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer1->av_entry, peer0->av_entry); + + /* Manually insert third address into implicit AV */ + peer2 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + /* Snapshot peer2 ep_addr before it too gets evicted later. */ + memcpy(&peer2_ep_addr, efa_proto_av_entry_ep_addr(peer2->av_entry), + sizeof(struct efa_ep_addr)); + + /* Expected LRU list: HEAD->peer0->peer2 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer2->av_entry); + + /* Verify that peer1 is evicted and added to the evicted hashmap */ + assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 1); + HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer1_ep_addr, + sizeof(struct efa_ep_addr), efa_ep_addr_hashable); + assert_non_null(efa_ep_addr_hashable); + assert_int_equal(efa_is_same_addr(&peer1_ep_addr, + &efa_ep_addr_hashable->addr), + 1); + + /* Access peer0 through repeated AV insertion path */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), efa_proto_av_entry_ep_addr(peer0->av_entry), &implicit_fi_addr, 0, NULL, true, true); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + assert_int_equal(err, 0); + assert_int_equal(implicit_fi_addr, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Expected LRU list: HEAD->peer2->peer0 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer2->av_entry, peer0->av_entry); + + /* Manually insert fourth address into implicit AV */ + peer3 = test_av_get_peer_from_implicit_av(resource); + test_av_verify_av_hash_cnt(av, 0, 0, 2, 0); + + /* Verify that peer2 is evicted and added to the evicted hashmap */ + assert_int_equal(HASH_CNT(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset), 2); + HASH_FIND(hh, container_of(av, struct efa_proto_av, efa_av)->evicted_peers_hashset, &peer2_ep_addr, + sizeof(struct efa_ep_addr), efa_ep_addr_hashable); + assert_non_null(efa_ep_addr_hashable); + assert_int_equal(efa_is_same_addr(&peer2_ep_addr, + &efa_ep_addr_hashable->addr), + 1); + + /* Expected LRU list: HEAD->peer0->peer3 */ + test_av_implicit_av_verify_lru_list_first_last_elements(av, peer0->av_entry, peer3->av_entry); +} + +/** + * @brief This test tests the implicit_refcnt and explicit_refcnt fields of AH + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_refcnt(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + fi_addr_t fi_addr; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct efa_rdm_ep *efa_rdm_ep; + struct efa_domain *efa_domain; + struct efa_rdm_peer *peer; + struct efa_av *av; + struct efa_ah *efa_ah = NULL; + int err; + + int allowed_ahs = 1; + + g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; + g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; + + g_self_ah_cnt = 1; + g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; + assert_int_equal(g_ibv_ah_cnt, 0); + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + + /* Self AH creation will update g_ibv_ah_cnt but will not actually create AH */ + assert_int_equal(g_ibv_ah_cnt, 1); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* Manually insert into implicit AV */ + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(av, struct efa_proto_av, efa_av), &raw_addr, &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep, fi_addr); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + efa_ah = peer->av_entry->ah; + + assert_int_equal(g_ibv_ah_cnt, 2); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + + /* Move implicit AV entry to explicit AV entry */ + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + + assert_int_equal(g_ibv_ah_cnt, 2); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + + err = fi_av_remove(resource->av, &fi_addr, 1, 0); + assert_int_equal(err, 0); + + assert_int_equal(HASH_CNT(hh, efa_domain->ah_map), 0); + + /* Only the self AH should be left */ + assert_int_equal(g_ibv_ah_cnt, 1); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when an explicit AV entry is inserted. It + * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_impl(bool explicit) +{ + fi_addr_t fi_addr; + struct efa_ep_addr raw_addr[2] = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + struct fid_fabric *fabric_fid[2]; + struct fid_domain *domain_fid[2]; + struct fid_ep *ep_fid[2]; + struct fid_cq *cq_fid[2]; + struct fid_av *av_fid[2]; + struct efa_domain *efa_domain[2]; + struct efa_rdm_ep *efa_rdm_ep[2]; + struct efa_rdm_peer *peer; + struct efa_av *efa_av[2]; + struct efa_ah *efa_ah = NULL; + int err; + struct fi_av_attr av_attr = {0}; + struct fi_cq_attr cq_attr = { + .format = FI_CQ_FORMAT_DATA + }; + struct fi_info *hints, *info, *cur; + int num_nic = 0; + + int allowed_ahs = 1; + + g_efa_unit_test_mocks.ibv_create_ah = &efa_mock_ibv_create_ah_dont_create_self_ah; + g_efa_unit_test_mocks.ibv_destroy_ah = &efa_mock_ibv_destroy_ah_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_alloc = &efa_mock_efa_ah_alloc_dont_create_self_ah; + g_efa_unit_test_mocks.efa_ah_release = &efa_mock_efa_ah_release_dont_create_self_ah; + + hints = efa_unit_test_alloc_hints(FI_EP_RDM, EFA_FABRIC_NAME); + fi_getinfo(FI_VERSION(2, 0), NULL, NULL, 0, hints, &info); + for (cur = info; cur; cur = cur->next) { + num_nic++; + } + + if (num_nic < 2) { + fi_freeinfo(info); + fi_freeinfo(hints); + return; + } + + g_self_ah_cnt = 2; + g_ibv_ah_limit = g_self_ah_cnt + allowed_ahs; /* 2 self AH */ + assert_int_equal(g_ibv_ah_cnt, 0); + + cur = info; + for (int i = 0; i < 2; i++) { + err = fi_fabric(cur->fabric_attr, &fabric_fid[i], NULL); + assert_int_equal(err, 0); + + err = fi_domain(fabric_fid[i], cur, &domain_fid[i], NULL); + assert_int_equal(err, 0); + + efa_domain[i] = container_of(domain_fid[i], struct efa_domain, util_domain.domain_fid); + + err = fi_av_open(domain_fid[i], &av_attr, &av_fid[i], NULL); + assert_int_equal(err, 0); + + efa_av[i] = container_of(av_fid[i], struct efa_av, util_av.av_fid); + + err = fi_cq_open(domain_fid[i], &cq_attr, &cq_fid[i], NULL); + assert_int_equal(err, 0); + + err = fi_endpoint(domain_fid[i], cur, &ep_fid[i], NULL); + assert_int_equal(err, 0); + + efa_rdm_ep[i] = container_of(ep_fid[i], struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + fi_ep_bind(ep_fid[i], &av_fid[i]->fid, 0); + fi_ep_bind(ep_fid[i], &cq_fid[i]->fid, FI_SEND | FI_RECV); + + err = fi_enable(ep_fid[i]); + assert_int_equal(err, 0); + + err = fi_getname(&ep_fid[i]->fid, &raw_addr[i], &raw_addr_len); + assert_int_equal(err, 0); + + cur = cur->next; + } + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); + + /* Manually insert into implicit AV in first domain */ + ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[0], &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); + ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); + efa_ah = peer->av_entry->ah; + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + + if (explicit) { + err = fi_av_insert(av_fid[0], &raw_addr[1], 1, &fi_addr, 0, NULL); + assert_int_equal(err, 1); + peer = efa_rdm_ep_get_peer(efa_rdm_ep[0], fi_addr); + } else { + ofi_genlock_lock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + err = efa_proto_av_insert_one(container_of(efa_av[0], struct efa_proto_av, efa_av), &raw_addr[1], &fi_addr, 0, NULL, true, true); + peer = efa_rdm_ep_get_peer_implicit(efa_rdm_ep[0], fi_addr); + ofi_genlock_unlock(&efa_rdm_ep[0]->base_ep.domain->srx_lock); + } + + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 1); + + efa_ah = peer->av_entry->ah; + if (explicit) { + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 0); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 1); + } else { + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->implicit_refcnt, 1); + assert_int_equal(efa_proto_ah_from_ah(efa_ah)->explicit_refcnt, 0); + } + + if (explicit) { + err = fi_av_remove(av_fid[0], &fi_addr, 1, 0); + assert_int_equal(err, 0); + assert_int_equal(HASH_CNT(hh, efa_domain[0]->ah_map), 0); + } + + for (int i = 0; i < 2; i++) { + efa_rdm_ep[i]->self_ah = NULL; + fi_close(&ep_fid[i]->fid); + fi_close(&cq_fid[i]->fid); + fi_close(&av_fid[i]->fid); + fi_close(&domain_fid[i]->fid); + fi_close(&fabric_fid[i]->fid); + } + fi_freeinfo(hints); + fi_freeinfo(info); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when an explicit AV entry is inserted. It + * requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_explicit_av_insert(struct efa_resource **state) +{ + test_ah_lru_eviction_impl(true); +} + +/** + * @brief This test inserts one implicit AV entry and verifies that the + * implicitly created AH is evicted when another implicit AV entry is inserted. + * It requires at least 2 NICs because ibv_create_ah only works for valid GIDs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_ah_lru_eviction_implicit_av_insert(struct efa_resource **state) +{ + test_ah_lru_eviction_impl(false); +} + +/** + * @brief Test proto AV explicit reverse lookup returns correct fi_addr + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_reverse_lookup_explicit(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr, lookup_addr; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + uint32_t ahn; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Reverse lookup on empty AV should return NOTAVAIL */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 42; + raw_addr.qkey = 0x5678; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Reverse lookup should find the entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, fi_addr); + + /* Lookup with wrong QPN should return NOTAVAIL */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 99, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); + + /* After remove, reverse lookup should return FI_ADDR_NOTAVAIL */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 42, NULL); + assert_int_equal(lookup_addr, FI_ADDR_NOTAVAIL); +} + +/** + * @brief Test that proto AV addr_to_entry returns NULL after entry is removed + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_addr_to_entry_after_remove(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + /* addr_to_entry on empty AV should return NULL */ + entry = efa_proto_av_addr_to_entry(proto_av, 0); + assert_null(entry); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 99; + raw_addr.qkey = 0x9999; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* Entry should be found with correct fields */ + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addr); + assert_int_equal(entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 99); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x9999); + + /* Remove and verify entry is no longer valid */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_null(entry); +} + +/** + * @brief Test proto AV insert/remove with peer creation via get_peer + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_insert_remove_with_peer(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *peer, *peer2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); + raw_addr.qpn = 55; + raw_addr.qkey = 0x5555; + + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Create peer via get_peer */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_non_null(peer); + assert_non_null(peer->av_entry); + assert_int_equal(peer->av_entry->fi_addr, fi_addr); + assert_int_equal(peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qpn, 55); + assert_int_equal(efa_proto_av_entry_ep_addr(peer->av_entry)->qkey, 0x5555); + assert_ptr_equal(peer->ep, efa_rdm_ep); + + /* Peer map lookup should find the same peer */ + peer2 = efa_rdm_ep_get_peer(efa_rdm_ep, fi_addr); + assert_ptr_equal(peer2, peer); + + /* Verify peer map on the entry itself */ + entry = efa_proto_av_addr_to_entry(proto_av, fi_addr); + assert_non_null(entry); + assert_ptr_equal(efa_proto_av_entry_ep_peer_map_lookup(entry, efa_rdm_ep), peer); + + /* Remove — peer is destroyed during av_remove */ + fi_av_remove(resource->av, &fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV implicit insert followed by explicit insert of same addr + * verifies the peer's av_entry pointer is updated to the explicit entry + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_to_explicit_peer_updated(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_peer *implicit_peer, *explicit_peer; + struct efa_av *av; + struct efa_proto_av *proto_av; + fi_addr_t implicit_fi_addr, explicit_fi_addr; + struct efa_ah *ah_before; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Insert implicit peer */ + implicit_peer = test_av_get_peer_from_implicit_av(resource); + assert_non_null(implicit_peer); + implicit_fi_addr = implicit_peer->av_entry->implicit_fi_addr; + assert_int_equal(implicit_peer->av_entry->fi_addr, FI_ADDR_NOTAVAIL); + assert_int_not_equal(implicit_fi_addr, FI_ADDR_NOTAVAIL); + test_av_verify_av_hash_cnt(av, 0, 0, 1, 0); + + /* Remember the AH — it should be reused after migration */ + ah_before = implicit_peer->av_entry->ah; + assert_non_null(ah_before); + + /* Now insert explicitly with the same address */ + struct efa_ep_addr raw_addr; + memcpy(&raw_addr, implicit_peer->av_entry->ep_addr, EFA_EP_ADDR_LEN); + + err = fi_av_insert(resource->av, &raw_addr, 1, &explicit_fi_addr, 0, NULL); + assert_int_equal(err, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Implicit entry should be gone */ + assert_null(efa_proto_av_addr_to_entry_implicit(proto_av, implicit_fi_addr)); + + /* Get peer via explicit addr — should be the same peer with updated av_entry */ + explicit_peer = efa_rdm_ep_get_peer(efa_rdm_ep, explicit_fi_addr); + assert_non_null(explicit_peer); + assert_ptr_equal(explicit_peer, implicit_peer); + assert_int_equal(explicit_peer->av_entry->fi_addr, explicit_fi_addr); + assert_int_equal(explicit_peer->av_entry->implicit_fi_addr, FI_ADDR_NOTAVAIL); + + /* AH should be the same object (reused, not reallocated) */ + assert_ptr_equal(explicit_peer->av_entry->ah, ah_before); + + fi_av_remove(resource->av, &explicit_fi_addr, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV batch insert of multiple addresses in one fi_av_insert call + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_batch_insert(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addrs[3] = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addrs[3]; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_proto_av_entry *entry; + int num_addr, i; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addrs[0], &raw_addr_len), 0); + memcpy(&raw_addrs[1], &raw_addrs[0], sizeof(struct efa_ep_addr)); + memcpy(&raw_addrs[2], &raw_addrs[0], sizeof(struct efa_ep_addr)); + raw_addrs[0].qpn = 10; raw_addrs[0].qkey = 0x1000; + raw_addrs[1].qpn = 11; raw_addrs[1].qkey = 0x1001; + raw_addrs[2].qpn = 12; raw_addrs[2].qkey = 0x1002; + + num_addr = fi_av_insert(resource->av, raw_addrs, 3, fi_addrs, 0, NULL); + assert_int_equal(num_addr, 3); + + /* All three should have distinct fi_addrs */ + assert_int_not_equal(fi_addrs[0], fi_addrs[1]); + assert_int_not_equal(fi_addrs[1], fi_addrs[2]); + assert_int_not_equal(fi_addrs[0], fi_addrs[2]); + + test_av_verify_av_hash_cnt(av, 3, 0, 0, 0); + + /* Verify each entry is accessible with correct QPN */ + for (i = 0; i < 3; i++) { + entry = efa_proto_av_addr_to_entry(proto_av, fi_addrs[i]); + assert_non_null(entry); + assert_non_null(entry->ah); + assert_int_equal(entry->fi_addr, fi_addrs[i]); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qpn, 10 + i); + assert_int_equal(efa_proto_av_entry_ep_addr(entry)->qkey, 0x1000 + i); + } + + /* Remove one at a time and verify counts */ + fi_av_remove(resource->av, &fi_addrs[0], 1, 0); + test_av_verify_av_hash_cnt(av, 2, 0, 0, 0); + assert_null(efa_proto_av_addr_to_entry(proto_av, fi_addrs[0])); + + fi_av_remove(resource->av, &fi_addrs[1], 1, 0); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + fi_av_remove(resource->av, &fi_addrs[2], 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Test proto AV remove of non-existent address returns error + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_remove_nonexistent(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + fi_addr_t bad_addr = 9999; + fi_addr_t notavail = FI_ADDR_NOTAVAIL; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + /* Remove with out-of-range fi_addr */ + err = fi_av_remove(resource->av, &bad_addr, 1, 0); + assert_int_not_equal(err, 0); + + /* Remove with FI_ADDR_NOTAVAIL */ + err = fi_av_remove(resource->av, ¬avail, 1, 0); + assert_int_not_equal(err, 0); +} + +/** + * @brief Test proto AV prv_reverse_av path: insert two addresses with same GID + * but different QPN/QKEY, remove the first, insert a new one with the same QPN + * as the first but different QKEY. The old entry should be in prv_reverse_av. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_prv_reverse_av(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr1 = {0}, raw_addr2 = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr1, fi_addr2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_proto_av_entry *entry1, *entry2; + fi_addr_t lookup_addr; + uint32_t ahn; + int num_addr; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + assert_int_equal(fi_getname(&resource->ep->fid, &raw_addr1, &raw_addr_len), 0); + memcpy(&raw_addr2, &raw_addr1, sizeof(struct efa_ep_addr)); + + /* Insert first address with qpn=20, qkey=0xAAAA */ + raw_addr1.qpn = 20; + raw_addr1.qkey = 0xAAAA; + num_addr = fi_av_insert(resource->av, &raw_addr1, 1, &fi_addr1, 0, NULL); + assert_int_equal(num_addr, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + + /* Verify first entry */ + entry1 = efa_proto_av_addr_to_entry(proto_av, fi_addr1); + assert_non_null(entry1); + assert_int_equal(efa_proto_av_entry_ep_addr(entry1)->qkey, 0xAAAA); + + /* Reverse lookup should find first entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL); + assert_int_equal(lookup_addr, fi_addr1); + + /* Insert second address with same qpn=20 but different qkey=0xBBBB. + * This simulates QPN reuse — the first entry moves to prv_reverse_av */ + raw_addr2.qpn = 20; + raw_addr2.qkey = 0xBBBB; + num_addr = fi_av_insert(resource->av, &raw_addr2, 1, &fi_addr2, 0, NULL); + assert_int_equal(num_addr, 1); + assert_int_not_equal(fi_addr1, fi_addr2); + + /* cur_reverse_av has 1 entry (the latest), prv_reverse_av has 1 (the old) */ + test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); + + /* Verify second entry */ + entry2 = efa_proto_av_addr_to_entry(proto_av, fi_addr2); + assert_non_null(entry2); + assert_int_equal(efa_proto_av_entry_ep_addr(entry2)->qkey, 0xBBBB); + + /* Both entries should share the same AH (same GID) */ + assert_ptr_equal(entry1->ah, entry2->ah); + + /* Reverse lookup without connid should return the current (latest) entry */ + lookup_addr = efa_proto_av_reverse_lookup(proto_av, ahn, 20, NULL); + assert_int_equal(lookup_addr, fi_addr2); + + /* Remove in reverse order: current entry first, then previous */ + fi_av_remove(resource->av, &fi_addr2, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 1, 0, 0); + + fi_av_remove(resource->av, &fi_addr1, 1, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); +} + +/** + * @brief Insert two peers that collide on (AHN, QPN) but differ in QKEY, then + * remove the first-inserted peer before the second. This reproduces the bug + * in efa_av_reverse_av_remove() where the code blindly deletes the + * cur_reverse_av entry matching (ahn, qpn) even though that entry belongs to + * a different (newer) conn. Removing the surviving second peer afterwards + * then hits a NULL prv_reverse_av_entry and SEGVs. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_reverse_av_remove_qpn_collision(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr raw_addr; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t fi_addr1, fi_addr2; + struct efa_av *av; + struct efa_proto_av *proto_av; + struct efa_rdm_ep *efa_rdm_ep; + uint32_t ahn; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + ahn = efa_rdm_ep->self_ah->ahn; + + /* Insert peer1: same GID as self, qpn=100, qkey=0xAAAA */ + raw_addr.qpn = 100; + raw_addr.qkey = 0xAAAA; + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr1, 0, NULL); + assert_int_equal(err, 1); + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + /* cur_reverse_av (ahn, 100) -> entry1 (fi_addr1) */ + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr1); + + /* Insert peer2: same GID and qpn, different qkey. This pushes peer1's + * reverse-AV entry from cur_reverse_av into prv_reverse_av. */ + raw_addr.qpn = 100; + raw_addr.qkey = 0xBBBB; + err = fi_av_insert(resource->av, &raw_addr, 1, &fi_addr2, 0, NULL); + assert_int_equal(err, 1); + assert_int_not_equal(fi_addr1, fi_addr2); + test_av_verify_av_hash_cnt(av, 1, 1, 0, 0); + /* cur_reverse_av (ahn, 100) now points to entry2 (fi_addr2); peer1 is + * in prv_reverse_av keyed by its own qkey. */ + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr2); + + /* Remove peer1 first. Without the fix this would incorrectly delete + * peer2's cur_reverse_av entry and leave peer1's prv entry orphaned. */ + err = fi_av_remove(resource->av, &fi_addr1, 1, 0); + assert_int_equal(err, 0); + /* peer1's prv entry is gone; peer2's cur entry must still be intact. */ + test_av_verify_av_hash_cnt(av, 1, 0, 0, 0); + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + fi_addr2); + + /* Remove peer2. Without the fix this hits a NULL prv_reverse_av_entry + * in efa_av_reverse_av_remove() -> SEGV / assertion failure. */ + err = fi_av_remove(resource->av, &fi_addr2, 1, 0); + assert_int_equal(err, 0); + test_av_verify_av_hash_cnt(av, 0, 0, 0, 0); + assert_int_equal(efa_proto_av_reverse_lookup(proto_av, ahn, 100, NULL), + FI_ADDR_NOTAVAIL); +} + +/** + * @brief Inserting an all-zero GID into the protocol AV must be rejected. + * + * efa_av_is_valid_address() returns 0 for all-zero GIDs. fi_av_insert + * should skip the bad address and return 0 (no address inserted), and + * the output fi_addr should be FI_ADDR_NOTAVAIL. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_insert_invalid_address(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_ep_addr zero_addr = {0}; + fi_addr_t fi_addr = 0; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + zero_addr.qpn = 5; + zero_addr.qkey = 0x1234; + /* zero_addr.raw is left all-zero */ + + err = fi_av_insert(resource->av, &zero_addr, 1, &fi_addr, 0, NULL); + assert_int_equal(err, 0); + assert_int_equal(fi_addr, FI_ADDR_NOTAVAIL); +} + +/** + * @brief With implicit_av_size set to 0 (unbounded mode), the implicit AV + * never evicts entries. + * + * Insert several implicit peers and verify all remain in the LRU list and + * util_av, and evicted_peers_hashset stays empty. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_unbounded(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_proto_av *proto_av; + struct efa_av *av; + const int num_peers = 10; + int i; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + + /* Disable the eviction limit */ + proto_av->implicit_av_size = 0; + + for (i = 0; i < num_peers; i++) + test_av_get_peer_from_implicit_av(resource); + + /* All peers should still be in the implicit AV */ + assert_int_equal(HASH_CNT(hh, proto_av->util_av_implicit.hash), num_peers); + /* No peer should have been evicted */ + assert_int_equal(HASH_CNT(hh, proto_av->evicted_peers_hashset), 0); +} + +/** + * @brief efa_proto_av_open rejects attr->name and attr->flags (both unsupported) + * + * Ensures the early-return error paths in efa_proto_av_open are exercised. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_proto_open_unsupported_attrs(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fi_av_attr av_attr = {0}; + struct fid_av *av = NULL; + int err; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + /* attr->name is not supported */ + av_attr.name = "foo"; + err = fi_av_open(resource->domain, &av_attr, &av, NULL); + assert_int_equal(err, -FI_ENOSYS); + assert_null(av); + av_attr.name = NULL; + + /* attr->flags is not supported */ + av_attr.flags = 1; + err = fi_av_open(resource->domain, &av_attr, &av, NULL); + assert_int_equal(err, -FI_ENOSYS); + assert_null(av); +} + +/** + * @brief efa_proto_av_implicit_av_lru_entry_move on a single-element list + * + * Insert exactly one implicit peer; the LRU list has exactly one node. + * Call efa_proto_av_implicit_av_lru_entry_move on it — this exercises the + * dlist_entry_in_list assertion on the smallest non-empty list. + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_av_implicit_av_lru_move_single(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_peer *peer; + struct efa_proto_av *proto_av; + struct efa_av *av; + struct efa_rdm_ep *efa_rdm_ep; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + av = container_of(resource->av, struct efa_av, util_av.av_fid); + proto_av = container_of(av, struct efa_proto_av, efa_av); + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, + base_ep.util_ep.ep_fid); + + peer = test_av_get_peer_from_implicit_av(resource); + assert_non_null(peer); + + ofi_genlock_lock(&efa_rdm_ep->base_ep.domain->srx_lock); + efa_proto_av_implicit_av_lru_entry_move(proto_av, peer->av_entry); + ofi_genlock_unlock(&efa_rdm_ep->base_ep.domain->srx_lock); + + /* Still exactly one entry in the LRU list */ + test_av_implicit_av_verify_lru_list_first_last_elements( + av, peer->av_entry, peer->av_entry); +} diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index 9a54e522bad..01239822b6b 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -84,7 +84,7 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state) struct efa_rdm_pke *pke; struct efa_ep_addr raw_addr = {0}; size_t raw_addr_len = sizeof(raw_addr); - struct efa_conn conn = {0}; + struct efa_proto_av_entry fake_entry = {0}; struct efa_rdm_peer peer; struct efa_unit_test_eager_rtm_pkt_attr pke_attr = {.msg_id = 0, .connid = 0x1234}; @@ -113,8 +113,8 @@ void test_efa_srx_unexp_pkt(struct efa_resource **state) fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len), 0); raw_addr.qpn = 0; raw_addr.qkey = 0x1234; - conn.ep_addr = &raw_addr; - efa_rdm_peer_construct(&peer, efa_rdm_ep, &conn); + memcpy(fake_entry.ep_addr, &raw_addr, EFA_EP_ADDR_LEN); + efa_rdm_peer_construct(&peer, efa_rdm_ep, &fake_entry); pke->peer = &peer; efa_unit_test_eager_msgrtm_pkt_construct(pke, &pke_attr); diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 6f6f7771361..49fd5672326 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -147,6 +147,11 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_ah_cnt_multi_av_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_multiple_ep_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_base_addr_to_entry_invalid, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + /* end efa_unit_test_av.c */ + + /* begin efa_unit_test_proto_av.c */ cmocka_unit_test_setup_teardown(test_av_reinsertion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_reverse_av_remove_qpn_collision, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_av_implicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -156,7 +161,19 @@ int main(void) cmocka_unit_test_setup_teardown(test_ah_refcnt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_explicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ah_lru_eviction_implicit_av_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - /* end efa_unit_test_av.c */ + cmocka_unit_test_setup_teardown(test_av_insert_remove_lookup_efa_direct, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_reverse_lookup_explicit, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_addr_to_entry_after_remove, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_insert_remove_with_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_to_explicit_peer_updated, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_batch_insert, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_remove_nonexistent, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_prv_reverse_av, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_insert_invalid_address, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_av_unbounded, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_proto_open_unsupported_attrs, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_av_implicit_av_lru_move_single, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + /* end efa_unit_test_proto_av.c */ /* begin efa_unit_test_ep.c */ cmocka_unit_test_setup_teardown(test_efa_device_construct_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index 1c9b021f051..06f0405f911 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -117,6 +117,11 @@ void test_efa_ah_cnt_multi_av_efa(); void test_efa_ah_cnt_multi_av_efa_direct(); void test_av_multiple_ep_efa(); void test_av_multiple_ep_efa_direct(); +void test_av_insert_remove_lookup_efa_direct(); +void test_av_base_addr_to_entry_invalid(); +/* end efa_unit_test_av.c */ + +/* begin efa_unit_test_proto_av.c */ void test_av_reinsertion(); void test_av_reverse_av_remove_qpn_collision(); void test_av_implicit(); @@ -126,7 +131,18 @@ void test_av_implicit_av_lru_eviction(); void test_ah_refcnt(); void test_ah_lru_eviction_explicit_av_insert(); void test_ah_lru_eviction_implicit_av_insert(); -/* end efa_unit_test_av.c */ +void test_av_proto_reverse_lookup_explicit(); +void test_av_proto_addr_to_entry_after_remove(); +void test_av_proto_insert_remove_with_peer(); +void test_av_implicit_to_explicit_peer_updated(); +void test_av_proto_batch_insert(); +void test_av_proto_remove_nonexistent(); +void test_av_proto_prv_reverse_av(); +void test_av_proto_insert_invalid_address(); +void test_av_implicit_av_unbounded(); +void test_av_proto_open_unsupported_attrs(); +void test_av_implicit_av_lru_move_single(); +/* end efa_unit_test_proto_av.c */ void test_efa_device_construct_error_handling(); void test_efa_rdm_ep_ignore_missing_host_id_file();