Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ option(USE_AVX2 "Enable AVX2 (FMA, F16C)" OFF)
option(USE_SVE2 "Enable SVE2 (INT8/16, FP16)" OFF)
option(USE_NEON "Enable NEON (FP16, DotProd)" OFF)
option(NDD_INV_IDX_STORE_FLOATS "Store raw float 32 values in sparse index (no quantization)" OFF)
set(NDD_RUNTIME_X86_DISPATCH OFF)

# Check if any SIMD option is selected
if(NOT USE_AVX512 AND NOT USE_AVX2 AND NOT USE_SVE2 AND NOT USE_NEON)
Expand All @@ -108,10 +109,9 @@ if(NOT USE_AVX512 AND NOT USE_AVX2 AND NOT USE_SVE2 AND NOT USE_NEON)
" -DUSE_SVE2=ON : For ARMv9/SVE2 capable processors (requires SVE2, FP16)\n"
" -DUSE_NEON=ON : For standard ARMv8/NEON processors (requires FP16, DotProd)")
else()
message(FATAL_ERROR "x86 architecture detected but no SIMD option selected.\n"
"Please specify one of the following flags:\n"
" -DUSE_AVX512=ON : For processors with AVX512F, BW, VNNI, FP16\n"
" -DUSE_AVX2=ON : For processors with AVX2, FMA, F16C")
set(NDD_RUNTIME_X86_DISPATCH ON)
message(STATUS "x86 architecture detected with no explicit SIMD mode; enabling runtime x86 "
"dispatch with an AVX2 baseline and optional AVX512 variants")
endif()
endif()

Expand Down Expand Up @@ -253,6 +253,7 @@ message(STATUS "Binary name: ${NDD_BINARY_NAME}")
# Add new src/*.cpp files here when they should be compiled into ndd.
set(NDD_CORE_SOURCES
src/sparse/inverted_index.cpp
src/utils/cpu_compat_check/cpu_runtime_dispatch.cpp
src/utils/system_sanity/system_sanity.cpp
)

Expand Down Expand Up @@ -341,6 +342,12 @@ elseif(USE_NEON)
endif()
target_compile_definitions(ndd_core PRIVATE USE_NEON)
target_compile_definitions(${NDD_BINARY_NAME} PRIVATE USE_NEON)
elseif(NDD_RUNTIME_X86_DISPATCH)
message(STATUS "SIMD: Runtime x86 dispatch enabled (AVX2 baseline + optional AVX512 variants)")
target_compile_options(ndd_core PRIVATE -mavx2 -mfma -mf16c)
target_compile_definitions(ndd_core PRIVATE USE_AVX2 NDD_RUNTIME_X86_DISPATCH NDD_COMPILE_AVX512_VARIANTS)
target_compile_options(${NDD_BINARY_NAME} PRIVATE -mavx2 -mfma -mf16c)
target_compile_definitions(${NDD_BINARY_NAME} PRIVATE USE_AVX2 NDD_RUNTIME_X86_DISPATCH NDD_COMPILE_AVX512_VARIANTS)
endif()

if(NDD_INV_IDX_STORE_FLOATS)
Expand Down Expand Up @@ -393,15 +400,20 @@ elseif(USE_SVE2)
message(STATUS "SIMD Mode: SVE2")
elseif(USE_NEON)
message(STATUS "SIMD Mode: NEON")
elseif(NDD_RUNTIME_X86_DISPATCH)
message(STATUS "SIMD Mode: Runtime x86 dispatch")
endif()
message(STATUS "ASIO include dir: ${ASIO_INCLUDE_DIR}")
message(STATUS "LMDB include dir: ${LMDB_INCLUDE_DIR}")
message(STATUS "OpenSSL include dir: ${OPENSSL_INCLUDE_DIR}")

# Create a symbolic link named 'ndd' pointing to the architecture-specific binary
add_custom_command(TARGET ${NDD_BINARY_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E create_symlink
$<TARGET_FILE_NAME:${NDD_BINARY_NAME}>
${CMAKE_CURRENT_BINARY_DIR}/ndd
COMMENT "Creating softlink 'ndd' -> ${NDD_BINARY_NAME}"
)
# (skipped when binary is already named 'ndd' to avoid a self-referential symlink)
if(NOT NDD_BINARY_NAME STREQUAL "ndd")
add_custom_command(TARGET ${NDD_BINARY_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E create_symlink
$<TARGET_FILE_NAME:${NDD_BINARY_NAME}>
${CMAKE_CURRENT_BINARY_DIR}/ndd
COMMENT "Creating softlink 'ndd' -> ${NDD_BINARY_NAME}"
)
endif()
8 changes: 7 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "core/ndd.hpp"
#include "auth.hpp"
#include "quant/common.hpp"
#include "utils/cpu_compat_check/cpu_runtime_dispatch.hpp"
#include "system_sanity/system_sanity.hpp"

using ndd::quant::quantLevelToString;
Expand Down Expand Up @@ -257,9 +258,14 @@ int main(int argc, char** argv) {
// Health check endpoint (no auth required)
// CROW_ROUTE(app, "/api/v1/health").methods("GET"_method)([](const crow::request& req) {
CROW_ROUTE(app, "/api/v1/health").methods("GET"_method)([]() {
crow::json::wvalue::list cpu_flags;
for(const auto& flag : ndd::cpu::get_active_cpu_flags()) {
cpu_flags.emplace_back(flag);
}
crow::json::wvalue response(
{{"status", "ok"},
{"timestamp", (std::int64_t)std::chrono::system_clock::now().time_since_epoch().count()}});
{"timestamp", (std::int64_t)std::chrono::system_clock::now().time_since_epoch().count()},
{"cpu_flags", cpu_flags}});
PRINT_LOG_TIME();
ndd::printSparseSearchDebugStats();
ndd::printSparseUpdateDebugStats();
Expand Down
72 changes: 65 additions & 7 deletions src/quant/binary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ namespace ndd {
return 1.0f;
}

#if defined(USE_AVX512)
inline std::vector<uint8_t> quantize_avx512(const std::vector<float>& input) {
#if NDD_HAS_AVX512_VARIANTS
NDD_TARGET_AVX512F inline std::vector<uint8_t>
quantize_avx512(const std::vector<float>& input) {
if(input.empty()) {
return std::vector<uint8_t>();
}
Expand Down Expand Up @@ -182,7 +183,12 @@ namespace ndd {

// Quantize FP32 vector to Binary (packed bits)
inline std::vector<uint8_t> quantize(const std::vector<float>& input) {
#if defined(USE_AVX512)
#if defined(NDD_RUNTIME_X86_DISPATCH) && (defined(__x86_64__) || defined(_M_X64))
if(ndd::cpu::use_avx512f()) {
return quantize_avx512(input);
}
return quantize_avx2(input);
#elif defined(USE_AVX512)
return quantize_avx512(input);
#elif defined(USE_AVX2)
return quantize_avx2(input);
Expand Down Expand Up @@ -213,8 +219,9 @@ namespace ndd {
#endif
}

#if defined(USE_AVX512)
inline std::vector<float> dequantize_avx512(const uint8_t* buffer, size_t dimension) {
#if NDD_HAS_AVX512_VARIANTS
NDD_TARGET_AVX512F inline std::vector<float> dequantize_avx512(const uint8_t* buffer,
size_t dimension) {
std::vector<float> output(dimension);

size_t i = 0;
Expand Down Expand Up @@ -370,7 +377,12 @@ namespace ndd {

// Dequantize Binary to FP32
inline std::vector<float> dequantize(const uint8_t* buffer, size_t dimension) {
#if defined(USE_AVX512)
#if defined(NDD_RUNTIME_X86_DISPATCH) && (defined(__x86_64__) || defined(_M_X64))
if(ndd::cpu::use_avx512f()) {
return dequantize_avx512(buffer, dimension);
}
return dequantize_avx2(buffer, dimension);
#elif defined(USE_AVX512)
return dequantize_avx512(buffer, dimension);
#elif defined(USE_AVX2)
return dequantize_avx2(buffer, dimension);
Expand All @@ -397,7 +409,44 @@ namespace ndd {
}

// Hamming distance implementation
inline float Hamming(const void* v1, const void* v2, const void* params) {
#if NDD_HAS_AVX512_VARIANTS
NDD_TARGET_AVX512VPOPCNTDQ inline float HammingAVX512(const void* v1,
const void* v2,
const void* params) {
const size_t dim = *static_cast<const size_t*>(params);
const uint64_t* p1 = static_cast<const uint64_t*>(v1);
const uint64_t* p2 = static_cast<const uint64_t*>(v2);

size_t num_uint64 = (dim + 63) / 64;
float dist = 0;
size_t i = 0;

__m512i acc = _mm512_setzero_si512();

for(; i + 8 <= num_uint64; i += 8) {
__m512i d1 = _mm512_loadu_si512((const __m512i*)&p1[i]);
__m512i d2 = _mm512_loadu_si512((const __m512i*)&p2[i]);
__m512i x = _mm512_xor_si512(d1, d2);
__m512i p = _mm512_popcnt_epi64(x);
acc = _mm512_add_epi64(acc, p);
}

if(i < num_uint64) {
__mmask8 mask = (__mmask8)((1 << (num_uint64 - i)) - 1);
__m512i d1 = _mm512_maskz_loadu_epi64(mask, &p1[i]);
__m512i d2 = _mm512_maskz_loadu_epi64(mask, &p2[i]);
__m512i x = _mm512_xor_si512(d1, d2);
__m512i p = _mm512_popcnt_epi64(x);
acc = _mm512_add_epi64(acc, p);
i = num_uint64;
}

dist += _mm512_reduce_add_epi64(acc);
return dist;
}
#endif

inline float HammingBaseline(const void* v1, const void* v2, const void* params) {
// params is expected to be a pointer to a struct where the first member is size_t
// dim e.g. hnswlib::DistParams
const size_t dim = *static_cast<const size_t*>(params);
Expand Down Expand Up @@ -612,6 +661,15 @@ namespace ndd {
return dist;
}

inline float Hamming(const void* v1, const void* v2, const void* params) {
#if defined(NDD_RUNTIME_X86_DISPATCH) && (defined(__x86_64__) || defined(_M_X64))
if(ndd::cpu::use_avx512vpopcntdq()) {
return HammingAVX512(v1, v2, params);
}
#endif
return HammingBaseline(v1, v2, params);
}

// Wrappers
inline float L2Sqr(const void* v1, const void* v2, const void* params) {
return Hamming(v1, v2, params);
Expand Down
15 changes: 11 additions & 4 deletions src/quant/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <vector>
#include <mutex>

#include "../utils/cpu_compat_check/cpu_runtime_dispatch.hpp"

#if defined(USE_AVX512) || defined(USE_AVX2)
# include <immintrin.h>
#endif
Expand Down Expand Up @@ -191,7 +193,7 @@ namespace ndd {

// Forward declarations for SIMD implementations
inline float find_abs_max_scalar(const float* data, size_t size);
#if defined(USE_AVX512)
#if NDD_HAS_AVX512_VARIANTS
inline float find_abs_max_avx512(const float* data, size_t size);
#endif
#if defined(USE_AVX2)
Expand All @@ -206,7 +208,12 @@ namespace ndd {

// Find absolute maximum value in a vector (for scaling)
inline float find_abs_max(const float* data, size_t size) {
#if defined(USE_AVX512)
#if defined(NDD_RUNTIME_X86_DISPATCH) && (defined(__x86_64__) || defined(_M_X64))
if(ndd::cpu::use_avx512f()) {
return find_abs_max_avx512(data, size);
}
return find_abs_max_avx2(data, size);
#elif defined(USE_AVX512)
return find_abs_max_avx512(data, size);
#elif defined(USE_SVE2)
return find_abs_max_sve(data, size);
Expand All @@ -228,9 +235,9 @@ namespace ndd {
return abs_max;
}

#if defined(USE_AVX512)
#if NDD_HAS_AVX512_VARIANTS
// AVX512 optimized absolute maximum finding - MAXIMUM register utilization
inline float find_abs_max_avx512(const float* data, size_t size) {
NDD_TARGET_AVX512F inline float find_abs_max_avx512(const float* data, size_t size) {
if(size == 0) {
return 0.0f;
}
Expand Down
Loading
Loading