From 2e4f041f9fe1a32deb69fb47e7d4edf90a8bda0a Mon Sep 17 00:00:00 2001 From: Ali Hassan Date: Mon, 6 Apr 2026 14:02:47 +0500 Subject: [PATCH 1/3] optimize percent_encode, percent_encode_index, and percent_decode Signed-off-by: Ali Hassan --- benchmarks/percent_encode.cpp | 92 +++++++- include/ada/unicode-inl.h | 41 ---- include/ada/unicode.h | 4 +- src/unicode.cpp | 393 +++++++++++++++++++++++++++++----- tests/CMakeLists.txt | 2 + tests/unicode_tests.cpp | 91 ++++++++ 6 files changed, 524 insertions(+), 99 deletions(-) create mode 100644 tests/unicode_tests.cpp diff --git a/benchmarks/percent_encode.cpp b/benchmarks/percent_encode.cpp index c88e50655..1504e9dcb 100644 --- a/benchmarks/percent_encode.cpp +++ b/benchmarks/percent_encode.cpp @@ -13,6 +13,26 @@ std::string examples[] = {"\xE1|", "other:9818274x1!!", "ref=web-twc-ao-gbl-adsinfo&utm_source=twc&utm_", "connect_timeout=10&application_name=myapp"}; +std::string long_examples[] = { + "connect timeout=10 application name=myapp server=db host internal " + "database=production analytics read preference=secondary preferred " + "ssl=true retry writes=true w=majority max pool size=50", + "ref=web twc ao gbl adsinfo utm source=twc utm medium=cpc " + "utm campaign=brand awareness q4 2024 utm content=banner 300x250 " + "utm term=weather forecast today gclid=Cj0KCQiA3Y ABhCnARIsAK", +}; + +std::string decode_examples[] = { + "%E4%BD%A0%E5%A5%BD%E4%B8%96%E7%95%8C%20%21%22%23%24%25%26%27", + "connect_timeout%3D10%26application_name%3Dmyapp%26server%3Ddb.host", + "%68%65%6C%6C%6F%20%77%6F%72%6C%64%20%74%68%69%73%20%69%73%20" + "%61%20%70%65%72%63%65%6E%74%20%68%65%61%76%79%20%73%74%72%69" + "%6E%67", + "%2Fapi%2Fv1%2Fusers%2F12345%2Fposts%3Fpage%3D1%26limit%3D50%26" + "sort%3Dcreated%26order%3Ddesc%26fields%3Did%2Ctitle%2Cbody%26" + "filter%3Dstatus%253Dpublished", +}; + void init_data() {} double examples_bytes = []() -> double { @@ -115,7 +135,7 @@ static void SpecialQuery(benchmark::State& state) { for (auto _ : state) { for (std::string& url_string : examples) { benchmark::DoNotOptimize(ada::unicode::percent_encode( - url_string, ada::character_sets::FRAGMENT_PERCENT_ENCODE)); + url_string, ada::character_sets::SPECIAL_QUERY_PERCENT_ENCODE)); } } if (collector.has_events()) { @@ -243,6 +263,76 @@ static void C0Control(benchmark::State& state) { } BENCHMARK(C0Control); +double long_examples_bytes = []() -> double { + size_t bytes{0}; + for (std::string& s : long_examples) { + bytes += s.size(); + } + return double(bytes); +}(); + +static void LongFragment(benchmark::State& state) { + for (auto _ : state) { + for (std::string& s : long_examples) { + benchmark::DoNotOptimize(ada::unicode::percent_encode( + s, ada::character_sets::FRAGMENT_PERCENT_ENCODE)); + } + } + state.counters["speed"] = benchmark::Counter( + long_examples_bytes, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(LongFragment); + +static void LongQuery(benchmark::State& state) { + for (auto _ : state) { + for (std::string& s : long_examples) { + benchmark::DoNotOptimize(ada::unicode::percent_encode( + s, ada::character_sets::QUERY_PERCENT_ENCODE)); + } + } + state.counters["speed"] = benchmark::Counter( + long_examples_bytes, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(LongQuery); + +double decode_examples_bytes = []() -> double { + size_t bytes{0}; + for (std::string& s : decode_examples) { + bytes += s.size(); + } + return double(bytes); +}(); + +const size_t decode_first_pct[] = { + decode_examples[0].find('%'), + decode_examples[1].find('%'), + decode_examples[2].find('%'), + decode_examples[3].find('%'), +}; + +static void Decode(benchmark::State& state) { + for (auto _ : state) { + for (size_t i = 0; i < std::size(decode_examples); i++) { + benchmark::DoNotOptimize(ada::unicode::percent_decode( + decode_examples[i], decode_first_pct[i])); + } + } + state.counters["speed"] = benchmark::Counter( + decode_examples_bytes, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(Decode); + +static void DecodeClean(benchmark::State& state) { + std::string clean(200, 'a'); + for (auto _ : state) { + benchmark::DoNotOptimize( + ada::unicode::percent_decode(clean, std::string_view::npos)); + } + state.counters["speed"] = + benchmark::Counter(200.0, benchmark::Counter::kIsIterationInvariantRate); +} +BENCHMARK(DecodeClean); + int main(int argc, char** argv) { #if defined(ADA_RUST_VERSION) benchmark::AddCustomContext("rust version ", ADA_RUST_VERSION); diff --git a/include/ada/unicode-inl.h b/include/ada/unicode-inl.h index 0cae6d7b9..8805738dd 100644 --- a/include/ada/unicode-inl.h +++ b/include/ada/unicode-inl.h @@ -5,46 +5,5 @@ #ifndef ADA_UNICODE_INL_H #define ADA_UNICODE_INL_H #include "ada/unicode.h" -#include "ada/character_sets.h" - -/** - * Unicode operations. These functions are not part of our public API and may - * change at any time. - * - * private - * @namespace ada::unicode - * @brief Includes the declarations for unicode operations - */ -namespace ada::unicode { -ada_really_inline size_t percent_encode_index(const std::string_view input, - const uint8_t character_set[]) { - const char* data = input.data(); - const size_t size = input.size(); - - // Process 8 bytes at a time using unrolled loop - size_t i = 0; - for (; i + 8 <= size; i += 8) { - unsigned char chunk[8]; - std::memcpy(&chunk, data + i, - 8); // entices compiler to unconditionally process 8 characters - - // Check 8 characters at once - for (size_t j = 0; j < 8; j++) { - if (character_sets::bit_at(character_set, chunk[j])) { - return i + j; - } - } - } - - // Handle remaining bytes - for (; i < size; i++) { - if (character_sets::bit_at(character_set, data[i])) { - return i; - } - } - - return size; -} -} // namespace ada::unicode #endif // ADA_UNICODE_INL_H diff --git a/include/ada/unicode.h b/include/ada/unicode.h index 27e458d51..f768afb07 100644 --- a/include/ada/unicode.h +++ b/include/ada/unicode.h @@ -228,8 +228,8 @@ bool percent_encode(std::string_view input, const uint8_t character_set[], * Returns the index at which percent encoding should start, or (equivalently), * the length of the prefix that does not require percent encoding. */ -ada_really_inline size_t percent_encode_index(std::string_view input, - const uint8_t character_set[]); +size_t percent_encode_index(std::string_view input, + const uint8_t character_set[]); /** * @private * Lowers the string in-place, assuming that the content is ASCII. diff --git a/src/unicode.cpp b/src/unicode.cpp index 138ff109c..f28912dcc 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -22,8 +22,6 @@ ADA_POP_DISABLE_WARNINGS #include #endif -#include - namespace ada::unicode { constexpr bool is_tabs_or_newline(char c) noexcept { @@ -449,6 +447,123 @@ unsigned constexpr convert_hex_to_binary(const char c) noexcept { return hex_to_binary_table[c - '0']; } +#if ADA_SSSE3 +size_t percent_encode_index(const std::string_view input, + const uint8_t character_set[]) { + const char* data = input.data(); + const size_t size = input.size(); + if (size < 16) { + for (size_t i = 0; i < size; i++) { + if (character_sets::bit_at(character_set, data[i])) return i; + } + return size; + } + // Nibble decomposition: for byte v = (hi << 4) | lo (v < 128), + // lo_lut[lo] = bitmask of which hi nibbles (0-7) need encoding + // hi_lut[hi] = (1 << hi) for hi < 8, else 0 + // Bytes >= 128 always need encoding -- caught by sign bit check. + uint8_t lo_lut_data[16] = {0}; + uint8_t hi_lut_data[16] = {0}; + for (int h = 0; h < 8; h++) { + hi_lut_data[h] = uint8_t(1) << h; + for (int l = 0; l < 16; l++) { + if (character_sets::bit_at(character_set, (h << 4) | l)) { + lo_lut_data[l] |= uint8_t(1) << h; + } + } + } + __m128i lo_lut = _mm_loadu_si128((const __m128i*)lo_lut_data); + __m128i hi_lut = _mm_loadu_si128((const __m128i*)hi_lut_data); + __m128i mask_0f = _mm_set1_epi8(0x0F); + + size_t i = 0; + for (; i + 15 < size; i += 16) { + __m128i word = _mm_loadu_si128((const __m128i*)(data + i)); + int high_mask = _mm_movemask_epi8(word); + __m128i lo_nibbles = _mm_and_si128(word, mask_0f); + __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), mask_0f); + __m128i matches = _mm_and_si128(_mm_shuffle_epi8(lo_lut, lo_nibbles), + _mm_shuffle_epi8(hi_lut, hi_nibbles)); + int match_mask = _mm_movemask_epi8(matches) | high_mask; + if (match_mask != 0) { + return i + __builtin_ctz(match_mask); + } + } + for (; i < size; i++) { + if (character_sets::bit_at(character_set, data[i])) return i; + } + return size; +} +#elif ADA_NEON +size_t percent_encode_index(const std::string_view input, + const uint8_t character_set[]) { + const char* data = input.data(); + const size_t size = input.size(); + if (size < 16) { + for (size_t i = 0; i < size; i++) { + if (character_sets::bit_at(character_set, data[i])) return i; + } + return size; + } + uint8x16x2_t cs_table; + cs_table.val[0] = vld1q_u8(character_set); + cs_table.val[1] = vld1q_u8(character_set + 16); + const uint8x16_t mask7 = vdupq_n_u8(7); + const uint8x16_t one = vdupq_n_u8(1); + + size_t i = 0; + for (; i + 15 < size; i += 16) { + uint8x16_t word = vld1q_u8((const uint8_t*)(data + i)); + uint8x16_t byte_idx = vshrq_n_u8(word, 3); + uint8x16_t cs_bytes = vqtbl2q_u8(cs_table, byte_idx); + uint8x16_t bit_idx = vandq_u8(word, mask7); + uint8x16_t bit_mask = vshlq_u8(one, vreinterpretq_s8_u8(bit_idx)); + uint8x16_t result = vandq_u8(cs_bytes, bit_mask); + if (vmaxvq_u32(vreinterpretq_u32_u8(result)) != 0) { + for (size_t j = 0; j < 16; j++) { + if (character_sets::bit_at(character_set, data[i + j])) return i + j; + } + } + } + for (; i < size; i++) { + if (character_sets::bit_at(character_set, data[i])) return i; + } + return size; +} +#else +size_t percent_encode_index(const std::string_view input, + const uint8_t character_set[]) { + const char* data = input.data(); + const size_t size = input.size(); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + unsigned char chunk[8]; + std::memcpy(&chunk, data + i, 8); + for (size_t j = 0; j < 8; j++) { + if (character_sets::bit_at(character_set, chunk[j])) { + return i + j; + } + } + } + for (; i < size; i++) { + if (character_sets::bit_at(character_set, data[i])) { + return i; + } + } + return size; +} +#endif + +ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept { +#ifdef ADA_REGULAR_VISUAL_STUDIO + unsigned long ret; + _BitScanForward(&ret, input_num); + return (int)ret; +#else + return __builtin_ctzl(input_num); +#endif +} + std::string percent_decode(const std::string_view input, size_t first_percent) { // next line is for safety only, we expect users to avoid calling // percent_decode when first_percent is outside the range. @@ -460,8 +575,123 @@ std::string percent_decode(const std::string_view input, size_t first_percent) { dest.append(input.substr(0, first_percent)); const char* pointer = input.data() + first_percent; const char* end = input.data() + input.size(); - // Optimization opportunity: if the following code gets - // called often, it can be optimized quite a bit. + + // SIMD fast path: scan 16 bytes at a time for '%'. + // When no '%' is found in a chunk, bulk-append all 16 bytes. +#if ADA_SSSE3 || ADA_SSE2 + const __m128i pct = _mm_set1_epi8('%'); + while (pointer + 15 < end) { + __m128i word = _mm_loadu_si128((const __m128i*)pointer); + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(word, pct)); + if (mask == 0) { + dest.append(pointer, 16); + pointer += 16; + continue; + } + int skip = trailing_zeroes(mask); + if (skip > 0) { + dest.append(pointer, skip); + pointer += skip; + } + size_t remaining = end - pointer - 1; + if (remaining >= 2 && is_ascii_hex_digit(pointer[1]) && + is_ascii_hex_digit(pointer[2])) { + unsigned a = convert_hex_to_binary(pointer[1]); + unsigned b = convert_hex_to_binary(pointer[2]); + dest += static_cast(a * 16 + b); + pointer += 3; + } else { + dest += pointer[0]; + pointer++; + } + } +#elif ADA_NEON + const uint8x16_t pct_vec = vdupq_n_u8('%'); + while (pointer + 15 < end) { + uint8x16_t word = vld1q_u8((const uint8_t*)pointer); + uint8x16_t cmp = vceqq_u8(word, pct_vec); + if (vmaxvq_u32(vreinterpretq_u32_u8(cmp)) == 0) { + dest.append(pointer, 16); + pointer += 16; + continue; + } + size_t skip = 0; + while (skip < 16 && pointer[skip] != '%') skip++; + if (skip > 0) { + dest.append(pointer, skip); + pointer += skip; + } + size_t remaining = end - pointer - 1; + if (remaining >= 2 && is_ascii_hex_digit(pointer[1]) && + is_ascii_hex_digit(pointer[2])) { + unsigned a = convert_hex_to_binary(pointer[1]); + unsigned b = convert_hex_to_binary(pointer[2]); + dest += static_cast(a * 16 + b); + pointer += 3; + } else { + dest += pointer[0]; + pointer++; + } + } +#elif ADA_LSX + const __m128i pct = __lsx_vrepli_b('%'); + while (pointer + 15 < end) { + __m128i word = __lsx_vld((const __m128i*)pointer, 0); + __m128i cmp = __lsx_vseq_b(word, pct); + if (__lsx_bz_v(cmp)) { + dest.append(pointer, 16); + pointer += 16; + continue; + } + int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(cmp), 0); + int skip = trailing_zeroes(mask); + if (skip > 0) { + dest.append(pointer, skip); + pointer += skip; + } + size_t remaining = end - pointer - 1; + if (remaining >= 2 && is_ascii_hex_digit(pointer[1]) && + is_ascii_hex_digit(pointer[2])) { + unsigned a = convert_hex_to_binary(pointer[1]); + unsigned b = convert_hex_to_binary(pointer[2]); + dest += static_cast(a * 16 + b); + pointer += 3; + } else { + dest += pointer[0]; + pointer++; + } + } +#elif ADA_RVV + while (pointer < end) { + size_t n = end - pointer; + size_t vl = __riscv_vsetvl_e8m1(n); + vuint8m1_t v = __riscv_vle8_v_u8m1((const uint8_t*)pointer, vl); + vbool8_t m = __riscv_vmseq(v, '%', vl); + long idx = __riscv_vfirst(m, vl); + if (idx < 0) { + dest.append(pointer, vl); + pointer += vl; + continue; + } + if (idx > 0) { + dest.append(pointer, idx); + pointer += idx; + } + size_t remaining = end - pointer - 1; + if (remaining >= 2 && is_ascii_hex_digit(pointer[1]) && + is_ascii_hex_digit(pointer[2])) { + unsigned a = convert_hex_to_binary(pointer[1]); + unsigned b = convert_hex_to_binary(pointer[2]); + dest += static_cast(a * 16 + b); + pointer += 3; + } else { + dest += pointer[0]; + pointer++; + } + } +#endif + + // Scalar tail (also the complete path when no SIMD is available). while (pointer < end) { const char ch = pointer[0]; size_t remaining = end - pointer - 1; @@ -474,37 +704,109 @@ std::string percent_decode(const std::string_view input, size_t first_percent) { } else { unsigned a = convert_hex_to_binary(pointer[1]); unsigned b = convert_hex_to_binary(pointer[2]); - char c = static_cast(a * 16 + b); - dest += c; + dest += static_cast(a * 16 + b); pointer += 3; } } return dest; } -std::string percent_encode(const std::string_view input, - const uint8_t character_set[]) { - auto pointer = std::ranges::find_if(input, [character_set](const char c) { - return character_sets::bit_at(character_set, c); - }); - // Optimization: Don't iterate if percent encode is not required - if (pointer == input.end()) { - return std::string(input); +// SIMD-accelerated encoding loop shared by all percent_encode overloads. +// Scans [p, pend) for bytes matching character_set, encoding matches as %XX +// and bulk-appending clean runs. +static ada_really_inline void percent_encode_to(const char* p, const char* pend, + const uint8_t character_set[], + std::string& out) { +#if ADA_SSSE3 + // Nibble decomposition LUTs (same algorithm as percent_encode_index). + uint8_t lo_lut_data[16] = {0}; + uint8_t hi_lut_data[16] = {0}; + for (int h = 0; h < 8; h++) { + hi_lut_data[h] = uint8_t(1) << h; + for (int l = 0; l < 16; l++) { + if (character_sets::bit_at(character_set, (h << 4) | l)) { + lo_lut_data[l] |= uint8_t(1) << h; + } + } } - - std::string result; - result.reserve(input.length()); // in the worst case, percent encoding might - // produce 3 characters. - result.append(input.substr(0, std::distance(input.begin(), pointer))); - - for (; pointer != input.end(); pointer++) { - if (character_sets::bit_at(character_set, *pointer)) { - result.append(character_sets::hex + uint8_t(*pointer) * 4, 3); + __m128i lo_lut = _mm_loadu_si128((const __m128i*)lo_lut_data); + __m128i hi_lut = _mm_loadu_si128((const __m128i*)hi_lut_data); + __m128i mask_0f = _mm_set1_epi8(0x0F); + + while (p + 15 < pend) { + __m128i word = _mm_loadu_si128((const __m128i*)p); + int high_mask = _mm_movemask_epi8(word); + __m128i lo_nibbles = _mm_and_si128(word, mask_0f); + __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), mask_0f); + __m128i matches = _mm_and_si128(_mm_shuffle_epi8(lo_lut, lo_nibbles), + _mm_shuffle_epi8(hi_lut, hi_nibbles)); + int match_mask = _mm_movemask_epi8(matches) | high_mask; + if (match_mask == 0) { + out.append(p, 16); + p += 16; + continue; + } + int clean = trailing_zeroes(match_mask); + if (clean > 0) { + out.append(p, clean); + p += clean; + } + out.append(character_sets::hex + uint8_t(*p) * 4, 3); + p++; + } +#elif ADA_NEON + uint8x16x2_t cs_table; + cs_table.val[0] = vld1q_u8(character_set); + cs_table.val[1] = vld1q_u8(character_set + 16); + const uint8x16_t mask7 = vdupq_n_u8(7); + const uint8x16_t one = vdupq_n_u8(1); + + while (p + 15 < pend) { + uint8x16_t word = vld1q_u8((const uint8_t*)p); + uint8x16_t byte_idx = vshrq_n_u8(word, 3); + uint8x16_t cs_bytes = vqtbl2q_u8(cs_table, byte_idx); + uint8x16_t bit_idx = vandq_u8(word, mask7); + uint8x16_t bit_mask = vshlq_u8(one, vreinterpretq_s8_u8(bit_idx)); + uint8x16_t hits = vandq_u8(cs_bytes, bit_mask); + if (vmaxvq_u32(vreinterpretq_u32_u8(hits)) == 0) { + out.append(p, 16); + p += 16; + continue; + } + size_t clean = 0; + while (clean < 16 && !character_sets::bit_at(character_set, p[clean])) { + clean++; + } + if (clean > 0) { + out.append(p, clean); + p += clean; + } + out.append(character_sets::hex + uint8_t(*p) * 4, 3); + p++; + } +#endif + // Scalar tail for remaining < 16 bytes. + while (p < pend) { + if (character_sets::bit_at(character_set, *p)) { + out.append(character_sets::hex + uint8_t(*p) * 4, 3); } else { - result += *pointer; + out += *p; } + p++; } +} +std::string percent_encode(const std::string_view input, + const uint8_t character_set[]) { + size_t first_idx = percent_encode_index(input, character_set); + if (first_idx == input.size()) { + return std::string(input); + } + std::string result; + result.reserve(input.length()); + result.append(input.substr(0, first_idx)); + percent_encode_to(input.data() + first_idx, input.data() + input.size(), + character_set, result); return result; } @@ -513,33 +815,21 @@ bool percent_encode(const std::string_view input, const uint8_t character_set[], std::string& out) { ada_log("percent_encode ", input, " to output string while ", append ? "appending" : "overwriting"); - auto pointer = std::ranges::find_if(input, [character_set](const char c) { - return character_sets::bit_at(character_set, c); - }); - ada_log("percent_encode done checking, moved to ", - std::distance(input.begin(), pointer)); - - // Optimization: Don't iterate if percent encode is not required - if (pointer == input.end()) { + size_t first_idx = percent_encode_index(input, character_set); + ada_log("percent_encode done checking, moved to ", first_idx); + + if (first_idx == input.size()) { ada_log("percent_encode encoding not needed."); return false; } if constexpr (!append) { out.clear(); } - ada_log("percent_encode appending ", std::distance(input.begin(), pointer), - " bytes"); - // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage) - out.append(input.data(), std::distance(input.begin(), pointer)); - ada_log("percent_encode processing ", std::distance(pointer, input.end()), - " bytes"); - for (; pointer != input.end(); pointer++) { - if (character_sets::bit_at(character_set, *pointer)) { - out.append(character_sets::hex + uint8_t(*pointer) * 4, 3); - } else { - out += *pointer; - } - } + ada_log("percent_encode appending ", first_idx, " bytes"); + out.append(input.substr(0, first_idx)); + ada_log("percent_encode processing ", input.size() - first_idx, " bytes"); + percent_encode_to(input.data() + first_idx, input.data() + input.size(), + character_set, out); return true; } @@ -564,16 +854,9 @@ bool to_ascii(std::optional& out, const std::string_view plain, std::string percent_encode(const std::string_view input, const uint8_t character_set[], size_t index) { std::string out; - // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage) - out.append(input.data(), index); - auto pointer = input.begin() + index; - for (; pointer != input.end(); pointer++) { - if (character_sets::bit_at(character_set, *pointer)) { - out.append(character_sets::hex + uint8_t(*pointer) * 4, 3); - } else { - out += *pointer; - } - } + out.append(input.substr(0, index)); + percent_encode_to(input.data() + index, input.data() + input.size(), + character_set, out); return out; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 416563a06..a2bdc8fc6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -75,6 +75,7 @@ else() if(ADA_INCLUDE_URL_PATTERN) add_gtest_test(wpt_urlpattern_tests wpt_urlpattern_tests.cpp) endif() + add_gtest_test(unicode_tests unicode_tests.cpp) add_gtest_test(url_components url_components.cpp) add_gtest_test(basic_tests basic_tests.cpp) add_gtest_test(from_file_tests from_file_tests.cpp) @@ -93,6 +94,7 @@ else() if(MSVC OR MINGW) target_compile_definitions(wpt_url_tests PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(wpt_urlpattern_tests PRIVATE _CRT_SECURE_NO_WARNINGS) + target_compile_definitions(unicode_tests PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(url_components PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(basic_fuzzer PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(from_file_tests PRIVATE _CRT_SECURE_NO_WARNINGS) diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp new file mode 100644 index 000000000..b63103b35 --- /dev/null +++ b/tests/unicode_tests.cpp @@ -0,0 +1,91 @@ +#include "ada.h" +#include "ada/character_sets.h" +#include "ada/unicode.h" +#include "gtest/gtest.h" + +#include + +using Types = testing::Types; +template +struct unicode_setter_tests : testing::Test {}; +TYPED_TEST_SUITE(unicode_setter_tests, Types); + +TEST(unicode_tests, percent_encode_index_boundaries) { + const uint8_t* userinfo = ada::character_sets::USERINFO_PERCENT_ENCODE; + const uint8_t* query = ada::character_sets::QUERY_PERCENT_ENCODE; + + std::string at_15 = std::string(15, 'a') + "|" + std::string(16, 'b'); + std::string at_16 = std::string(16, 'a') + "|" + std::string(15, 'b'); + std::string at_17 = std::string(17, 'a') + "|" + std::string(14, 'b'); + std::string clean(32, 'a'); + std::string non_ascii = std::string(16, 'a') + std::string(1, char(0xE1)); + + EXPECT_EQ(ada::unicode::percent_encode_index(at_15, userinfo), 15u); + EXPECT_EQ(ada::unicode::percent_encode_index(at_16, userinfo), 16u); + EXPECT_EQ(ada::unicode::percent_encode_index(at_17, userinfo), 17u); + EXPECT_EQ(ada::unicode::percent_encode_index(clean, userinfo), clean.size()); + EXPECT_EQ(ada::unicode::percent_encode_index(non_ascii, query), 16u); +} + +TEST(unicode_tests, percent_encode_with_index_matches_full_encode) { + const uint8_t* userinfo = ada::character_sets::USERINFO_PERCENT_ENCODE; + const uint8_t* query = ada::character_sets::QUERY_PERCENT_ENCODE; + + std::string needs_encoding = std::string(16, 'a') + "|" + std::string(16, 'b'); + size_t first_idx = + ada::unicode::percent_encode_index(needs_encoding, userinfo); + + EXPECT_EQ(first_idx, 16u); + EXPECT_EQ(ada::unicode::percent_encode(needs_encoding, userinfo), + ada::unicode::percent_encode(needs_encoding, userinfo, first_idx)); + + std::string non_ascii = std::string(16, 'a') + std::string(1, char(0xE1)); + EXPECT_EQ(ada::unicode::percent_encode(non_ascii, query), + std::string(16, 'a') + "%E1"); +} + +TEST(unicode_tests, percent_decode_boundaries_and_invalid_sequences) { + std::string valid = std::string(15, 'a') + "%41" + std::string(16, 'b') + "%2F"; + EXPECT_EQ(ada::unicode::percent_decode(valid, valid.find('%')), + std::string(15, 'a') + "A" + std::string(16, 'b') + "/"); + + std::string valid_at_16 = std::string(16, 'a') + "%20" + std::string(15, 'b'); + EXPECT_EQ(ada::unicode::percent_decode(valid_at_16, valid_at_16.find('%')), + std::string(16, 'a') + " " + std::string(15, 'b')); + + std::string invalid = std::string(15, 'a') + "%G1" + std::string(16, 'b') + "%"; + EXPECT_EQ(ada::unicode::percent_decode(invalid, invalid.find('%')), invalid); + + std::string truncated = std::string(16, 'a') + "%4"; + EXPECT_EQ(ada::unicode::percent_decode(truncated, truncated.find('%')), + truncated); + + EXPECT_EQ(ada::unicode::percent_decode("plain-text", + std::string_view::npos), + "plain-text"); +} + +TYPED_TEST(unicode_setter_tests, set_search_and_hash_encode_boundary_spaces) { + auto url = ada::parse("https://example.com/"); + ASSERT_TRUE(url); + + std::string search_input = std::string(15, 'a') + " " + std::string(16, 'b'); + url->set_search(search_input); + EXPECT_EQ(url->get_search(), + "?" + std::string(15, 'a') + "%20" + std::string(16, 'b')); + + std::string hash_input = std::string(16, 'c') + " " + std::string(15, 'd'); + url->set_hash(hash_input); + EXPECT_EQ(url->get_hash(), + "#" + std::string(16, 'c') + "%20" + std::string(15, 'd')); +} + +TYPED_TEST(unicode_setter_tests, set_pathname_encodes_boundary_space) { + auto url = ada::parse("https://example.com/"); + ASSERT_TRUE(url); + + std::string pathname = "/" + std::string(15, 'a') + " " + std::string(16, 'b'); + url->set_pathname(pathname); + EXPECT_EQ(url->get_pathname(), + "/" + std::string(15, 'a') + "%20" + std::string(16, 'b')); +} From 4fd8d3e66c78cf65d6218a773214746e2856f848 Mon Sep 17 00:00:00 2001 From: Ali Hassan Date: Mon, 6 Apr 2026 21:58:59 +0500 Subject: [PATCH 2/3] fix lint errors --- src/helpers.cpp | 8 ++++---- tests/unicode_tests.cpp | 15 +++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/helpers.cpp b/src/helpers.cpp index defb7778c..38fe2d395 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -294,7 +294,7 @@ ada_really_inline size_t find_next_host_delimiter_special( uint8x16_t classify = vandq_u8(lowpart, highpart); if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); - uint16_t is_non_zero = ~to_bitmask(is_zero); + uint16_t is_non_zero = static_cast(~to_bitmask(is_zero)); return i + trailing_zeroes(is_non_zero); } } @@ -307,7 +307,7 @@ ada_really_inline size_t find_next_host_delimiter_special( uint8x16_t classify = vandq_u8(lowpart, highpart); if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); - uint16_t is_non_zero = ~to_bitmask(is_zero); + uint16_t is_non_zero = static_cast(~to_bitmask(is_zero)); return view.length() - 16 + trailing_zeroes(is_non_zero); } } @@ -583,7 +583,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, uint8x16_t classify = vandq_u8(lowpart, highpart); if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); - uint16_t is_non_zero = ~to_bitmask(is_zero); + uint16_t is_non_zero = static_cast(~to_bitmask(is_zero)); return i + trailing_zeroes(is_non_zero); } } @@ -596,7 +596,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, uint8x16_t classify = vandq_u8(lowpart, highpart); if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) { uint8x16_t is_zero = vceqq_u8(classify, zero); - uint16_t is_non_zero = ~to_bitmask(is_zero); + uint16_t is_non_zero = static_cast(~to_bitmask(is_zero)); return view.length() - 16 + trailing_zeroes(is_non_zero); } } diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp index b63103b35..134d977a7 100644 --- a/tests/unicode_tests.cpp +++ b/tests/unicode_tests.cpp @@ -31,7 +31,8 @@ TEST(unicode_tests, percent_encode_with_index_matches_full_encode) { const uint8_t* userinfo = ada::character_sets::USERINFO_PERCENT_ENCODE; const uint8_t* query = ada::character_sets::QUERY_PERCENT_ENCODE; - std::string needs_encoding = std::string(16, 'a') + "|" + std::string(16, 'b'); + std::string needs_encoding = + std::string(16, 'a') + "|" + std::string(16, 'b'); size_t first_idx = ada::unicode::percent_encode_index(needs_encoding, userinfo); @@ -45,7 +46,8 @@ TEST(unicode_tests, percent_encode_with_index_matches_full_encode) { } TEST(unicode_tests, percent_decode_boundaries_and_invalid_sequences) { - std::string valid = std::string(15, 'a') + "%41" + std::string(16, 'b') + "%2F"; + std::string valid = + std::string(15, 'a') + "%41" + std::string(16, 'b') + "%2F"; EXPECT_EQ(ada::unicode::percent_decode(valid, valid.find('%')), std::string(15, 'a') + "A" + std::string(16, 'b') + "/"); @@ -53,15 +55,15 @@ TEST(unicode_tests, percent_decode_boundaries_and_invalid_sequences) { EXPECT_EQ(ada::unicode::percent_decode(valid_at_16, valid_at_16.find('%')), std::string(16, 'a') + " " + std::string(15, 'b')); - std::string invalid = std::string(15, 'a') + "%G1" + std::string(16, 'b') + "%"; + std::string invalid = + std::string(15, 'a') + "%G1" + std::string(16, 'b') + "%"; EXPECT_EQ(ada::unicode::percent_decode(invalid, invalid.find('%')), invalid); std::string truncated = std::string(16, 'a') + "%4"; EXPECT_EQ(ada::unicode::percent_decode(truncated, truncated.find('%')), truncated); - EXPECT_EQ(ada::unicode::percent_decode("plain-text", - std::string_view::npos), + EXPECT_EQ(ada::unicode::percent_decode("plain-text", std::string_view::npos), "plain-text"); } @@ -84,7 +86,8 @@ TYPED_TEST(unicode_setter_tests, set_pathname_encodes_boundary_space) { auto url = ada::parse("https://example.com/"); ASSERT_TRUE(url); - std::string pathname = "/" + std::string(15, 'a') + " " + std::string(16, 'b'); + std::string pathname = + "/" + std::string(15, 'a') + " " + std::string(16, 'b'); url->set_pathname(pathname); EXPECT_EQ(url->get_pathname(), "/" + std::string(15, 'a') + "%20" + std::string(16, 'b')); From 63b475f27b7e8654ee204561a3d2fe93dc5b57af Mon Sep 17 00:00:00 2001 From: Ali Hassan Date: Mon, 6 Apr 2026 22:39:09 +0500 Subject: [PATCH 3/3] improve SSSE3 path Signed-off-by: Ali Hassan --- src/unicode.cpp | 164 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 111 insertions(+), 53 deletions(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index f28912dcc..3be544e91 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -448,42 +448,47 @@ unsigned constexpr convert_hex_to_binary(const char c) noexcept { } #if ADA_SSSE3 -size_t percent_encode_index(const std::string_view input, - const uint8_t character_set[]) { - const char* data = input.data(); - const size_t size = input.size(); - if (size < 16) { - for (size_t i = 0; i < size; i++) { - if (character_sets::bit_at(character_set, data[i])) return i; - } - return size; - } - // Nibble decomposition: for byte v = (hi << 4) | lo (v < 128), - // lo_lut[lo] = bitmask of which hi nibbles (0-7) need encoding - // hi_lut[hi] = (1 << hi) for hi < 8, else 0 - // Bytes >= 128 always need encoding -- caught by sign bit check. - uint8_t lo_lut_data[16] = {0}; - uint8_t hi_lut_data[16] = {0}; - for (int h = 0; h < 8; h++) { - hi_lut_data[h] = uint8_t(1) << h; - for (int l = 0; l < 16; l++) { - if (character_sets::bit_at(character_set, (h << 4) | l)) { - lo_lut_data[l] |= uint8_t(1) << h; +// Prebuilt nibble-decomposition LUTs for SSSE3 percent-encode. +// Constructed once from character_set, then shared across index + encode +// to avoid the 128-iteration build cost being paid twice per call. +struct ssse3_encode_luts { + __m128i lo_lut; + __m128i hi_lut; + __m128i mask_0f; + explicit ssse3_encode_luts(const uint8_t character_set[]) noexcept { + // For byte v = (hi << 4) | lo (v < 128): + // lo_lut[lo] = bitmask of which hi nibbles (0-7) need encoding + // hi_lut[hi] = (1 << hi) for hi < 8, else 0 + // Bytes >= 128 always need encoding -- caught by sign bit check. + uint8_t lo_data[16] = {0}; + uint8_t hi_data[16] = {0}; + for (int h = 0; h < 8; h++) { + hi_data[h] = uint8_t(1) << h; + for (int l = 0; l < 16; l++) { + if (character_sets::bit_at(character_set, (h << 4) | l)) { + lo_data[l] |= uint8_t(1) << h; + } } } + lo_lut = _mm_loadu_si128((const __m128i*)lo_data); + hi_lut = _mm_loadu_si128((const __m128i*)hi_data); + mask_0f = _mm_set1_epi8(0x0F); } - __m128i lo_lut = _mm_loadu_si128((const __m128i*)lo_lut_data); - __m128i hi_lut = _mm_loadu_si128((const __m128i*)hi_lut_data); - __m128i mask_0f = _mm_set1_epi8(0x0F); +}; +static size_t percent_encode_index_simd(const std::string_view input, + const uint8_t character_set[], + const ssse3_encode_luts& luts) { + const char* data = input.data(); + const size_t size = input.size(); size_t i = 0; for (; i + 15 < size; i += 16) { __m128i word = _mm_loadu_si128((const __m128i*)(data + i)); int high_mask = _mm_movemask_epi8(word); - __m128i lo_nibbles = _mm_and_si128(word, mask_0f); - __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), mask_0f); - __m128i matches = _mm_and_si128(_mm_shuffle_epi8(lo_lut, lo_nibbles), - _mm_shuffle_epi8(hi_lut, hi_nibbles)); + __m128i lo_nibbles = _mm_and_si128(word, luts.mask_0f); + __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), luts.mask_0f); + __m128i matches = _mm_and_si128(_mm_shuffle_epi8(luts.lo_lut, lo_nibbles), + _mm_shuffle_epi8(luts.hi_lut, hi_nibbles)); int match_mask = _mm_movemask_epi8(matches) | high_mask; if (match_mask != 0) { return i + __builtin_ctz(match_mask); @@ -494,6 +499,19 @@ size_t percent_encode_index(const std::string_view input, } return size; } + +size_t percent_encode_index(const std::string_view input, + const uint8_t character_set[]) { + const size_t size = input.size(); + if (size < 16) { + for (size_t i = 0; i < size; i++) { + if (character_sets::bit_at(character_set, input.data()[i])) return i; + } + return size; + } + ssse3_encode_luts luts(character_set); + return percent_encode_index_simd(input, character_set, luts); +} #elif ADA_NEON size_t percent_encode_index(const std::string_view input, const uint8_t character_set[]) { @@ -711,35 +729,19 @@ std::string percent_decode(const std::string_view input, size_t first_percent) { return dest; } -// SIMD-accelerated encoding loop shared by all percent_encode overloads. -// Scans [p, pend) for bytes matching character_set, encoding matches as %XX -// and bulk-appending clean runs. -static ada_really_inline void percent_encode_to(const char* p, const char* pend, - const uint8_t character_set[], - std::string& out) { +// SSSE3 encode loop that accepts prebuilt LUTs. +// Self-contained: includes the scalar tail for remaining bytes. #if ADA_SSSE3 - // Nibble decomposition LUTs (same algorithm as percent_encode_index). - uint8_t lo_lut_data[16] = {0}; - uint8_t hi_lut_data[16] = {0}; - for (int h = 0; h < 8; h++) { - hi_lut_data[h] = uint8_t(1) << h; - for (int l = 0; l < 16; l++) { - if (character_sets::bit_at(character_set, (h << 4) | l)) { - lo_lut_data[l] |= uint8_t(1) << h; - } - } - } - __m128i lo_lut = _mm_loadu_si128((const __m128i*)lo_lut_data); - __m128i hi_lut = _mm_loadu_si128((const __m128i*)hi_lut_data); - __m128i mask_0f = _mm_set1_epi8(0x0F); - +static ada_really_inline void percent_encode_to_simd( + const char* p, const char* pend, const uint8_t character_set[], + std::string& out, const ssse3_encode_luts& luts) { while (p + 15 < pend) { __m128i word = _mm_loadu_si128((const __m128i*)p); int high_mask = _mm_movemask_epi8(word); - __m128i lo_nibbles = _mm_and_si128(word, mask_0f); - __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), mask_0f); - __m128i matches = _mm_and_si128(_mm_shuffle_epi8(lo_lut, lo_nibbles), - _mm_shuffle_epi8(hi_lut, hi_nibbles)); + __m128i lo_nibbles = _mm_and_si128(word, luts.mask_0f); + __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi16(word, 4), luts.mask_0f); + __m128i matches = _mm_and_si128(_mm_shuffle_epi8(luts.lo_lut, lo_nibbles), + _mm_shuffle_epi8(luts.hi_lut, hi_nibbles)); int match_mask = _mm_movemask_epi8(matches) | high_mask; if (match_mask == 0) { out.append(p, 16); @@ -754,6 +756,27 @@ static ada_really_inline void percent_encode_to(const char* p, const char* pend, out.append(character_sets::hex + uint8_t(*p) * 4, 3); p++; } + while (p < pend) { + if (character_sets::bit_at(character_set, *p)) { + out.append(character_sets::hex + uint8_t(*p) * 4, 3); + } else { + out += *p; + } + p++; + } +} +#endif + +// Encoding loop for callers that don't have prebuilt LUTs (e.g. +// percent_encode with a pre-known index). On SSSE3 this builds its own +// LUTs and delegates; on NEON/scalar it runs inline. +static ada_really_inline void percent_encode_to(const char* p, const char* pend, + const uint8_t character_set[], + std::string& out) { +#if ADA_SSSE3 + ssse3_encode_luts luts(character_set); + percent_encode_to_simd(p, pend, character_set, out, luts); + return; #elif ADA_NEON uint8x16x2_t cs_table; cs_table.val[0] = vld1q_u8(character_set); @@ -798,6 +821,20 @@ static ada_really_inline void percent_encode_to(const char* p, const char* pend, std::string percent_encode(const std::string_view input, const uint8_t character_set[]) { +#if ADA_SSSE3 + if (input.size() >= 16) { + ssse3_encode_luts luts(character_set); + size_t first_idx = percent_encode_index_simd(input, character_set, luts); + if (first_idx == input.size()) return std::string(input); + std::string result; + result.reserve(input.length()); + result.append(input.substr(0, first_idx)); + percent_encode_to_simd(input.data() + first_idx, + input.data() + input.size(), character_set, result, + luts); + return result; + } +#endif size_t first_idx = percent_encode_index(input, character_set); if (first_idx == input.size()) { return std::string(input); @@ -815,6 +852,27 @@ bool percent_encode(const std::string_view input, const uint8_t character_set[], std::string& out) { ada_log("percent_encode ", input, " to output string while ", append ? "appending" : "overwriting"); +#if ADA_SSSE3 + if (input.size() >= 16) { + ssse3_encode_luts luts(character_set); + size_t first_idx = percent_encode_index_simd(input, character_set, luts); + ada_log("percent_encode done checking, moved to ", first_idx); + if (first_idx == input.size()) { + ada_log("percent_encode encoding not needed."); + return false; + } + if constexpr (!append) { + out.clear(); + } + ada_log("percent_encode appending ", first_idx, " bytes"); + out.append(input.substr(0, first_idx)); + ada_log("percent_encode processing ", input.size() - first_idx, " bytes"); + percent_encode_to_simd(input.data() + first_idx, + input.data() + input.size(), character_set, out, + luts); + return true; + } +#endif size_t first_idx = percent_encode_index(input, character_set); ada_log("percent_encode done checking, moved to ", first_idx);