Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions ggml/src/ggml-threading.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "ggml-threading.h"
#include <algorithm>
#include <mutex>
#include <thread>
#include <vector>

std::mutex ggml_critical_section_mutex;

Expand All @@ -10,3 +13,45 @@ void ggml_critical_section_start() {
void ggml_critical_section_end(void) {
ggml_critical_section_mutex.unlock();
}

size_t ggml_quantize_chunk_mt(
enum ggml_type type,
const float * src,
void * dst,
int64_t start,
int64_t nrows,
int64_t n_per_row,
const float * imatrix,
int n_threads) {
if (n_threads <= 1 || nrows <= 1) {
return ggml_quantize_chunk(type, src, dst, start, nrows, n_per_row, imatrix);
}

const int n_t = std::min((int64_t) n_threads, nrows);
const int64_t chunk = (nrows + n_t - 1) / n_t;

std::vector<size_t> results(n_t, 0);
std::vector<std::thread> threads;
threads.reserve(n_t - 1);

auto worker = [&](int t) {
const int64_t r0 = (int64_t)t * chunk;
const int64_t r1 = std::min(r0 + chunk, nrows);
const int64_t nrows_t = r1 - r0;
results[t] = ggml_quantize_chunk(type, src, dst, start + r0 * n_per_row, nrows_t, n_per_row, imatrix);
};

for (int t = 1; t < n_t; ++t) {
threads.emplace_back(worker, t);
}
worker(0);
for (auto & th : threads) {
th.join();
}

size_t total = 0;
for (int t = 0; t < n_t; ++t) {
total += results[t];
}
return total;
}
23 changes: 23 additions & 0 deletions ggml/src/ggml-threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@ extern "C" {
GGML_API void ggml_critical_section_start(void);
GGML_API void ggml_critical_section_end(void);

// Parallel version of ggml_quantize_chunk.
//
// Splits [start, start + nrows * n_per_row) across n_threads worker threads,
// each calling ggml_quantize_chunk on its row range. Threads write to
// non-overlapping regions of dst, so no locking is required.
//
// Falls back to the single-threaded ggml_quantize_chunk when n_threads <= 1
// or nrows <= 1. The primary motivation is iq4_nl, whose per-block NL search
// makes single-threaded throughput ~95x slower than other 4-bit types; this
// function recovers near-linear scaling with thread count.
//
// imatrix may be NULL for types that do not require it.
// Returns total bytes written (same contract as ggml_quantize_chunk).
GGML_API size_t ggml_quantize_chunk_mt(
enum ggml_type type,
const float * src,
void * dst,
int64_t start,
int64_t nrows,
int64_t n_per_row,
const float * imatrix,
int n_threads);

#ifdef __cplusplus
}
#endif