Blosc · FrancescAlted · Jun 4, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/bench/ctable/query-backends.py b/bench/ctable/query-backends.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Benchmark: CTable.where() across three evaluation backends.
+
+Tests how performance scales with table size (10M–500M rows) for the query:
+    (tips > 100) & (km > 0) & (lon < 0)
+
+Backends:
+  interpreted : miniexpr bytecode interpreter (default, no JIT)
+  tcc         : Tiny C Compiler JIT (fast compile, modest code quality)
+  cc          : system C compiler JIT (clang/gcc, -O3 + auto-vectorisation)
+
+Two timings are shown per backend:
+  cold  – first call, includes JIT compilation cost for tcc/cc
+  warm  – second call, kernel is cached (shared library already loaded)
+"""
+
+import os
+import sys
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+import blosc2
+
+
+SIZES = [10_000_000, 50_000_000, 100_000_000, 200_000_000, 500_000_000]
+BUILD_CHUNK = 10_000_000  # rows per extend() call to avoid large temp arrays
+
+BACKENDS = [
+    ("interpreted", None),
+    ("tcc",         "tcc"),
+    ("cc",          "cc"),
+]
+
+NP_DTYPE = np.dtype([
+    ("passenger_count", np.int32),
+    ("shared",          np.bool_),
+    ("tips",            np.float32),
+    ("km",              np.float32),
+    ("lon",             np.float32),
+])
+
+
+@dataclass
+class Row:
+    passenger_count: int   = blosc2.field(blosc2.int32())
+    shared:          bool  = blosc2.field(blosc2.bool())
+    tips:            float = blosc2.field(blosc2.float32())
+    km:              float = blosc2.field(blosc2.float32())
+    lon:             float = blosc2.field(blosc2.float32())
+
+
+def make_chunk(n: int, rng: np.random.Generator) -> np.ndarray:
+    # Integer-valued floats: mantissa low bytes are zero → high compression ratio.
+    chunk = np.empty(n, dtype=NP_DTYPE)
+    chunk["passenger_count"] = rng.integers(1, 7, n, dtype=np.int32)
+    chunk["shared"]          = rng.integers(0, 2, n, dtype=np.bool_)
+    chunk["tips"]            = rng.integers(0, 501, n).astype(np.float32)   # 501 distinct values
+    chunk["km"]              = rng.integers(-10, 201, n).astype(np.float32) # 211 distinct values
+    chunk["lon"]             = rng.integers(-150, 51, n).astype(np.float32) # 201 distinct values
+    return chunk
+
+
+def build_table(n_rows: int, rng: np.random.Generator) -> blosc2.CTable:
+    ct = blosc2.CTable(Row, expected_size=n_rows)
+    remaining = n_rows
+    while remaining > 0:
+        batch = min(remaining, BUILD_CHUNK)
+        ct.extend(make_chunk(batch, rng))
+        remaining -= batch
+    return ct
+
+
+def run_where(ct: blosc2.CTable, blosc_me_jit: str | None) -> tuple[float, int]:
+    """Run the where() query under the given BLOSC_ME_JIT setting.
+
+    Thresholds are chosen so that each sub-condition passes ~4.6% of rows
+    independently, giving a combined selectivity of ~0.01%:
+      tips  ~ U(0,   500): tips > 477   passes (500-477)/500   = 4.6%
+      km    ~ U(-10, 200): km   > 190   passes (200-190)/210   = 4.8%
+      lon   ~ U(-150, 50): lon  < -140  passes (-140+150)/200  = 5.0%
+    Combined: 0.046 * 0.048 * 0.050 ≈ 0.011%
+    """
+    saved = os.environ.pop("BLOSC_ME_JIT", None)
+    try:
+        if blosc_me_jit is not None:
+            os.environ["BLOSC_ME_JIT"] = blosc_me_jit
+        condition = (ct.tips > 477) & (ct.km > 190) & (ct.lon < -140)
+        t0 = time.perf_counter()
+        result = ct.where(condition)
+        elapsed = time.perf_counter() - t0
+        return elapsed, len(result)
+    finally:
+        os.environ.pop("BLOSC_ME_JIT", None)
+        if saved is not None:
+            os.environ["BLOSC_ME_JIT"] = saved
+
+
+def fmt_row(n: int, timings: list[tuple[float, float]], n_matched: int) -> str:
+    parts = [f"{n:>12,}"]
+    for cold, warm in timings:
+        parts.append(f"{cold:>8.3f} {warm:>8.3f}")
+    parts.append(f"  ({n_matched:,} matched)")
+    return " | ".join(parts)
+
+
+def main():
+    rng = np.random.default_rng(42)
+
+    # Header
+    backend_header = " | ".join(f"{'--- ' + name + ' ---':>17}" for name, _ in BACKENDS)
+    print(f"\n{'':>12} | {backend_header}")
+    subheader = " | ".join(f"{'cold(s)':>8} {'warm(s)':>8}" for _ in BACKENDS)
+    print(f"{'rows':>12} | {subheader}")
+    print("-" * (14 + 19 * len(BACKENDS)))
+
+    for n in SIZES:
+        print(f"  building {n:,} rows...", end=" ", flush=True)
+        ct = build_table(n, rng)
+        print("done", flush=True)
+
+        timings = []
+        n_matched = None
+        for _name, backend in BACKENDS:
+            cold, n_matched = run_where(ct, backend)
+            warm, _        = run_where(ct, backend)
+            timings.append((cold, warm))
+
+        print(fmt_row(n, timings, n_matched))
+        sys.stdout.flush()
+
+        del ct  # free memory before building the next (larger) table
+
+    print()
+    print("cold = first call (includes JIT compilation for tcc/cc)")
+    print("warm = second call (kernel cached, compilation cost amortised)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/doc/getting_started/overview.rst b/doc/getting_started/overview.rst
@@ -77,6 +77,8 @@ container objects in Python-Blosc2:
   `buffer protocol <https://docs.python.org/3/c-api/buffer.html>`_.
 * ``NDArray``: An N-Dimensional store that mirrors the NumPy API, enhanced with
   efficient compressed data storage.
+* ``CTable``: A columnar table for structured, record-oriented data with a
+  powerful query engine built on top of compressed ``NDArray`` columns.
 
 These containers are described in more detail below.
 
@@ -257,6 +259,96 @@ parameter in ``compute()`` or ``sum()`` functions). For a more in-depth look at
 this example, with performance comparisons, see this
 `compute-bigger blog post <https://ironarray.io/blog/compute-bigger>`_.
 
+Querying Columnar Data with CTable
+===================================
+
+``CTable`` is Python-Blosc2's columnar store for structured, record-oriented
+data.  Each column is a compressed ``NDArray``, so the same chunking,
+compression, and compute-engine machinery that powers ``NDArray`` expressions
+is available for tabular queries — with no data copy required.
+
+Schemas are defined with plain Python dataclasses, supporting a rich mix of
+types including integers, floats, booleans, and strings:
+
+.. code-block:: python
+
+    from dataclasses import dataclass
+    import blosc2
+
+
+    @dataclass
+    class Row:
+        passenger_count: int = blosc2.field(blosc2.int32())
+        shared: bool = blosc2.field(blosc2.bool())
+        tips: float = blosc2.field(blosc2.float32())
+        km: float = blosc2.field(blosc2.float32())
+        lon: float = blosc2.field(blosc2.float32())
+        company: str = blosc2.field(blosc2.string(max_length=50))
+
+
+    t = blosc2.CTable(Row, expected_size=10_000_000)
+
+Columns support the full lazy-expression syntax, so compound boolean filters
+are written naturally and evaluated in a single pass over the compressed data:
+
+.. code-block:: python
+
+    condition = (t.tips > 100) & (t.km > 0) & (t.lon < -10)
+    result = t.where(condition).sort_by("km")
+
+Beyond filtering and sorting, ``CTable`` offers:
+
+* **Aggregations and group-by** — ``groupby()``, ``sum()``, ``mean()``,
+  ``min()``, ``max()``, ``std()`` and more, optionally with a ``where=``
+  mask for conditional aggregation.
+* **Computed and generated columns** — columns whose values are derived from
+  other columns via a lazy expression, evaluated on the fly without storing
+  extra data.
+* **Automatic SUMMARY indexes** — per-block min/max indexes built
+  transparently at write time, enabling ``where()`` to skip entire blocks
+  that cannot contain matching rows, dramatically reducing I/O for
+  high-selectivity queries.
+* **Schema validation** — type and constraint checking (``ge=``, ``le=``,
+  nullable, etc.) enforced at insert time, keeping data quality guarantees
+  inside the table itself.
+* **Null handling** — first-class nullable columns with ``notnull()``,
+  ``null_count``, and null-aware aggregations.
+* **Nested field paths** — hierarchically structured schemas expose columns
+  as ``t.payment.tips``, ``t.trip.begin.lon``, etc., keeping query code
+  readable even for wide, deeply nested records.
+* **Parquet and Arrow round-trips** — load from and save to Parquet or Apache
+  Arrow with a single call, making it easy to interoperate with the broader
+  data ecosystem.
+* **Persistent storage** — open and save tables to disk (``CTable.open()``,
+  ``CTable.save()``); in-memory and on-disk tables share the same API.
+
+.. code-block:: python
+
+    # Load from Parquet, filter, and persist the result
+    t = blosc2.CTable.from_parquet("trips.parquet")
+    result = t.where((t.tips > 100) & (t.km > 0)).sort_by("km")
+    result.save("filtered_trips.b2z")
+
+.. tip::
+
+   **Free ~30% speedup for large tables:** set the ``BLOSC_ME_JIT=cc``
+   environment variable to have filter expressions JIT-compiled by the system C
+   compiler (clang/gcc) with ``-O3`` and auto-vectorisation, instead of the
+   default bytecode interpreter.  The compiled kernel is cached on disk so
+   subsequent runs pay no compilation cost.
+
+   .. code-block:: bash
+
+      BLOSC_ME_JIT=cc python my_script.py
+
+   Benchmarks on tables from 50 M to 500 M rows show a consistent ~30%
+   speedup across Intel, AMD, and Apple Silicon hardware.  The one-time
+   compilation cost on Linux (gcc, ~30 ms) is negligible; on macOS (clang,
+   ~400 ms) it is only worth paying for large tables or repeated queries.
+   For small tables (< ~50 M rows) the default bytecode interpreter is
+   perfectly adequate.  See the :py:meth:`blosc2.LazyArray.compute` docstring
+   for the full list of ``BLOSC_ME_JIT`` values and options.
+
 Hopefully, this overview has provided a good understanding of Python-Blosc2's
 capabilities. To begin your journey with Python-Blosc2, proceed to the
 `installation instructions <installation>`_. Then explore the

diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst
@@ -315,16 +315,46 @@ Mutations
 ---------
 
 In addition to physical schema changes such as :meth:`CTable.add_column`,
-CTables can host **computed columns** backed by a lazy expression over stored
-columns.  Computed columns are read-only, use no extra storage, participate in
-display, filtering, sorting, and aggregates, and are persisted across
-:meth:`CTable.save`, :meth:`CTable.load`, and :meth:`CTable.open`.
-
-When a computed result should become a normal stored column, use
-:meth:`CTable.materialize_computed_column`.  The materialized column is a stored
-snapshot that can be indexed like any other stored column.  New rows inserted
-later via :meth:`CTable.append` or :meth:`CTable.extend` auto-fill omitted
-materialized-column values from the recorded expression metadata.
+CTables support two kinds of derived columns:
+
+**Computed columns** (:meth:`CTable.add_computed_column`) are purely virtual —
+they use no extra storage, are evaluated on demand, and are read-only.  They
+participate in display, filtering, sorting, and aggregates, and are persisted
+across save/open round-trips.  Because they have no physical storage, they
+**cannot be indexed**.
+
+**Generated columns** (:meth:`CTable.add_generated_column`) are physically
+stored.  Their values are computed once and written to disk; new rows appended
+later are auto-filled automatically.  Because the data is real, generated
+columns **can be indexed** with :meth:`CTable.create_index`, which makes
+``where()`` queries on them fast.
+
+**Practical rule**: use a computed column when you just need a derived value
+available for display, export, or occasional reads.  Use a generated column
+(optionally with ``create_index=True``) when you need to filter or sort by a
+derived value frequently — the index pays for itself after the first few
+queries.
+
+Both forms accept plain expression strings, :func:`blosc2.dsl_kernel`-decorated
+functions, and :class:`blosc2.LazyUDF` objects.  DSL kernels support full Python
+control flow (``if``/``else``, ``where()``, loops) and have their source
+persisted and recompiled on open.
+
+When passing a :class:`blosc2.LazyUDF` built with an explicit ``jit_backend=``
+(e.g. ``jit_backend="cc"`` to use the system C compiler instead of the default
+TCC), that choice is persisted in the column metadata and automatically restored
+on :func:`blosc2.open`.  This matters for kernels where one backend produces
+measurably faster code — the optimised backend stays active for the lifetime of
+the table without any extra configuration::
+
+    t.add_generated_column(
+        "score",
+        values=blosc2.lazyudf(my_kernel, (t.col_a, t.col_b), jit_backend="cc"),
+    )
+
+When a computed result should become a stored snapshot rather than a live
+virtual column, use :meth:`CTable.materialize_computed_column` to convert it
+in place.
 
 .. autosummary::