Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions bench/ctable/query-backends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Benchmark: CTable.where() across three evaluation backends.

Tests how performance scales with table size (10M–500M rows) for the query:
(tips > 100) & (km > 0) & (lon < 0)

Backends:
interpreted : miniexpr bytecode interpreter (default, no JIT)
tcc : Tiny C Compiler JIT (fast compile, modest code quality)
cc : system C compiler JIT (clang/gcc, -O3 + auto-vectorisation)

Two timings are shown per backend:
cold – first call, includes JIT compilation cost for tcc/cc
warm – second call, kernel is cached (shared library already loaded)
"""

import os
import sys
import time
from dataclasses import dataclass

import numpy as np

import blosc2


SIZES = [10_000_000, 50_000_000, 100_000_000, 200_000_000, 500_000_000]
BUILD_CHUNK = 10_000_000 # rows per extend() call to avoid large temp arrays

BACKENDS = [
("interpreted", None),
("tcc", "tcc"),
("cc", "cc"),
]

NP_DTYPE = np.dtype([
("passenger_count", np.int32),
("shared", np.bool_),
("tips", np.float32),
("km", np.float32),
("lon", np.float32),
])


@dataclass
class Row:
passenger_count: int = blosc2.field(blosc2.int32())
shared: bool = blosc2.field(blosc2.bool())
tips: float = blosc2.field(blosc2.float32())
km: float = blosc2.field(blosc2.float32())
lon: float = blosc2.field(blosc2.float32())


def make_chunk(n: int, rng: np.random.Generator) -> np.ndarray:
# Integer-valued floats: mantissa low bytes are zero → high compression ratio.
chunk = np.empty(n, dtype=NP_DTYPE)
chunk["passenger_count"] = rng.integers(1, 7, n, dtype=np.int32)
chunk["shared"] = rng.integers(0, 2, n, dtype=np.bool_)
chunk["tips"] = rng.integers(0, 501, n).astype(np.float32) # 501 distinct values
chunk["km"] = rng.integers(-10, 201, n).astype(np.float32) # 211 distinct values
chunk["lon"] = rng.integers(-150, 51, n).astype(np.float32) # 201 distinct values
return chunk


def build_table(n_rows: int, rng: np.random.Generator) -> blosc2.CTable:
ct = blosc2.CTable(Row, expected_size=n_rows)
remaining = n_rows
while remaining > 0:
batch = min(remaining, BUILD_CHUNK)
ct.extend(make_chunk(batch, rng))
remaining -= batch
return ct


def run_where(ct: blosc2.CTable, blosc_me_jit: str | None) -> tuple[float, int]:
"""Run the where() query under the given BLOSC_ME_JIT setting.

Thresholds are chosen so that each sub-condition passes ~4.6% of rows
independently, giving a combined selectivity of ~0.01%:
tips ~ U(0, 500): tips > 477 passes (500-477)/500 = 4.6%
km ~ U(-10, 200): km > 190 passes (200-190)/210 = 4.8%
lon ~ U(-150, 50): lon < -140 passes (-140+150)/200 = 5.0%
Combined: 0.046 * 0.048 * 0.050 ≈ 0.011%
"""
saved = os.environ.pop("BLOSC_ME_JIT", None)
try:
if blosc_me_jit is not None:
os.environ["BLOSC_ME_JIT"] = blosc_me_jit
condition = (ct.tips > 477) & (ct.km > 190) & (ct.lon < -140)
t0 = time.perf_counter()
result = ct.where(condition)
elapsed = time.perf_counter() - t0
return elapsed, len(result)
finally:
os.environ.pop("BLOSC_ME_JIT", None)
if saved is not None:
os.environ["BLOSC_ME_JIT"] = saved


def fmt_row(n: int, timings: list[tuple[float, float]], n_matched: int) -> str:
parts = [f"{n:>12,}"]
for cold, warm in timings:
parts.append(f"{cold:>8.3f} {warm:>8.3f}")
parts.append(f" ({n_matched:,} matched)")
return " | ".join(parts)


def main():
rng = np.random.default_rng(42)

# Header
backend_header = " | ".join(f"{'--- ' + name + ' ---':>17}" for name, _ in BACKENDS)
print(f"\n{'':>12} | {backend_header}")
subheader = " | ".join(f"{'cold(s)':>8} {'warm(s)':>8}" for _ in BACKENDS)
print(f"{'rows':>12} | {subheader}")
print("-" * (14 + 19 * len(BACKENDS)))

for n in SIZES:
print(f" building {n:,} rows...", end=" ", flush=True)
ct = build_table(n, rng)
print("done", flush=True)

timings = []
n_matched = None
for _name, backend in BACKENDS:
cold, n_matched = run_where(ct, backend)
warm, _ = run_where(ct, backend)
timings.append((cold, warm))

print(fmt_row(n, timings, n_matched))
sys.stdout.flush()

del ct # free memory before building the next (larger) table

print()
print("cold = first call (includes JIT compilation for tcc/cc)")
print("warm = second call (kernel cached, compilation cost amortised)")


if __name__ == "__main__":
main()
92 changes: 92 additions & 0 deletions doc/getting_started/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ container objects in Python-Blosc2:
`buffer protocol <https://docs.python.org/3/c-api/buffer.html>`_.
* ``NDArray``: An N-Dimensional store that mirrors the NumPy API, enhanced with
efficient compressed data storage.
* ``CTable``: A columnar table for structured, record-oriented data with a
powerful query engine built on top of compressed ``NDArray`` columns.

These containers are described in more detail below.

Expand Down Expand Up @@ -257,6 +259,96 @@ parameter in ``compute()`` or ``sum()`` functions). For a more in-depth look at
this example, with performance comparisons, see this
`compute-bigger blog post <https://ironarray.io/blog/compute-bigger>`_.

Querying Columnar Data with CTable
===================================

``CTable`` is Python-Blosc2's columnar store for structured, record-oriented
data. Each column is a compressed ``NDArray``, so the same chunking,
compression, and compute-engine machinery that powers ``NDArray`` expressions
is available for tabular queries — with no data copy required.

Schemas are defined with plain Python dataclasses, supporting a rich mix of
types including integers, floats, booleans, and strings:

.. code-block:: python

from dataclasses import dataclass
import blosc2


@dataclass
class Row:
passenger_count: int = blosc2.field(blosc2.int32())
shared: bool = blosc2.field(blosc2.bool())
tips: float = blosc2.field(blosc2.float32())
km: float = blosc2.field(blosc2.float32())
lon: float = blosc2.field(blosc2.float32())
company: str = blosc2.field(blosc2.string(max_length=50))


t = blosc2.CTable(Row, expected_size=10_000_000)

Columns support the full lazy-expression syntax, so compound boolean filters
are written naturally and evaluated in a single pass over the compressed data:

.. code-block:: python

condition = (t.tips > 100) & (t.km > 0) & (t.lon < -10)
result = t.where(condition).sort_by("km")

Beyond filtering and sorting, ``CTable`` offers:

* **Aggregations and group-by** — ``groupby()``, ``sum()``, ``mean()``,
``min()``, ``max()``, ``std()`` and more, optionally with a ``where=``
mask for conditional aggregation.
* **Computed and generated columns** — columns whose values are derived from
other columns via a lazy expression, evaluated on the fly without storing
extra data.
* **Automatic SUMMARY indexes** — per-block min/max indexes built
transparently at write time, enabling ``where()`` to skip entire blocks
that cannot contain matching rows, dramatically reducing I/O for
high-selectivity queries.
* **Schema validation** — type and constraint checking (``ge=``, ``le=``,
nullable, etc.) enforced at insert time, keeping data quality guarantees
inside the table itself.
* **Null handling** — first-class nullable columns with ``notnull()``,
``null_count``, and null-aware aggregations.
* **Nested field paths** — hierarchically structured schemas expose columns
as ``t.payment.tips``, ``t.trip.begin.lon``, etc., keeping query code
readable even for wide, deeply nested records.
* **Parquet and Arrow round-trips** — load from and save to Parquet or Apache
Arrow with a single call, making it easy to interoperate with the broader
data ecosystem.
* **Persistent storage** — open and save tables to disk (``CTable.open()``,
``CTable.save()``); in-memory and on-disk tables share the same API.

.. code-block:: python

# Load from Parquet, filter, and persist the result
t = blosc2.CTable.from_parquet("trips.parquet")
result = t.where((t.tips > 100) & (t.km > 0)).sort_by("km")
result.save("filtered_trips.b2z")

.. tip::

**Free ~30% speedup for large tables:** set the ``BLOSC_ME_JIT=cc``
environment variable to have filter expressions JIT-compiled by the system C
compiler (clang/gcc) with ``-O3`` and auto-vectorisation, instead of the
default bytecode interpreter. The compiled kernel is cached on disk so
subsequent runs pay no compilation cost.

.. code-block:: bash

BLOSC_ME_JIT=cc python my_script.py

Benchmarks on tables from 50 M to 500 M rows show a consistent ~30%
speedup across Intel, AMD, and Apple Silicon hardware. The one-time
compilation cost on Linux (gcc, ~30 ms) is negligible; on macOS (clang,
~400 ms) it is only worth paying for large tables or repeated queries.
For small tables (< ~50 M rows) the default bytecode interpreter is
perfectly adequate. See the :py:meth:`blosc2.LazyArray.compute` docstring
for the full list of ``BLOSC_ME_JIT`` values and options.

Hopefully, this overview has provided a good understanding of Python-Blosc2's
capabilities. To begin your journey with Python-Blosc2, proceed to the
`installation instructions <installation>`_. Then explore the
Expand Down
50 changes: 40 additions & 10 deletions doc/reference/ctable.rst
Original file line number Diff line number Diff line change
Expand Up @@ -315,16 +315,46 @@ Mutations
---------

In addition to physical schema changes such as :meth:`CTable.add_column`,
CTables can host **computed columns** backed by a lazy expression over stored
columns. Computed columns are read-only, use no extra storage, participate in
display, filtering, sorting, and aggregates, and are persisted across
:meth:`CTable.save`, :meth:`CTable.load`, and :meth:`CTable.open`.

When a computed result should become a normal stored column, use
:meth:`CTable.materialize_computed_column`. The materialized column is a stored
snapshot that can be indexed like any other stored column. New rows inserted
later via :meth:`CTable.append` or :meth:`CTable.extend` auto-fill omitted
materialized-column values from the recorded expression metadata.
CTables support two kinds of derived columns:

**Computed columns** (:meth:`CTable.add_computed_column`) are purely virtual —
they use no extra storage, are evaluated on demand, and are read-only. They
participate in display, filtering, sorting, and aggregates, and are persisted
across save/open round-trips. Because they have no physical storage, they
**cannot be indexed**.

**Generated columns** (:meth:`CTable.add_generated_column`) are physically
stored. Their values are computed once and written to disk; new rows appended
later are auto-filled automatically. Because the data is real, generated
columns **can be indexed** with :meth:`CTable.create_index`, which makes
``where()`` queries on them fast.

**Practical rule**: use a computed column when you just need a derived value
available for display, export, or occasional reads. Use a generated column
(optionally with ``create_index=True``) when you need to filter or sort by a
derived value frequently — the index pays for itself after the first few
queries.

Both forms accept plain expression strings, :func:`blosc2.dsl_kernel`-decorated
functions, and :class:`blosc2.LazyUDF` objects. DSL kernels support full Python
control flow (``if``/``else``, ``where()``, loops) and have their source
persisted and recompiled on open.
Comment thread
FrancescAlted marked this conversation as resolved.

When passing a :class:`blosc2.LazyUDF` built with an explicit ``jit_backend=``
(e.g. ``jit_backend="cc"`` to use the system C compiler instead of the default
TCC), that choice is persisted in the column metadata and automatically restored
on :func:`blosc2.open`. This matters for kernels where one backend produces
measurably faster code — the optimised backend stays active for the lifetime of
the table without any extra configuration::

t.add_generated_column(
"score",
values=blosc2.lazyudf(my_kernel, (t.col_a, t.col_b), jit_backend="cc"),
)

When a computed result should become a stored snapshot rather than a live
virtual column, use :meth:`CTable.materialize_computed_column` to convert it
in place.

.. autosummary::

Expand Down
Loading
Loading