Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6cbce5e
chore: upgrade to zarr v3 sharding default
ilan-gold Mar 18, 2026
a15cdca
fix: setting
ilan-gold Mar 18, 2026
de96c4d
fix: shard first
ilan-gold Mar 18, 2026
fe98c53
Merge branch 'main' into ig/zarr_v3_shard_default
ilan-gold Mar 25, 2026
1edeaed
fix: reverse check order
ilan-gold Apr 1, 2026
ff3daf4
chore: relnote
ilan-gold Apr 1, 2026
47c6f21
fix: int as chunks
ilan-gold Apr 1, 2026
df1eedc
fix: check shard v3 for default
ilan-gold Apr 1, 2026
d92c134
Merge branch 'main' into ig/zarr_v3_shard_default
ilan-gold Apr 1, 2026
ccf4936
fix: group check
ilan-gold Apr 1, 2026
33e4627
fixL why?
ilan-gold Apr 1, 2026
abdc4ba
fix: more 1d chunking
ilan-gold Apr 1, 2026
2dfac05
fix: consolidated metadata + default sharding
ilan-gold Apr 1, 2026
daf059e
fix: tests again
ilan-gold Apr 1, 2026
acc4169
fix: documented intention
ilan-gold Apr 1, 2026
fe4a9a6
fix: revert old change
ilan-gold Apr 1, 2026
fb65c93
fix: more validation removal
ilan-gold Apr 1, 2026
5976147
fix: more check
ilan-gold Apr 1, 2026
bba236b
fix: remove test
ilan-gold Apr 1, 2026
254d705
fix: oops!
ilan-gold Apr 1, 2026
534f5b5
fix: last ones!
ilan-gold Apr 1, 2026
bd77b1a
final one?
ilan-gold Apr 2, 2026
8998fce
fix: specify chunks
ilan-gold Apr 2, 2026
e2aecd5
fix: remove unneeded warnings
ilan-gold Apr 2, 2026
926667b
fix: structured still unstable
ilan-gold Apr 2, 2026
f47f04e
fix: fixedlenutf32 still unstable
ilan-gold Apr 2, 2026
3f8013a
Merge branch 'main' into ig/zarr_v3_shard_default
ilan-gold Apr 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/2368.feat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Write zarr sharding + v3 by default {user}`ilan-gold`
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,11 @@ filterwarnings_when_strict = [
"default:(Observation|Variable) names are not unique. To make them unique:UserWarning",
"default::scipy.sparse.SparseEfficiencyWarning",
"default::dask.array.core.PerformanceWarning",
"default:anndata will no longer support zarr v2:DeprecationWarning",
"default:Consolidated metadata is:UserWarning",
# https://github.com/zarr-developers/zarr-python/pull/3781
"default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning",
"default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning",
"default:Automatic shard shape inference is experimental",
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to figure out if this warning is real or not. We might not be able to turn on auto-sharding here without either filtering this warning or providing defaults

"default:Writing zarr v2:UserWarning",
# TODO: Remove in conjunction with or before https://github.com/scverse/anndata/pull/1707
"default:.*will obey copy-on-write semantics:FutureWarning",
]
Expand Down
12 changes: 8 additions & 4 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,8 @@ def write_basic(
f.create_dataset(k, data=elem, shape=elem.shape, dtype=dtype, **dataset_kwargs)
else:
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
if f.metadata.zarr_format == 3:
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs)
# see https://github.com/zarr-developers/zarr-python/discussions/2712
if isinstance(elem, ZarrArray | H5Array):
Expand Down Expand Up @@ -518,7 +519,8 @@ def write_basic_dask_dask_dense(
is_h5 = isinstance(f, H5Group)
if not is_h5:
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
if f.metadata.zarr_format == 3:
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
if is_h5:
g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
else:
Expand Down Expand Up @@ -602,7 +604,8 @@ def write_vlen_string_array_zarr(
filters, fill_value = None, None
if f.metadata.zarr_format == 2:
filters, fill_value = [VLenUTF8()], ""
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
if f.metadata.zarr_format == 3:
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
f.create_array(
k,
shape=elem.shape,
Expand Down Expand Up @@ -727,7 +730,8 @@ def write_sparse_compressed(
attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs
)
else:
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
if f.metadata.zarr_format == 3:
dataset_kwargs = zarr_v3_sharding(dataset_kwargs)
arr = g.create_array(
attr_name, shape=attr.shape, dtype=dtype, **dataset_kwargs
)
Expand Down
6 changes: 4 additions & 2 deletions src/anndata/_io/specs/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,11 @@ def read_elem_lazy(

Reading a dense matrix from a zarr store lazily:

>>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"])
>>> adata.layers["dense"] = ad.experimental.read_elem_lazy(
... g["layers/dense"], chunks=(500, 500)
... )
>>> adata.layers["dense"]
dask.array<from-zarr, shape=(2700, 32738), dtype=float32, chunksize=(169, 2047), chunktype=numpy.ndarray>
dask.array<from-zarr, shape=(2700, 32738), dtype=float32, chunksize=(500, 500), chunktype=numpy.ndarray>

Making a new anndata object from on-disk, with custom chunks:

Expand Down
11 changes: 2 additions & 9 deletions src/anndata/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,21 +456,15 @@ def validate_zarr_write_format(format: int, settings: SettingsManager):
if format not in {2, 3}:
msg = "non-v2 zarr on-disk format not supported"
raise ValueError(msg)
if format == 2 and getattr(settings, "auto_shard_zarr_v3", False):
msg = "Cannot set `zarr_write_format` to 2 with autosharding on. Please set to `False` `anndata.settings.auto_shard_zarr_v3`"
raise ValueError(msg)


def validate_zarr_sharding(auto_shard: bool, settings: SettingsManager): # noqa: FBT001
validate_bool(auto_shard, settings)
if auto_shard and settings.zarr_write_format == 2:
msg = "Cannot shard v2 format data. Please set `anndata.settings.zarr_write_format` to 3."
raise ValueError(msg)


settings.register(
"zarr_write_format",
default_value=2,
default_value=3,
description="Which version of zarr to write to when anndata must internally open a write-able zarr group.",
validate=validate_zarr_write_format,
get_from_env=lambda name, default: check_and_get_environ_var(
Expand Down Expand Up @@ -517,13 +511,12 @@ def validate_sparse_settings(val: Any, settings: SettingsManager) -> None:

settings.register(
"auto_shard_zarr_v3",
default_value=False,
default_value=True,
description="Whether or not to use zarr's auto computation of sharding for v3. For v2 this setting will be ignored. The setting will apply to all calls to anndata's writing mechanism (write_zarr / write_elem) and will **not** override any user-defined kwargs for shards.",
validate=validate_zarr_sharding,
get_from_env=check_and_get_bool,
)


settings.register(
"copy_on_write_X",
default_value=False,
Expand Down
4 changes: 2 additions & 2 deletions src/anndata/_settings.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ class _AnnDataSettingsManager(SettingsManager):
check_uniqueness: bool = True
copy_on_write_X: bool = False
allow_write_nullable_strings: bool | None = None
zarr_write_format: Literal[2, 3] = 2
zarr_write_format: Literal[2, 3] = 3
use_sparse_array_on_read: bool = False
min_rows_for_chunked_h5_copy: int = 1000
disallow_forward_slash_in_h5ad: bool = False
write_csr_csc_indices_with_min_possible_dtype: bool = False
auto_shard_zarr_v3: bool = False
auto_shard_zarr_v3: bool = True

settings: _AnnDataSettingsManager
5 changes: 3 additions & 2 deletions src/anndata/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,10 +1278,11 @@ def visititems_zarr(
visitor(key, maybe_group)


def check_all_sharded(g: ZarrGroup):
def check_all_sharded_v3(g: ZarrGroup):
def visit(key: str, arr: zarr.Array | zarr.Group):
# Check for recarray via https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
assert arr.metadata.zarr_format == 3
if isinstance(arr, zarr.Array) and arr.shape != () and arr.dtype.names is None:
assert arr.shards is not None
assert arr.shards is not None, arr

visititems_zarr(g, visitor=visit)
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def diskfmt(
if (fmt := request.param[0]) == "h5ad":
yield fmt
else:
with ad.settings.override(zarr_write_format=request.param[1]):
with ad.settings.override(
auto_shard_zarr_v3=request.param[1] == 3, zarr_write_format=request.param[1]
):
yield fmt


Expand Down
32 changes: 24 additions & 8 deletions tests/lazy/test_read.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import json
from importlib.util import find_spec
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
Expand All @@ -9,6 +11,7 @@
import zarr

from anndata import AnnData
from anndata._settings import settings
from anndata.compat import DaskArray
from anndata.experimental import read_elem_lazy, read_lazy
from anndata.experimental.backed._io import ANNDATA_ELEMS
Expand All @@ -23,7 +26,7 @@

if TYPE_CHECKING:
from collections.abc import Callable
from pathlib import Path
from typing import Literal

from anndata._types import AnnDataElem

Expand Down Expand Up @@ -184,18 +187,31 @@ def test_view_of_view_to_memory(adata_remote: AnnData, adata_orig: AnnData):


@pytest.mark.zarr_io
def test_unconsolidated(tmp_path: Path, mtx_format):
adata = gen_adata((10, 10), mtx_format, **GEN_ADATA_NO_XARRAY_ARGS)
@pytest.mark.parametrize("zarr_version", [2, 3])
def test_unconsolidated(tmp_path: Path, zarr_version: Literal[2, 3]):
settings.zarr_write_format = zarr_version
adata = gen_adata((10, 10), **GEN_ADATA_NO_XARRAY_ARGS)
orig_pth = tmp_path / "orig.zarr"
adata.write_zarr(orig_pth)
(orig_pth / ".zmetadata").unlink()
if zarr_version == 2:
(orig_pth / ".zmetadata").unlink()
else:
z = zarr.open(orig_pth)
metadata = z.metadata.to_dict()
del metadata["consolidated_metadata"]
with Path.open(orig_pth / "zarr.json", mode="w") as f:
f.write(json.dumps(metadata))
store = AccessTrackingStore(orig_pth, read_only=True)
store.initialize_key_trackers(["obs/.zgroup", ".zgroup"])
store.initialize_key_trackers(
["obs/.zgroup"] if zarr_version == 2 else ["obs/zarr.json"]
)
with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"):
remote = read_lazy(store)
remote_to_memory = remote.to_memory()
assert_equal(remote_to_memory, adata)
store.assert_access_count("obs/.zgroup", 1)
store.assert_access_count(
f"obs/{'.zgroup' if zarr_version == 2 else 'zarr.json'}", 1
)


@pytest.mark.zarr_io
Expand Down Expand Up @@ -224,8 +240,8 @@ def test_h5_file_obj(tmp_path: Path):
def df_group(tmp_path_factory) -> zarr.Group:
df = gen_typed_df(120)
path = tmp_path_factory.mktemp("foo.zarr")
g = zarr.open_group(path, mode="w", zarr_format=2)
write_elem(g, "foo", df, dataset_kwargs={"chunks": 25})
g = zarr.open_group(path, mode="w")
write_elem(g, "foo", df, dataset_kwargs={"chunks": (25,)})
return zarr.open(path, mode="r")["foo"]


Expand Down
5 changes: 2 additions & 3 deletions tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,9 @@ def test_lazy_array_cache(
a_disk[3:5]
a_disk[6:7]
a_disk[8:9]
# Three hits for metadata in zarr v3:
# see https://github.com/zarr-developers/zarr-python/discussions/2760 for more info on the difference.
# 1 hit for metadata in zarr v3 for zarr.json:
# Then there is actual data access, 1 more when cached, 4 more otherwise.
c_expected = 4 if should_cache_indptr else 7
c_expected = 2 if should_cache_indptr else 5
assert store.get_access_count("X/indptr") == c_expected
for elem_not_indptr in elems - {"indptr"}:
assert (
Expand Down
4 changes: 2 additions & 2 deletions tests/test_concatenate_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from anndata._core.merge import _resolve_axis
from anndata.experimental.merge import as_group, concat_on_disk
from anndata.io import read_elem, write_elem
from anndata.tests.helpers import assert_equal, check_all_sharded, gen_adata
from anndata.tests.helpers import assert_equal, check_all_sharded_v3, gen_adata
from anndata.utils import asarray

if TYPE_CHECKING:
Expand Down Expand Up @@ -269,7 +269,7 @@ def test_concatenate_zarr_v3_shard(xxxm_adatas, tmp_path):
g = zarr.open(tmp_path)
assert g.metadata.zarr_format == 3

check_all_sharded(g)
check_all_sharded_v3(g)


def test_singleton(xxxm_adatas, tmp_path, file_format):
Expand Down
13 changes: 8 additions & 5 deletions tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
import pytest
import zarr

import anndata as ad
from anndata._core.anndata import AnnData
Expand All @@ -25,7 +26,7 @@
as_sparse_dask_array,
as_sparse_dask_matrix,
assert_equal,
check_all_sharded,
check_all_sharded_v3,
gen_adata,
)

Expand Down Expand Up @@ -127,8 +128,6 @@ def test_dask_distributed_write(
*,
auto_shard_zarr_v3: bool,
) -> None:
if auto_shard_zarr_v3 and ad.settings.zarr_write_format == 2:
pytest.skip(reason="Cannot shard v2 data")
import dask.array as da
import dask.distributed as dd
import numpy as np
Expand All @@ -144,8 +143,12 @@ def test_dask_distributed_write(
ad.io.write_elem(g, "", orig)
# TODO: See https://github.com/zarr-developers/zarr-python/issues/2716
with as_group(pth, mode="r") as g:
if auto_shard_zarr_v3:
check_all_sharded(g)
if (
auto_shard_zarr_v3
and ad.settings.zarr_write_format == 3
and isinstance(g, zarr.Group)
):
check_all_sharded_v3(g)
curr = ad.io.read_elem(g)

with pytest.raises(AssertionError):
Expand Down
38 changes: 1 addition & 37 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
as_cupy_sparse_dask_array,
as_dense_cupy_dask_array,
assert_equal,
check_all_sharded,
gen_adata,
visititems_zarr,
)
Expand Down Expand Up @@ -857,7 +856,7 @@ def test_chunking_1d_array(
chunks: tuple[int] | None,
expected_chunks: tuple[int],
):
write_elem(store, "foo", arr, dataset_kwargs={"chunks": 25})
write_elem(store, "foo", arr, dataset_kwargs={"chunks": (25,)})
arr = read_elem_lazy(store["foo"], chunks=chunks)
assert arr.chunksize == expected_chunks

Expand Down Expand Up @@ -912,41 +911,6 @@ def test_h5_unchunked(
assert arr.chunksize == expected_chunks


@pytest.mark.zarr_io
@pytest.mark.parametrize(
"override",
[
{"auto_shard_zarr_v3": True, "zarr_write_format": 3},
{"zarr_write_format": 3, "auto_shard_zarr_v3": True},
],
ids=["shard_first", "write_format_first"],
)
def test_write_auto_sharded(tmp_path: Path, override: dict):
path = tmp_path / "check.zarr"
adata = gen_adata((1000, 100), **GEN_ADATA_NO_XARRAY_ARGS)
with ad.settings.override(**override):
adata.write_zarr(path)

check_all_sharded(zarr.open(path))


@pytest.mark.zarr_io
def test_write_auto_sharded_against_v2_format():
with pytest.raises(ValueError, match=r"Cannot shard v2 format data."): # noqa: PT012, SIM117
with ad.settings.override(zarr_write_format=2):
with ad.settings.override(auto_shard_zarr_v3=True):
pass


@pytest.mark.zarr_io
def test_write_auto_cannot_set_v2_format_after_sharding():
with pytest.raises(ValueError, match=r"Cannot set `zarr_write_format` to 2"): # noqa: PT012, SIM117
with ad.settings.override(zarr_write_format=3):
with ad.settings.override(auto_shard_zarr_v3=True):
with ad.settings.override(zarr_write_format=2):
pass


@pytest.mark.zarr_io
def test_write_auto_sharded_does_not_override(tmp_path: Path):
z = open_write_group(tmp_path / "arr.zarr", zarr_format=3)
Expand Down
17 changes: 16 additions & 1 deletion tests/test_readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
_read_attr,
)
from anndata.tests.helpers import (
DEFAULT_KEY_TYPES,
GEN_ADATA_NO_XARRAY_ARGS,
as_dense_dask_array,
assert_equal,
check_all_sharded_v3,
gen_adata,
jnp,
jnp_array_or_idempotent,
Expand Down Expand Up @@ -410,6 +412,8 @@ def check_compressed(key, value):
def test_zarr_compression(
tmp_path: Path, zarr_write_format: Literal[2, 3], *, use_compression: bool
):
if zarr_write_format == 2:
ad.settings.auto_shard_zarr_v3 = False
ad.settings.zarr_write_format = zarr_write_format
pth = str(Path(tmp_path) / "adata.zarr")
adata = gen_adata((10, 8), **GEN_ADATA_NO_XARRAY_ARGS)
Expand Down Expand Up @@ -902,6 +906,17 @@ def test_io_dtype(tmp_path, diskfmt, dtype, roundtrip):
assert curr.X.dtype == dtype


def test_zarr_v3_sharded_default(tmp_path):
pth = tmp_path / "adata.zarr"

orig = gen_adata(
(10, 20), obsm_types=DEFAULT_KEY_TYPES, varm_types=DEFAULT_KEY_TYPES
)
orig.write_zarr(pth)

check_all_sharded_v3(zarr.open(pth))


def test_h5py_attr_limit(tmp_path):
N = 10_000
a = ad.AnnData(np.ones((5, 10)))
Expand Down Expand Up @@ -972,7 +987,7 @@ def test_write_elem_version_mismatch(tmp_path: Path):
g = zarr.open_group(
zarr_path,
mode="w",
zarr_format=2 if ad.settings.zarr_write_format == 3 else 3,
zarr_format=2,
)
ad.io.write_elem(g, "/", adata)
adata_roundtripped = ad.read_zarr(g)
Expand Down
Loading