diff --git a/docs/release-notes/2368.feat.md b/docs/release-notes/2368.feat.md new file mode 100644 index 000000000..7bc30bab6 --- /dev/null +++ b/docs/release-notes/2368.feat.md @@ -0,0 +1 @@ +Write zarr sharding + v3 by default {user}`ilan-gold` diff --git a/pyproject.toml b/pyproject.toml index 173c2ccaf..c403323b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,12 +171,11 @@ filterwarnings_when_strict = [ "default:(Observation|Variable) names are not unique. To make them unique:UserWarning", "default::scipy.sparse.SparseEfficiencyWarning", "default::dask.array.core.PerformanceWarning", - "default:anndata will no longer support zarr v2:DeprecationWarning", "default:Consolidated metadata is:UserWarning", + # https://github.com/zarr-developers/zarr-python/pull/3781 "default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning", "default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning", "default:Automatic shard shape inference is experimental", - "default:Writing zarr v2:UserWarning", # TODO: Remove in conjunction with or before https://github.com/scverse/anndata/pull/1707 "default:.*will obey copy-on-write semantics:FutureWarning", ] diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 43b084a00..d818bd50d 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -438,7 +438,8 @@ def write_basic( f.create_dataset(k, data=elem, shape=elem.shape, dtype=dtype, **dataset_kwargs) else: dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) - dataset_kwargs = zarr_v3_sharding(dataset_kwargs) + if f.metadata.zarr_format == 3: + dataset_kwargs = zarr_v3_sharding(dataset_kwargs) f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs) # see https://github.com/zarr-developers/zarr-python/discussions/2712 if isinstance(elem, ZarrArray | H5Array): @@ -518,7 +519,8 @@ def write_basic_dask_dask_dense( is_h5 = isinstance(f, H5Group) if not is_h5: dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) - dataset_kwargs = zarr_v3_sharding(dataset_kwargs) + if f.metadata.zarr_format == 3: + dataset_kwargs = zarr_v3_sharding(dataset_kwargs) if is_h5: g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) else: @@ -602,7 +604,8 @@ def write_vlen_string_array_zarr( filters, fill_value = None, None if f.metadata.zarr_format == 2: filters, fill_value = [VLenUTF8()], "" - dataset_kwargs = zarr_v3_sharding(dataset_kwargs) + if f.metadata.zarr_format == 3: + dataset_kwargs = zarr_v3_sharding(dataset_kwargs) f.create_array( k, shape=elem.shape, @@ -727,7 +730,8 @@ def write_sparse_compressed( attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs ) else: - dataset_kwargs = zarr_v3_sharding(dataset_kwargs) + if f.metadata.zarr_format == 3: + dataset_kwargs = zarr_v3_sharding(dataset_kwargs) arr = g.create_array( attr_name, shape=attr.shape, dtype=dtype, **dataset_kwargs ) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 51726e4e2..fbb0421ea 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -483,9 +483,11 @@ def read_elem_lazy( Reading a dense matrix from a zarr store lazily: - >>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"]) + >>> adata.layers["dense"] = ad.experimental.read_elem_lazy( + ... g["layers/dense"], chunks=(500, 500) + ... ) >>> adata.layers["dense"] - dask.array + dask.array Making a new anndata object from on-disk, with custom chunks: diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 8c569cbc1..f064a2fcc 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -456,21 +456,15 @@ def validate_zarr_write_format(format: int, settings: SettingsManager): if format not in {2, 3}: msg = "non-v2 zarr on-disk format not supported" raise ValueError(msg) - if format == 2 and getattr(settings, "auto_shard_zarr_v3", False): - msg = "Cannot set `zarr_write_format` to 2 with autosharding on. Please set to `False` `anndata.settings.auto_shard_zarr_v3`" - raise ValueError(msg) def validate_zarr_sharding(auto_shard: bool, settings: SettingsManager): # noqa: FBT001 validate_bool(auto_shard, settings) - if auto_shard and settings.zarr_write_format == 2: - msg = "Cannot shard v2 format data. Please set `anndata.settings.zarr_write_format` to 3." - raise ValueError(msg) settings.register( "zarr_write_format", - default_value=2, + default_value=3, description="Which version of zarr to write to when anndata must internally open a write-able zarr group.", validate=validate_zarr_write_format, get_from_env=lambda name, default: check_and_get_environ_var( @@ -517,13 +511,12 @@ def validate_sparse_settings(val: Any, settings: SettingsManager) -> None: settings.register( "auto_shard_zarr_v3", - default_value=False, + default_value=True, description="Whether or not to use zarr's auto computation of sharding for v3. For v2 this setting will be ignored. The setting will apply to all calls to anndata's writing mechanism (write_zarr / write_elem) and will **not** override any user-defined kwargs for shards.", validate=validate_zarr_sharding, get_from_env=check_and_get_bool, ) - settings.register( "copy_on_write_X", default_value=False, diff --git a/src/anndata/_settings.pyi b/src/anndata/_settings.pyi index d55afdeda..c0bf5b3ac 100644 --- a/src/anndata/_settings.pyi +++ b/src/anndata/_settings.pyi @@ -41,11 +41,11 @@ class _AnnDataSettingsManager(SettingsManager): check_uniqueness: bool = True copy_on_write_X: bool = False allow_write_nullable_strings: bool | None = None - zarr_write_format: Literal[2, 3] = 2 + zarr_write_format: Literal[2, 3] = 3 use_sparse_array_on_read: bool = False min_rows_for_chunked_h5_copy: int = 1000 disallow_forward_slash_in_h5ad: bool = False write_csr_csc_indices_with_min_possible_dtype: bool = False - auto_shard_zarr_v3: bool = False + auto_shard_zarr_v3: bool = True settings: _AnnDataSettingsManager diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 784951da9..8813c1f9f 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1278,10 +1278,11 @@ def visititems_zarr( visitor(key, maybe_group) -def check_all_sharded(g: ZarrGroup): +def check_all_sharded_v3(g: ZarrGroup): def visit(key: str, arr: zarr.Array | zarr.Group): # Check for recarray via https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes + assert arr.metadata.zarr_format == 3 if isinstance(arr, zarr.Array) and arr.shape != () and arr.dtype.names is None: - assert arr.shards is not None + assert arr.shards is not None, arr visititems_zarr(g, visitor=visit) diff --git a/tests/conftest.py b/tests/conftest.py index bfc37a7b4..e2379c766 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,7 +43,9 @@ def diskfmt( if (fmt := request.param[0]) == "h5ad": yield fmt else: - with ad.settings.override(zarr_write_format=request.param[1]): + with ad.settings.override( + auto_shard_zarr_v3=request.param[1] == 3, zarr_write_format=request.param[1] + ): yield fmt diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index c8e3a3104..77e2ce81f 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -1,6 +1,8 @@ from __future__ import annotations +import json from importlib.util import find_spec +from pathlib import Path from typing import TYPE_CHECKING import numpy as np @@ -9,6 +11,7 @@ import zarr from anndata import AnnData +from anndata._settings import settings from anndata.compat import DaskArray from anndata.experimental import read_elem_lazy, read_lazy from anndata.experimental.backed._io import ANNDATA_ELEMS @@ -23,7 +26,7 @@ if TYPE_CHECKING: from collections.abc import Callable - from pathlib import Path + from typing import Literal from anndata._types import AnnDataElem @@ -184,18 +187,31 @@ def test_view_of_view_to_memory(adata_remote: AnnData, adata_orig: AnnData): @pytest.mark.zarr_io -def test_unconsolidated(tmp_path: Path, mtx_format): - adata = gen_adata((10, 10), mtx_format, **GEN_ADATA_NO_XARRAY_ARGS) +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_unconsolidated(tmp_path: Path, zarr_version: Literal[2, 3]): + settings.zarr_write_format = zarr_version + adata = gen_adata((10, 10), **GEN_ADATA_NO_XARRAY_ARGS) orig_pth = tmp_path / "orig.zarr" adata.write_zarr(orig_pth) - (orig_pth / ".zmetadata").unlink() + if zarr_version == 2: + (orig_pth / ".zmetadata").unlink() + else: + z = zarr.open(orig_pth) + metadata = z.metadata.to_dict() + del metadata["consolidated_metadata"] + with Path.open(orig_pth / "zarr.json", mode="w") as f: + f.write(json.dumps(metadata)) store = AccessTrackingStore(orig_pth, read_only=True) - store.initialize_key_trackers(["obs/.zgroup", ".zgroup"]) + store.initialize_key_trackers( + ["obs/.zgroup"] if zarr_version == 2 else ["obs/zarr.json"] + ) with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"): remote = read_lazy(store) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) - store.assert_access_count("obs/.zgroup", 1) + store.assert_access_count( + f"obs/{'.zgroup' if zarr_version == 2 else 'zarr.json'}", 1 + ) @pytest.mark.zarr_io @@ -224,8 +240,8 @@ def test_h5_file_obj(tmp_path: Path): def df_group(tmp_path_factory) -> zarr.Group: df = gen_typed_df(120) path = tmp_path_factory.mktemp("foo.zarr") - g = zarr.open_group(path, mode="w", zarr_format=2) - write_elem(g, "foo", df, dataset_kwargs={"chunks": 25}) + g = zarr.open_group(path, mode="w") + write_elem(g, "foo", df, dataset_kwargs={"chunks": (25,)}) return zarr.open(path, mode="r")["foo"] diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 3112dd75e..5ff7a2328 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -395,10 +395,9 @@ def test_lazy_array_cache( a_disk[3:5] a_disk[6:7] a_disk[8:9] - # Three hits for metadata in zarr v3: - # see https://github.com/zarr-developers/zarr-python/discussions/2760 for more info on the difference. + # 1 hit for metadata in zarr v3 for zarr.json: # Then there is actual data access, 1 more when cached, 4 more otherwise. - c_expected = 4 if should_cache_indptr else 7 + c_expected = 2 if should_cache_indptr else 5 assert store.get_access_count("X/indptr") == c_expected for elem_not_indptr in elems - {"indptr"}: assert ( diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index 194649166..5f8b4d190 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -16,7 +16,7 @@ from anndata._core.merge import _resolve_axis from anndata.experimental.merge import as_group, concat_on_disk from anndata.io import read_elem, write_elem -from anndata.tests.helpers import assert_equal, check_all_sharded, gen_adata +from anndata.tests.helpers import assert_equal, check_all_sharded_v3, gen_adata from anndata.utils import asarray if TYPE_CHECKING: @@ -269,7 +269,7 @@ def test_concatenate_zarr_v3_shard(xxxm_adatas, tmp_path): g = zarr.open(tmp_path) assert g.metadata.zarr_format == 3 - check_all_sharded(g) + check_all_sharded_v3(g) def test_singleton(xxxm_adatas, tmp_path, file_format): diff --git a/tests/test_dask.py b/tests/test_dask.py index a88139c90..025ffa0d0 100644 --- a/tests/test_dask.py +++ b/tests/test_dask.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd import pytest +import zarr import anndata as ad from anndata._core.anndata import AnnData @@ -25,7 +26,7 @@ as_sparse_dask_array, as_sparse_dask_matrix, assert_equal, - check_all_sharded, + check_all_sharded_v3, gen_adata, ) @@ -127,8 +128,6 @@ def test_dask_distributed_write( *, auto_shard_zarr_v3: bool, ) -> None: - if auto_shard_zarr_v3 and ad.settings.zarr_write_format == 2: - pytest.skip(reason="Cannot shard v2 data") import dask.array as da import dask.distributed as dd import numpy as np @@ -144,8 +143,12 @@ def test_dask_distributed_write( ad.io.write_elem(g, "", orig) # TODO: See https://github.com/zarr-developers/zarr-python/issues/2716 with as_group(pth, mode="r") as g: - if auto_shard_zarr_v3: - check_all_sharded(g) + if ( + auto_shard_zarr_v3 + and ad.settings.zarr_write_format == 3 + and isinstance(g, zarr.Group) + ): + check_all_sharded_v3(g) curr = ad.io.read_elem(g) with pytest.raises(AssertionError): diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 7b4d4a356..b17b1f028 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -31,7 +31,6 @@ as_cupy_sparse_dask_array, as_dense_cupy_dask_array, assert_equal, - check_all_sharded, gen_adata, visititems_zarr, ) @@ -857,7 +856,7 @@ def test_chunking_1d_array( chunks: tuple[int] | None, expected_chunks: tuple[int], ): - write_elem(store, "foo", arr, dataset_kwargs={"chunks": 25}) + write_elem(store, "foo", arr, dataset_kwargs={"chunks": (25,)}) arr = read_elem_lazy(store["foo"], chunks=chunks) assert arr.chunksize == expected_chunks @@ -912,41 +911,6 @@ def test_h5_unchunked( assert arr.chunksize == expected_chunks -@pytest.mark.zarr_io -@pytest.mark.parametrize( - "override", - [ - {"auto_shard_zarr_v3": True, "zarr_write_format": 3}, - {"zarr_write_format": 3, "auto_shard_zarr_v3": True}, - ], - ids=["shard_first", "write_format_first"], -) -def test_write_auto_sharded(tmp_path: Path, override: dict): - path = tmp_path / "check.zarr" - adata = gen_adata((1000, 100), **GEN_ADATA_NO_XARRAY_ARGS) - with ad.settings.override(**override): - adata.write_zarr(path) - - check_all_sharded(zarr.open(path)) - - -@pytest.mark.zarr_io -def test_write_auto_sharded_against_v2_format(): - with pytest.raises(ValueError, match=r"Cannot shard v2 format data."): # noqa: PT012, SIM117 - with ad.settings.override(zarr_write_format=2): - with ad.settings.override(auto_shard_zarr_v3=True): - pass - - -@pytest.mark.zarr_io -def test_write_auto_cannot_set_v2_format_after_sharding(): - with pytest.raises(ValueError, match=r"Cannot set `zarr_write_format` to 2"): # noqa: PT012, SIM117 - with ad.settings.override(zarr_write_format=3): - with ad.settings.override(auto_shard_zarr_v3=True): - with ad.settings.override(zarr_write_format=2): - pass - - @pytest.mark.zarr_io def test_write_auto_sharded_does_not_override(tmp_path: Path): z = open_write_group(tmp_path / "arr.zarr", zarr_format=3) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 3359b2ff8..467b16b44 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -29,9 +29,11 @@ _read_attr, ) from anndata.tests.helpers import ( + DEFAULT_KEY_TYPES, GEN_ADATA_NO_XARRAY_ARGS, as_dense_dask_array, assert_equal, + check_all_sharded_v3, gen_adata, jnp, jnp_array_or_idempotent, @@ -410,6 +412,8 @@ def check_compressed(key, value): def test_zarr_compression( tmp_path: Path, zarr_write_format: Literal[2, 3], *, use_compression: bool ): + if zarr_write_format == 2: + ad.settings.auto_shard_zarr_v3 = False ad.settings.zarr_write_format = zarr_write_format pth = str(Path(tmp_path) / "adata.zarr") adata = gen_adata((10, 8), **GEN_ADATA_NO_XARRAY_ARGS) @@ -902,6 +906,17 @@ def test_io_dtype(tmp_path, diskfmt, dtype, roundtrip): assert curr.X.dtype == dtype +def test_zarr_v3_sharded_default(tmp_path): + pth = tmp_path / "adata.zarr" + + orig = gen_adata( + (10, 20), obsm_types=DEFAULT_KEY_TYPES, varm_types=DEFAULT_KEY_TYPES + ) + orig.write_zarr(pth) + + check_all_sharded_v3(zarr.open(pth)) + + def test_h5py_attr_limit(tmp_path): N = 10_000 a = ad.AnnData(np.ones((5, 10))) @@ -972,7 +987,7 @@ def test_write_elem_version_mismatch(tmp_path: Path): g = zarr.open_group( zarr_path, mode="w", - zarr_format=2 if ad.settings.zarr_write_format == 3 else 3, + zarr_format=2, ) ad.io.write_elem(g, "/", adata) adata_roundtripped = ad.read_zarr(g)