diff --git a/.Rbuildignore b/.Rbuildignore index c5eeb6fa..5c997ee0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,3 +18,4 @@ ^data$ ^codecov\.yml$ ^benchmarks$ +^\.venv$ diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index eb5db458..1dda7f2c 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -19,9 +19,10 @@ jobs: steps: - uses: actions/checkout@v6 - - uses: r-lib/actions/setup-r@v2 + - name: Setup R and Bioconductor + uses: grimbough/bioc-actions/setup-bioc@v1 with: - use-public-rspm: true + bioc-version: devel - name: Install air run: curl -LsSf https://github.com/posit-dev/air/releases/latest/download/air-installer.sh | sh diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 775ebf16..7363e69f 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -36,9 +36,10 @@ jobs: with: python-version: "3.x" - - uses: r-lib/actions/setup-r@v2 + - name: Setup R and Bioconductor + uses: grimbough/bioc-actions/setup-bioc@v1 with: - use-public-rspm: true + bioc-version: devel - uses: r-lib/actions/setup-r-dependencies@v2 with: diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml index 10cc5b44..3ea48236 100644 --- a/.github/workflows/pr-commands.yaml +++ b/.github/workflows/pr-commands.yaml @@ -24,9 +24,10 @@ jobs: with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: r-lib/actions/setup-r@v2 + - name: Setup R and Bioconductor + uses: grimbough/bioc-actions/setup-bioc@v1 with: - use-public-rspm: true + bioc-version: devel - uses: r-lib/actions/setup-r-dependencies@v2 with: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index befbce78..b0aaa2b0 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -26,10 +26,10 @@ jobs: sudo apt-get update sudo apt-get -y install hdf5-tools libsz2 libaec-dev - - name: Setup R - uses: r-lib/actions/setup-r@v2 + - name: Setup R and Bioconductor + uses: grimbough/bioc-actions/setup-bioc@v1 with: - use-public-rspm: true + bioc-version: devel - name: Install R dependencies uses: r-lib/actions/setup-r-dependencies@v2 diff --git a/.gitignore b/.gitignore index 2579b686..994b93d1 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,4 @@ benchmarks/results_*.txt vignettes/data/*.h5ad /doc/ /Meta/ -/data/ +/data/ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 8995b385..dd323095 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,6 +49,7 @@ Suggests: knitr, processx, rhdf5 (>= 2.52.1), + Rarr (>= 1.11.12), rmarkdown, S4Vectors, Seurat, diff --git a/NAMESPACE b/NAMESPACE index 0efc422a..68725b83 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,7 +17,9 @@ export(as_AnnData) export(generate_dataset) export(get_generator_types) export(read_h5ad) +export(read_zarr) export(write_h5ad) +export(write_zarr) importFrom(Matrix,as.matrix) importFrom(Matrix,sparseMatrix) importFrom(Matrix,t) @@ -35,3 +37,4 @@ importFrom(reticulate,r_to_py) importFrom(rlang,`%||%`) importFrom(rlang,caller_env) importFrom(stats,setNames) +importFrom(utils,tail) diff --git a/R/AbstractAnnData.R b/R/AbstractAnnData.R index 7d44d211..734ba695 100644 --- a/R/AbstractAnnData.R +++ b/R/AbstractAnnData.R @@ -290,6 +290,37 @@ AbstractAnnData <- R6::R6Class( ) }, #' @description + #' Convert to a [`ZarrAnnData`] + #' + #' See [as_ZarrAnnData()] for more details on the conversion + #' + #' @param file See [as_ZarrAnnData()] + #' @param compression See [as_ZarrAnnData()] + #' @param mode See [as_ZarrAnnData()] + #' + #' @return A [`ZarrAnnData`] object + as_ZarrAnnData = function( + file, + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ), + mode = c("w-", "r", "r+", "a", "w", "x") + ) { + as_ZarrAnnData( + adata = self, + file = file, + compression = compression, + mode = mode + ) + }, + #' @description #' Write the `AnnData` object to an H5AD file #' #' See [write_h5ad()] for details @@ -313,6 +344,37 @@ AbstractAnnData <- R6::R6Class( chunk_size = chunk_size, mode = mode ) + }, + #' @description + #' Write the `AnnData` object to a Zarr file + #' + #' See [write_zarr()] for details + #' + #' @param path See [write_zarr()] + #' @param compression See [write_zarr()] + #' @param mode See [write_zarr()] + #' + #' @return `path` invisibly + write_zarr = function( + path, + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ), + mode = c("w-", "r", "r+", "a", "w", "x") + ) { + write_zarr( + object = self, + path, + compression = compression, + mode = mode + ) } ), private = list( diff --git a/R/AnnData-usage.R b/R/AnnData-usage.R index 416f7b30..79f13007 100644 --- a/R/AnnData-usage.R +++ b/R/AnnData-usage.R @@ -13,6 +13,7 @@ #' #' - [InMemoryAnnData] stores data in memory #' - [HDF5AnnData] provides an interface to a H5AD file +#' - [ZarrAnnData] provides an interface to a Zarr store #' - [ReticulateAnnData] wraps a Python `AnnData` object via \pkg{reticulate} #' #' See the class documentation for details. @@ -89,6 +90,7 @@ #' \item{`as_Seurat()`}{Convert to [`SeuratObject::Seurat`], see [as_Seurat()]} #' \item{`as_InMemoryAnnData()`}{Convert to [`InMemoryAnnData`], as [as_InMemoryAnnData()]} #' \item{`as_HDF5AnnData()`}{Convert to [`HDF5AnnData`], see [as_HDF5AnnData()]} +#' \item{`as_ZarrAnnData()`}{Convert to [`ZarrAnnData`], see [as_ZarrAnnData()]} #' \item{`as_ReticulateAnnData()`}{Convert to [`ReticulateAnnData`], see [as_ReticulateAnnData()]} #' } #' @@ -102,6 +104,14 @@ #' } #' } #' +#' \describe{ +#' \item{ +#' `write_zarr()` +#' }{ +#' Write the `AnnData` object to a Zarr store, see [write_zarr()] +#' } +#' } +#' #' ## General methods: #' #' \describe{ @@ -113,6 +123,7 @@ #' \describe{ #' \item{[AnnData()]}{Create an [InMemoryAnnData] object} #' \item{[read_h5ad()]}{Read an `AnnData` from a H5AD file} +#' \item{[read_zarr()]}{Read an `AnnData` from a Zarr store} #' \item{[as_AnnData()]}{Convert other objects to an `AnnData` object} #' } #' @@ -122,6 +133,7 @@ #' inherit from #' @seealso [InMemoryAnnData] for the in-memory implementation of `AnnData` #' @seealso [HDF5AnnData] for the HDF5-backed implementation of `AnnData` +#' @seealso [ZarrAnnData] for the Zarr-backed implementation of `AnnData` #' @seealso [ReticulateAnnData] for the reticulate-based implementation that wraps Python AnnData objects #' #' @name AnnData-usage diff --git a/R/Rarr_utils.R b/R/Rarr_utils.R new file mode 100644 index 00000000..22e5cfc4 --- /dev/null +++ b/R/Rarr_utils.R @@ -0,0 +1,105 @@ +# Zarr metadata files used to identify valid Zarr nodes (arrays or groups) +ZARR_METADATA_FILES <- c(".zarray", ".zattrs", ".zgroup", "zarr.json") + +#' create_zarr_group +#' +#' Create a Zarr group +#' +#' @param store The location of the Zarr store +#' @param name Name of the group +#' @param version Zarr version +#' +#' @return `NULL` +#' +#' @noRd +create_zarr_group <- function(store, name, version = "v2") { + # Split "a/b/c" into c("a", "b", "c") + split_name <- strsplit(name, split = "/", fixed = TRUE)[[1]] + if (length(split_name) > 1) { + # Build cumulative paths: c("a", "a/b", "a/b/c") + split_name <- vapply( + seq_along(split_name), + function(x) paste(split_name[seq_len(x)], collapse = "/"), + FUN.VALUE = character(1) + ) + # Keep only the target and its immediate parent: + # split_name[1] = "a/b/c" (target), split_name[2] = "a/b" (parent) + split_name <- rev(tail(split_name, 2)) + # Recursively ensure the parent group exists before creating the target + if (!dir.exists(file.path(store, split_name[2]))) { + create_zarr_group(store = store, name = split_name[2]) + } + } + dir.create(file.path(store, split_name[1]), showWarnings = FALSE) + switch( + version, + v2 = { + write( + "{\"zarr_format\":2}", + file = file.path(store, split_name[1], ".zgroup") + ) + }, + v3 = { + cli_abort("Currently only zarr v2 is supported!") + }, + cli_abort("Only zarr v2 is supported. Use version = 'v2'") + ) +} + +#' create_zarr +#' +#' Create Zarr store +#' +#' @param store The location of the Zarr store +#' @param version Zarr version +#' +#' @return `NULL` +#' +#' @noRd +create_zarr <- function(store, version = "v2") { + prefix <- basename(store) + dir <- gsub(paste0(prefix, "$"), "", store) + create_zarr_group(store = dir, name = prefix, version = version) +} + +#' is_zarr_empty +#' +#' Check if a Zarr store is empty +#' +#' @param store The location of the Zarr store +#' +#' @return Returns `TRUE` if the Zarr store is empty +#' +#' @noRd +is_zarr_empty <- function(store) { + files <- list.files(store, recursive = FALSE, full.names = FALSE) + all(files %in% ZARR_METADATA_FILES) +} + +#' Zarr path exists +#' +#' Check that a path in Zarr exists +#' +#' @return Whether the `target_path` exists in `store` +#' @noRd +#' +#' @param store Path to a Zarr store +#' @param target_path The path within the store to test for +zarr_path_exists <- function(store, target_path) { + zarr <- file.path(store, target_path) + if (!dir.exists(zarr)) { + FALSE + } else { + list_files <- list.files( + path = zarr, + full.names = FALSE, + recursive = FALSE, + all.files = TRUE + ) + if (any(ZARR_METADATA_FILES %in% list_files)) { + TRUE + } else { + FALSE + } + } +} diff --git a/R/ZarrAnnData.R b/R/ZarrAnnData.R new file mode 100644 index 00000000..4458e044 --- /dev/null +++ b/R/ZarrAnnData.R @@ -0,0 +1,560 @@ +#' @title ZarrAnnData +#' +#' @description +#' Implementation of a Zarr-backed `AnnData` object. This class provides an +#' interface to a Zarr file and minimal data is stored in memory until it is +#' requested by the user. It is primarily designed as an intermediate object +#' when reading/writing Zarr files but can be useful for accessing parts of +#' large files. +#' +#' See [AnnData-usage] for details on creating and using `AnnData` objects. +#' +#' @return A `ZarrAnnData` object +#' +#' @seealso [AnnData-usage] for details on creating and using `AnnData` objects +#' +#' @family AnnData classes +ZarrAnnData <- R6::R6Class( + "ZarrAnnData", # nolint + inherit = AbstractAnnData, + cloneable = FALSE, + private = list( + .zarrobj = NULL, + .compression = NULL, + .readonly = NULL, + + .check_file_valid = function() { + if (!zarr_path_exists(private$.zarrobj, "/")) { + cli_abort( + "The Zarr path does not exist or is not a valid Zarr store" + ) + } + }, + + .check_writeable = function() { + if (isTRUE(private$.readonly)) { + cli_abort( + "Cannot write to a Zarr store opened in read-only mode.", + call = rlang::caller_env() + ) + } + } + ), + active = list( + #' @field X See [AnnData-usage] + X = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_X, status=done + read_zarr_element(private$.zarrobj, "X") |> + private$.add_matrix_dimnames("X") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_X, status=done + private$.validate_aligned_array( + value, + "X", + shape = c(self$n_obs(), self$n_vars()), + expected_rownames = self$obs_names, + expected_colnames = self$var_names + ) |> + write_zarr_element( + private$.zarrobj, + "X", + private$.compression + ) + } + }, + #' @field layers See [AnnData-usage] + layers = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_layers, status=done + read_zarr_element(private$.zarrobj, "layers") |> + private$.add_mapping_dimnames("layers") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_layers, status=done + private$.validate_aligned_mapping( + value, + "layers", + c(self$n_obs(), self$n_vars()), + expected_rownames = self$obs_names, + expected_colnames = self$var_names + ) |> + write_zarr_element( + private$.zarrobj, + "layers", + private$.compression + ) + } + }, + #' @field obsm See [AnnData-usage] + obsm = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obsm, status=done + read_zarr_element(private$.zarrobj, "obsm") |> + private$.add_mapping_dimnames("obsm") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_obsm, status=done + private$.validate_aligned_mapping( + value, + "obsm", + c(self$n_obs()), + expected_rownames = self$obs_names, + strip_rownames = TRUE, + strip_colnames = FALSE, + warn_colnames = TRUE + ) |> + write_zarr_element( + private$.zarrobj, + "obsm", + private$.compression + ) + } + }, + #' @field varm See [AnnData-usage] + varm = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_varm, status=done + read_zarr_element(private$.zarrobj, "varm") |> + private$.add_mapping_dimnames("varm") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_varm, status=done + private$.validate_aligned_mapping( + value, + "varm", + c(self$n_vars()), + expected_rownames = self$var_names, + strip_rownames = TRUE, + strip_colnames = FALSE, + warn_colnames = TRUE + ) |> + write_zarr_element( + private$.zarrobj, + "varm", + private$.compression + ) + } + }, + #' @field obsp See [AnnData-usage] + obsp = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obsp, status=done + read_zarr_element(private$.zarrobj, "obsp") |> + private$.add_mapping_dimnames("obsp") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_obsp, status=done + private$.validate_aligned_mapping( + value, + "obsp", + c(self$n_obs(), self$n_obs()), + expected_rownames = self$obs_names, + expected_colnames = self$obs_names + ) |> + write_zarr_element( + private$.zarrobj, + "obsp", + private$.compression + ) + } + }, + #' @field varp See [AnnData-usage] + varp = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_varp, status=done + read_zarr_element(private$.zarrobj, "varp") |> + private$.add_mapping_dimnames("varp") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_varp, status=done + private$.validate_aligned_mapping( + value, + "varp", + c(self$n_vars(), self$n_vars()), + expected_rownames = self$var_names, + expected_colnames = self$var_names + ) |> + write_zarr_element( + private$.zarrobj, + "varp", + private$.compression + ) + } + }, + #' @field obs See [AnnData-usage] + obs = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obs, status=done + read_zarr_element(private$.zarrobj, "obs") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_obs, status=done + private$.validate_obsvar_dataframe(value, "obs") |> + write_zarr_element( + private$.zarrobj, + "obs", + private$.compression + ) + } + }, + #' @field var See [AnnData-usage] + var = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_var, status=done + read_zarr_element(private$.zarrobj, "var") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_var, status=done + private$.validate_obsvar_dataframe(value, "var") |> + write_zarr_element( + private$.zarrobj, + "var", + private$.compression + ) + } + }, + #' @field obs_names See [AnnData-usage] + obs_names = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obs_names, status=done + read_zarr_element_keys(private$.zarrobj, "obs", dim = "rows") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_obs_names, status=done + rownames(self$obs) <- value + } + }, + #' @field var_names See [AnnData-usage] + var_names = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_var_names, status=done + read_zarr_element_keys(private$.zarrobj, "var", dim = "rows") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_var_names, status=done + rownames(self$var) <- value + } + }, + #' @field uns See [AnnData-usage] + uns = function(value) { + private$.check_file_valid() + + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_uns, status=done + read_zarr_element(private$.zarrobj, "uns") + } else { + private$.check_writeable() + # trackstatus: class=ZarrAnnData, feature=set_uns, status=done + private$.validate_named_list( + value, + "uns", + warn_matrix_dimnames = TRUE + ) |> + write_zarr_element( + private$.zarrobj, + "uns", + private$.compression + ) + } + } + ), + public = list( + #' @description + #' `ZarrAnnData` constructor + #' + #' @param file The file name (character) of the `.zarr` file. If this file + #' already exits, other arguments must be `NULL`. + #' @param X See the `X` slot in [AnnData-usage] + #' @param layers See the `layers` slot in [AnnData-usage] + #' @param obs See the `obs` slot in [AnnData-usage] + #' @param var See the `var` slot in [AnnData-usage] + #' @param obsm See the `obsm` slot in [AnnData-usage] + #' @param varm See the `varm` slot in [AnnData-usage] + #' @param obsp See the `obsp` slot in [AnnData-usage] + #' @param varp See the `varp` slot in [AnnData-usage] + #' @param uns See the `uns` slot in [AnnData-usage] + #' @param shape Shape tuple (e.g. `c(n_obs, n_vars)`). Can be provided if + #' both `X` or `obs` and `var` are not provided. + #' @param mode The mode to open the Zarr file. See [as_ZarrAnnData()] for + #' details + #' @param compression The compression algorithm to use. See + #' [as_ZarrAnnData()] for details + #' + #' @details + #' The constructor creates a new Zarr `AnnData` interface object. This can + #' either be used to either connect to an existing `.zarr` file or to + #' create a new one. If any additional slot arguments are set an existing + #' file will be overwritten. + initialize = function( + file, + X = NULL, + obs = NULL, + var = NULL, + layers = NULL, + obsm = NULL, + varm = NULL, + obsp = NULL, + varp = NULL, + uns = NULL, + shape = NULL, + mode = c("a", "r", "r+", "w", "w-", "x"), + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ) + ) { + check_requires("ZarrAnnData", "Rarr", where = "Bioc") + + compression <- match.arg(compression) + mode <- match.arg(mode) + + private$.compression <- compression + + is_readonly <- FALSE + + if (is.character(file)) { + if (mode == "a") { + if (dir.exists(file)) { + mode <- "r+" + } else { + mode <- "w-" + } + } + + if (!dir.exists(file) && mode %in% c("r", "r+")) { + cli_abort( + paste( + "File {.file {file}} does not exist but mode is set to {.val {mode}}.", + "If you want to create a new file, use a different mode (e.g. 'w-').", + "See {.help read_zarr} or {.help write_zarr} for more information." + ), + call = rlang::caller_env() + ) + } + + if (dir.exists(file) && mode %in% c("w-", "x")) { + cli_abort( + paste( + "File {.file {file}} already exists but mode is set to {.val {mode}}.", + "If you want to overwrite the file, use a different mode (e.g. 'w').", + "See {.help read_zarr} or {.help write_zarr} for more information." + ), + call = rlang::caller_env() + ) + } + + if (mode %in% c("w", "w-", "x")) { + create_zarr(file) + } else if (mode == "r") { + is_readonly <- TRUE + } + } else { + cli_abort( + paste( + "{.arg file} must be a {.cls character}" + ) + ) + } + + if (!zarr_path_exists(file, "/")) { + cli_abort( + paste( + "{.arg file} must be a valid zarr store/file" + ) + ) + } + + is_empty <- is_zarr_empty(file) + + if (!is_readonly) { + if (!is_empty) { + cli_warn( + paste( + "An non-empty file is opened in read/write mode.", + "Use with caution, as this can lead to data corruption." + ) + ) + } else { + shape <- get_shape(obs, var, X, shape) + obs <- get_initial_obs(obs, X, shape) + var <- get_initial_var(var, X, shape) + write_empty_zarr(file, obs, var, compression) + } + } + + # File is supposed to exist by now. Check if it is a valid Zarr file + attrs <- Rarr::read_zarr_attributes(file) + if (!all(c("encoding-type", "encoding-version") %in% names(attrs))) { + cli_abort(c( + "File {.file {file}} is not a valid AnnData-Zarr file." + )) + } + + # Set the file path + private$.zarrobj <- file + private$.readonly <- is_readonly + + if (is_readonly) { + # if any of these variables are not NULL, throw an error + are_null <- vapply( + .anndata_slots, + function(x) is.null(get(x)), + logical(1) + ) + if (!all(are_null)) { + cli_abort( + paste0( + "Error trying to write data (", + paste(.anndata_slots[!are_null], collapse = ", "), + ") to a Zarr file opened in read-only mode." + ) + ) + } + } else { + for (slot in .anndata_slots) { + value <- get(slot) + if (!is.null(value)) { + self[[slot]] <- value + } + } + } + + self + }, + + #' @description See the `n_obs` field in [AnnData-usage] + n_obs = function() { + length(self$obs_names) + }, + + #' @description See the `n_vars` field in [AnnData-usage] + n_vars = function() { + length(self$var_names) + }, + + #' @description See [AnnData-usage] + obs_keys = function() { + read_zarr_element_keys(private$.zarrobj, "obs", dim = "cols") + }, + #' @description See [AnnData-usage] + var_keys = function() { + read_zarr_element_keys(private$.zarrobj, "var", dim = "cols") + }, + #' @description See [AnnData-usage] + layers_keys = function() { + read_zarr_element_keys(private$.zarrobj, "layers") + }, + #' @description See [AnnData-usage] + obsm_keys = function() { + read_zarr_element_keys(private$.zarrobj, "obsm") + }, + #' @description See [AnnData-usage] + varm_keys = function() { + read_zarr_element_keys(private$.zarrobj, "varm") + }, + #' @description See [AnnData-usage] + obsp_keys = function() { + read_zarr_element_keys(private$.zarrobj, "obsp") + }, + #' @description See [AnnData-usage] + varp_keys = function() { + read_zarr_element_keys(private$.zarrobj, "varp") + }, + #' @description See [AnnData-usage] + uns_keys = function() { + read_zarr_element_keys(private$.zarrobj, "uns") + } + ) +) + +#' Convert an `AnnData` to an `ZarrAnnData` +#' +#' Convert another `AnnData` object to an [`ZarrAnnData`] object +#' +#' @param adata An `AnnData` object to be converted to [`ZarrAnnData`] +#' @param file The file name (character) of the `.zarr` file +#' @param compression The compression algorithm to use when writing the +#' Zarr file. Can be one of `"none"`, `"gzip"`, `"blosc"`, `"zstd"`, +#' `"lzma"`, `"bz2"`, `"zlib"` or `"lz4"`. Defaults to `"none"`. +#' @param mode The mode to open the Zarr file: +#' +#' * `a` creates a new file or opens an existing one for read/write +#' * `r` opens an existing file for reading +#' * `r+` opens an existing file for read/write +#' * `w` creates a file, truncating any existing ones +#' * `w-`/`x` are synonyms, creating a file and failing if it already exists +#' +#' @return A [`ZarrAnnData`] object with the same data as the input `AnnData` +#' object. +#' @keywords internal +#' +#' @family object converters +#' +# nolint start: object_name_linter +as_ZarrAnnData <- function( + # nolint end: object_name_linter + adata, + file, + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ), + mode = c("w-", "r", "r+", "a", "w", "x") +) { + if (!(inherits(adata, "AbstractAnnData"))) { + cli_abort( + "{.arg adata} must be a {.cls AbstractAnnData} but has class {.cls {class(adata)}}" + ) + } + + mode <- match.arg(mode) + ZarrAnnData$new( + file = file, + X = adata$X, + obs = adata$obs, + var = adata$var, + obsm = adata$obsm, + varm = adata$varm, + layers = adata$layers, + obsp = adata$obsp, + varp = adata$varp, + uns = adata$uns, + shape = adata$shape(), + mode = mode, + compression = compression + ) +} diff --git a/R/anndataR-package.R b/R/anndataR-package.R index 97507bf7..516c3348 100644 --- a/R/anndataR-package.R +++ b/R/anndataR-package.R @@ -9,6 +9,7 @@ #' @importFrom R6 R6Class #' @importFrom rlang `%||%` #' @importFrom stats setNames +#' @importFrom utils tail ## usethis namespace: end NULL diff --git a/R/anndata_constructors.R b/R/anndata_constructors.R index 5b6f3692..ec440df3 100644 --- a/R/anndata_constructors.R +++ b/R/anndata_constructors.R @@ -5,6 +5,7 @@ anndata_constructors <- function() { list( "HDF5AnnData" = HDF5AnnData, "InMemoryAnnData" = InMemoryAnnData, + "ZarrAnnData" = ZarrAnnData, "ReticulateAnnData" = ReticulateAnnData ) } @@ -16,7 +17,12 @@ anndata_constructors <- function() { #' #' @noRd get_anndata_constructor <- function( - class = c("HDF5AnnData", "InMemoryAnnData", "ReticulateAnnData") + class = c( + "HDF5AnnData", + "InMemoryAnnData", + "ZarrAnnData", + "ReticulateAnnData" + ) ) { # TODO: also support directly passing the correct class? class <- match.arg(class) diff --git a/R/as_AnnData.R b/R/as_AnnData.R index 39c15126..4826532e 100644 --- a/R/as_AnnData.R +++ b/R/as_AnnData.R @@ -185,7 +185,12 @@ as_AnnData <- function( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = NULL, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c( + "InMemory", + "HDF5AnnData", + "ZarrAnnData", + "ReticulateAnnData" + ), ... ) { UseMethod("as_AnnData", x) @@ -205,7 +210,12 @@ as_AnnData.SingleCellExperiment <- function( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = TRUE, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c( + "InMemory", + "HDF5AnnData", + "ZarrAnnData", + "ReticulateAnnData" + ), ... ) { from_SingleCellExperiment( @@ -238,7 +248,12 @@ as_AnnData.Seurat <- function( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = NULL, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c( + "InMemory", + "HDF5AnnData", + "ZarrAnnData", + "ReticulateAnnData" + ), ... ) { from_Seurat( diff --git a/R/from_Seurat.R b/R/from_Seurat.R index a40d110b..e6fa29f5 100644 --- a/R/from_Seurat.R +++ b/R/from_Seurat.R @@ -30,7 +30,12 @@ from_Seurat <- function( obsp_mapping = TRUE, varp_mapping = TRUE, uns_mapping = TRUE, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c( + "InMemory", + "HDF5AnnData", + "ZarrAnnData", + "ReticulateAnnData" + ), ... ) { check_requires("Converting Seurat to AnnData", c("SeuratObject", "Seurat")) diff --git a/R/from_SingleCellExperiment.R b/R/from_SingleCellExperiment.R index 559681e2..4897718c 100644 --- a/R/from_SingleCellExperiment.R +++ b/R/from_SingleCellExperiment.R @@ -28,7 +28,12 @@ from_SingleCellExperiment <- function( obsp_mapping = TRUE, varp_mapping = TRUE, uns_mapping = TRUE, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c( + "InMemory", + "HDF5AnnData", + "ZarrAnnData", + "ReticulateAnnData" + ), ... ) { check_requires( diff --git a/R/read_h5ad_helpers.R b/R/read_h5ad_helpers.R index 887e7fcd..5598b9f4 100644 --- a/R/read_h5ad_helpers.R +++ b/R/read_h5ad_helpers.R @@ -266,41 +266,13 @@ read_h5ad_sparse_array <- function( on.exit(rhdf5::H5Gclose(h5group), add = TRUE) attrs <- rhdf5::h5readAttributes(file, name, native = FALSE) - x_data <- as.vector(h5group$data) - # dgCMatrix/dgRMatrix x slot must be double - if (!is.double(x_data)) { - x_data <- as.double(x_data) - } - indices <- as.integer(as.vector(h5group$indices)) - indptr <- as.integer(as.vector(h5group$indptr)) - shape <- as.integer(as.vector(attrs[["shape"]])) - - # The Matrix package validity checks require that indices are sorted within - # each major axis group (row indices within columns for CSC, column indices - # within rows for CSR). For sparse matrices in Python order isn't guaranteed, - # so we sort if needed. - if (length(indices) > 1L) { - row_lengths <- diff(indptr) - group_ids <- rep.int(seq_along(row_lengths), row_lengths) - ord <- order(group_ids, indices) - if (is.unsorted(ord)) { - indices <- indices[ord] - x_data <- x_data[ord] - } - } - - if (type == "csc_matrix") { - # Directly construct dgCMatrix (CSC format) to avoid overhead of constructing - # a general sparseMatrix and then coercing to dgCMatrix - # Slots: i = row indices (0-based), p = col pointers, x = values, Dim - mtx <- new("dgCMatrix", i = indices, p = indptr, x = x_data, Dim = shape) - } else if (type == "csr_matrix") { - # Directly construct dgRMatrix (CSR format) - # Slots: j = column indices (0-based), p = row pointers, x = values, Dim - mtx <- new("dgRMatrix", j = indices, p = indptr, x = x_data, Dim = shape) - } - - mtx + construct_sparse_matrix( + data = as.vector(h5group$data), + indices = as.vector(h5group$indices), + indptr = as.vector(h5group$indptr), + shape = as.vector(attrs[["shape"]]), + type = type + ) } #' Read H5AD recarray diff --git a/R/read_zarr.R b/R/read_zarr.R new file mode 100644 index 00000000..91dfb64c --- /dev/null +++ b/R/read_zarr.R @@ -0,0 +1,69 @@ +#' Read Zarr +#' +#' Read data from a Zarr store +#' +#' @param path Path to the Zarr store to read +#' @param as The type of object to return. One of: +#' +#' * `"InMemoryAnnData"`: Read the Zarr store into memory as an +#' [`InMemoryAnnData`] object +#' * `"ZarrAnnData"`: Read the Zarr store as an [`ZarrAnnData`] object +#' * `"SingleCellExperiment"`: Read the Zarr store as a +#' [`SingleCellExperiment::SingleCellExperiment`] object +#' * `"Seurat"`: Read the Zarr store as a +#' [`SeuratObject::Seurat`] object +#' @param mode The mode to open the Zarr file. +#' +#' * `a` creates a new file or opens an existing one for read/write. +#' * `r` opens an existing file for reading. +#' * `r+` opens an existing file for read/write. +#' * `w` creates a file, truncating any existing ones. +#' * `w-`/`x` are synonyms, creating a file and failing if it already exists. +#' @param ... Extra arguments provided to the `as_*` conversion function for the +#' object specified by `as` +#' +#' @return The object specified by `as` +#' @export +#' +#' @family AnnData creators +#' +#' @examples +#' # Please use "example_v3.zarr.zip" for AnnData stored as Zarr version 3 +#' zarr_dir <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +#' td <- tempdir(check = TRUE) +#' unzip(zarr_dir, exdir = td) +#' zarr_store <- file.path(td, "example_v2.zarr") +#' +#' # Read the Zarr as a SingleCellExperiment object +#' if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { +#' sce <- read_zarr(zarr_store, as = "SingleCellExperiment") +#' } +#' +#' # Read the Zarr as a Seurat object +#' if (requireNamespace("SeuratObject", quietly = TRUE)) { +#' seurat <- read_zarr(zarr_store, as = "Seurat") +#' } +read_zarr <- function( + path, + as = c("InMemoryAnnData", "ZarrAnnData", "SingleCellExperiment", "Seurat"), + mode = c("r", "r+", "a", "w", "w-", "x"), + ... +) { + as <- match.arg(as) + mode <- match.arg(mode) + + zarr_adata <- ZarrAnnData$new(path, mode = mode) + + if (as == "ZarrAnnData") { + return(zarr_adata) + } + + adata <- switch( + as, + "SingleCellExperiment" = zarr_adata$as_SingleCellExperiment(...), + "Seurat" = zarr_adata$as_Seurat(...), + "InMemoryAnnData" = zarr_adata$as_InMemoryAnnData(...) + ) + + adata +} diff --git a/R/read_zarr_helpers.R b/R/read_zarr_helpers.R new file mode 100644 index 00000000..0cf42f4f --- /dev/null +++ b/R/read_zarr_helpers.R @@ -0,0 +1,562 @@ +#' Read Zarr encoding +#' +#' Read the encoding and version of an element in a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' +#' @return A named list with names type and version +#' +#' @noRd +read_zarr_encoding <- function(store, name) { + tryCatch( + { + attrs <- Rarr::read_zarr_attributes(file.path(store, name)) + list( + type = attrs[["encoding-type"]], + version = attrs[["encoding-version"]] + ) + }, + error = function(e) { + cli_abort( + "Encoding attributes not found for element {.val {name}} in {.path {store}}" + ) + } + ) +} + +#' Read Zarr element +#' +#' Read an element from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param type The encoding type of the element to read +#' @param version The encoding version of the element to read +#' @param stop_on_error Whether to stop on error or generate a warning instead +#' @param ... Extra arguments passed to individual reading functions +#' +#' @details +#' Encoding is automatically determined from the element using +#' `read_zarr_encoding` and used to select the appropriate reading function. +#' +#' @return Value depending on the encoding +#' +#' @noRd +read_zarr_element <- function( + store, + name, + type = NULL, + version = NULL, + stop_on_error = FALSE, + ... +) { + if (!zarr_path_exists(store, name)) { + return(NULL) + } + + if (is.null(type)) { + encoding_list <- read_zarr_encoding(store, name) + type <- encoding_list$type + version <- encoding_list$version + } + + read_fun <- switch( + type, + "null" = read_zarr_null, + "array" = read_zarr_dense_array, + "rec-array" = read_zarr_rec_array, + "csr_matrix" = read_zarr_csr_matrix, + "csc_matrix" = read_zarr_csc_matrix, + "dataframe" = read_zarr_data_frame, + "dict" = read_zarr_mapping, + "string" = read_zarr_string_scalar, + "numeric-scalar" = read_zarr_numeric_scalar, + "categorical" = read_zarr_categorical, + "string-array" = read_zarr_string_array, + "nullable-integer" = read_zarr_nullable_integer, + "nullable-boolean" = read_zarr_nullable_boolean, + cli_abort( + "No function for reading Zarr encoding {.cls {type}} for element {.val {name}}" + ) + ) + + tryCatch( + { + read_fun(store = store, name = name, version = version, ...) + }, + error = function(e) { + msg <- cli::cli_fmt(cli::cli_bullets(c( + paste0( + "Error reading element {.field {name}} of type {.cls {type}}" + ), + "i" = conditionMessage(e) + ))) + if (stop_on_error) { + cli_abort(msg) + } else { + cli_warn(msg) + NULL + } + } + ) +} + +#' Read Zarr null +#' +#' Read a null value from an Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return `NULL` +#' @noRd +read_zarr_null <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + + NULL +} + +#' Read Zarr dense array +#' +#' Read a dense array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A matrix or a vector if 1D +#' +#' @noRd +read_zarr_dense_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + data <- Rarr::read_zarr_array(file.path(store, name)) + + data +} + +read_zarr_csr_matrix <- function(store, name, version) { + read_zarr_sparse_array( + store = store, + name = name, + version = version, + type = "csr_matrix" + ) +} + +read_zarr_csc_matrix <- function(store, name, version) { + read_zarr_sparse_array( + store = store, + name = name, + version = version, + type = "csc_matrix" + ) +} + +#' Read Zarr sparse array +#' +#' Read a sparse array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' @param type Type of the sparse matrix, either "csr_matrix" or "csc_matrix" +#' +#' @return A sparse matrix/DelayedArray???, or a vector if 1D +#' @importFrom Matrix sparseMatrix +#' +#' @noRd +read_zarr_sparse_array <- function( + store, + name, + version = "0.1.0", + type = c("csr_matrix", "csc_matrix") +) { + version <- match.arg(version) + type <- match.arg(type) + + attrs <- Rarr::read_zarr_attributes(file.path(store, name)) + + construct_sparse_matrix( + data = as.vector(Rarr::read_zarr_array(file.path(store, name, "data"))), + indices = as.vector(Rarr::read_zarr_array(file.path( + store, + name, + "indices" + ))), + indptr = as.vector(Rarr::read_zarr_array(file.path(store, name, "indptr"))), + shape = as.vector(unlist(attrs$shape, use.names = FALSE)), + type = type + ) +} + +#' Read Zarr recarray +#' +#' Read a recarray from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @details +#' A "record array" (recarray) is a Python NumPy array type that contains +#' "fields" that can be indexed using attributes (similar to columns in a +#' spreadsheet). See https://numpy.org/doc/stable/reference/generated/numpy.recarray.html +#' for details. +#' +#' They are used by **scanpy** to score marker gene testing results. +#' +#' @return A named list of 1D arrays +#' +#' @noRd +read_zarr_rec_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + Rarr::read_zarr_array(file.path(store, name)) |> + lapply(as.vector) +} + +#' Read Zarr nullable boolean +#' +#' Read a nullable boolean from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A boolean vector +#' +#' @noRd +read_zarr_nullable_boolean <- function(store, name, version = "0.1.0") { + as.logical(read_zarr_nullable(store, name, version)) +} + +#' Read Zarr nullable integer +#' +#' Read a nullable integer from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return An integer vector +#' +#' @noRd +read_zarr_nullable_integer <- function(store, name, version = "0.1.0") { + as.integer(read_zarr_nullable(store, name, version)) +} + +#' Read Zarr nullable +#' +#' Read a nullable vector (boolean or integer) from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A nullable vector +#' +#' @noRd +read_zarr_nullable <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + + mask <- Rarr::read_zarr_array(file.path(store, paste0(name, "/mask"))) + values <- Rarr::read_zarr_array(file.path(store, paste0(name, "/values"))) + + # Get values and set missing + element <- values + element[mask] <- NA + + element +} + +#' Read Zarr string array +#' +#' Read a string array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A character vector/matrix +#' +#' @noRd +read_zarr_string_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + data <- Rarr::read_zarr_array(file.path(store, name)) + + # convert "NA" to NA (as in rhdf5:::.h5postProcessDataset) + data[data == "NA"] <- NA + + data +} + +#' Read Zarr categorical +#' +#' Read a categorical from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A factor +#' +#' @noRd +read_zarr_categorical <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + codes <- Rarr::read_zarr_array(file.path(store, paste0(name, "/codes"))) + categories <- Rarr::read_zarr_array(file.path( + store, + paste0(name, "/categories") + )) + + # Get codes and convert to 1-based indexing + codes <- codes + 1L + + # Set missing values + codes[codes == 0L] <- NA_integer_ + + levels <- categories + + attributes <- Rarr::read_zarr_attributes(file.path(store, name)) + ordered <- attributes[["ordered"]] + + factor(levels[codes], levels = levels, ordered = ordered) +} + +#' Read Zarr string scalar +#' +#' Read a string scalar from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A character vector of length 1 +#' +#' @noRd +read_zarr_string_scalar <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + as.character(Rarr::read_zarr_array(file.path(store, name))) +} + +#' Read Zarr numeric scalar +#' +#' Read a numeric scalar from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A numeric vector of length 1 +#' +#' @noRd +read_zarr_numeric_scalar <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + value <- Rarr::read_zarr_array(file.path(store, name)) + + # convert array to vector + value <- as.vector(value) + + value +} + +#' Read Zarr mapping +#' +#' Read a mapping from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A named list +#' +#' @noRd +read_zarr_mapping <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + items <- read_zarr_mapping_keys(store, name, version) + read_zarr_collection(store, name, items) +} + +#' Read Zarr data frame +#' +#' Read a data frame from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A data.frame +#' +#' @noRd +read_zarr_data_frame <- function( + store, + name, + version = "0.2.0" +) { + version <- match.arg(version) + + dim_keys <- read_zarr_data_frame_keys(store, name, version) + data <- read_zarr_collection(store, name, dim_keys$cols) + + as.data.frame( + row.names = dim_keys$rows, + data, + check.names = FALSE, + fix.empty.names = FALSE + ) +} + +#' Read multiple Zarr datatypes +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param item_names Vector of item names (in order) +#' +#' @return A named list +#' +#' @noRd +read_zarr_collection <- function(store, name, item_names) { + items <- lapply( + item_names, + function(item_name) { + new_name <- paste0(name, "/", item_name) + encoding <- read_zarr_encoding(store, new_name) + read_zarr_element( + store = store, + name = new_name, + type = encoding$type, + version = encoding$version + ) + } + ) + names(items) <- item_names + items +} + +#' Read Zarr element keys +#' +#' Read the keys of an element from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param type The encoding type of the element to read +#' @param version The encoding version of the element to read +#' @param stop_on_error Whether to stop on error or generate a warning instead +#' @param ... Extra arguments passed to individual reading functions +#' +#' @return A character vector of keys +#' +#' @noRd +read_zarr_element_keys <- function( + store, + name, + type = NULL, + version = NULL, + stop_on_error = FALSE, + ... +) { + if (!zarr_path_exists(store, name)) { + return(NULL) + } + + if (is.null(type)) { + encoding_list <- read_zarr_encoding(store, name) + type <- encoding_list$type + version <- encoding_list$version + } + + read_fun <- switch( + type, + "dataframe" = read_zarr_data_frame_keys, + "dict" = read_zarr_mapping_keys, + cli_abort( + "No function for reading keys for Zarr encoding {.cls {type}} for element {.val {name}}" + ) + ) + + tryCatch( + { + read_fun(store = store, name = name, version = version, ...) + }, + error = function(e) { + msg <- cli::cli_fmt(cli::cli_bullets(c( + paste0( + "Error reading element keys for {.field {name}} of type {.cls {type}}" + ), + "i" = conditionMessage(e) + ))) + if (stop_on_error) { + cli_abort(msg) + } else { + cli_warn(msg) + NULL + } + } + ) +} + +#' Read Zarr mapping keys +#' +#' Read keys for a mapping (dict) from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return A character vector of item names +#' +#' @noRd +read_zarr_mapping_keys <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + + items <- list.dirs( + path = file.path(store, name), + recursive = FALSE, + full.names = FALSE + ) + items[!items %in% ZARR_METADATA_FILES] +} + +#' Read Zarr data frame keys +#' +#' Read the row names (index) and/or column names of a data frame from a Zarr +#' store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' @param dim Dimension to read keys for: `"both"`, `"rows"`, or `"cols"` +#' +#' @return A character vector if `dim` is `"rows"` or `"cols"`, or a list with +#' elements `"rows"` and `"cols"` if `dim` is `"both"` +#' +#' @noRd +read_zarr_data_frame_keys <- function( + store, + name, + version = "0.2.0", + dim = c("both", "rows", "cols") +) { + version <- match.arg(version) + dim <- match.arg(dim) + + attrs <- Rarr::read_zarr_attributes(file.path(store, name)) + index_name <- attrs[["_index"]] + column_order <- attrs[["column-order"]] + + if (dim == "both") { + list( + rows = as.vector(read_zarr_element(store, file.path(name, index_name))), + cols = as.character(column_order) + ) + } else if (dim == "rows") { + as.vector(read_zarr_element(store, file.path(name, index_name))) + } else if (dim == "cols") { + as.character(column_order) + } +} diff --git a/R/utils.R b/R/utils.R index 0d3fc2a5..d1ce170b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -260,3 +260,57 @@ warn_matrix_dimnames_not_writeable <- function( invisible() } + +#' Construct a sparse matrix from CSR/CSC components +#' +#' Build a `dgCMatrix` or `dgRMatrix` from raw data, index, and pointer vectors. +#' +#' @param data Non-zero values. Coerced to `double`. +#' @param indices Column indices (CSC) or row indices (CSR), 0-based. Coerced +#' to `integer`. +#' @param indptr Index pointers, 0-based. Coerced to `integer`. +#' @param shape Matrix dimensions. Coerced to `integer`. +#' @param type Either `"csc_matrix"` or `"csr_matrix"`. +#' +#' @return A `dgCMatrix` (CSC) or `dgRMatrix` (CSR). +#' +#' @noRd +construct_sparse_matrix <- function( + data, + indices, + indptr, + shape, + type = c("csc_matrix", "csr_matrix") +) { + type <- match.arg(type) + + data <- as.double(data) + indices <- as.integer(indices) + indptr <- as.integer(indptr) + shape <- as.integer(shape) + + # The Matrix package validity checks require that indices are sorted within + # each major axis group (row indices within columns for CSC, column indices + # within rows for CSR). For sparse matrices in Python order isn't guaranteed, + # so we sort if needed. + if (length(indices) > 1L) { + row_lengths <- diff(indptr) + group_ids <- rep.int(seq_along(row_lengths), row_lengths) + ord <- order(group_ids, indices) + if (is.unsorted(ord)) { + indices <- indices[ord] + data <- data[ord] + } + } + + if (type == "csc_matrix") { + # Directly construct dgCMatrix (CSC format) to avoid overhead of constructing + # a general sparseMatrix and then coercing to dgCMatrix + # Slots: i = row indices (0-based), p = col pointers, x = values, Dim + new("dgCMatrix", i = indices, p = indptr, x = data, Dim = shape) + } else if (type == "csr_matrix") { + # Directly construct dgRMatrix (CSR format) + # Slots: j = column indices (0-based), p = row pointers, x = values, Dim + new("dgRMatrix", j = indices, p = indptr, x = data, Dim = shape) + } +} diff --git a/R/write_zarr.R b/R/write_zarr.R new file mode 100644 index 00000000..4f71010a --- /dev/null +++ b/R/write_zarr.R @@ -0,0 +1,128 @@ +#' Write Zarr +#' +#' Write a Zarr file +#' +#' @param object The object to write, either a +#' [`SingleCellExperiment::SingleCellExperiment`] or a +#' [`SeuratObject::Seurat`] object +#' @param path Path of the file to write to +#' @param compression The compression algorithm to use when writing the Zarr +#' file. Can be one of `"none"`, `"gzip"`, `"blosc"`, `"zstd"`, +#' `"lzma"`, `"bz2"`, `"zlib"`, `"lz4"`. Defaults to `"none"`. +#' See `help("compressors", package = "Rarr")`. +#' @param mode The mode to open the Zarr file. +#' +#' * `a` creates a new file or opens an existing one for read/write +#' * `r+` opens an existing file for read/write +#' * `w` creates a file, truncating any existing ones +#' * `w-`/`x` are synonyms creating a file and failing if it already exists +#' @param ... Additional arguments passed to [as_AnnData()] +#' +#' @details +#' +#' ## `NULL` values +#' +#' For compatibility with changes in Python **anndata** 0.12.0, `NULL` values +#' in `uns` are written to Zarr files as a `NULL` dataset (instead of not being +#' written at all). To disable this behaviour, set +#' `option(anndataR.write_null = FALSE)`. This may be required to allow the file +#' to be read by older versions of Python **anndata**. +#' +#' @return `path` invisibly +#' @export +#' +#' @examples +#' adata <- AnnData( +#' X = matrix(1:5, 3L, 5L), +#' layers = list( +#' A = matrix(5:1, 3L, 5L), +#' B = matrix(letters[1:5], 3L, 5L) +#' ), +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) +#' ) +#' zarr_store <- tempfile(fileext = ".zarr") +#' adata$write_zarr(zarr_store) +#' +#' # Write a SingleCellExperiment as a Zarr store +#' if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { +#' ncells <- 100 +#' counts <- matrix(rpois(20000, 5), ncol = ncells) +#' logcounts <- log2(counts + 1) +#' +#' pca <- matrix(runif(ncells * 5), ncells) +#' tsne <- matrix(rnorm(ncells * 2), ncells) +#' +#' sce <- SingleCellExperiment::SingleCellExperiment( +#' assays = list(counts = counts, logcounts = logcounts), +#' reducedDims = list(PCA = pca, tSNE = tsne) +#' ) +#' +#' adata <- as_AnnData(sce) +#' zarr_store <- tempfile(fileext = ".zarr") +#' adata$write_zarr(zarr_store) +#' } +#' +#' # Write a Seurat as a Zarr +#' if (requireNamespace("Seurat", quietly = TRUE)) { +#' library(Seurat) +#' +#' counts <- matrix(1:15, 5L, 3L) +#' dimnames(counts) <- list( +#' LETTERS[1:5], +#' letters[1:3] +#' ) +#' cell.metadata <- data.frame( +#' row.names = letters[1:3], +#' cell = 1:3 +#' ) +#' obj <- CreateSeuratObject(counts, meta.data = cell.metadata) +#' gene.metadata <- data.frame( +#' row.names = LETTERS[1:5], +#' gene = 1:5 +#' ) +#' obj[["RNA"]] <- AddMetaData(GetAssay(obj), gene.metadata) +#' +#' adata <- as_AnnData(obj) +#' zarr_store <- tempfile(fileext = ".zarr") +#' adata$write_zarr(zarr_store) +#' } +write_zarr <- function( + object, + path, + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ), + mode = c("w-", "r", "r+", "a", "w", "x"), + ... +) { + mode <- match.arg(mode) + adata <- if (inherits(object, "AbstractAnnData")) { + object$as_ZarrAnnData( + path, + compression = compression, + mode = mode + ) + } else { + as_AnnData( + object, + output_class = "ZarrAnnData", + file = path, + compression = compression, + mode = mode, + ... + ) + } + + rm(adata) + gc() + + invisible(path) +} diff --git a/R/write_zarr_helpers.R b/R/write_zarr_helpers.R new file mode 100644 index 00000000..8ed9f6e6 --- /dev/null +++ b/R/write_zarr_helpers.R @@ -0,0 +1,743 @@ +#' Write Zarr element +#' +#' Write an element to a Zarr store +#' +#' @param value The value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"` or `"gzip"`. Defaults to `"none"`. +#' @param stop_on_error Whether to stop on error or generate a warning instead +#' @param ... Additional arguments passed to writing functions +#' +#' @noRd +#' +#' @details +#' `write_zarr_element()` should always be used instead of any of the specific +#' writing functions as it contains additional boilerplate to make sure +#' elements are written correctly. +write_zarr_element <- function( + value, + store, + name, + compression = c( + "none", + "gzip", + "blosc", + "zstd", + "lzma", + "bz2", + "zlib", + "lz4" + ), + stop_on_error = FALSE, + ... +) { + compression <- match.arg(compression) + + # Sparse matrices + write_fun <- + if (is.null(value)) { + write_zarr_null + } else if (inherits(value, "sparseMatrix")) { + # Sparse matrices + write_zarr_sparse_array + } else if (is.factor(value)) { + # Categoricals + write_zarr_categorical + } else if (is.list(value)) { + # Lists and data frames + if (is.data.frame(value)) { + write_zarr_data_frame + } else { + write_zarr_mapping + } + } else if (is.character(value)) { + # Character values + if (length(value) == 1 && !is.matrix(value)) { + write_zarr_string_scalar + } else { + write_zarr_string_array + } + } else if (is.numeric(value) || inherits(value, "denseMatrix")) { + # Numeric values + if (length(value) == 1 && !is.matrix(value)) { + write_zarr_numeric_scalar + } else if (is.integer(value) && any(is.na(value))) { + write_zarr_nullable_integer + } else { + write_zarr_dense_array + } + } else if (is.logical(value)) { + # Logical values + if (any(is.na(value))) { + write_zarr_nullable_boolean + } else if (length(value) == 1) { + # Single Booleans should be written as numeric scalars + write_zarr_numeric_scalar + } else { + write_zarr_dense_array + } + } else { + # Fail if unknown + cli_abort(c( + "Writing {.cls {class(value)}} objects to Zarr is not supported", + "i" = "Attempting to write to {.path {name}} in {.file {store}}" + )) + } + + # Delete the path if it already exists + if (zarr_path_exists(store, name)) { + unlink(file.path(store, name), recursive = TRUE) + } + + tryCatch( + { + write_fun( + value = value, + store = store, + name = name, + compression = compression, + ... + ) + }, + error = function(e) { + message <- paste0( + "Could not write element '", + name, + "' of type '", + class(value), + "':\n", + conditionMessage(e) + ) + if (stop_on_error) { + cli_abort(message) + } else { + cli_warn(message) + NULL + } + } + ) +} + +#' Write Zarr encoding +#' +#' Write Zarr encoding attributes to an element in a Zarr store +#' +#' @noRd +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param encoding The encoding type to set +#' @param version The encoding version to set +write_zarr_encoding <- function(store, name, encoding, version) { + Rarr::write_zarr_attributes( + file.path(store, name), + new.zattrs = list(`encoding-type` = encoding, `encoding-version` = version) + ) +} + +#' Write Zarr null +#' +#' Write a null dataset to an Zarr file +#' +#' @param value Value to write, not used +#' @param store An open Zarr handle +#' @param name Name of the element within the Zarr store +#' @param compression Not used as there is no value +#' @param version Encoding version of the element to write +#' +#' @noRd +write_zarr_null <- function( + value, + store, + name, + compression, + version = "0.1.0" +) { + if (isFALSE(getOption("anndataR.write_null", "TRUE"))) { + return(invisible(NULL)) + } + + Rarr::create_empty_zarr_array( + file.path(store, name), + dim = 0, + chunk_dim = 0, + data_type = "logical", + zarr_version = 2L + ) + + write_zarr_encoding(store, name, "null", version) +} + +#' Write Zarr dense array +#' +#' Write a dense array to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_dense_array <- function( + value, + store, + name, + compression, + version = "0.2.0" +) { + version <- match.arg(version) + + # matrices of type 'dgeMatrix' can simply be converted to a matrix + if (inherits(value, "denseMatrix")) { + value <- as.matrix(value) + } + + zarr_write_compressed( + store, + name, + value, + compression + ) + + # Write attributes + write_zarr_encoding(store, name, "array", version) +} + +#' Write Zarr sparse array +#' +#' Write a sparse array to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_sparse_array <- function( + value, + store, + name, + compression, + version = "0.1.0" +) { + version <- match.arg(version) + + # check types + stopifnot(inherits(value, "sparseMatrix")) + + if (inherits(value, "RsparseMatrix")) { + type <- "csr_matrix" + indices_attr <- "j" + } else if (inherits(value, "CsparseMatrix")) { + type <- "csc_matrix" + indices_attr <- "i" + } else { + cli_abort(c( + "Unsupported matrix format in {.path {name}}", + "i" = "Supported matrices inherit from {.cls RsparseMatrix} or {.cls CsparseMatrix}" + )) + } + + # Write sparse matrix + create_zarr_group(store, name) + zarr_write_compressed( + store, + paste0(name, "/indices"), + attr(value, indices_attr), + compression + ) + zarr_write_compressed( + store, + paste0(name, "/indptr"), + value@p, + compression + ) + zarr_write_compressed( + store, + paste0(name, "/data"), + value@x, + compression + ) + + # Add encoding + write_zarr_encoding(store, name, type, version) + + # Write shape attribute + Rarr::write_zarr_attributes(file.path(store, name), list(shape = dim(value))) +} + +#' Write Zarr nullable boolean +#' +#' Write a nullable boolean to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_nullable_boolean <- function( + value, + store, + name, + compression, + version = "0.1.0" +) { + # write mask and values + create_zarr_group(store, name) + value_no_na <- value + value_no_na[is.na(value_no_na)] <- FALSE + + zarr_write_compressed( + store, + paste0(name, "/values"), + value_no_na, + compression + ) + zarr_write_compressed( + store, + paste0(name, "/mask"), + is.na(value), + compression + ) + + # Write attributes + write_zarr_encoding(store, name, "nullable-boolean", version) +} + +#' Write Zarr nullable integer +#' +#' Write a nullable integer to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_nullable_integer <- function( + value, + store, + name, + compression, + version = "0.1.0" +) { + # write mask and values + create_zarr_group(store, name) + value_no_na <- value + value_no_na[is.na(value_no_na)] <- -1L + + zarr_write_compressed( + store, + paste0(name, "/values"), + value_no_na, + compression + ) + zarr_write_compressed( + store, + paste0(name, "/mask"), + is.na(value), + compression + ) + + # Write attributes + write_zarr_encoding(store, name, "nullable-integer", version) +} + +#' Write Zarr string array +#' +#' Write a string array to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_string_array <- function( + value, + store, + name, + compression, + version = "0.2.0" +) { + dims <- dim(value) %||% length(value) + + # replace NA to "NA" (as in rhdf5:::.h5postProcessDataset) + # to read as "NA" -> NA later after Rarr:read_zarr_array + value[is.na(value)] <- "NA" + + if (any(dims == 0)) { + Rarr::create_empty_zarr_array( + file.path(store, name), + dim = dims, + chunk_dim = dims, + data_type = " 1) "C" else "F", + # TODO: string arrays require vlen-utf8 filter support + # see https://github.com/Huber-group-EMBL/Rarr/issues/98 + data_type = " 1) "C" else "F", + compressor = .get_compressor(compression), + zarr_version = 2L + ) +} + +#' Get Zarr compressor +#' +#' Convert a compression name to the corresponding Rarr compressor object. +#' +#' @param compression The compression algorithm name. One of `"none"`, +#' `"gzip"`, `"blosc"`, `"zstd"`, `"lzma"`, `"bz2"`, `"zlib"`, `"lz4"`. +#' +#' @return A Rarr compressor object, or `NULL` for no compression. +#' +#' @noRd +.get_compressor <- function(compression) { + switch( + compression, + "none" = NULL, + "zstd" = Rarr::use_zstd(), + "blosc" = Rarr::use_blosc(), + "gzip" = Rarr::use_gzip(), + "lzma" = Rarr::use_lzma(), + "bz2" = Rarr::use_bz2(), + "zlib" = Rarr::use_zlib(), + "lz4" = Rarr::use_lz4() + ) +} diff --git a/README.md b/README.md index aae4be08..40126588 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ ## Features of {anndataR} - Provide an `R6` class to work with `AnnData` objects in R (either in-memory or on-disk) -- Read/write `*.h5ad` files natively +- Read/write `*.h5ad` files and `*.zarr` stores natively - Convert to/from `SingleCellExperiment` objects - Convert to/from `Seurat` objects @@ -49,6 +49,8 @@ the task you want to perform. - To read/write `*.h5ad` files, install [rhdf5](https://www.bioconductor.org/packages/rhdf5): `BiocManager::install("rhdf5")` +- To read/write `*.zarr` stores, install [Rarr](https://www.bioconductor.org/packages/Rarr): + `BiocManager::install("Rarr")` - To convert to/from `SingleCellExperiment` objects, install [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html): `BiocManager::install("SingleCellExperiment")` - To convert to/from `Seurat` objects, install [SeuratObject](https://cran.r-project.org/package=SeuratObject): diff --git a/benchmarks/lib/helpers.R b/benchmarks/lib/helpers.R index 1eac9faf..5393ddd7 100644 --- a/benchmarks/lib/helpers.R +++ b/benchmarks/lib/helpers.R @@ -83,6 +83,23 @@ generate_bench_h5ad <- function(x_type, n_obs, n_vars, cache_dir) { path } +#' Convert an H5AD bench file to a Zarr store and cache it +#' +#' @param x_type Matrix type key (matches h5ad_paths names) +#' @param h5ad_path Path to the corresponding H5AD file +#' @param cache_dir Directory to cache generated stores +#' @return Path to the generated Zarr store directory +generate_bench_zarr <- function(x_type, h5ad_path, cache_dir) { + path <- file.path(cache_dir, paste0("bench_", x_type, ".zarr")) + if (dir.exists(path)) { + return(path) + } + ad <- reticulate::import("anndata", convert = FALSE) + adata_py <- ad$read_h5ad(h5ad_path) + adata_py$write_zarr(path) + path +} + # --------------------------------------------------------------------------- # bench::mark → BMF JSON conversion # --------------------------------------------------------------------------- diff --git a/benchmarks/run_benchmarks.R b/benchmarks/run_benchmarks.R index 1de4a4a3..42a401aa 100644 --- a/benchmarks/run_benchmarks.R +++ b/benchmarks/run_benchmarks.R @@ -73,6 +73,22 @@ h5ad_paths <- setNames( ) cat("\n") +cat("Generating Zarr test data (converting from H5AD)...\n") +zarr_paths <- setNames( + vapply( + x_types, + function(xt) { + cat(sprintf(" %s... ", xt)) + path <- generate_bench_zarr(xt, h5ad_paths[[xt]], cache_dir) + cat("done\n") + path + }, + character(1) + ), + x_types +) +cat("\n") + # --------------------------------------------------------------------------- # Run selected suites # --------------------------------------------------------------------------- @@ -88,12 +104,12 @@ for (suite in suites_to_run) { suite_results <- switch( suite, - read = bench_read(h5ad_paths, opts$iterations, x_types), - write = bench_write(h5ad_paths, opts$iterations, x_types), - get = bench_get(h5ad_paths, opts$iterations), - set = bench_set(h5ad_paths, opts$iterations), - convert = bench_convert(h5ad_paths, opts$iterations, x_types), - subset = bench_subset(h5ad_paths, opts$iterations), + read = bench_read(h5ad_paths, opts$iterations, x_types, zarr_paths), + write = bench_write(h5ad_paths, opts$iterations, x_types, zarr_paths), + get = bench_get(h5ad_paths, opts$iterations, zarr_paths), + set = bench_set(h5ad_paths, opts$iterations, zarr_paths), + convert = bench_convert(h5ad_paths, opts$iterations, x_types, zarr_paths), + subset = bench_subset(h5ad_paths, opts$iterations, zarr_paths), { warning("Unknown suite: ", suite) list() diff --git a/benchmarks/suites/bench_convert.R b/benchmarks/suites/bench_convert.R index a679ea35..c01e1547 100644 --- a/benchmarks/suites/bench_convert.R +++ b/benchmarks/suites/bench_convert.R @@ -5,7 +5,7 @@ # format conversions (InMemory↔SCE, InMemory↔Seurat). # ============================================================================= -bench_convert <- function(h5ad_paths, iterations, x_types) { +bench_convert <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() # --- Backend conversions (per X type) --- @@ -47,6 +47,43 @@ bench_convert <- function(h5ad_paths, iterations, x_types) { ) } + # --- Zarr ↔ InMemory conversions (per X type) --- + for (xt in x_types) { + zarr_path <- zarr_paths[[xt]] + + # Zarr → InMemory + env <- new.env(parent = globalenv()) + env$.ad <- read_zarr(zarr_path, as = "ZarrAnnData") + + results <- c( + results, + run_one_benchmark( + name = paste0("convert_Zarr_to_InMemory_", xt), + expr = quote(.ad$as_InMemoryAnnData()), + iterations = iterations, + env = env + ) + ) + + # InMemory → Zarr + env2 <- new.env(parent = globalenv()) + env2$.ad <- read_zarr(zarr_path, as = "InMemoryAnnData") + + results <- c( + results, + run_one_benchmark( + name = paste0("convert_InMemory_to_Zarr_", xt), + expr = quote({ + .tmp <- tempfile() + .result <- .ad$as_ZarrAnnData(.tmp) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env2 + ) + ) + } + # --- Format conversions (using float_csparse as representative) --- path <- h5ad_paths[["float_csparse"]] ad <- read_h5ad(path, as = "InMemoryAnnData") diff --git a/benchmarks/suites/bench_get.R b/benchmarks/suites/bench_get.R index 08c28114..589722bd 100644 --- a/benchmarks/suites/bench_get.R +++ b/benchmarks/suites/bench_get.R @@ -42,15 +42,23 @@ colnames = quote(colnames(.ad)) ) -bench_get <- function(h5ad_paths, iterations) { +bench_get <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch(backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) # Open the AnnData - ad <- read_h5ad(path, as = backend) + ad <- if (backend == "ZarrAnnData") { + read_zarr(zarr_paths[["float_csparse"]], as = "ZarrAnnData") + } else { + read_h5ad(path, as = backend) + } # --- Slot getters --- for (slot in .bench_slots) { diff --git a/benchmarks/suites/bench_read.R b/benchmarks/suites/bench_read.R index 605f9f4a..bc6dcc6b 100644 --- a/benchmarks/suites/bench_read.R +++ b/benchmarks/suites/bench_read.R @@ -5,7 +5,7 @@ # across different X matrix types. # ============================================================================= -bench_read <- function(h5ad_paths, iterations, x_types) { +bench_read <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() for (xt in x_types) { @@ -37,5 +37,32 @@ bench_read <- function(h5ad_paths, iterations, x_types) { ) } + # Read from Zarr store + for (xt in x_types) { + path <- zarr_paths[[xt]] + + # Read Zarr → InMemoryAnnData + results <- c( + results, + run_one_benchmark( + name = paste0("read_zarr_InMemory_", xt), + expr = quote(read_zarr(.path, as = "InMemoryAnnData")), + setup = bquote(.path <- .(path)), + iterations = iterations + ) + ) + + # Open Zarr lazily → ZarrAnnData + results <- c( + results, + run_one_benchmark( + name = paste0("read_zarr_Zarr_", xt), + expr = quote(read_zarr(.path, as = "ZarrAnnData")), + setup = bquote(.path <- .(path)), + iterations = iterations + ) + ) + } + results } diff --git a/benchmarks/suites/bench_set.R b/benchmarks/suites/bench_set.R index 8f751ba1..c3be772b 100644 --- a/benchmarks/suites/bench_set.R +++ b/benchmarks/suites/bench_set.R @@ -4,7 +4,7 @@ # Benchmarks setting every AnnData slot on both InMemory and HDF5 backends. # ============================================================================= -bench_set <- function(h5ad_paths, iterations) { +bench_set <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] @@ -22,12 +22,24 @@ bench_set <- function(h5ad_paths, iterations) { "uns" ) - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch(backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) for (slot in slots) { - # For HDF5, we need a fresh writable copy for each slot - if (backend == "HDF5AnnData") { + # Each backend needs a fresh writable instance per slot + if (backend == "ZarrAnnData") { + # Copy Zarr store directory so each slot gets a fresh writable copy + zarr_path <- zarr_paths[["float_csparse"]] + tmp_parent <- tempfile() + dir.create(tmp_parent, recursive = TRUE) + file.copy(zarr_path, tmp_parent, recursive = TRUE) + tmp <- file.path(tmp_parent, basename(zarr_path)) + ad <- read_zarr(tmp, as = "ZarrAnnData", mode = "r+") + } else if (backend == "HDF5AnnData") { tmp <- tempfile(fileext = ".h5ad") file.copy(path, tmp) ad <- suppressWarnings( @@ -55,7 +67,9 @@ bench_set <- function(h5ad_paths, iterations) { ) ) - if (backend == "HDF5AnnData") { + if (backend == "ZarrAnnData") { + unlink(tmp, recursive = TRUE) + } else if (backend == "HDF5AnnData") { ad$close() unlink(tmp) } diff --git a/benchmarks/suites/bench_subset.R b/benchmarks/suites/bench_subset.R index f6a8b5f6..dc9fa039 100644 --- a/benchmarks/suites/bench_subset.R +++ b/benchmarks/suites/bench_subset.R @@ -5,13 +5,21 @@ # materialization back to concrete implementations. # ============================================================================= -bench_subset <- function(h5ad_paths, iterations) { +bench_subset <- function(h5ad_paths, iterations, zarr_paths) { results <- list() path <- h5ad_paths[["float_csparse"]] - for (backend in c("InMemoryAnnData", "HDF5AnnData")) { - short <- if (backend == "InMemoryAnnData") "InMemory" else "HDF5" - ad <- read_h5ad(path, as = backend) + for (backend in c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData")) { + short <- switch(backend, + InMemoryAnnData = "InMemory", + HDF5AnnData = "HDF5", + ZarrAnnData = "Zarr" + ) + ad <- if (backend == "ZarrAnnData") { + read_zarr(zarr_paths[["float_csparse"]], as = "ZarrAnnData") + } else { + read_h5ad(path, as = backend) + } n_obs <- ad$n_obs() n_vars <- ad$n_vars() @@ -123,7 +131,22 @@ bench_subset <- function(h5ad_paths, iterations) { ) ) - # Clean up + # --- Materialize view → Zarr --- + results <- c( + results, + run_one_benchmark( + name = paste0("materialize_to_Zarr_", short), + expr = quote({ + .tmp <- tempfile() + .result <- .view$as_ZarrAnnData(.tmp) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env4 + ) + ) + + # Clean up (ZarrAnnData holds no persistent file handles) if (backend == "HDF5AnnData") { ad$close() } diff --git a/benchmarks/suites/bench_write.R b/benchmarks/suites/bench_write.R index 569b0788..e1acfddf 100644 --- a/benchmarks/suites/bench_write.R +++ b/benchmarks/suites/bench_write.R @@ -5,7 +5,7 @@ # with different compression settings and X matrix types. # ============================================================================= -bench_write <- function(h5ad_paths, iterations, x_types) { +bench_write <- function(h5ad_paths, iterations, x_types, zarr_paths) { results <- list() compressions <- c("none", "gzip") @@ -57,5 +57,50 @@ bench_write <- function(h5ad_paths, iterations, x_types) { } } + # Write to Zarr store + for (xt in x_types) { + path <- zarr_paths[[xt]] + + for (compression in compressions) { + # Write from InMemoryAnnData → Zarr + env <- new.env(parent = globalenv()) + env$.ad <- read_zarr(path, as = "InMemoryAnnData") + env$.compression <- compression + + results <- c( + results, + run_one_benchmark( + name = paste0("write_zarr_InMemory_", xt, "_", compression), + expr = quote({ + .tmp <- tempfile() + .ad$as_ZarrAnnData(.tmp, compression = .compression) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env + ) + ) + + # Write from ZarrAnnData → Zarr + env2 <- new.env(parent = globalenv()) + env2$.ad <- read_zarr(path, as = "ZarrAnnData") + env2$.compression <- compression + + results <- c( + results, + run_one_benchmark( + name = paste0("write_zarr_Zarr_", xt, "_", compression), + expr = quote({ + .tmp <- tempfile() + .ad$as_ZarrAnnData(.tmp, compression = .compression) + unlink(.tmp, recursive = TRUE) + }), + iterations = iterations, + env = env2 + ) + ) + } + } + results } diff --git a/inst/WORDLIST b/inst/WORDLIST index cf69d9fb..96ed5ffc 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,9 +1,7 @@ AbstractAnnData AnnData AnnDataView -aram Bioc -bioc BiocCheck BiocManager CMD @@ -14,24 +12,29 @@ InMemoryAnnData LZF Lifecycle ORCID -pkgdown R's +Rarr ReticulateAnnData -roundtrip SCE Seurat SeuratObject SingleCellExperiment Zarr +ZarrAnnData anndata +aram +bioc dynverse hdf knitr mudata obs +pkgdown png py +py's rhdf +roundtrip scanpy scverse theislab diff --git a/inst/extdata/example.h5ad b/inst/extdata/example.h5ad index a009bfb8..da25ad52 100644 Binary files a/inst/extdata/example.h5ad and b/inst/extdata/example.h5ad differ diff --git a/inst/extdata/example_v2.zarr.zip b/inst/extdata/example_v2.zarr.zip new file mode 100644 index 00000000..174f60c1 Binary files /dev/null and b/inst/extdata/example_v2.zarr.zip differ diff --git a/inst/extdata/example_v3.zarr.zip b/inst/extdata/example_v3.zarr.zip new file mode 100644 index 00000000..a28ef934 Binary files /dev/null and b/inst/extdata/example_v3.zarr.zip differ diff --git a/inst/scripts/example_h5ad.py b/inst/scripts/example_files.py similarity index 59% rename from inst/scripts/example_h5ad.py rename to inst/scripts/example_files.py index 9142caaf..24fd19b7 100644 --- a/inst/scripts/example_h5ad.py +++ b/inst/scripts/example_files.py @@ -1,27 +1,59 @@ -# python v3.13.5 -import anndata # anndata v0.11.4 -import scanpy # scanpy v1.11.4 -import numpy # numpy v2.2.6 -import pandas # pandas v2.3.0 -import scipy.sparse # scipy v1.14.1 - -# This script uses Python to create an example H5AD file for testing +# /// script +# requires-python = "==3.14.4" +# dependencies = [ +# "anndata==0.12.10", +# "igraph==1.0.0", +# "leidenalg==0.11.0", +# "scanpy==1.12.1", +# "scipy==1.17.1", +# "zarr==3.1.6", +# ] +# /// +import os +import shutil +import zipfile + +import anndata +import numpy +import pandas +import scanpy +import scipy.sparse + +# This script uses Python to create example H5AD and Zarr files for testing # interoperability between languages. It is designed to be a small but # relatively complex file that tests reading of different types and data -# structures. The standard scanpy workflow has also been applied to populate +# structures. +# +# In order to run the script, install uv (https://docs.astral.sh/uv/) and run: +# +# uv run inst/scripts/example_files.py +# +# The standard scanpy workflow has also been applied to populate # some of the most common information from real analyses. It should be updated # to test new issues as they are discovered. # # NOTE: When updating this script for the {anndataR} example H5AD file please # update the package versions used above, update the script version, date and -# changelog below and format the file using Python Black -# (https://black.readthedocs.io/en/stable/). +# changelog below and format the file using Ruff (https://docs.astral.sh/ruff/): # -# Version: 0.2.0 -# Date: 2023-05-11 +# ruff format inst/scripts/example_files.py && ruff check --select I --fix inst/scripts/example_files.py +# +# Version: 0.4.1 +# Date: 2026-04-15 # # CHANGELOG # +# v0.4.1 (2026-04-15) +# - Replace requirements.yml with uv dependency comments +# - Update package versions to latest stable versions +# - Add progress messages +# - Use Ruff for formatting +# v0.4.0 (2025-11-24) +# - Add zarr example +# - Add requirements.yml +# v0.3.0 (2025-08-04) +# - Add adata.varp["test_varp"] to test reading of varp +# - Update package versions to latest stable versions # v0.3.0 (2025-08-04) # - Add adata.varp["test_varp"] to test reading of varp # - Update package versions to latest stable versions @@ -36,6 +68,8 @@ numpy.random.seed(0) +print(">>> Creating AnnData...") + # Randomly generate a counts matrix counts = numpy.random.poisson(2, size=(50, 100)) @@ -74,20 +108,52 @@ adata.uns["String2D"] = [[f"row{i}col{j}" for i in range(10)] for j in range(5)] adata.uns["DataFrameEmpty"] = pandas.DataFrame(index=adata.obs.index) +print("\n>>> Running scanpy workflow...") + # Run the standard scanpy workflow +print("Calculating QC metrics...") scanpy.pp.calculate_qc_metrics(adata, percent_top=None, inplace=True) +print("Normalizing..") scanpy.pp.normalize_total(adata, inplace=True) adata.layers["dense_X"] = adata.X.copy().toarray() scanpy.pp.log1p(adata) +print("Finding highly variable genes...") scanpy.pp.highly_variable_genes(adata) +print("Calculating PCA...") scanpy.tl.pca(adata) +print("Finding neighbors...") scanpy.pp.neighbors(adata) +print("Calculating UMAP...") scanpy.tl.umap(adata) +print("Calculating Leiden clusters...") scanpy.tl.leiden(adata) +print("Calculating marker genes...") scanpy.tl.rank_genes_groups(adata, "leiden") # add varp to test reading of varp adata.varp["test_varp"] = numpy.random.rand(adata.n_vars, adata.n_vars) -# Write the H5AD file +print("\n>>> Writing H5AD file...") adata.write_h5ad("inst/extdata/example.h5ad", compression="gzip") + +# Write Zarr files in both v2 and v3 formats and zip them +os.chdir("inst/extdata/") +for fmt in (2, 3): + anndata.settings.zarr_write_format = fmt + + zarr_dir = f"example_v{fmt}.zarr" + zip_path = f"{zarr_dir}.zip" + + print(f"\n>>> Writing Zarr v{fmt} file...") + adata.write_zarr(zarr_dir) + + print(f"Zipping Zarr v{fmt} file...") + + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z: + for root, dirs, files in os.walk(zarr_dir): + for file in files: + z.write(os.path.join(root, file)) + + shutil.rmtree(zarr_dir) + +print("\n>>> Done!") diff --git a/man/AbstractAnnData.Rd b/man/AbstractAnnData.Rd index 4f7a1dc6..3da4ca48 100644 --- a/man/AbstractAnnData.Rd +++ b/man/AbstractAnnData.Rd @@ -20,7 +20,8 @@ Other AnnData classes: \code{\link{AnnDataView}}, \code{\link{HDF5AnnData}}, \code{\link{InMemoryAnnData}}, -\code{\link{ReticulateAnnData}} +\code{\link{ReticulateAnnData}}, +\code{\link{ZarrAnnData}} } \concept{AnnData classes} \section{Active bindings}{ @@ -70,7 +71,9 @@ Other AnnData classes: \item \href{#method-AbstractAnnData-as_InMemoryAnnData}{\code{AbstractAnnData$as_InMemoryAnnData()}} \item \href{#method-AbstractAnnData-as_ReticulateAnnData}{\code{AbstractAnnData$as_ReticulateAnnData()}} \item \href{#method-AbstractAnnData-as_HDF5AnnData}{\code{AbstractAnnData$as_HDF5AnnData()}} +\item \href{#method-AbstractAnnData-as_ZarrAnnData}{\code{AbstractAnnData$as_ZarrAnnData()}} \item \href{#method-AbstractAnnData-write_h5ad}{\code{AbstractAnnData$write_h5ad()}} +\item \href{#method-AbstractAnnData-write_zarr}{\code{AbstractAnnData$write_zarr()}} \item \href{#method-AbstractAnnData-clone}{\code{AbstractAnnData$clone()}} } } @@ -355,6 +358,36 @@ An \code{\link{HDF5AnnData}} object } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-AbstractAnnData-as_ZarrAnnData}{}}} +\subsection{Method \code{as_ZarrAnnData()}}{ +Convert to a \code{\link{ZarrAnnData}} + +See \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}} for more details on the conversion +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{AbstractAnnData$as_ZarrAnnData( + file, + compression = c("none", "gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4"), + mode = c("w-", "r", "r+", "a", "w", "x") +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{file}}{See \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}}} + +\item{\code{compression}}{See \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}}} + +\item{\code{mode}}{See \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A \code{\link{ZarrAnnData}} object +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-AbstractAnnData-write_h5ad}{}}} \subsection{Method \code{write_h5ad()}}{ @@ -388,6 +421,36 @@ See \code{\link[=write_h5ad]{write_h5ad()}} for details } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-AbstractAnnData-write_zarr}{}}} +\subsection{Method \code{write_zarr()}}{ +Write the \code{AnnData} object to a Zarr file + +See \code{\link[=write_zarr]{write_zarr()}} for details +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{AbstractAnnData$write_zarr( + path, + compression = c("none", "gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4"), + mode = c("w-", "r", "r+", "a", "w", "x") +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{See \code{\link[=write_zarr]{write_zarr()}}} + +\item{\code{compression}}{See \code{\link[=write_zarr]{write_zarr()}}} + +\item{\code{mode}}{See \code{\link[=write_zarr]{write_zarr()}}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +\code{path} invisibly +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-AbstractAnnData-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/AnnData-usage.Rd b/man/AnnData-usage.Rd index de99618e..5f3ec266 100644 --- a/man/AnnData-usage.Rd +++ b/man/AnnData-usage.Rd @@ -18,6 +18,7 @@ abstract \link{AbstractAnnData} class and store and access data in different way \itemize{ \item \link{InMemoryAnnData} stores data in memory \item \link{HDF5AnnData} provides an interface to a H5AD file +\item \link{ZarrAnnData} provides an interface to a Zarr store \item \link{ReticulateAnnData} wraps a Python \code{AnnData} object via \pkg{reticulate} } @@ -124,6 +125,7 @@ Convert to \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExpe \item{\code{as_Seurat()}}{Convert to \code{\link[SeuratObject:Seurat-class]{SeuratObject::Seurat}}, see \code{\link[=as_Seurat]{as_Seurat()}}} \item{\code{as_InMemoryAnnData()}}{Convert to \code{\link{InMemoryAnnData}}, as \code{\link[=as_InMemoryAnnData]{as_InMemoryAnnData()}}} \item{\code{as_HDF5AnnData()}}{Convert to \code{\link{HDF5AnnData}}, see \code{\link[=as_HDF5AnnData]{as_HDF5AnnData()}}} +\item{\code{as_ZarrAnnData()}}{Convert to \code{\link{ZarrAnnData}}, see \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}}} \item{\code{as_ReticulateAnnData()}}{Convert to \code{\link{ReticulateAnnData}}, see \code{\link[=as_ReticulateAnnData]{as_ReticulateAnnData()}}} } } @@ -137,6 +139,14 @@ Convert to \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExpe Write the \code{AnnData} object to an HDF5 file, see \code{\link[=write_h5ad]{write_h5ad()}} } } + +\describe{ +\item{ +\code{write_zarr()} +}{ +Write the \code{AnnData} object to a Zarr store, see \code{\link[=write_zarr]{write_zarr()}} +} +} } \subsection{General methods:}{ @@ -153,6 +163,7 @@ Write the \code{AnnData} object to an HDF5 file, see \code{\link[=write_h5ad]{wr \describe{ \item{\code{\link[=AnnData]{AnnData()}}}{Create an \link{InMemoryAnnData} object} \item{\code{\link[=read_h5ad]{read_h5ad()}}}{Read an \code{AnnData} from a H5AD file} +\item{\code{\link[=read_zarr]{read_zarr()}}}{Read an \code{AnnData} from a Zarr store} \item{\code{\link[=as_AnnData]{as_AnnData()}}}{Convert other objects to an \code{AnnData} object} } } @@ -168,5 +179,7 @@ inherit from \link{HDF5AnnData} for the HDF5-backed implementation of \code{AnnData} +\link{ZarrAnnData} for the Zarr-backed implementation of \code{AnnData} + \link{ReticulateAnnData} for the reticulate-based implementation that wraps Python AnnData objects } diff --git a/man/AnnData.Rd b/man/AnnData.Rd index f8074138..a1399c7b 100644 --- a/man/AnnData.Rd +++ b/man/AnnData.Rd @@ -66,6 +66,7 @@ adata Other AnnData creators: \code{\link{as_AnnData}()}, -\code{\link{read_h5ad}()} +\code{\link{read_h5ad}()}, +\code{\link{read_zarr}()} } \concept{AnnData creators} diff --git a/man/AnnDataView.Rd b/man/AnnDataView.Rd index b0a7131e..b5740cc4 100644 --- a/man/AnnDataView.Rd +++ b/man/AnnDataView.Rd @@ -34,7 +34,8 @@ Other AnnData classes: \code{\link{AbstractAnnData}}, \code{\link{HDF5AnnData}}, \code{\link{InMemoryAnnData}}, -\code{\link{ReticulateAnnData}} +\code{\link{ReticulateAnnData}}, +\code{\link{ZarrAnnData}} } \concept{AnnData classes} \section{Super class}{ @@ -83,6 +84,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$as_ReticulateAnnData()
  • anndataR::AbstractAnnData$as_Seurat()
  • anndataR::AbstractAnnData$as_SingleCellExperiment()
  • +
  • anndataR::AbstractAnnData$as_ZarrAnnData()
  • anndataR::AbstractAnnData$layers_keys()
  • anndataR::AbstractAnnData$n_obs()
  • anndataR::AbstractAnnData$n_vars()
  • @@ -96,6 +98,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$varm_keys()
  • anndataR::AbstractAnnData$varp_keys()
  • anndataR::AbstractAnnData$write_h5ad()
  • +
  • anndataR::AbstractAnnData$write_zarr()
  • }} diff --git a/man/HDF5AnnData.Rd b/man/HDF5AnnData.Rd index 3100894f..310204c1 100644 --- a/man/HDF5AnnData.Rd +++ b/man/HDF5AnnData.Rd @@ -22,7 +22,8 @@ Other AnnData classes: \code{\link{AbstractAnnData}}, \code{\link{AnnDataView}}, \code{\link{InMemoryAnnData}}, -\code{\link{ReticulateAnnData}} +\code{\link{ReticulateAnnData}}, +\code{\link{ZarrAnnData}} } \concept{AnnData classes} \section{Super class}{ @@ -80,9 +81,11 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$as_ReticulateAnnData()
  • anndataR::AbstractAnnData$as_Seurat()
  • anndataR::AbstractAnnData$as_SingleCellExperiment()
  • +
  • anndataR::AbstractAnnData$as_ZarrAnnData()
  • anndataR::AbstractAnnData$print()
  • anndataR::AbstractAnnData$shape()
  • anndataR::AbstractAnnData$write_h5ad()
  • +
  • anndataR::AbstractAnnData$write_zarr()
  • }} diff --git a/man/InMemoryAnnData.Rd b/man/InMemoryAnnData.Rd index d397cb6e..5be3f56b 100644 --- a/man/InMemoryAnnData.Rd +++ b/man/InMemoryAnnData.Rd @@ -41,7 +41,8 @@ Other AnnData classes: \code{\link{AbstractAnnData}}, \code{\link{AnnDataView}}, \code{\link{HDF5AnnData}}, -\code{\link{ReticulateAnnData}} +\code{\link{ReticulateAnnData}}, +\code{\link{ZarrAnnData}} } \concept{AnnData classes} \section{Super class}{ @@ -89,6 +90,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$as_ReticulateAnnData()
  • anndataR::AbstractAnnData$as_Seurat()
  • anndataR::AbstractAnnData$as_SingleCellExperiment()
  • +
  • anndataR::AbstractAnnData$as_ZarrAnnData()
  • anndataR::AbstractAnnData$layers_keys()
  • anndataR::AbstractAnnData$n_obs()
  • anndataR::AbstractAnnData$n_vars()
  • @@ -102,6 +104,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$varm_keys()
  • anndataR::AbstractAnnData$varp_keys()
  • anndataR::AbstractAnnData$write_h5ad()
  • +
  • anndataR::AbstractAnnData$write_zarr()
  • }} diff --git a/man/ReticulateAnnData.Rd b/man/ReticulateAnnData.Rd index 2b5ac2cc..8d4a3daa 100644 --- a/man/ReticulateAnnData.Rd +++ b/man/ReticulateAnnData.Rd @@ -22,7 +22,8 @@ Other AnnData classes: \code{\link{AbstractAnnData}}, \code{\link{AnnDataView}}, \code{\link{HDF5AnnData}}, -\code{\link{InMemoryAnnData}} +\code{\link{InMemoryAnnData}}, +\code{\link{ZarrAnnData}} } \concept{AnnData classes} \section{Super class}{ @@ -72,6 +73,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$as_ReticulateAnnData()
  • anndataR::AbstractAnnData$as_Seurat()
  • anndataR::AbstractAnnData$as_SingleCellExperiment()
  • +
  • anndataR::AbstractAnnData$as_ZarrAnnData()
  • anndataR::AbstractAnnData$layers_keys()
  • anndataR::AbstractAnnData$obs_keys()
  • anndataR::AbstractAnnData$obsm_keys()
  • @@ -83,6 +85,7 @@ Other AnnData classes:
  • anndataR::AbstractAnnData$varm_keys()
  • anndataR::AbstractAnnData$varp_keys()
  • anndataR::AbstractAnnData$write_h5ad()
  • +
  • anndataR::AbstractAnnData$write_zarr()
  • }} diff --git a/man/ZarrAnnData.Rd b/man/ZarrAnnData.Rd new file mode 100644 index 00000000..4e48bc3d --- /dev/null +++ b/man/ZarrAnnData.Rd @@ -0,0 +1,257 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ZarrAnnData.R +\name{ZarrAnnData} +\alias{ZarrAnnData} +\title{ZarrAnnData} +\value{ +A \code{ZarrAnnData} object +} +\description{ +Implementation of a Zarr-backed \code{AnnData} object. This class provides an +interface to a Zarr file and minimal data is stored in memory until it is +requested by the user. It is primarily designed as an intermediate object +when reading/writing Zarr files but can be useful for accessing parts of +large files. + +See \link{AnnData-usage} for details on creating and using \code{AnnData} objects. +} +\seealso{ +\link{AnnData-usage} for details on creating and using \code{AnnData} objects + +Other AnnData classes: +\code{\link{AbstractAnnData}}, +\code{\link{AnnDataView}}, +\code{\link{HDF5AnnData}}, +\code{\link{InMemoryAnnData}}, +\code{\link{ReticulateAnnData}} +} +\concept{AnnData classes} +\section{Super class}{ +\code{\link[anndataR:AbstractAnnData]{anndataR::AbstractAnnData}} -> \code{ZarrAnnData} +} +\section{Active bindings}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{X}}{See \link{AnnData-usage}} + +\item{\code{layers}}{See \link{AnnData-usage}} + +\item{\code{obsm}}{See \link{AnnData-usage}} + +\item{\code{varm}}{See \link{AnnData-usage}} + +\item{\code{obsp}}{See \link{AnnData-usage}} + +\item{\code{varp}}{See \link{AnnData-usage}} + +\item{\code{obs}}{See \link{AnnData-usage}} + +\item{\code{var}}{See \link{AnnData-usage}} + +\item{\code{obs_names}}{See \link{AnnData-usage}} + +\item{\code{var_names}}{See \link{AnnData-usage}} + +\item{\code{uns}}{See \link{AnnData-usage}} +} +\if{html}{\out{
    }} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-ZarrAnnData-new}{\code{ZarrAnnData$new()}} +\item \href{#method-ZarrAnnData-n_obs}{\code{ZarrAnnData$n_obs()}} +\item \href{#method-ZarrAnnData-n_vars}{\code{ZarrAnnData$n_vars()}} +\item \href{#method-ZarrAnnData-obs_keys}{\code{ZarrAnnData$obs_keys()}} +\item \href{#method-ZarrAnnData-var_keys}{\code{ZarrAnnData$var_keys()}} +\item \href{#method-ZarrAnnData-layers_keys}{\code{ZarrAnnData$layers_keys()}} +\item \href{#method-ZarrAnnData-obsm_keys}{\code{ZarrAnnData$obsm_keys()}} +\item \href{#method-ZarrAnnData-varm_keys}{\code{ZarrAnnData$varm_keys()}} +\item \href{#method-ZarrAnnData-obsp_keys}{\code{ZarrAnnData$obsp_keys()}} +\item \href{#method-ZarrAnnData-varp_keys}{\code{ZarrAnnData$varp_keys()}} +\item \href{#method-ZarrAnnData-uns_keys}{\code{ZarrAnnData$uns_keys()}} +} +} +\if{html}{\out{ +
    Inherited methods + +
    +}} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-new}{}}} +\subsection{Method \code{new()}}{ +\code{ZarrAnnData} constructor +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$new( + file, + X = NULL, + obs = NULL, + var = NULL, + layers = NULL, + obsm = NULL, + varm = NULL, + obsp = NULL, + varp = NULL, + uns = NULL, + shape = NULL, + mode = c("a", "r", "r+", "w", "w-", "x"), + compression = c("none", "gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4") +)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{file}}{The file name (character) of the \code{.zarr} file. If this file +already exits, other arguments must be \code{NULL}.} + +\item{\code{X}}{See the \code{X} slot in \link{AnnData-usage}} + +\item{\code{obs}}{See the \code{obs} slot in \link{AnnData-usage}} + +\item{\code{var}}{See the \code{var} slot in \link{AnnData-usage}} + +\item{\code{layers}}{See the \code{layers} slot in \link{AnnData-usage}} + +\item{\code{obsm}}{See the \code{obsm} slot in \link{AnnData-usage}} + +\item{\code{varm}}{See the \code{varm} slot in \link{AnnData-usage}} + +\item{\code{obsp}}{See the \code{obsp} slot in \link{AnnData-usage}} + +\item{\code{varp}}{See the \code{varp} slot in \link{AnnData-usage}} + +\item{\code{uns}}{See the \code{uns} slot in \link{AnnData-usage}} + +\item{\code{shape}}{Shape tuple (e.g. \code{c(n_obs, n_vars)}). Can be provided if +both \code{X} or \code{obs} and \code{var} are not provided.} + +\item{\code{mode}}{The mode to open the Zarr file. See \code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}} for +details} + +\item{\code{compression}}{The compression algorithm to use. See +\code{\link[=as_ZarrAnnData]{as_ZarrAnnData()}} for details} +} +\if{html}{\out{
    }} +} +\subsection{Details}{ +The constructor creates a new Zarr \code{AnnData} interface object. This can +either be used to either connect to an existing \code{.zarr} file or to +create a new one. If any additional slot arguments are set an existing +file will be overwritten. +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-n_obs}{}}} +\subsection{Method \code{n_obs()}}{ +See the \code{n_obs} field in \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$n_obs()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-n_vars}{}}} +\subsection{Method \code{n_vars()}}{ +See the \code{n_vars} field in \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$n_vars()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-obs_keys}{}}} +\subsection{Method \code{obs_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$obs_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-var_keys}{}}} +\subsection{Method \code{var_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$var_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-layers_keys}{}}} +\subsection{Method \code{layers_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$layers_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-obsm_keys}{}}} +\subsection{Method \code{obsm_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$obsm_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-varm_keys}{}}} +\subsection{Method \code{varm_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$varm_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-obsp_keys}{}}} +\subsection{Method \code{obsp_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$obsp_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-varp_keys}{}}} +\subsection{Method \code{varp_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$varp_keys()}\if{html}{\out{
    }} +} + +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ZarrAnnData-uns_keys}{}}} +\subsection{Method \code{uns_keys()}}{ +See \link{AnnData-usage} +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{ZarrAnnData$uns_keys()}\if{html}{\out{
    }} +} + +} +} diff --git a/man/as_AnnData.Rd b/man/as_AnnData.Rd index b9fd4028..6867c8b3 100644 --- a/man/as_AnnData.Rd +++ b/man/as_AnnData.Rd @@ -18,7 +18,7 @@ as_AnnData( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = NULL, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c("InMemory", "HDF5AnnData", "ZarrAnnData", "ReticulateAnnData"), ... ) @@ -34,7 +34,7 @@ as_AnnData( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = TRUE, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c("InMemory", "HDF5AnnData", "ZarrAnnData", "ReticulateAnnData"), ... ) @@ -50,7 +50,7 @@ as_AnnData( varp_mapping = TRUE, uns_mapping = TRUE, assay_name = NULL, - output_class = c("InMemory", "HDF5AnnData", "ReticulateAnnData"), + output_class = c("InMemory", "HDF5AnnData", "ZarrAnnData", "ReticulateAnnData"), ... ) } @@ -243,7 +243,8 @@ as_AnnData(sce) \seealso{ Other AnnData creators: \code{\link{AnnData}()}, -\code{\link{read_h5ad}()} +\code{\link{read_h5ad}()}, +\code{\link{read_zarr}()} Other object converters: \code{\link{as_HDF5AnnData}()}, @@ -251,6 +252,7 @@ Other object converters: \code{\link{as_ReticulateAnnData}()}, \code{\link{as_Seurat}()}, \code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{AnnData creators} diff --git a/man/as_HDF5AnnData.Rd b/man/as_HDF5AnnData.Rd index 13ca7e33..fe3c8bbe 100644 --- a/man/as_HDF5AnnData.Rd +++ b/man/as_HDF5AnnData.Rd @@ -50,6 +50,7 @@ Other object converters: \code{\link{as_ReticulateAnnData}()}, \code{\link{as_Seurat}()}, \code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{object converters} diff --git a/man/as_InMemoryAnnData.Rd b/man/as_InMemoryAnnData.Rd index c1e0108a..d803276e 100644 --- a/man/as_InMemoryAnnData.Rd +++ b/man/as_InMemoryAnnData.Rd @@ -35,6 +35,7 @@ Other object converters: \code{\link{as_ReticulateAnnData}()}, \code{\link{as_Seurat}()}, \code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{object converters} diff --git a/man/as_ReticulateAnnData.Rd b/man/as_ReticulateAnnData.Rd index 79a75ada..d62660c0 100644 --- a/man/as_ReticulateAnnData.Rd +++ b/man/as_ReticulateAnnData.Rd @@ -41,6 +41,7 @@ Other object converters: \code{\link{as_InMemoryAnnData}()}, \code{\link{as_Seurat}()}, \code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{object converters} diff --git a/man/as_Seurat.Rd b/man/as_Seurat.Rd index 9da4e682..dfafff57 100644 --- a/man/as_Seurat.Rd +++ b/man/as_Seurat.Rd @@ -159,6 +159,7 @@ Other object converters: \code{\link{as_InMemoryAnnData}()}, \code{\link{as_ReticulateAnnData}()}, \code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{object converters} diff --git a/man/as_SingleCellExperiment.Rd b/man/as_SingleCellExperiment.Rd index 6058f00e..c6b4d4fc 100644 --- a/man/as_SingleCellExperiment.Rd +++ b/man/as_SingleCellExperiment.Rd @@ -165,6 +165,7 @@ Other object converters: \code{\link{as_InMemoryAnnData}()}, \code{\link{as_ReticulateAnnData}()}, \code{\link{as_Seurat}()}, +\code{\link{as_ZarrAnnData}()}, \code{\link{reticulate-helpers}} } \concept{object converters} diff --git a/man/as_ZarrAnnData.Rd b/man/as_ZarrAnnData.Rd new file mode 100644 index 00000000..1bb8c985 --- /dev/null +++ b/man/as_ZarrAnnData.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ZarrAnnData.R +\name{as_ZarrAnnData} +\alias{as_ZarrAnnData} +\title{Convert an \code{AnnData} to an \code{ZarrAnnData}} +\usage{ +as_ZarrAnnData( + adata, + file, + compression = c("none", "gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4"), + mode = c("w-", "r", "r+", "a", "w", "x") +) +} +\arguments{ +\item{adata}{An \code{AnnData} object to be converted to \code{\link{ZarrAnnData}}} + +\item{file}{The file name (character) of the \code{.zarr} file} + +\item{compression}{The compression algorithm to use when writing the +Zarr file. Can be one of \code{"none"}, \code{"gzip"}, \code{"blosc"}, \code{"zstd"}, +\code{"lzma"}, \code{"bz2"}, \code{"zlib"} or \code{"lz4"}. Defaults to \code{"none"}.} + +\item{mode}{The mode to open the Zarr file: +\itemize{ +\item \code{a} creates a new file or opens an existing one for read/write +\item \code{r} opens an existing file for reading +\item \verb{r+} opens an existing file for read/write +\item \code{w} creates a file, truncating any existing ones +\item \verb{w-}/\code{x} are synonyms, creating a file and failing if it already exists +}} +} +\value{ +A \code{\link{ZarrAnnData}} object with the same data as the input \code{AnnData} +object. +} +\description{ +Convert another \code{AnnData} object to an \code{\link{ZarrAnnData}} object +} +\seealso{ +Other object converters: +\code{\link{as_AnnData}()}, +\code{\link{as_HDF5AnnData}()}, +\code{\link{as_InMemoryAnnData}()}, +\code{\link{as_ReticulateAnnData}()}, +\code{\link{as_Seurat}()}, +\code{\link{as_SingleCellExperiment}()}, +\code{\link{reticulate-helpers}} +} +\concept{object converters} +\keyword{internal} diff --git a/man/read_h5ad.Rd b/man/read_h5ad.Rd index fe9bd5bd..b6aaed24 100644 --- a/man/read_h5ad.Rd +++ b/man/read_h5ad.Rd @@ -59,6 +59,7 @@ if (requireNamespace("SeuratObject", quietly = TRUE)) { \seealso{ Other AnnData creators: \code{\link{AnnData}()}, -\code{\link{as_AnnData}()} +\code{\link{as_AnnData}()}, +\code{\link{read_zarr}()} } \concept{AnnData creators} diff --git a/man/read_zarr.Rd b/man/read_zarr.Rd new file mode 100644 index 00000000..02575b75 --- /dev/null +++ b/man/read_zarr.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_zarr.R +\name{read_zarr} +\alias{read_zarr} +\title{Read Zarr} +\usage{ +read_zarr( + path, + as = c("InMemoryAnnData", "ZarrAnnData", "SingleCellExperiment", "Seurat"), + mode = c("r", "r+", "a", "w", "w-", "x"), + ... +) +} +\arguments{ +\item{path}{Path to the Zarr store to read} + +\item{as}{The type of object to return. One of: +\itemize{ +\item \code{"InMemoryAnnData"}: Read the Zarr store into memory as an +\code{\link{InMemoryAnnData}} object +\item \code{"ZarrAnnData"}: Read the Zarr store as an \code{\link{ZarrAnnData}} object +\item \code{"SingleCellExperiment"}: Read the Zarr store as a +\code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment::SingleCellExperiment}} object +\item \code{"Seurat"}: Read the Zarr store as a +\code{\link[SeuratObject:Seurat-class]{SeuratObject::Seurat}} object +}} + +\item{mode}{The mode to open the Zarr file. +\itemize{ +\item \code{a} creates a new file or opens an existing one for read/write. +\item \code{r} opens an existing file for reading. +\item \verb{r+} opens an existing file for read/write. +\item \code{w} creates a file, truncating any existing ones. +\item \verb{w-}/\code{x} are synonyms, creating a file and failing if it already exists. +}} + +\item{...}{Extra arguments provided to the \verb{as_*} conversion function for the +object specified by \code{as}} +} +\value{ +The object specified by \code{as} +} +\description{ +Read data from a Zarr store +} +\examples{ +# Please use "example_v3.zarr.zip" for AnnData stored as Zarr version 3 +zarr_dir <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_dir, exdir = td) +zarr_store <- file.path(td, "example_v2.zarr") + +# Read the Zarr as a SingleCellExperiment object +if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { + sce <- read_zarr(zarr_store, as = "SingleCellExperiment") +} + +# Read the Zarr as a Seurat object +if (requireNamespace("SeuratObject", quietly = TRUE)) { + seurat <- read_zarr(zarr_store, as = "Seurat") +} +} +\seealso{ +Other AnnData creators: +\code{\link{AnnData}()}, +\code{\link{as_AnnData}()}, +\code{\link{read_h5ad}()} +} +\concept{AnnData creators} diff --git a/man/reticulate-helpers.Rd b/man/reticulate-helpers.Rd index f0759403..17c5cd95 100644 --- a/man/reticulate-helpers.Rd +++ b/man/reticulate-helpers.Rd @@ -71,6 +71,7 @@ Other object converters: \code{\link{as_InMemoryAnnData}()}, \code{\link{as_ReticulateAnnData}()}, \code{\link{as_Seurat}()}, -\code{\link{as_SingleCellExperiment}()} +\code{\link{as_SingleCellExperiment}()}, +\code{\link{as_ZarrAnnData}()} } \concept{object converters} diff --git a/man/write_zarr.Rd b/man/write_zarr.Rd new file mode 100644 index 00000000..002379c3 --- /dev/null +++ b/man/write_zarr.Rd @@ -0,0 +1,109 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_zarr.R +\name{write_zarr} +\alias{write_zarr} +\title{Write Zarr} +\usage{ +write_zarr( + object, + path, + compression = c("none", "gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4"), + mode = c("w-", "r", "r+", "a", "w", "x"), + ... +) +} +\arguments{ +\item{object}{The object to write, either a +\code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment::SingleCellExperiment}} or a +\code{\link[SeuratObject:Seurat-class]{SeuratObject::Seurat}} object} + +\item{path}{Path of the file to write to} + +\item{compression}{The compression algorithm to use when writing the Zarr +file. Can be one of \code{"none"}, \code{"gzip"}, \code{"blosc"}, \code{"zstd"}, +\code{"lzma"}, \code{"bz2"}, \code{"zlib"}, \code{"lz4"}. Defaults to \code{"none"}. +See \code{help("compressors", package = "Rarr")}.} + +\item{mode}{The mode to open the Zarr file. +\itemize{ +\item \code{a} creates a new file or opens an existing one for read/write +\item \verb{r+} opens an existing file for read/write +\item \code{w} creates a file, truncating any existing ones +\item \verb{w-}/\code{x} are synonyms creating a file and failing if it already exists +}} + +\item{...}{Additional arguments passed to \code{\link[=as_AnnData]{as_AnnData()}}} +} +\value{ +\code{path} invisibly +} +\description{ +Write a Zarr file +} +\details{ +\subsection{\code{NULL} values}{ + +For compatibility with changes in Python \strong{anndata} 0.12.0, \code{NULL} values +in \code{uns} are written to Zarr files as a \code{NULL} dataset (instead of not being +written at all). To disable this behaviour, set +\code{option(anndataR.write_null = FALSE)}. This may be required to allow the file +to be read by older versions of Python \strong{anndata}. +} +} +\examples{ +adata <- AnnData( + X = matrix(1:5, 3L, 5L), + layers = list( + A = matrix(5:1, 3L, 5L), + B = matrix(letters[1:5], 3L, 5L) + ), + obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), + var = data.frame(row.names = letters[1:5], gene = 1:5) +) +zarr_store <- tempfile(fileext = ".zarr") +adata$write_zarr(zarr_store) + +# Write a SingleCellExperiment as a Zarr store +if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { + ncells <- 100 + counts <- matrix(rpois(20000, 5), ncol = ncells) + logcounts <- log2(counts + 1) + + pca <- matrix(runif(ncells * 5), ncells) + tsne <- matrix(rnorm(ncells * 2), ncells) + + sce <- SingleCellExperiment::SingleCellExperiment( + assays = list(counts = counts, logcounts = logcounts), + reducedDims = list(PCA = pca, tSNE = tsne) + ) + + adata <- as_AnnData(sce) + zarr_store <- tempfile(fileext = ".zarr") + adata$write_zarr(zarr_store) +} + +# Write a Seurat as a Zarr +if (requireNamespace("Seurat", quietly = TRUE)) { + library(Seurat) + + counts <- matrix(1:15, 5L, 3L) + dimnames(counts) <- list( + LETTERS[1:5], + letters[1:3] + ) + cell.metadata <- data.frame( + row.names = letters[1:3], + cell = 1:3 + ) + obj <- CreateSeuratObject(counts, meta.data = cell.metadata) + gene.metadata <- data.frame( + row.names = LETTERS[1:5], + gene = 1:5 + ) + obj[["RNA"]] <- AddMetaData(GetAssay(obj), gene.metadata) + + adata <- as_AnnData(obj) + zarr_store <- tempfile(fileext = ".zarr") + adata$write_zarr(zarr_store) +} +} diff --git a/tests/testthat/helper-roundtrip.R b/tests/testthat/helper-roundtrip.R new file mode 100644 index 00000000..e64a425b --- /dev/null +++ b/tests/testthat/helper-roundtrip.R @@ -0,0 +1,48 @@ +#' Get format config for roundtrip tests +#' +#' Get a list of backend-specific values for a given file format +#' +#' @param fmt Either `"h5ad"` or `"zarr"` +#' @return A named list with elements: `backend`, `ext`, `r_read_fun`, +#' `r_write_fun`, `py_read_method`, `py_write_method` +get_fmt_config <- function(fmt = c("h5ad", "zarr")) { + fmt <- match.arg(fmt) + + if (fmt == "zarr") { + skip_if_no_zarr() # nolint: object_usage_linter + list( + backend = "ZarrAnnData", + ext = ".zarr", + r_read_fun = read_zarr, + r_write_fun = write_zarr, + py_read_method = "read_zarr", + py_write_method = "write_zarr" + ) + } else { + list( + backend = "HDF5AnnData", + ext = ".h5ad", + r_read_fun = read_h5ad, + r_write_fun = write_h5ad, + py_read_method = "read_h5ad", + py_write_method = "write_h5ad" + ) + } +} + +#' Expect AnnData print output to match +#' +#' Compares the print output of an R AnnData object with a Python AnnData +#' object, normalising backend class names before comparing. +#' +#' @param adata_r An R AnnData object +#' @param adata_py A Python AnnData object +expect_anndata_print_equal <- function(adata_r, adata_py) { + str_r <- capture.output(print(adata_r)) + str_py <- capture.output(print(adata_py)) + + # Normalise class names in R output to match Python output + str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) + + expect_equal(str_r, str_py) +} diff --git a/tests/testthat/helper-skip_if_no_zarr.R b/tests/testthat/helper-skip_if_no_zarr.R new file mode 100644 index 00000000..7e43774e --- /dev/null +++ b/tests/testthat/helper-skip_if_no_zarr.R @@ -0,0 +1,14 @@ +# helper function to skip tests if we don't have the Python 'zarr' module +# or the R {anndata} package +skip_if_no_zarr <- function() { + testthat::skip_if_not_installed("reticulate") + reticulate::py_require("zarr") + testthat::skip_if_not( + reticulate::py_module_available("zarr"), + message = "Python zarr module not available for testing" + ) + + # TODO: Remove when this warning is removed from anndata + wn <- reticulate::import("warnings") + wn$filterwarnings("ignore", message="Writing zarr v2 data will no longer be the default") +} diff --git a/tests/testthat/test-Zarr-read.R b/tests/testthat/test-Zarr-read.R new file mode 100644 index 00000000..cf6fab81 --- /dev/null +++ b/tests/testthat/test-Zarr-read.R @@ -0,0 +1,169 @@ +skip_if_not_installed("Rarr") + +for (zarr_version in c("v2", "v3")) { + zarr_zip <- system.file( + "extdata", + paste0("example_", zarr_version, ".zarr.zip"), + package = "anndataR" + ) + td <- tempdir(check = TRUE) + unzip(zarr_zip, exdir = td) + store <- file.path(td, paste0("example_", zarr_version, ".zarr")) + + test_that(paste("reading Zarr", zarr_version, "encoding works"), { + encoding <- read_zarr_encoding(store, "obs") + expect_equal(names(encoding), c("type", "version")) + }) + + test_that(paste("reading Zarr", zarr_version, "dense matrices works"), { + mat <- read_zarr_dense_array(store, "layers/dense_counts") + expect_true(is.matrix(mat)) + expect_type(mat, "integer") + expect_equal(dim(mat), c(50, 100)) + + mat <- read_zarr_dense_array(store, "layers/dense_X") + expect_true(is.matrix(mat)) + expect_type(mat, "double") + expect_equal(dim(mat), c(50, 100)) + }) + + test_that(paste("reading Zarr", zarr_version, "sparse matrices works"), { + mat <- read_zarr_sparse_array(store, "layers/csc_counts", type = "csc") + expect_s4_class(mat, "dgCMatrix") + expect_equal(dim(mat), c(50, 100)) + + mat <- read_zarr_sparse_array(store, "layers/counts", type = "csr") + expect_s4_class(mat, "dgRMatrix") + expect_equal(dim(mat), c(50, 100)) + }) + + # TODO: Re-enable when recarays are handled consistently, see https://github.com/scverse/anndataR/issues/409 + test_that(paste("reading Zarr", zarr_version, "recarrays works"), { + if (zarr_version == "v3") { + skip("Read support for Zarr v3 rec arrays is not implemented yet") + } + array_list <- read_zarr_rec_array( + store, + "uns/rank_genes_groups/logfoldchanges" + ) + expect_true(is.list(array_list)) + for (array in array_list) { + expect_true(is.vector(array)) + expect_type(array, "double") + expect_equal(length(array), 6) + } + }) + + test_that(paste("reading Zarr", zarr_version, "1D numeric arrays works"), { + array_1d <- read_zarr_dense_array(store, "obs/Int") + expect_equal(array_1d, array(0L:49L)) + + array_1d <- read_zarr_dense_array(store, "obs/Float") + expect_equal(array_1d, array(rep(42.42, 50))) + }) + + test_that( + paste("reading Zarr", zarr_version, "1D sparse numeric arrays works"), + { + array_1d <- read_zarr_sparse_array(store, "uns/Sparse1D", type = "csc") + expect_s4_class(array_1d, "dgCMatrix") + expect_equal(dim(array_1d), c(1, 6)) + } + ) + + test_that(paste("reading Zarr", zarr_version, "1D nullable arrays works"), { + array_1d <- read_zarr_nullable_integer(store, "obs/IntNA") + expect_vector(array_1d, ptype = integer(), size = 50) + expect_true(any(is.na(array_1d))) + + array_1d <- read_zarr_dense_array(store, "obs/FloatNA") + expected <- array(rep(42.42, 50)) + expected[1] <- NA + expect_equal(array_1d, expected) + + array_1d <- read_zarr_nullable_boolean(store, "obs/BoolNA") + expect_vector(array_1d, ptype = logical(), size = 50) + expect_true(any(is.na(array_1d))) + }) + + test_that(paste("reading Zarr", zarr_version, "string scalars works"), { + scalar <- read_zarr_string_scalar(store, "uns/StringScalar") + expect_equal(scalar, "A string") + }) + + test_that(paste("reading Zarr", zarr_version, "numeric scalars works"), { + scalar <- read_zarr_numeric_scalar(store, "uns/IntScalar") + expect_equal(scalar, 1) + }) + + test_that(paste("reading Zarr", zarr_version, "string arrays works"), { + array <- read_zarr_string_array(store, "uns/String") + expect_equal(array, array(paste0("String ", 0L:9L))) + + array <- read_zarr_string_array(store, "uns/String2D") + expect_true(is.matrix(array)) + expect_type(array, "character") + expect_equal(dim(array), c(5, 10)) + }) + + test_that(paste("reading Zarr", zarr_version, "mappings works"), { + if (zarr_version == "v3") { + # TODO: Remove when v3 recarray support is implemented + mapping <- suppressWarnings(read_zarr_mapping(store, "uns")) + } else { + mapping <- read_zarr_mapping(store, "uns") + } + expect_type(mapping, "list") + expect_type(names(mapping), "character") + }) + + test_that(paste("reading Zarr", zarr_version, "dataframes works"), { + df <- read_zarr_data_frame(store, "obs") + expect_s3_class(df, "data.frame") + expect_equal( + colnames(df), + c( + "Float", + "FloatNA", + "Int", + "IntNA", + "Bool", + "BoolNA", + "n_genes_by_counts", + "log1p_n_genes_by_counts", + "total_counts", + "log1p_total_counts", + "leiden" + ) + ) + }) + + test_that( + paste("reading Zarr", zarr_version, "as SingleCellExperiment works"), + { + skip_if_not_installed("SingleCellExperiment") + + if (zarr_version == "v3") { + # TODO: Remove when v3 recarray support is implemented + sce <- suppressWarnings(read_zarr(store, as = "SingleCellExperiment")) + } else { + sce <- read_zarr(store, as = "SingleCellExperiment") + } + + expect_s4_class(sce, "SingleCellExperiment") + } + ) + + test_that(paste("reading Zarr", zarr_version, "as Seurat works"), { + skip_if_not_installed("SeuratObject") + + if (zarr_version == "v3") { + # TODO: Remove when v3 recarray support is implemented + seurat <- suppressWarnings(read_zarr(store, as = "Seurat")) + } else { + seurat <- read_zarr(store, as = "Seurat") + } + + expect_s4_class(seurat, "Seurat") + }) +} diff --git a/tests/testthat/test-Zarr-write.R b/tests/testthat/test-Zarr-write.R new file mode 100644 index 00000000..cfbe525e --- /dev/null +++ b/tests/testthat/test-Zarr-write.R @@ -0,0 +1,255 @@ +skip_if_not_installed("Rarr") + +store <- tempfile(fileext = ".zarr") +if (dir.exists(store)) { + unlink(store, recursive = TRUE) +} + +create_zarr(store = store) + +test_that("Writing Zarr dense arrays works", { + array <- matrix(rnorm(20), nrow = 5, ncol = 4) + + expect_silent(write_zarr_element( + array, + store, + "dense_array", + compression = "none" + )) + expect_true(zarr_path_exists(store, "dense_array")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "dense_array")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "array") +}) + +test_that("Writing Zarr dense 3D arrays works", { + value <- array(rnorm(60), dim = c(5, 4, 3)) + + expect_silent( + write_zarr_element( + value, + store, + "dense_3d_array" + ) + ) + expect_true(zarr_path_exists(store, "dense_3d_array")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "dense_3d_array")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "array") +}) + +test_that("Writing Zarr sparse arrays works", { + array <- matrix(rnorm(20), nrow = 5, ncol = 4) + + csc_array <- as(array, "CsparseMatrix") + expect_silent(write_zarr_element( + csc_array, + store, + "csc_array", + compression = "none" + )) + expect_true(zarr_path_exists(store, "csc_array")) + expect_true(zarr_path_exists(store, "csc_array/data")) + expect_true(zarr_path_exists(store, "csc_array/indices")) + expect_true(zarr_path_exists(store, "csc_array/indptr")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "csc_array")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "csc_matrix") + + csr_array <- as(array, "RsparseMatrix") + expect_silent(write_zarr_element( + csr_array, + store, + "csr_array", + compression = "none" + )) + expect_true(zarr_path_exists(store, "csr_array")) + expect_true(zarr_path_exists(store, "csr_array/data")) + expect_true(zarr_path_exists(store, "csr_array/indices")) + expect_true(zarr_path_exists(store, "csr_array/indptr")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "csr_array")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "csr_matrix") +}) + +test_that("Writing dgeMatrix", { + value <- matrix(rnorm(20), nrow = 5, ncol = 4) |> + as("dMatrix") |> + as("generalMatrix") |> + as("unpackedMatrix") + + expect_silent( + write_zarr_element(value, store, "dgematrix") + ) + expect_true(zarr_path_exists(store, "dgematrix")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "dgematrix")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "array") +}) + +test_that("Writing Zarr nullable booleans works", { + nullable <- c(TRUE, TRUE, FALSE, FALSE, FALSE) + nullable[5] <- NA + + expect_silent(write_zarr_element(nullable, store, "nullable_bool")) + expect_true(zarr_path_exists(store, "nullable_bool")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "nullable_bool")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "nullable-boolean") +}) + +test_that("Writing Zarr nullable integers works", { + nullable <- as.integer(1:5) + nullable[5] <- NA + + expect_silent(write_zarr_element(nullable, store, "nullable_int")) + expect_true(zarr_path_exists(store, "nullable_int")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "nullable_int")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "nullable-integer") +}) + +test_that("Writing Zarr string arrays works", { + string <- LETTERS[1:5] + + write_zarr_element(string, store, "string_array") + expect_true(zarr_path_exists(store, "string_array")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "string_array")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "string-array") + + string2d <- matrix(LETTERS[1:20], nrow = 5, ncol = 4) + + expect_silent(write_zarr_element(string2d, store, "string_array2D")) + expect_true(zarr_path_exists(store, "string_array2D")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "string_array2D")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "string-array") +}) + +test_that("Writing Zarr categoricals works", { + categorical <- factor(LETTERS[1:5]) + + expect_no_error(write_zarr_element(categorical, store, "categorical")) + expect_true(zarr_path_exists(store, "categorical")) + expect_true(zarr_path_exists(store, "categorical/categories")) + expect_true(zarr_path_exists(store, "categorical/codes")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "categorical")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "categorical") +}) + +test_that("Writing Zarr string scalars works", { + string <- "A" + + expect_silent(write_zarr_element(string, store, "string_scalar")) + expect_true(zarr_path_exists(store, "string_scalar")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "string_scalar")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "string") +}) + +test_that("Writing Zarr numeric scalars works", { + number <- 1.0 + + expect_silent(write_zarr_element(number, store, "numeric_scalar")) + expect_true(zarr_path_exists(store, "numeric_scalar")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "numeric_scalar")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "numeric-scalar") +}) + +test_that("Writing Zarr mappings works", { + mapping <- list( + array = matrix(rnorm(20), nrow = 5, ncol = 4), + sparse = as(matrix(rnorm(20), nrow = 5, ncol = 4), "CsparseMatrix"), + string = LETTERS[1:5], + numeric = rnorm(5), + scalar = 2 + ) + + expect_silent(write_zarr_element( + mapping, + store, + "mapping", + compression = "none" + )) + expect_true(zarr_path_exists(store, "mapping")) + expect_true(zarr_path_exists(store, "mapping/array")) + expect_true(zarr_path_exists(store, "mapping/sparse")) + expect_true(zarr_path_exists(store, "mapping/sparse/data")) + expect_true(zarr_path_exists(store, "mapping/sparse/indices")) + expect_true(zarr_path_exists(store, "mapping/sparse/indptr")) + expect_true(zarr_path_exists(store, "mapping/string")) + expect_true(zarr_path_exists(store, "mapping/numeric")) + expect_true(zarr_path_exists(store, "mapping/scalar")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "mapping")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "dict") +}) + +test_that("Writing Zarr data frames works", { + df <- data.frame( + Letters = letters[1:5], + Numbers = 1:5 + ) + + expect_silent(write_zarr_element(df, store, "dataframe")) + expect_true(zarr_path_exists(store, "dataframe")) + expect_true(zarr_path_exists(store, "dataframe/Letters")) + expect_true(zarr_path_exists(store, "dataframe/Numbers")) + expect_true(zarr_path_exists(store, "dataframe/_index")) + attrs <- Rarr::read_zarr_attributes(file.path(store, "dataframe")) + expect_true(all(c("encoding-type", "encoding-version") %in% names(attrs))) + expect_equal(attrs[["encoding-type"]], "dataframe") + expect_true(all(c("_index", "column-order") %in% names(attrs))) + expect_equal(attrs[["_index"]], "_index") + expect_identical( + as.character(attrs[["column-order"]]), + c("Letters", "Numbers") + ) +}) + +test_that("writing Zarr from SingleCellExperiment works", { + skip_if_not_installed("SingleCellExperiment") + store <- tempfile(fileext = ".zarr") + sce <- generate_dataset(format = "SingleCellExperiment") + write_zarr(sce, store) + expect_true(dir.exists(store)) +}) + +test_that("writing Zarr from Seurat works", { + skip_if_not_installed("SeuratObject") + store <- tempfile(fileext = ".zarr") + sce <- generate_dataset(format = "Seurat") + write_zarr(sce, store) + expect_true(dir.exists(store)) +}) + +dir_size <- function(path) { + files <- list.files(path, recursive = TRUE, full.names = TRUE) + sum(file.info(files)$size, na.rm = TRUE) +} + +test_that("writing compressed files works for Zarr", { + dummy <- generate_dataset(100, 200) + non_random_X <- matrix(5, 100, 200) # nolint + + adata <- AnnData( + X = non_random_X, + obs = dummy$obs, + var = dummy$var + ) + + store_none <- tempfile(fileext = ".zarr") + store_compressed <- tempfile(fileext = ".zarr") + + write_zarr(adata, store_none, compression = "none") + + comp_list <- c("gzip", "blosc", "zstd", "lzma", "bz2", "zlib", "lz4") + for (comp in comp_list) { + write_zarr(adata, store_compressed, compression = comp) + unlink(store_compressed, recursive = TRUE) + expect_true(dir_size(store_none) > dir_size(store_compressed)) + } +}) diff --git a/tests/testthat/test-ZarrAnnData.R b/tests/testthat/test-ZarrAnnData.R new file mode 100644 index 00000000..7d61d9d6 --- /dev/null +++ b/tests/testthat/test-ZarrAnnData.R @@ -0,0 +1,368 @@ +skip_if_not_installed("Rarr") + +file <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(file, exdir = td) +store <- file.path(td, "example_v2.zarr") + +test_that("opening Zarr works", { + adata <- ZarrAnnData$new(store, mode = "r") + expect_true(inherits(adata, "ZarrAnnData")) +}) + +adata <- ZarrAnnData$new(store, mode = "r") + +# GETTERS ---------------------------------------------------------------- +# trackstatus: class=ZarrAnnData, feature=test_get_X, status=done +test_that("reading X works", { + X <- adata$X + expect_s4_class(X, "dgRMatrix") + expect_equal(dim(X), c(50, 100)) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_layers, status=done +test_that("reading layers works", { + layers <- adata$layers + expect_true(is.list(layers), "list") + expect_equal( + names(layers), + c("counts", "csc_counts", "dense_X", "dense_counts") + ) +}) + +test_that("reading obsm works", { + obsm <- adata$obsm + expect_true(is.list(obsm), "list") + expect_equal( + names(obsm), + c("X_pca", "X_umap") + ) +}) + +test_that("reading varm works", { + varm <- adata$varm + expect_true(is.list(varm), "list") + expect_equal( + names(varm), + c("PCs") + ) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_obsp, status=done +test_that("reading obsp works", { + obsp <- adata$obsp + expect_true(is.list(obsp), "list") + expect_equal( + names(obsp), + c("connectivities", "distances") + ) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_varp, status=done +test_that("reading varp works", { + varp <- adata$varp + expect_true(is.list(varp), "list") + expect_equal( + names(varp), + c("test_varp") + ) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_obs, status=done +test_that("reading obs works", { + obs <- adata$obs + expect_s3_class(obs, "data.frame") + expect_equal( + colnames(obs), + c( + "Float", + "FloatNA", + "Int", + "IntNA", + "Bool", + "BoolNA", + "n_genes_by_counts", + "log1p_n_genes_by_counts", + "total_counts", + "log1p_total_counts", + "leiden" + ) + ) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_var, status=done +test_that("reading var works", { + var <- adata$var + expect_s3_class(var, "data.frame") + expect_equal( + colnames(var), + c( + "String", + "n_cells_by_counts", + "mean_counts", + "log1p_mean_counts", + "pct_dropout_by_counts", + "total_counts", + "log1p_total_counts", + "highly_variable", + "means", + "dispersions", + "dispersions_norm" + ) + ) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_obs_names, status=done +test_that("reading obs names works", { + obs_names <- adata$obs_names + expect_vector(obs_names, ptype = character(), size = 50) +}) + +# trackstatus: class=ZarrAnnData, feature=test_get_var_names, status=done +test_that("reading var names works", { + var_names <- adata$var_names + expect_vector(var_names, ptype = character(), size = 100) +}) + +# SETTERS ---------------------------------------------------------------- +test_that("creating empty Zarr works", { + empty_store <- tempfile(fileext = ".zarr") + expect_silent(ZarrAnnData$new(empty_store)) + unlink(empty_store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_X, status=done +test_that("writing X works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + X <- matrix(rnorm(10 * 20), nrow = 10, ncol = 20) + expect_silent(zarr$X <- X) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_layers, status=done +test_that("writing layers works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + X <- matrix(rnorm(10 * 20), nrow = 10, ncol = 20) + expect_silent(zarr$layers <- list(layer1 = X, layer2 = X)) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_obs, status=done +test_that("writing obs works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + obs <- data.frame( + Letters = LETTERS[1:10], + Numbers = 1:10, + row.names = paste0("Row", 1:10) + ) + zarr$obs <- obs + expect_identical(zarr$obs_names, paste0("Row", 1:10)) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_var, status=done +test_that("writing var works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + var <- data.frame( + Letters = LETTERS[1:20], + Numbers = 1:20, + row.names = paste0("Row", 1:20) + ) + zarr$var <- var + expect_identical(zarr$var_names, paste0("Row", 1:20)) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_obs_names, status=done +test_that("writing obs names works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + zarr$obs_names <- LETTERS[1:10] + expect_identical(zarr$obs_names, LETTERS[1:10]) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_var_names, status=done +test_that("writing var names works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + zarr$var_names <- LETTERS[1:20] + expect_identical(zarr$var_names, LETTERS[1:20]) + unlink(store, recursive = TRUE) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_obsm, status=done +test_that("writing obsm works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + obsm_x <- matrix(rnorm(10 * 5), nrow = 10, ncol = 5) + zarr$obsm <- list(X = obsm_x) + # obsm should now have rownames added on-the-fly + expected_obsm_x <- obsm_x + rownames(expected_obsm_x) <- zarr$obs_names + expect_identical(zarr$obsm$X, expected_obsm_x) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_varm, status=done +test_that("writing varm works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + varm_x <- matrix(rnorm(20 * 5), nrow = 20, ncol = 5) + zarr$varm <- list(PCs = varm_x) + # varm should now have rownames added on-the-fly + expected_varm_x <- varm_x + rownames(expected_varm_x) <- zarr$var_names + expect_identical(zarr$varm$PCs, expected_varm_x) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_obsp, status=done +test_that("writing obsp works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + + obsp_x <- matrix(rnorm(10 * 10), nrow = 10, ncol = 10) + zarr$obsp <- list(connectivities = obsp_x) + # obsp should now have dimnames added on-the-fly + expected_obsp_x <- obsp_x + dimnames(expected_obsp_x) <- list(zarr$obs_names, zarr$obs_names) + expect_identical(zarr$obsp$connectivities, expected_obsp_x) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_varp, status=done +test_that("writing varp works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + varp_x <- matrix(rnorm(20 * 20), nrow = 20, ncol = 20) + zarr$varp <- list(connectivities = varp_x) + # varp should now have dimnames added on-the-fly + expected_varp_x <- varp_x + dimnames(expected_varp_x) <- list(zarr$var_names, zarr$var_names) + expect_identical(zarr$varp$connectivities, expected_varp_x) +}) + +# trackstatus: class=ZarrAnnData, feature=test_set_uns, status=done +test_that("writing uns works", { + store <- tempfile(fileext = ".zarr") + create_zarr(store = store) + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + zarr <- ZarrAnnData$new(store, obs = obs, var = var) + zarr$uns <- list( + foo = "bar", + baz = c(1, 2, 3), + nested = list( + nested_foo = "nested_bar", + nested_baz = c(4L, 5L, 6L) + ) + ) + expect_identical(zarr$uns$foo, "bar") + expect_equal(zarr$uns$baz, c(1, 2, 3), ignore_attr = TRUE) + expect_identical(zarr$uns$nested$nested_foo, "nested_bar") + expect_equal(zarr$uns$nested$nested_baz, c(4L, 5L, 6L), ignore_attr = TRUE) +}) + +# ERROR HANDLING --------------------------------------------------------- +test_that("opening a non-existent path in read mode errors", { + expect_error( + ZarrAnnData$new(tempfile(fileext = ".zarr"), mode = "r"), + "does not exist" + ) +}) + +test_that("opening a non-existent path in r+ mode errors", { + expect_error( + ZarrAnnData$new(tempfile(fileext = ".zarr"), mode = "r+"), + "does not exist" + ) +}) + +test_that("opening an existing file in exclusive-create mode errors", { + store <- tempfile(fileext = ".zarr") + create_zarr(store) + on.exit(unlink(store, recursive = TRUE)) + expect_error( + ZarrAnnData$new(store, mode = "w-"), + "already exists" + ) +}) + +test_that("writing to a read-only store errors", { + store <- tempfile(fileext = ".zarr") + obs <- data.frame(row.names = 1:10) + var <- data.frame(row.names = 1:20) + ZarrAnnData$new(store, obs = obs, var = var) + on.exit(unlink(store, recursive = TRUE)) + + zarr_ro <- ZarrAnnData$new(store, mode = "r") + expect_error( + zarr_ro$X <- matrix(rnorm(10 * 20), nrow = 10, ncol = 20), + "read-only" + ) +}) + +# CONVERSION ------------------------------------------------------------- +test_that("as_ZarrAnnData() round-trip from InMemoryAnnData works", { + mem <- AnnData( + X = matrix(1:20, nrow = 4, ncol = 5), + obs = data.frame(a = 1:4, row.names = paste0("obs", 1:4)), + var = data.frame(b = 1:5, row.names = paste0("var", 1:5)), + layers = list(counts = matrix(21:40, nrow = 4, ncol = 5)), + uns = list(foo = "bar") + ) + + store <- tempfile(fileext = ".zarr") + on.exit(unlink(store, recursive = TRUE)) + + zarr <- as_ZarrAnnData(mem, file = store) + expect_true(inherits(zarr, "ZarrAnnData")) + expect_equal(zarr$obs_names, mem$obs_names) + expect_equal(zarr$var_names, mem$var_names) + expect_equal(as.matrix(zarr$X), mem$X, ignore_attr = TRUE) + expect_equal( + as.matrix(zarr$layers$counts), + mem$layers$counts, + ignore_attr = TRUE + ) + expect_equal(zarr$uns$foo, mem$uns$foo) +}) diff --git a/tests/testthat/test-h5ad-zarr.R b/tests/testthat/test-h5ad-zarr.R new file mode 100644 index 00000000..f03fc964 --- /dev/null +++ b/tests/testthat/test-h5ad-zarr.R @@ -0,0 +1,199 @@ +skip_if_not_installed("rhdf5") +skip_if_not_installed("Rarr") + +# h5ad file +filename <- system.file("extdata", "example.h5ad", package = "anndataR") +file <- rhdf5::H5Fopen(filename, flags = "H5F_ACC_RDONLY", native = FALSE) + +# zarr file +zarr_dir <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_dir, exdir = td) +store <- file.path(td, "example_v2.zarr") + +# helper to compare h5ad and zarr reads for the same path +expect_equal_h5ad_zarr <- function(h5ad_fn, zarr_fn, path, ...) { + expect_equal(h5ad_fn(file, path, ...), zarr_fn(store, path, ...)) +} + +# compare rec arrays of h5ad and zarr +compare_rec_array <- function(rec_array_h5ad, rec_array_zarr, test_fun) { + test_fun(length(rec_array_h5ad), length(rec_array_zarr[[1]])) + test_fun(do.call(rbind, rec_array_h5ad), { + array_list_zarr_mat <- do.call(cbind, rec_array_zarr) + rownames(array_list_zarr_mat) <- + paste(0:(nrow(array_list_zarr_mat) - 1)) + array_list_zarr_mat + }) +} + +test_that("reading dense matrices is the same for h5ad and zarr", { + for (path in c("layers/dense_counts", "layers/dense_X")) { + expect_equal_h5ad_zarr(read_h5ad_dense_array, read_zarr_dense_array, path) + } +}) + +test_that("reading sparse matrices is same for h5ad and zarr", { + sparse_mats <- list( + list(path = "layers/csc_counts", type = "csc"), + list(path = "layers/counts", type = "csr") + ) + for (mat in sparse_mats) { + expect_equal_h5ad_zarr( + read_h5ad_sparse_array, + read_zarr_sparse_array, + mat$path, + type = mat$type + ) + } +}) + +test_that("reading recarrays is the the same for h5ad and zarr", { + # h5ad returns a list of 6 arrays of length 100 + array_list_h5ad <- read_h5ad_rec_array( + file, + "uns/rank_genes_groups/logfoldchanges" + ) + # zarr returns a list of 100 arrays of length 6 + array_list_zarr <- read_zarr_rec_array( + store, + "uns/rank_genes_groups/logfoldchanges" + ) + compare_rec_array(array_list_h5ad, array_list_zarr, expect_equal) +}) + +test_that("reading 1D numeric arrays is the same for h5ad and zarr", { + for (path in c("obs/Int", "obs/Float")) { + expect_equal_h5ad_zarr(read_h5ad_dense_array, read_zarr_dense_array, path) + } +}) + +test_that("reading 1D sparse numeric arrays is the same for h5ad and zarr", { + expect_equal_h5ad_zarr( + read_h5ad_sparse_array, + read_zarr_sparse_array, + "uns/Sparse1D", + type = "csc" + ) +}) + +test_that("reading 1D nullable arrays is the same for h5ad and zarr", { + expect_equal_h5ad_zarr( + read_h5ad_nullable_integer, + read_zarr_nullable_integer, + "obs/IntNA" + ) + expect_equal_h5ad_zarr( + read_h5ad_dense_array, + read_zarr_dense_array, + "obs/FloatNA" + ) + for (path in c("obs/Bool", "obs/BoolNA")) { + expect_equal_h5ad_zarr( + read_h5ad_nullable_boolean, + read_zarr_nullable_boolean, + path + ) + } +}) + +test_that("reading string scalars is the same for h5ad and zarr", { + expect_equal_h5ad_zarr( + read_h5ad_string_scalar, + read_zarr_string_scalar, + "uns/StringScalar" + ) +}) + +test_that("reading numeric scalars is the same for h5ad and zarr", { + expect_equal_h5ad_zarr( + read_h5ad_numeric_scalar, + read_zarr_numeric_scalar, + "uns/IntScalar" + ) +}) + +test_that("reading string arrays is the same for h5ad and zarr", { + for (path in c("uns/String", "uns/String2D")) { + expect_equal_h5ad_zarr(read_h5ad_string_array, read_zarr_string_array, path) + } +}) + +# TODO: Re-enable when recarays are handled consistently, see https://github.com/scverse/anndataR/issues/409 +test_that("reading mappings is the same for h5ad and zarr", { + skip( + "skipping test for mappings since rec arrays are read differently + across h5ad and zarr" + ) + # since rec arrays are read differently across h5ad and zarr, + # we compare all elements individually + mapping_h5ad <- read_h5ad_mapping(file, "uns") + mapping_zarr <- read_zarr_mapping(store, "uns") + for (nm in names(mapping_h5ad)) { + if (!nm %in% "rank_genes_groups") { + expect_equal(mapping_h5ad[[nm]], mapping_zarr[[nm]]) + } else { + map_ranks_h5ad <- mapping_h5ad$rank_genes_groups + map_ranks_zarr <- mapping_zarr$rank_genes_groups + lapply( + names(map_ranks_h5ad)[!names(map_ranks_h5ad) %in% "params"], + function(nmr) { + print(nmr) + compare_rec_array( + map_ranks_h5ad[[nmr]], + map_ranks_zarr[[nmr]], + expect_equal + ) + } + ) + } + } +}) + +tmp <- read_zarr_element(store, "uns/neighbors/params/random_state") +tmp2 <- read_h5ad_element(file, "uns/neighbors/params/random_state") + +test_that("reading dataframes is the the same for h5ad and zarr", { + expect_equal_h5ad_zarr(read_h5ad_data_frame, read_zarr_data_frame, "obs") +}) + +rhdf5::H5Fclose(file) + +test_that("reading H5AD as SingleCellExperiment is the same for h5ad and zarr", { + skip_if_not_installed("SingleCellExperiment") + skip_if_not_installed("S4Vectors") + sce_h5ad <- read_h5ad(filename, as = "SingleCellExperiment") + sce_zarr <- read_zarr(store, as = "SingleCellExperiment") + # TODO: Update when recarays are handled consistently, see https://github.com/scverse/anndataR/issues/409 + S4Vectors::metadata(sce_zarr) <- S4Vectors::metadata(sce_h5ad) + expect_equal(sce_h5ad, sce_zarr) +}) + +test_that("reading H5AD as Seurat is the same for h5ad and zarr", { + skip_if_not_installed("Seurat") + sce_h5ad <- read_h5ad(filename, as = "Seurat") + sce_zarr <- read_zarr(store, as = "Seurat") + # TODO: Update when recarays are handled consistently, see https://github.com/scverse/anndataR/issues/409 + expect_warning( + Seurat::Misc(sce_zarr, "rank_genes_groups") <- + Seurat::Misc(sce_h5ad, "rank_genes_groups"), + "Overwriting miscellanous" + ) + # TODO: neighbors/params/random_state and + # leiden/params/random_state read as 0 in Python anndata but as an empty + # array in Zarr + expect_warning( + Seurat::Misc(sce_zarr, "neighbors") <- + Seurat::Misc(sce_h5ad, "neighbors"), + "Overwriting miscellanous" + ) + expect_warning( + Seurat::Misc(sce_zarr, "leiden") <- + Seurat::Misc(sce_h5ad, "leiden"), + "Overwriting miscellanous" + ) + # Sort Misc by name to make comparison order-agnostic + sce_h5ad@misc <- sce_h5ad@misc[sort(names(sce_h5ad@misc))] + sce_zarr@misc <- sce_zarr@misc[sort(names(sce_zarr@misc))] + expect_equal(sce_h5ad, sce_zarr) +}) diff --git a/tests/testthat/test-roundtrip-X.R b/tests/testthat/test-roundtrip-X.R index 3b6d5bde..4380c5eb 100644 --- a/tests/testthat/test-roundtrip-X.R +++ b/tests/testthat/test-roundtrip-X.R @@ -15,152 +15,167 @@ test_names <- names(da$matrix_generators) # -> https://github.com/scverse/anndata/blob/2a2c0e3198c298a5c80a73ac343c63203b5ca133/src/anndata/_core/anndata.py#L2164-L2172 # nolint test_names <- test_names[!grepl("_3d$", test_names)] -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = name, - obs_types = list(), - var_types = list(), - layer_types = list(), - obsm_types = list(), - varm_types = list(), - obsp_types = list(), - varp_types = list(), - uns_types = list(), - nested_uns_types = list() - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - file_r2 <- withr::local_file( - tempfile(paste0("anndata_r2_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that(paste0("Reading an AnnData with X '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("X"), - dtype = name, - process = "read", - known_issues = known_issues +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = name, + obs_types = list(), + var_types = list(), + layer_types = list(), + obsm_types = list(), + varm_types = list(), + obsp_types = list(), + varp_types = list(), + uns_types = list(), + nested_uns_types = list() ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - }) - - test_that( - paste0("Comparing an anndata with X '", name, "' with reticulate works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("X"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - # Extract X matrices, removing dimnames for comparison since - # R AnnData adds dimnames on-the-fly but Python doesn't preserve them - actual_x <- adata_r$X - expected_x <- py_to_r(adata_py$X) - dimnames(actual_x) <- NULL - dimnames(expected_x) <- NULL - - expect_equal( - actual_x, - expected_x, - tolerance = 1e-6 - ) - } - ) - - gc() - - test_that(paste0("Writing an AnnData with X '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("X"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) + ) + file_r2 <- withr::local_file( + tempfile(paste0("anndata_r2_", name), fileext = fmt_config$ext) ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - # read from file - adata_py2 <- ad$read_h5ad(file_r) + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0("Reading an AnnData with X '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("X"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } + ) - # expect that the objects are the same - expect_equal_py( - adata_py2$X, - adata_py$X + test_that( + paste0( + "Comparing an anndata with X '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("X"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + # Extract X matrices, removing dimnames for comparison since + # R AnnData adds dimnames on-the-fly but Python doesn't preserve them + actual_x <- adata_r$X + expected_x <- py_to_r(adata_py$X) + dimnames(actual_x) <- NULL + dimnames(expected_x) <- NULL + + expect_equal( + actual_x, + expected_x, + tolerance = 1e-6 + ) + } ) - }) - - skip_if_no_h5diff() - # Get all R datatypes that are equivalent to the python datatype (name) - res <- Filter(function(x) x[[1]] == name, matrix_equivalences) - r_datatypes <- vapply(res, function(x) x[[2]], character(1)) - - for (r_name in r_datatypes) { - test_msg <- paste0( - "Comparing a python generated .h5ad with X '", - name, - "' with an R generated .h5ad '", - r_name, - "' works" + + gc() + + test_that( + paste0("Writing an AnnData with X '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("X"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect that the objects are the same + expect_equal_py( + adata_py2$X, + adata_py$X + ) + } ) - test_that(test_msg, { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("X"), - dtype = c(name, r_name), - process = c("h5diff"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - # generate an R h5ad - adata_r <- r_generate_dataset(10L, 20L, x_type = list(r_name)) - write_h5ad(adata_r, file_r2, mode = "w") - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, "X") - - # run h5diff - res <- processx::run( - "h5diff", - c("-v2", file_py, file_r2, "/X"), - error_on_status = FALSE - ) - - expect_equal(res$status, 0, info = res$stdout) - }) + + if (fmt == "h5ad") { + skip_if_no_h5diff() + # Get all R datatypes that are equivalent to the python datatype (name) + res <- Filter(function(x) x[[1]] == name, matrix_equivalences) + r_datatypes <- vapply(res, function(x) x[[2]], character(1)) + + for (r_name in r_datatypes) { + test_msg <- paste0( + "Comparing a python generated .h5ad with X '", + name, + "' with an R generated .h5ad '", + r_name, + "' works" + ) + test_that(test_msg, { + msg <- message_if_known( + backend = "HDF5AnnData", + slot = c("X"), + dtype = c(name, r_name), + process = c("h5diff"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + # generate an R h5ad + adata_r <- r_generate_dataset(10L, 20L, x_type = list(r_name)) + write_h5ad(adata_r, file_r2, mode = "w") + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, "X") + + # run h5diff + res <- processx::run( + "h5diff", + c("-v2", file_py, file_r2, "/X"), + error_on_status = FALSE + ) + + expect_equal(res$status, 0, info = res$stdout) + }) + } + } } } diff --git a/tests/testthat/test-roundtrip-empty.R b/tests/testthat/test-roundtrip-empty.R index eb8b19f6..8d275aa0 100644 --- a/tests/testthat/test-roundtrip-empty.R +++ b/tests/testthat/test-roundtrip-empty.R @@ -12,73 +12,80 @@ bi <- reticulate::import_builtins() known_issues <- read_known_issues() -# first generate a python h5ad -adata_py <- ad$AnnData() - name <- "empty" -# create a couple of paths -file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") -) -file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") -) - -# write to file -adata_py$write_h5ad(file_py) -# Read it back in to get the version as read from disk -adata_py <- ad$read_h5ad(file_py) - -test_that(paste0("Reading an AnnData with layer '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("none"), - dtype = name, - process = "read", - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) - ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) -}) +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) -gc() + # first generate a python adata + adata_py <- ad$AnnData() -test_that(paste0("Writing an AnnData with layer '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("none"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) + ) + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - - # read from file - adata_py2 <- ad$read_h5ad(file_r) - # check that the print output is the same - expect_equal( - unlist(reticulate::py_to_r(adata_py2$shape)), - unlist(reticulate::py_to_r(adata_py$shape)) + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0("Reading an AnnData with layer '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("none"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } ) - # check that the print output is the same - str_py2 <- capture.output(print(adata_py2)) - str_py <- capture.output(print(adata_py)) - expect_equal(str_py2, str_py) -}) + gc() + + test_that( + paste0("Writing an AnnData with layer '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("none"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # check that the shape is the same + expect_equal( + unlist(reticulate::py_to_r(adata_py2$shape)), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + + # check that the print output is the same + str_py2 <- capture.output(print(adata_py2)) + str_py <- capture.output(print(adata_py)) + expect_equal(str_py2, str_py) + } + ) +} diff --git a/tests/testthat/test-roundtrip-layers.R b/tests/testthat/test-roundtrip-layers.R index a8d8b885..1081e86c 100644 --- a/tests/testthat/test-roundtrip-layers.R +++ b/tests/testthat/test-roundtrip-layers.R @@ -11,177 +11,188 @@ known_issues <- read_known_issues() test_names <- names(da$matrix_generators) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(), - var_types = list(), - layer_types = list(name), - obsm_types = list(), - varm_types = list(), - obsp_types = list(), - varp_types = list(), - uns_types = list(), - nested_uns_types = list() - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - file_r2 <- withr::local_file( - tempfile(paste0("anndata_r2_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that(paste0("Reading an AnnData with layer '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("layers"), - dtype = name, - process = "read", - known_issues = known_issues +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(), + var_types = list(), + layer_types = list(name), + obsm_types = list(), + varm_types = list(), + obsp_types = list(), + varp_types = list(), + uns_types = list(), + nested_uns_types = list() ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) ) - expect_equal( - adata_r$layers_keys(), - bi$list(adata_py$layers$keys()) + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - }) - - gc() - - test_that( - paste0( - "Comparing an anndata with layer '", - name, - "' with reticulate works" - ), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("layers"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them - # So we need to strip dimnames for comparison - actual_mat <- adata_r$layers[[name]] - expected_mat <- py_to_r(py_get_item(adata_py$layers, name)) - dimnames(actual_mat) <- NULL - dimnames(expected_mat) <- NULL - - expect_equal( - actual_mat, - expected_mat, - tolerance = 1e-6 - ) - } - ) - - gc() - - test_that(paste0("Writing an AnnData with layer '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("layers"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + file_r2 <- withr::local_file( + tempfile(paste0("anndata_r2_", name), fileext = fmt_config$ext) ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - # read from file - adata_py2 <- ad$read_h5ad(file_r) - - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$layers$keys()), - name + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0("Reading an AnnData with layer '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("layers"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + expect_equal( + adata_r$layers_keys(), + bi$list(adata_py$layers$keys()) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } ) - # expect that the objects are the same - expect_equal_py( - py_get_item(adata_py2$layers, name), - py_get_item(adata_py$layers, name) + gc() + + test_that( + paste0( + "Comparing an anndata with layer '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("layers"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them + # So we need to strip dimnames for comparison + actual_mat <- adata_r$layers[[name]] + expected_mat <- py_to_r(py_get_item(adata_py$layers, name)) + dimnames(actual_mat) <- NULL + dimnames(expected_mat) <- NULL + + expect_equal( + actual_mat, + expected_mat, + tolerance = 1e-6 + ) + } ) - }) - - gc() - - skip_if_no_h5diff() - # Get all R datatypes that are equivalent to the python datatype (name) - res <- Filter(function(x) x[[1]] == name, matrix_equivalences) - r_datatypes <- vapply(res, function(x) x[[2]], character(1)) - - for (r_name in r_datatypes) { - test_msg <- paste0( - "Comparing a python generated .h5ad with layer '", - name, - "' with an R generated .h5ad '", - r_name, - "' works" + + gc() + + test_that( + paste0("Writing an AnnData with layer '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("layers"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$layers$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py( + py_get_item(adata_py2$layers, name), + py_get_item(adata_py$layers, name) + ) + } ) - test_that(test_msg, { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("X"), - dtype = c(name, r_name), - process = c("h5diff"), - known_issues = known_issues - ) - - skip_if(!is.null(msg), message = msg) - - # generate an R h5ad - adata_r <- r_generate_dataset(10L, 20L, layer_types = list(r_name)) - write_h5ad(adata_r, file_r2, mode = "w") - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/layers/", r_name)) - - # run h5diff - res <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/layers/", name), - paste0("/layers/", r_name) - ), - error_on_status = FALSE - ) - - expect_equal(res$status, 0, info = res$stdout) - }) + + gc() + + if (fmt == "h5ad") { + skip_if_no_h5diff() + # Get all R datatypes that are equivalent to the python datatype (name) + res <- Filter(function(x) x[[1]] == name, matrix_equivalences) + r_datatypes <- vapply(res, function(x) x[[2]], character(1)) + + for (r_name in r_datatypes) { + test_msg <- paste0( + "Comparing a python generated .h5ad with layer '", + name, + "' with an R generated .h5ad '", + r_name, + "' works" + ) + test_that(test_msg, { + msg <- message_if_known( + backend = "HDF5AnnData", + slot = c("X"), + dtype = c(name, r_name), + process = c("h5diff"), + known_issues = known_issues + ) + + skip_if(!is.null(msg), message = msg) + + # generate an R h5ad + adata_r <- r_generate_dataset(10L, 20L, layer_types = list(r_name)) + write_h5ad(adata_r, file_r2, mode = "w") + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/layers/", r_name)) + + # run h5diff + res <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/layers/", name), + paste0("/layers/", r_name) + ), + error_on_status = FALSE + ) + + expect_equal(res$status, 0, info = res$stdout) + }) + } + } } } diff --git a/tests/testthat/test-roundtrip-obsmvarm.R b/tests/testthat/test-roundtrip-obsmvarm.R index 22e8017b..ae769d8f 100644 --- a/tests/testthat/test-roundtrip-obsmvarm.R +++ b/tests/testthat/test-roundtrip-obsmvarm.R @@ -15,7 +15,7 @@ test_names <- c( ) # temporary workaround for -# https://github.com/LouiseDck/dummy-anndata/issues/12 +# https://github.com/data-intuitive/dummy-anndata/issues/12 test_names <- setdiff( test_names, c( @@ -28,221 +28,238 @@ test_names <- setdiff( ) ) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(), - var_types = list(), - layer_types = list(), - obsm_types = list(name), - varm_types = list(name), - obsp_types = list(), - varp_types = list(), - uns_types = list(), - nested_uns_types = list() - ) +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - file_r2 <- withr::local_file( - tempfile(paste0("anndata_r2_", name), fileext = ".h5ad") - ) + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(), + var_types = list(), + layer_types = list(), + obsm_types = list(name), + varm_types = list(name), + obsp_types = list(), + varp_types = list(), + uns_types = list(), + nested_uns_types = list() + ) - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that( - paste0("Reading an AnnData with obsm and varm '", name, "' works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsm", "varm"), - dtype = name, - process = "read", - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) - ) - expect_equal( - adata_r$obsm_keys(), - bi$list(adata_py$obsm$keys()) - ) - expect_equal( - adata_r$varm_keys(), - bi$list(adata_py$varm$keys()) - ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - } - ) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) + ) + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) + ) + file_r2 <- withr::local_file( + tempfile(paste0("anndata_r2_", name), fileext = fmt_config$ext) + ) - test_that( - paste0( - "Comparing an anndata with obsm and varm '", - name, - "' with reticulate works" - ), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsm", "varm"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them - # So we need to strip dimnames for comparison - actual_obsm <- adata_r$obsm[[name]] - expected_obsm <- py_to_r(py_get_item(adata_py$obsm, name)) - dimnames(actual_obsm) <- NULL - dimnames(expected_obsm) <- NULL - - expect_equal( - actual_obsm, - expected_obsm, - tolerance = 1e-6 - ) - - actual_varm <- adata_r$varm[[name]] - expected_varm <- py_to_r(py_get_item(adata_py$varm, name)) - dimnames(actual_varm) <- NULL - dimnames(expected_varm) <- NULL - - expect_equal( - actual_varm, - expected_varm, - tolerance = 1e-6 - ) - } - ) + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) - gc() - - test_that( - paste0("Writing an AnnData with obsm and varm '", name, "' works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsm", "varm"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - - # read from file - adata_py2 <- ad$read_h5ad(file_r) - - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$obsm$keys()), - name - ) - expect_contains( - bi$list(adata_py2$obsm$keys()), - name - ) - - # expect that the objects are the same - expect_equal_py( - py_get_item(adata_py2$obsm, name), - py_get_item(adata_py$obsm, name) - ) - expect_equal_py( - py_get_item(adata_py2$varm, name), - py_get_item(adata_py$varm, name) - ) - } - ) + test_that( + paste0( + "Reading an AnnData with obsm and varm '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsm", "varm"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + expect_equal( + adata_r$obsm_keys(), + bi$list(adata_py$obsm$keys()) + ) + expect_equal( + adata_r$varm_keys(), + bi$list(adata_py$varm$keys()) + ) - skip_if_no_h5diff() - # Get all R datatypes that are equivalent to the python datatype (name) - res <- Filter(function(x) x[[1]] == name, all_equivalences) - r_datatypes <- vapply(res, function(x) x[[2]], character(1)) - - for (r_name in r_datatypes) { - test_msg <- paste0( - "Comparing a python generated .h5ad with obsm and varm '", - name, - "' with an R generated .h5ad '", - r_name, - "' works" + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } ) - test_that(test_msg, { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsm", "varm"), - dtype = c(name, r_name), - process = c("h5diff"), - known_issues = known_issues - ) - - skip_if(!is.null(msg), message = msg) - # generate an R h5ad - adata_r <- r_generate_dataset( - 10L, - 20L, - obsm_types = list(r_name), - varm_types = list(r_name) - ) - write_h5ad(adata_r, file_r2, mode = "w") - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/obsm/", r_name)) - - # run h5diff - res_obsm <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/obsm/", name), - paste0("/obsm/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_obsm$status, 0, info = res_obsm$stdout) - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/varm/", r_name)) - - res_varm <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/varm/", name), - paste0("/varm/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_varm$status, 0, info = res_varm$stdout) - }) + + test_that( + paste0( + "Comparing an anndata with obsm and varm '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsm", "varm"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them + # So we need to strip dimnames for comparison + actual_obsm <- adata_r$obsm[[name]] + expected_obsm <- py_to_r(py_get_item(adata_py$obsm, name)) + dimnames(actual_obsm) <- NULL + dimnames(expected_obsm) <- NULL + + expect_equal( + actual_obsm, + expected_obsm, + tolerance = 1e-6 + ) + + actual_varm <- adata_r$varm[[name]] + expected_varm <- py_to_r(py_get_item(adata_py$varm, name)) + dimnames(actual_varm) <- NULL + dimnames(expected_varm) <- NULL + + expect_equal( + actual_varm, + expected_varm, + tolerance = 1e-6 + ) + } + ) + + gc() + + test_that( + paste0( + "Writing an AnnData with obsm and varm '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsm", "varm"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$obsm$keys()), + name + ) + expect_contains( + bi$list(adata_py2$varm$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py( + py_get_item(adata_py2$obsm, name), + py_get_item(adata_py$obsm, name) + ) + expect_equal_py( + py_get_item(adata_py2$varm, name), + py_get_item(adata_py$varm, name) + ) + } + ) + + if (fmt == "h5ad") { + skip_if_no_h5diff() + # Get all R datatypes that are equivalent to the python datatype (name) + res <- Filter(function(x) x[[1]] == name, all_equivalences) + r_datatypes <- vapply(res, function(x) x[[2]], character(1)) + + for (r_name in r_datatypes) { + test_msg <- paste0( + "Comparing a python generated .h5ad with obsm and varm '", + name, + "' with an R generated .h5ad '", + r_name, + "' works" + ) + test_that(test_msg, { + msg <- message_if_known( + backend = "HDF5AnnData", + slot = c("obsm", "varm"), + dtype = c(name, r_name), + process = c("h5diff"), + known_issues = known_issues + ) + + skip_if(!is.null(msg), message = msg) + # generate an R h5ad + adata_r <- r_generate_dataset( + 10L, + 20L, + obsm_types = list(r_name), + varm_types = list(r_name) + ) + write_h5ad(adata_r, file_r2, mode = "w") + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/obsm/", r_name)) + + # run h5diff + res_obsm <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/obsm/", name), + paste0("/obsm/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_obsm$status, 0, info = res_obsm$stdout) + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/varm/", r_name)) + + res_varm <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/varm/", name), + paste0("/varm/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_varm$status, 0, info = res_varm$stdout) + }) + } + } } } diff --git a/tests/testthat/test-roundtrip-obspvarp.R b/tests/testthat/test-roundtrip-obspvarp.R index 0e0f045d..2fff728c 100644 --- a/tests/testthat/test-roundtrip-obspvarp.R +++ b/tests/testthat/test-roundtrip-obspvarp.R @@ -11,220 +11,237 @@ known_issues <- read_known_issues() test_names <- names(da$matrix_generators) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(), - var_types = list(), - layer_types = list(), - obsm_types = list(), - varm_types = list(), - obsp_types = list(name), - varp_types = list(name), - uns_types = list(), - nested_uns_types = list() - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - file_r2 <- withr::local_file( - tempfile(paste0("anndata_r2_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that( - paste0("Reading an AnnData with obsp and varp '", name, "' works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsp", "varp"), - dtype = name, - process = "read", - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) - ) - expect_equal( - adata_r$obsp_keys(), - bi$list(adata_py$obsp$keys()) - ) - expect_equal( - adata_r$varp_keys(), - bi$list(adata_py$varp$keys()) - ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - } - ) - - test_that( - paste0( - "Comparing an anndata with obsp and varp '", - name, - "' with reticulate works" - ), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsp", "varp"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them - # So we need to strip dimnames for comparison - actual_obsp <- adata_r$obsp[[name]] - expected_obsp <- py_to_r(py_get_item(adata_py$obsp, name)) - dimnames(actual_obsp) <- NULL - dimnames(expected_obsp) <- NULL - - expect_equal( - actual_obsp, - expected_obsp, - tolerance = 1e-6 - ) - - actual_varp <- adata_r$varp[[name]] - expected_varp <- py_to_r(py_get_item(adata_py$varp, name)) - dimnames(actual_varp) <- NULL - dimnames(expected_varp) <- NULL - - expect_equal( - actual_varp, - expected_varp, - tolerance = 1e-6 - ) - } - ) - - gc() - - test_that( - paste0("Writing an AnnData with obsp and varp '", name, "' works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsp", "varp"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - - # read from file - adata_py2 <- ad$read_h5ad(file_r) - - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$obsp$keys()), - name - ) - expect_contains( - bi$list(adata_py2$varp$keys()), - name - ) - - # expect that the objects are the same - expect_equal_py( - py_get_item(adata_py2$obsp, name), - py_get_item(adata_py$obsp, name) - ) - expect_equal_py( - py_get_item(adata_py2$varp, name), - py_get_item(adata_py$varp, name) - ) - } - ) - - skip_if_no_h5diff() - # Get all R datatypes that are equivalent to the python datatype (name) - res <- Filter(function(x) x[[1]] == name, matrix_equivalences) - r_datatypes <- vapply(res, function(x) x[[2]], character(1)) - - for (r_name in r_datatypes) { - test_msg <- paste0( - "Comparing a python generated .h5ad with obsp and varp '", - name, - "' with an R generated .h5ad '", - r_name, - "' works" +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(), + var_types = list(), + layer_types = list(), + obsm_types = list(), + varm_types = list(), + obsp_types = list(name), + varp_types = list(name), + uns_types = list(), + nested_uns_types = list() + ) + + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) + ) + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - test_that(test_msg, { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsp", "varp"), - dtype = c(name, r_name), - process = c("h5diff"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - # generate an R h5ad - adata_r <- r_generate_dataset( - 10L, - 20L, - obsp_types = list(r_name), - varp_types = list(r_name) - ) - write_h5ad(adata_r, file_r2, mode = "w") - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/obsp/", r_name)) - - # run h5diff - res_obsp <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/obsp/", name), - paste0("/obsp/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_obsp$status, 0, info = res_obsp$stdout) - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/varp/", r_name)) - - res_varp <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/varp/", name), - paste0("/varp/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_varp$status, 0, info = res_varp$stdout) - }) + file_r2 <- withr::local_file( + tempfile(paste0("anndata_r2_", name), fileext = fmt_config$ext) + ) + + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0( + "Reading an AnnData with obsp and varp '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsp", "varp"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + expect_equal( + adata_r$obsp_keys(), + bi$list(adata_py$obsp$keys()) + ) + expect_equal( + adata_r$varp_keys(), + bi$list(adata_py$varp$keys()) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } + ) + + test_that( + paste0( + "Comparing an anndata with obsp and varp '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsp", "varp"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + # R AnnData now adds dimnames on-the-fly, but Python doesn't preserve them + # So we need to strip dimnames for comparison + actual_obsp <- adata_r$obsp[[name]] + expected_obsp <- py_to_r(py_get_item(adata_py$obsp, name)) + dimnames(actual_obsp) <- NULL + dimnames(expected_obsp) <- NULL + + expect_equal( + actual_obsp, + expected_obsp, + tolerance = 1e-6 + ) + + actual_varp <- adata_r$varp[[name]] + expected_varp <- py_to_r(py_get_item(adata_py$varp, name)) + dimnames(actual_varp) <- NULL + dimnames(expected_varp) <- NULL + + expect_equal( + actual_varp, + expected_varp, + tolerance = 1e-6 + ) + } + ) + + gc() + + test_that( + paste0( + "Writing an AnnData with obsp and varp '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsp", "varp"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$obsp$keys()), + name + ) + expect_contains( + bi$list(adata_py2$varp$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py( + py_get_item(adata_py2$obsp, name), + py_get_item(adata_py$obsp, name) + ) + expect_equal_py( + py_get_item(adata_py2$varp, name), + py_get_item(adata_py$varp, name) + ) + } + ) + + if (fmt == "h5ad") { + skip_if_no_h5diff() + # Get all R datatypes that are equivalent to the python datatype (name) + res <- Filter(function(x) x[[1]] == name, matrix_equivalences) + r_datatypes <- vapply(res, function(x) x[[2]], character(1)) + + for (r_name in r_datatypes) { + test_msg <- paste0( + "Comparing a python generated .h5ad with obsp and varp '", + name, + "' with an R generated .h5ad '", + r_name, + "' works" + ) + test_that(test_msg, { + msg <- message_if_known( + backend = "HDF5AnnData", + slot = c("obsp", "varp"), + dtype = c(name, r_name), + process = c("h5diff"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + # generate an R h5ad + adata_r <- r_generate_dataset( + 10L, + 20L, + obsp_types = list(r_name), + varp_types = list(r_name) + ) + write_h5ad(adata_r, file_r2, mode = "w") + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/obsp/", r_name)) + + # run h5diff + res_obsp <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/obsp/", name), + paste0("/obsp/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_obsp$status, 0, info = res_obsp$stdout) + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/varp/", r_name)) + + res_varp <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/varp/", name), + paste0("/varp/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_varp$status, 0, info = res_varp$stdout) + }) + } + } } } diff --git a/tests/testthat/test-roundtrip-obsvar.R b/tests/testthat/test-roundtrip-obsvar.R index d89529bb..90ac6707 100644 --- a/tests/testthat/test-roundtrip-obsvar.R +++ b/tests/testthat/test-roundtrip-obsvar.R @@ -11,196 +11,219 @@ known_issues <- read_known_issues() test_names <- names(da$vector_generators) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(name), - var_types = list(name), - layer_types = list(), - obsm_types = list(), - varm_types = list(), - obsp_types = list(), - varp_types = list(), - uns_types = list(), - nested_uns_types = list() - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - file_r2 <- withr::local_file( - tempfile(paste0("anndata_r2_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that(paste0("reading an AnnData with obs and var '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obs", "var"), - dtype = name, - process = "read", - known_issues = known_issues +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(name), + var_types = list(name), + layer_types = list(), + obsm_types = list(), + varm_types = list(), + obsp_types = list(), + varp_types = list(), + uns_types = list(), + nested_uns_types = list() ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - expect_equal( - adata_r$shape(), - unlist(reticulate::py_to_r(adata_py$shape)) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) ) - expect_equal( - adata_r$obs_keys(), - bi$list(adata_py$obs_keys()) + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - expect_equal( - adata_r$var_keys(), - bi$list(adata_py$var_keys()) + file_r2 <- withr::local_file( + tempfile(paste0("anndata_r2_", name), fileext = fmt_config$ext) ) - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - }) - - test_that( - paste0( - "Comparing an anndata with obs and var '", - name, - "' with reticulate works" - ), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obs", "var"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - expect_equal( - adata_r$obs[[name]], - py_to_r(adata_py$obs)[[name]], - tolerance = 1e-6 - ) - expect_equal( - adata_r$var[[name]], - py_to_r(adata_py$var)[[name]], - tolerance = 1e-6 - ) - } - ) - - gc() - - test_that(paste0("Writing an AnnData with obs and var '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obsp", "varp"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0( + "reading an AnnData with obs and var '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obs", "var"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + expect_equal( + adata_r$shape(), + unlist(reticulate::py_to_r(adata_py$shape)) + ) + expect_equal( + adata_r$obs_keys(), + reticulate::py_to_r(adata_py$obs$columns$tolist()) + ) + expect_equal( + adata_r$var_keys(), + reticulate::py_to_r(adata_py$obs$columns$tolist()) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - # read from file - adata_py2 <- ad$read_h5ad(file_r) - - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$obs$keys()), - name - ) - expect_contains( - bi$list(adata_py2$var$keys()), - name + test_that( + paste0( + "Comparing an anndata with obs and var '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obs", "var"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + expect_equal( + adata_r$obs[[name]], + py_to_r(adata_py$obs)[[name]], + tolerance = 1e-6 + ) + expect_equal( + adata_r$var[[name]], + py_to_r(adata_py$var)[[name]], + tolerance = 1e-6 + ) + } ) - # expect that the objects are the same - expect_equal_py(adata_py2$obs, adata_py$obs) - expect_equal_py(adata_py2$var, adata_py$var) - }) - - skip_if_no_h5diff() - # Get all R datatypes that are equivalent to the python datatype (name) - res <- Filter(function(x) x[[1]] == name, vector_equivalences) - r_datatypes <- vapply(res, function(x) x[[2]], character(1)) - - for (r_name in r_datatypes) { - test_msg <- paste0( - "Comparing a python generated .h5ad with obs and var '", - name, - "' with an R generated .h5ad '", - r_name, - "' works" + gc() + + test_that( + paste0( + "Writing an AnnData with obs and var '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("obsp", "varp"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$obs$keys()), + name + ) + expect_contains( + bi$list(adata_py2$var$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py(adata_py2$obs, adata_py$obs) + expect_equal_py(adata_py2$var, adata_py$var) + } ) - test_that(test_msg, { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("obs", "var"), - dtype = c(name, r_name), - process = c("h5diff"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - # generate an R h5ad - adata_r <- r_generate_dataset( - 10L, - 20L, - obs_types = list(r_name), - var_types = list(r_name) - ) - - write_h5ad(adata_r, file_r2) - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/obs/", r_name)) - - # run h5diff - res_obs <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/obs/", name), - paste0("/obs/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_obs$status, 0, info = res_obs$stdout) - - # Remove the rhdf5-NA.OK for comparison - hdf5_clear_rhdf5_attributes(file_r2, paste0("/var/", r_name)) - - res_var <- processx::run( - "h5diff", - c( - "-v2", - file_py, - file_r2, - paste0("/var/", name), - paste0("/var/", r_name) - ), - error_on_status = FALSE - ) - expect_equal(res_var$status, 0, info = res_var$stdout) - }) + + if (fmt == "h5ad") { + skip_if_no_h5diff() + # Get all R datatypes that are equivalent to the python datatype (name) + res <- Filter(function(x) x[[1]] == name, vector_equivalences) + r_datatypes <- vapply(res, function(x) x[[2]], character(1)) + + for (r_name in r_datatypes) { + test_msg <- paste0( + "Comparing a python generated .h5ad with obs and var '", + name, + "' with an R generated .h5ad '", + r_name, + "' works" + ) + test_that(test_msg, { + msg <- message_if_known( + backend = "HDF5AnnData", + slot = c("obs", "var"), + dtype = c(name, r_name), + process = c("h5diff"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + # generate an R h5ad + adata_r <- r_generate_dataset( + 10L, + 20L, + obs_types = list(r_name), + var_types = list(r_name) + ) + + write_h5ad(adata_r, file_r2) + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/obs/", r_name)) + + # run h5diff + res_obs <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/obs/", name), + paste0("/obs/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_obs$status, 0, info = res_obs$stdout) + + # Remove the rhdf5-NA.OK for comparison + hdf5_clear_rhdf5_attributes(file_r2, paste0("/var/", r_name)) + + res_var <- processx::run( + "h5diff", + c( + "-v2", + file_py, + file_r2, + paste0("/var/", name), + paste0("/var/", r_name) + ), + error_on_status = FALSE + ) + expect_equal(res_var$status, 0, info = res_var$stdout) + }) + } + } } } diff --git a/tests/testthat/test-roundtrip-uns-nested.R b/tests/testthat/test-roundtrip-uns-nested.R index fdf06f2f..029c55b7 100644 --- a/tests/testthat/test-roundtrip-uns-nested.R +++ b/tests/testthat/test-roundtrip-uns-nested.R @@ -15,113 +15,134 @@ test_names <- c( names(da$scalar_generators) ) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(), - var_types = list(), - layer_types = list(), - obsm_types = list(), - varm_types = list(), - obsp_types = list(), - varp_types = list(), - uns_types = list(), - nested_uns_types = list(name) - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that(paste0("Reading an AnnData with uns_nested '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns_nested"), - dtype = name, - process = "read", - known_issues = known_issues +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(), + var_types = list(), + layer_types = list(), + obsm_types = list(), + varm_types = list(), + obsp_types = list(), + varp_types = list(), + uns_types = list(), + nested_uns_types = list(name) ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - expect_equal( - names(adata_r$uns$nested), - bi$list(adata_py$uns$nested$keys()) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - }) - - test_that( - paste0( - "Comparing an anndata with uns_nested '", - name, - "' with reticulate works" - ), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns_nested"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - py_value <- convert_py_value(adata_py$uns$nested[[name]], name) - - expect_equal( - adata_r$uns[["nested"]][[name]], - py_value - ) - } - ) - - gc() - - test_that(paste0("Writing an AnnData with uns_nested '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns_nested"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - - # read from file - adata_py2 <- ad$read_h5ad(file_r) + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0( + "Reading an AnnData with uns_nested '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns_nested"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + expect_equal( + names(adata_r$uns$nested), + bi$list(adata_py$uns$nested$keys()) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } + ) - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$uns$nested$keys()), - name + test_that( + paste0( + "Comparing an anndata with uns_nested '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns_nested"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + py_value <- convert_py_value(adata_py$uns$nested[[name]], name) + + expect_equal( + adata_r$uns[["nested"]][[name]], + py_value + ) + } ) - # expect that the objects are the same - expect_equal_py( - py_get_item(adata_py2$uns$nested, name), - py_get_item(adata_py$uns$nested, name) + gc() + + test_that( + paste0( + "Writing an AnnData with uns_nested '", + name, + "' (", + fmt, + ") works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns_nested"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$uns$nested$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py( + py_get_item(adata_py2$uns$nested, name), + py_get_item(adata_py$uns$nested, name) + ) + } ) - }) + } } diff --git a/tests/testthat/test-roundtrip-uns.R b/tests/testthat/test-roundtrip-uns.R index a77bb00d..45324ced 100644 --- a/tests/testthat/test-roundtrip-uns.R +++ b/tests/testthat/test-roundtrip-uns.R @@ -15,109 +15,122 @@ test_names <- c( names(da$scalar_generators) ) -for (name in test_names) { - # first generate a python h5ad - adata_py <- da$generate_dataset( - x_type = NULL, - obs_types = list(), - var_types = list(), - layer_types = list(), - obsm_types = list(), - varm_types = list(), - obsp_types = list(), - varp_types = list(), - uns_types = list(name), - nested_uns_types = list() - ) - - # create a couple of paths - file_py <- withr::local_file( - tempfile(paste0("anndata_py_", name), fileext = ".h5ad") - ) - file_r <- withr::local_file( - tempfile(paste0("anndata_r_", name), fileext = ".h5ad") - ) - - # write to file - adata_py$write_h5ad(file_py) - # Read it back in to get the version as read from disk - adata_py <- ad$read_h5ad(file_py) - - test_that(paste0("Reading an AnnData with uns '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns"), - dtype = name, - process = "read", - known_issues = known_issues +for (fmt in c("h5ad", "zarr")) { + fmt_config <- get_fmt_config(fmt) + + for (name in test_names) { + # first generate a python adata + adata_py <- da$generate_dataset( + x_type = NULL, + obs_types = list(), + var_types = list(), + layer_types = list(), + obsm_types = list(), + varm_types = list(), + obsp_types = list(), + varp_types = list(), + uns_types = list(name), + nested_uns_types = list() ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - expect_equal( - names(adata_r$uns), - bi$list(adata_py$uns$keys()) + # create a couple of paths + file_py <- withr::local_file( + tempfile(paste0("anndata_py_", name), fileext = fmt_config$ext) ) - - # check that the print output is the same (normalize class names) - str_r <- capture.output(print(adata_r)) - str_py <- capture.output(print(adata_py)) - str_r <- gsub("[^ ]*AnnData", "AnnData", str_r) - expect_equal(str_r, str_py) - }) - - test_that( - paste0("Comparing an anndata with uns '", name, "' with reticulate works"), - { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns"), - dtype = name, - process = c("read", "reticulate"), - known_issues = known_issues - ) - skip_if(!is.null(msg), message = msg) - - adata_r <- read_h5ad(file_py, as = "HDF5AnnData") - - py_value <- convert_py_value(adata_py$uns[[name]], name) - - expect_equal( - adata_r$uns[[name]], - py_value - ) - } - ) - - gc() - - test_that(paste0("Writing an AnnData with uns '", name, "' works"), { - msg <- message_if_known( - backend = "HDF5AnnData", - slot = c("uns"), - dtype = name, - process = c("read", "write"), - known_issues = known_issues + file_r <- withr::local_file( + tempfile(paste0("anndata_r_", name), fileext = fmt_config$ext) ) - skip_if(!is.null(msg), message = msg) - adata_r <- read_h5ad(file_py, as = "InMemoryAnnData") - write_h5ad(adata_r, file_r) - - # read from file - adata_py2 <- ad$read_h5ad(file_r) + # write to file + adata_py[[fmt_config$py_write_method]](file_py) + # Read it back in to get the version as read from disk + adata_py <- ad[[fmt_config$py_read_method]](file_py) + + test_that( + paste0("Reading an AnnData with uns '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns"), + dtype = name, + process = "read", + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + expect_equal( + names(adata_r$uns), + bi$list(adata_py$uns$keys()) + ) + + # check that the print output is the same (normalize class names) + expect_anndata_print_equal(adata_r, adata_py) + } + ) - # expect name is one of the keys - expect_contains( - bi$list(adata_py2$uns$keys()), - name + test_that( + paste0( + "Comparing an anndata with uns '", + name, + "' (", + fmt, + ") with reticulate works" + ), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns"), + dtype = name, + process = c("read", "reticulate"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = fmt_config$backend) + + py_value <- convert_py_value(adata_py$uns[[name]], name) + + expect_equal( + adata_r$uns[[name]], + py_value + ) + } ) - # expect that the objects are the same - expect_equal_py( - py_get_item(adata_py2$uns, name), - py_get_item(adata_py$uns, name) + gc() + + test_that( + paste0("Writing an AnnData with uns '", name, "' (", fmt, ") works"), + { + msg <- message_if_known( + backend = fmt_config$backend, + slot = c("uns"), + dtype = name, + process = c("read", "write"), + known_issues = known_issues + ) + skip_if(!is.null(msg), message = msg) + + adata_r <- fmt_config$r_read_fun(file_py, as = "InMemoryAnnData") + fmt_config$r_write_fun(adata_r, file_r) + + # read from file + adata_py2 <- ad[[fmt_config$py_read_method]](file_r) + + # expect name is one of the keys + expect_contains( + bi$list(adata_py2$uns$keys()), + name + ) + + # expect that the objects are the same + expect_equal_py( + py_get_item(adata_py2$uns, name), + py_get_item(adata_py$uns, name) + ) + } ) - }) + } } diff --git a/vignettes/anndataR.Rmd b/vignettes/anndataR.Rmd index 4bd48cc6..e04ff553 100644 --- a/vignettes/anndataR.Rmd +++ b/vignettes/anndataR.Rmd @@ -27,7 +27,7 @@ library(SingleCellExperiment) # Introduction -`r Biocpkg("anndataR")` allows users to work with `.h5ad` files, interact with `AnnData` objects and convert to/from `SingleCellExperiment` or `Seurat` objects. +`r Biocpkg("anndataR")` allows users to work with `.h5ad` files and `.zarr` stores, interact with `AnnData` objects and convert to/from `SingleCellExperiment` or `Seurat` objects. This enables users to move data easily between the different programming languages and analysis ecosystems needed to perform single-cell data analysis. This package builds on our experience developing and using other interoperability packages and aims to provide a first-class R `AnnData` experience. @@ -37,8 +37,8 @@ This package builds on our experience developing and using other interoperabilit Existing packages provide similar functionality to `r Biocpkg("anndataR")` but there are some important differences: - `r Biocpkg("zellkonverter")` provides conversion of `SingleCellExperiment` objects to/from `AnnData` and reading/writing of `.h5ad` files. - This is facilitated via `r CRANpkg("reticulate")` using `r Biocpkg("basilisk")` to manage Python environments (native reading of `.h5ad` files is also possible). - In contrast, `r Biocpkg("anndataR")` provides a native R H5AD interface, removing the need for Python dependencies. + This is facilitated via `r CRANpkg("reticulate")` using `r Biocpkg("basilisk")` to manage Python environments (native reading of `.h5ad` is also possible). + In contrast, `r Biocpkg("anndataR")` provides a native R H5AD and Zarr interface, removing the need for Python dependencies. Conversion to/from `Seurat` objects is also supported. - `r CRANpkg("anndata")` (on CRAN) is a wrapper around the Python _anndata_ package. It provides a nicer interface from within R but still requires a Python environment. @@ -97,6 +97,29 @@ There is also a HDF5-backed `AnnData` object: adata <- read_h5ad(h5ad_path, as = "HDF5AnnData") ``` +Similarly, these functionalities are provided for `.zarr` stores too. + +```{r zarr-path} +zarr_path <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_path, exdir = td) +zarr_path <- file.path(td, "example_v2.zarr") +``` + +```{r read-zarr} +# in-memory +adata <- read_zarr(zarr_path) + +# as SingleCellExperiment +sce <- read_zarr(zarr_path, as = "SingleCellExperiment") + +# as Seurat +obj <- read_zarr(zarr_path, as = "Seurat") + +# as Zarr-backed +adata <- read_zarr(zarr_path, as = "ZarrAnnData") +``` + See `r vignette("usage_python")` for interacting with a Python `AnnData` via `r CRANpkg("reticulate")`. ## Using `AnnData` objects @@ -187,6 +210,19 @@ tmpfile <- tempfile(fileext = ".h5ad") write_h5ad(obj, tmpfile) ``` +Similarly, we can write `AnnData` and other objects to `.zarr` stores too. + +```{r write-to-disk-zarr} +tmpfile <- tempfile(fileext = ".zarr") +adata$write_zarr(tmpfile) # Alternatively, write_zarr(adata, tmpfile) + +tmpfile <- tempfile(fileext = ".zarr") +write_zarr(sce, tmpfile) + +tmpfile <- tempfile(fileext = ".zarr") +write_zarr(obj, tmpfile) +``` + ## Subsetting `AnnData` objects {#subsetting} `r Biocpkg("anndataR")` provides standard R subsetting methods that work with familiar bracket notation. These methods return `AnnDataView` objects that provide lazy evaluation for efficient memory usage. diff --git a/vignettes/articles/development_status.Rmd b/vignettes/articles/development_status.Rmd index d60d9870..8e86fd87 100644 --- a/vignettes/articles/development_status.Rmd +++ b/vignettes/articles/development_status.Rmd @@ -67,7 +67,7 @@ status_lines_proc <- status_lines |> # combine with missing fields status_lines_required <- crossing( - class = c("InMemoryAnnData", "HDF5AnnData", "Seurat", "SingleCellExperiment"), + class = c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData", "Seurat", "SingleCellExperiment"), prefix = c("get_", "test_get_", "set_", "test_set_"), slot = c( "X", diff --git a/vignettes/articles/software_design.Rmd b/vignettes/articles/software_design.Rmd index e7004d87..a1836cce 100644 --- a/vignettes/articles/software_design.Rmd +++ b/vignettes/articles/software_design.Rmd @@ -30,15 +30,11 @@ Ideally, this package will be a complete replacement for all of these packages, # Core features * An `r CRANpkg("R6")` `AnnData` class to work with objects in R -* In-memory (`InMemoryAnnData`), HDF5-backed (`HDF5AnnData`) and Python-backed (`ReticulateAnnData`) back ends with a consistent interface -* Read/write `.h5ad` files natively +* In-memory (`InMemoryAnnData`), HDF5-backed (`HDF5AnnData`), Zarr-backed (`ZarrAnnData`) and Python-backed (`ReticulateAnnData`) back ends with a consistent interface +* Read/write `.h5ad` files and `.zarr` stores natively * Convert to/from `SingleCellExperiment` objects * Convert to/from `Seurat` objects -## Planned features - -* Zarr-backed back end (`ZarrAnnData`) - ## `AnnData` classes The different `AnnData` classes provide a consistent interface but store and access data in different ways: @@ -48,9 +44,10 @@ The different `AnnData` classes provide a consistent interface but store and acc It is want you will want to use in most cases where you want to interact with an `AnnData` object. - The `HDF5AnnData` provides an interface to a H5AD file and minimal data is stored in memory until it is requested by the user. It is primarily designed as an intermediate object when reading/writing H5AD files but can be useful for accessing parts of large files. +- The `ZarrAnnData` provides an interface to a Zarr store and minimal data is stored in memory until it is requested by the user. + It is primarily designed as an intermediate object when reading/writing Zarr stores but can be useful for accessing parts of large stores. - The `ReticulateAnnData` accesses data stored in an `AnnData` object in a concurrent Python session. This comes with the overhead and complexity of using `r CRANpkg("reticulate")` but is sometimes useful to access functionality that has not yet been implemented in `r Biocpkg("anndataR")`. -- The planned `ZarrAnnData` will provide an interface to an `AnnData` Zarr store, similar to `HDF5AnnData`. - An `AnnDataView` is returned when subsetting an `AnnData` object and provides access to a subset of the data in the referenced object. Some functionality (such as setting slots) requires converting to one of the full classes. diff --git a/vignettes/usage_seurat.Rmd b/vignettes/usage_seurat.Rmd index 7607d15d..948af675 100644 --- a/vignettes/usage_seurat.Rmd +++ b/vignettes/usage_seurat.Rmd @@ -32,7 +32,7 @@ You can install them using the following code: install.packages("Seurat") ``` -# Reading H5AD files to a `Seurat` Object +# Reading H5AD files and Zarr stores to a `Seurat` Object Using an example `.h5ad` file included in the package, we will demonstrate how to read an `.h5ad` file and convert it to a `Seurat` object. @@ -58,6 +58,27 @@ seurat_obj <- adata$as_Seurat() seurat_obj ``` +Similarly, we can read from a Zarr store which we also demonstrate with an example `.zarr` store: + +```{r prep-file-zarr} +# Please use "example_v3.zarr.zip" for AnnData stored as Zarr version 3 +zarr_path <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_path, exdir = td) +zarr_path <- file.path(td, "example_v2.zarr") + +seurat_obj_zarr <- read_zarr(zarr_path, as = "Seurat") +seurat_obj_zarr +``` + +or + +```{r convert-seurat-zarr} +adata <- read_zarr(zarr_path) +seurat_obj_zarr <- adata$as_Seurat() +seurat_obj_zarr +``` + # Mapping between `AnnData` and `Seurat` Figure \@ref(fig:mapping) shows the structures of the `AnnData` and `Seurat` objects and how `r Biocpkg("anndataR")` maps between them. @@ -102,19 +123,21 @@ seurat_obj The mapping arguments can also be passed directly to `read_h5ad()`. -# Writing a `Seurat` object to a H5AD file +# Writing a `Seurat` object to a H5AD file or Zarr store -The reverse conversion is also possible, allowing you to convert the `Seurat` object back to an `AnnData` object, or to just write out the `Seurat` object as an `.h5ad` file. +The reverse conversion is also possible, allowing you to convert the `Seurat` object back to an `AnnData` object, or to just write out the `Seurat` object as an `.h5ad` file or `.zarr` store. -```{r write-seurat} +```{r write-seurat, eval=FALSE} write_h5ad(seurat_obj, tempfile(fileext = ".h5ad")) +write_zarr(seurat_obj, tempfile(fileext = ".zarr")) ``` This is equivalent to converting the `Seurat` object to an `AnnData` object and then writing it out: -```{r convert-to-anndata} +```{r convert-to-anndata, eval=FALSE} adata <- as_AnnData(seurat_obj) adata$write_h5ad(tempfile(fileext = ".h5ad")) +adata$write_zarr(tempfile(fileext = ".zarr")) ``` You can again customize the conversion process by providing specific mappings for each slot in the `AnnData` object. @@ -137,7 +160,7 @@ adata <- as_AnnData( adata ``` -The mapping arguments can also be passed directly to `write_h5ad()`. +The mapping arguments can also be passed directly to `write_h5ad()` or `write_zarr()`. # Session info diff --git a/vignettes/usage_singlecellexperiment.Rmd b/vignettes/usage_singlecellexperiment.Rmd index f6d95f66..af10dcc5 100644 --- a/vignettes/usage_singlecellexperiment.Rmd +++ b/vignettes/usage_singlecellexperiment.Rmd @@ -35,7 +35,7 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) { BiocManager::install("SingleCellExperiment") ``` -# Reading H5AD files to a `SingleCellExperiment` object +# Reading H5AD files and Zarr stores to a `SingleCellExperiment` object Using an example `.h5ad` file included in the package, we will demonstrate how to read an `.h5ad` file and convert it to a `SingleCellExperiment` object. @@ -61,6 +61,27 @@ sce <- adata$as_SingleCellExperiment() sce ``` +Similarly, we can read from a Zarr store which we also demonstrate with an example `.zarr` store: + +```{r prep-file-zarr} +# Please use "example_v3.zarr.zip" for AnnData stored as Zarr version 3 +zarr_path <- system.file("extdata", "example_v2.zarr.zip", package = "anndataR") +td <- tempdir(check = TRUE) +unzip(zarr_path, exdir = td) +zarr_path <- file.path(td, "example_v2.zarr") + +sce_zarr <- read_zarr(zarr_path, as = "SingleCellExperiment") +sce_zarr +``` + +or + +```{r read-zarr} +adata <- read_zarr(zarr_path) +sce_zarr <- adata$as_SingleCellExperiment() +sce_zarr +``` + # Mapping between `AnnData` and `SingleCellExperiment` Figure \@ref(fig:mapping) shows the structures of the `AnnData` and `SingleCellExperiment` objects and how `r Biocpkg("anndataR")` maps between them. @@ -104,19 +125,21 @@ adata$as_SingleCellExperiment( The mapping arguments can also be passed directly to `read_h5ad()`. -# Writing a `SingleCellExperiment` object to H5AD file +# Writing a `SingleCellExperiment` object to a H5AD file or Zarr store -The reverse conversion is also possible, allowing you to convert a `SingleCellExperiment` object back to an `AnnData` object, or to just write out the `SingleCellExperiment` object as an `.h5ad` file. +The reverse conversion is also possible, allowing you to convert a `SingleCellExperiment` object back to an `AnnData` object, or to just write out the `SingleCellExperiment` object as an `.h5ad` file or `.zarr` store. -```{r write-sce} +```{r write-sce, eval=FALSE} write_h5ad(sce_obj, tempfile(fileext = ".h5ad")) +write_zarr(sce_obj, tempfile(fileext = ".zarr")) ``` This is equivalent to converting the `SingleCellExperiment` object to an `AnnData` object and then writing it out: -```{r convert-and-write} +```{r convert-and-write, eval=FALSE} adata <- as_AnnData(sce_obj) adata$write_h5ad(tempfile(fileext = ".h5ad")) +adata$write_zarr(tempfile(fileext = ".zarr")) ``` You can again customize the conversion process by providing specific mappings for each slot in the `AnnData` object. For more details, see `?as_AnnData`. @@ -136,7 +159,7 @@ as_AnnData( ) ``` -The mapping arguments can also be passed directly to `write_h5ad()`. +The mapping arguments can also be passed directly to `write_h5ad()` or `write_zarr()`. # Session info