diff --git a/README.md b/README.md
index 177c1d1..75b4fe0 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,59 @@ Template folder for creating new E2E tests. Contains a complete framework with:
 
 Use `./tests/create-test.sh <your-test-name>` to create a new test from this template.
 
+### csi-ceph
+
+Reference testkit that provisions a full Rook-managed Ceph cluster and a
+csi-ceph-backed `StorageClass` end-to-end, then verifies a plain `PVC`
+bound against that class.
+
+Built around `testkit.EnsureCephStorageClass` (see
+[pkg/FUNCTIONS_GLOSSARY.md](pkg/FUNCTIONS_GLOSSARY.md#ceph-storageclass-testkit)),
+which handles: enabling `sds-node-configurator` + `sds-elastic` + `csi-ceph`
+modules, optionally provisioning a `sds-local-volume` Thick `StorageClass`
+for OSD backing, seeding `rook-config-override` (for things like
+`ms_crc_data=false`), creating Rook `CephCluster` + `CephBlockPool`, and
+wiring `CephClusterConnection` / `CephClusterAuthentication` /
+`CephStorageClass` csi-ceph CRs.
+
+The testkit itself only runs a smoke check; downstream repos (e.g.
+`csi-ceph`) can import `github.com/deckhouse/storage-e2e/pkg/testkit` and
+reuse `EnsureCephStorageClass` inside their own Ginkgo specs.
+
+Testkit-specific env variables:
+
+- `CSI_CEPH_OSD_STORAGE_CLASS` — pre-existing block-mode StorageClass used to
+  back Rook OSD PVCs. When empty, a `sds-local-volume` Thick SC is
+  auto-provisioned via `EnsureDefaultStorageClass`.
+- `CSI_CEPH_MODULE_PULL_OVERRIDE` — image tag for `csi-ceph`'s
+  ModulePullOverride (dev registries only, e.g. when testing a PR build).
+
+#### `modulePullOverride` env templating
+
+Any module entry in `cluster_config.yml` may reference an env var with the
+`${VAR}` form in `modulePullOverride`. `storage-e2e` resolves those at config
+load time, so CI can point a module at a per-PR/MR image without editing the
+YAML between runs:
+
+```yaml
+dkpParameters:
+  modules:
+    - name: csi-ceph
+      modulePullOverride: "${MODULE_IMAGE_TAG}"  # CI must set MODULE_IMAGE_TAG, e.g. "pr131" on GitHub or "mr131" on GitLab
+```
+
+If a referenced env var is unset, `LoadClusterConfig` fails fast with
+`module "<name>" references env var ${VAR} in modulePullOverride but it is not set`
+instead of silently falling back to `main` — so a missing variable in CI is
+caught before bootstrap, not after a 30-minute wrong-image install.
+
+Run:
+
+```bash
+source tests/csi-ceph/test_exports
+go test -timeout=240m -v ./tests/csi-ceph -count=1
+```
+
 ### csi-all-stress-tests
 
 Stress tests for all CSI storage drivers. This test suite:
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index 4d06918..a109a0a 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -23,6 +23,7 @@ storage-e2e/
 │   ├── config/                    # Configuration management
 │   │   ├── config.go             # Main configuration struct
 │   │   ├── env.go                # Environment variable parsing
+│   │   ├── overrides.go          # ${VAR} expansion in modulePullOverride at config load time
 │   │   ├── types.go              # Configuration type definitions
 │   │   └── images.go             # OS image definitions
 │   │
@@ -75,6 +76,12 @@ storage-e2e/
 │   ├── kubernetes/               # Public Kubernetes utilities
 │   │   ├── apply.go              # YAML manifest application
 │   │   ├── blockdevice.go        # BlockDevice operations
+│   │   ├── cephblockpool.go      # Rook CephBlockPool operations
+│   │   ├── cephcluster.go        # Rook CephCluster operations
+│   │   ├── cephfilesystem.go     # Rook CephFilesystem operations
+│   │   ├── cephclusterconnection.go # csi-ceph connection/auth CRs
+│   │   ├── cephcredentials.go    # Rook Ceph credential discovery
+│   │   ├── cephstorageclass.go   # csi-ceph CephStorageClass CR
 │   │   ├── client.go             # Clientset/dynamic client with retry
 │   │   ├── localstorageclass.go  # LocalStorageClass CR operations
 │   │   ├── lvmvolumegroup.go     # LVMVolumeGroup operations
@@ -83,17 +90,25 @@ storage-e2e/
 │   │   ├── nodegroup.go          # NodeGroup operations
 │   │   ├── nodes.go              # Node listing, taints, labels
 │   │   ├── pod.go                # Pod operations
+│   │   ├── pod_exec.go           # Pods/exec helpers + DistrolessReader for distroless containers
+│   │   ├── poll.go               # Generic readiness poller (per-call timeout, WARN on net errors)
 │   │   ├── pvc.go                # PVC operations
+│   │   ├── rookconfigoverride.go # Rook ceph.conf override ConfigMap
 │   │   ├── secrets.go            # Secret operations
 │   │   ├── storageclass.go       # StorageClass get/wait/default
+│   │   ├── storageclass_manage.go # Global default StorageClass management
 │   │   ├── virtualdisk.go        # VirtualDisk attach/detach
-│   │   └── vmpod.go              # VM pod lookup
+│   │   ├── vmpod.go              # VM pod lookup
+│   │   └── volumesnapshotclass.go # VolumeSnapshotClass helpers
 │   │
 │   ├── retry/                    # Generic retry with exponential backoff
 │   │   └── retry.go
 │   │
 │   └── testkit/                  # Test framework utilities
-│       ├── storageclass.go       # Default StorageClass provisioning
+│       ├── ceph.go               # EnsureCephStorageClass (Rook + csi-ceph)
+│       ├── ceph_cluster.go       # EnsureCephCluster (Rook only, no csi-ceph)
+│       ├── ceph_crc.go           # Ceph CRC tuning helpers
+│       ├── storageclass.go       # EnsureDefaultStorageClass (sds-local-volume)
 │       └── stress-tests.go       # Stress test runner
 │
 ├── tests/                         # Test suites
@@ -326,6 +341,7 @@ Tests use Ginkgo's lifecycle hooks:
 config/
 ├── config.go           # Main configuration operations
 ├── env.go              # Environment variable definitions and validation
+├── overrides.go        # ${VAR} expansion in modulePullOverride at config load time
 ├── types.go            # Configuration type definitions
 └── images.go           # OS image URL definitions
 ```
@@ -486,6 +502,12 @@ pkg/
 ├── kubernetes/
 │   ├── apply.go        # YAML manifest application
 │   ├── blockdevice.go  # BlockDevice operations
+│   ├── cephblockpool.go         # Rook CephBlockPool CRUD + wait
+│   ├── cephcluster.go           # Rook CephCluster CRUD + wait
+│   ├── cephfilesystem.go        # Rook CephFilesystem CRUD + wait
+│   ├── cephclusterconnection.go # csi-ceph CephClusterConnection/Auth CRs
+│   ├── cephcredentials.go       # Read fsid/mons/admin-key from Rook secrets
+│   ├── cephstorageclass.go      # csi-ceph CephStorageClass CR
 │   ├── client.go       # Clientset/dynamic client with retry
 │   ├── localstorageclass.go  # LocalStorageClass CR operations
 │   ├── lvmvolumegroup.go     # LVMVolumeGroup operations
@@ -494,15 +516,23 @@ pkg/
 │   ├── nodegroup.go    # NodeGroup operations
 │   ├── nodes.go        # Node listing, taints, labels
 │   ├── pod.go          # Pod operations
+│   ├── pod_exec.go     # Exec helpers + DistrolessReader (ephemeral-container session)
+│   ├── poll.go         # pollResourceUntilReady helper for Wait*Ready callers
 │   ├── pvc.go          # PVC operations
+│   ├── rookconfigoverride.go    # Rook global ceph.conf override
 │   ├── secrets.go      # Secret operations
 │   ├── storageclass.go # StorageClass get/wait/default
+│   ├── storageclass_manage.go   # Global default-SC management
 │   ├── virtualdisk.go  # VirtualDisk attach/detach
-│   └── vmpod.go        # VM pod lookup
+│   ├── vmpod.go        # VM pod lookup
+│   └── volumesnapshotclass.go   # VolumeSnapshotClass helpers
 ├── retry/
 │   └── retry.go        # Generic retry with exponential backoff
 └── testkit/
-    ├── storageclass.go  # Default StorageClass provisioning
+    ├── ceph.go          # EnsureCephStorageClass / EnsureDefaultCephStorageClass
+    ├── ceph_cluster.go  # EnsureCephCluster (Rook-only, no csi-ceph)
+    ├── ceph_crc.go      # Ceph CRC tuning helpers
+    ├── storageclass.go  # EnsureDefaultStorageClass (sds-local-volume)
     └── stress-tests.go  # Stress test runner
 ```
 
@@ -730,7 +760,8 @@ logger.Error("Failed to create resource: %v", err)
 | `TEST_CLUSTER_VIRTUAL_MACHINE_CLASS_NAME` | `generic` | VM class for VMs on the base cluster in `alwaysCreateNew`. If set to another name (DNS-1123 subdomain) and the class does not exist, it is created from `generic` with `spec.cpu.type: Host`, **`spec.nodeSelector` / `spec.tolerations` cleared**, sizing policies retained from template, labeled `storage-e2e.deckhouse.io/auto-created=true`, and left after cleanup |
 | `TEST_CLUSTER_CLEANUP` | `false` | Cleanup cluster after tests |
 | `LOG_LEVEL` | `debug` | Log level (debug/info/warn/error) |
-| `KUBE_CONFIG_PATH` | - | Fallback kubeconfig path |
+| `KUBE_CONFIG_PATH` | - | Explicit kubeconfig path. Used when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` from the master fails. If unset and SSH also fails, `GetKubeconfig` returns an error (no silent fallback to `~/.kube/config`). |
+| `MODULE_IMAGE_TAG` (and any other custom name) | - | Any `${VAR}` placeholder used inside `modulePullOverride:` in `cluster_config.yml` is expanded at config load time by `internal/config/overrides.ExpandEnvInModulePullOverride`. Missing/empty placeholders fail fast with an explicit error so CI can point modules at `pr<N>` / `mr<N>` images via a single env var without editing the YAML between runs. |
 
 ### Commander Variables (only when `TEST_CLUSTER_CREATE_MODE=commander`)
 
diff --git a/docs/FUNCTIONS_GLOSSARY.md b/docs/FUNCTIONS_GLOSSARY.md
index 64d4b4a..592a386 100644
--- a/docs/FUNCTIONS_GLOSSARY.md
+++ b/docs/FUNCTIONS_GLOSSARY.md
@@ -16,6 +16,7 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - [Pod](#pod)
 - [PVC (PersistentVolumeClaim)](#pvc-persistentvolumeclaim)
 - [StorageClass](#storageclass)
+- [VolumeSnapshotClass](#volumesnapshotclass)
 - [BlockDevice](#blockdevice)
 - [LVMVolumeGroup](#lvmvolumegroup)
 - [LocalStorageClass](#localstorageclass)
@@ -24,8 +25,18 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - [Secrets](#secrets)
 - [Modules](#modules)
 - [Retry](#retry)
+- [Rook Config Override](#rook-config-override)
+- [Ceph Credentials](#ceph-credentials)
+- [CephCluster (Rook)](#cephcluster-rook)
+- [CephBlockPool (Rook)](#cephblockpool-rook)
+- [CephFilesystem (Rook)](#cephfilesystem-rook)
+- [CephClusterConnection / CephClusterAuthentication (csi-ceph)](#cephclusterconnection--cephclusterauthentication-csi-ceph)
+- [CephStorageClass (csi-ceph)](#cephstorageclass-csi-ceph)
 - [Default StorageClass (Testkit)](#default-storageclass-testkit)
+- [Ceph StorageClass (Testkit)](#ceph-storageclass-testkit)
+- [Ceph Cluster (Testkit) — no csi-ceph wiring](#ceph-cluster-testkit--no-csi-ceph-wiring)
 - [Stress Tests (Testkit)](#stress-tests-testkit)
+- [Ceph CRC (Testkit)](#ceph-crc-testkit)
 
 ---
 
@@ -137,6 +148,16 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - `WaitForAllPodsReadyInNamespace(ctx, kubeconfig, namespace, timeout)` — Waits for all pods in a namespace to be in Ready condition.
 - `WaitForPodsStatus(ctx, clientset, namespace, labelSelector, status, expectedCount, maxAttempts, interval)` — Waits for pods matching a label selector to reach a specific status (Running, Completed, etc.).
 
+`pkg/kubernetes/pod_exec.go`
+
+- `ExecInPod(ctx, kubeconfig, namespace, pod, container, cmd) (stdout, stderr, err)` — Runs a command inside a container via the apiserver's `pods/exec` subresource (SPDY). Returns stdout and stderr separately; the container must ship every binary referenced by `cmd`. Use this when the container has a usable shell/userland.
+- `ReadFileFromPod(ctx, kubeconfig, namespace, pod, container, path)` — `ExecInPod` + `cat <path>`. Convenience wrapper for non-distroless images.
+- `ReadFileFromDistrolessPod(ctx, kubeconfig, namespace, pod, targetContainer, path, opts)` — Reads a file from a distroless / scratch container that ships no `cat`/`sh`/`tar`. Injects a short-lived ephemeral container (image from `opts.DebugImage`, defaults to `DefaultDebugImage = "busybox:1.36"`) with `targetContainerName=targetContainer`, polls until it goes Running (`opts.StartupTimeout`, defaults to 60s), then `cat /proc/1/root<path>` — `/proc/1/root` is the kernel-exposed FS root of PID 1 in the target container, which the ephemeral container can see thanks to the shared PID namespace. Adding the ephemeral container goes through the dedicated `/pods/<name>/ephemeralcontainers` subresource, so existing containers and the pod sandbox are NOT restarted, `metadata.generation` is not bumped, and ReplicaSet/DaemonSet observation is unaffected — downstream rollout / `checksum/...` annotation assertions still see a clean signal. Caveat: ephemeral containers cannot be removed once added, but each call generates a unique name and the `sleep 60` command exits on its own; entries pile up in `pod.status.ephemeralContainerStatuses` until the next pod recycle. Internally a one-shot wrapper around `OpenDistrolessReader` + `(*DistrolessReader).ReadFile`.
+- `OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts) (*DistrolessReader, error)` — Long-lived variant of `ReadFileFromDistrolessPod`: injects ONE ephemeral container (sleeps for `opts.SessionTTL`, defaults to `DefaultDistrolessSessionTTL` = 30 min) and returns a session that can serve arbitrarily many cheap reads. Use this for polling loops (e.g. `Eventually(...)` waiting for a file's content to flip) so the ephemeral-container cold start is paid once instead of per iteration.
+- `(*DistrolessReader) ReadFile(ctx, path)` — `cat /proc/1/root<path>` against the pre-injected ephemeral container. Cheap — just a `pods/exec` round-trip; no apiserver mutations.
+- `(*DistrolessReader) PodName()` — Name of the pod this reader is bound to. Used by callers that need to detect rollouts (the pod name changes when the workload-controller recycles the pod) and re-`OpenDistrolessReader` against the new pod.
+- `(*DistrolessReader) EphemeralName()` — Auto-generated name of the injected ephemeral container, mostly for logs.
+
 ## PVC (PersistentVolumeClaim)
 
 `pkg/kubernetes/pvc.go`
@@ -155,6 +176,17 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - `GetStorageClass(ctx, kubeconfig, name)` — Returns the `*storagev1.StorageClass` with the given name, or `(nil, nil)` if it does not exist.
 - `SetGlobalDefaultStorageClass(ctx, kubeconfig, storageClassName)` — Updates the "global" ModuleConfig to set `spec.settings.storageClass` to the given name, making it the cluster default.
 
+`pkg/kubernetes/storageclass_manage.go`
+
+- `CreateStorageClass(ctx, kubeconfig, cfg)` — Creates a `storage.k8s.io/v1 StorageClass` directly from `StorageClassCreateConfig` (`Name`, `Provisioner`, `Parameters`, `VolumeBindingMode`, `ReclaimPolicy`, `AllowExpansion`, `MakeDefault`, plus optional extra labels/annotations). When `MakeDefault=true` both the GA and beta `is-default-class` annotations are set. Idempotent: `AlreadyExists` is logged and treated as success.
+
+## VolumeSnapshotClass
+
+`pkg/kubernetes/volumesnapshotclass.go`
+
+- `CreateVolumeSnapshotClass(ctx, kubeconfig, cfg)` — Creates a `snapshot.storage.k8s.io/v1 VolumeSnapshotClass` from `VolumeSnapshotClassConfig` (`Name`, `Driver`, `DeletionPolicy` defaulting to `Delete`, `Parameters`, `MakeDefault`). Idempotent: `AlreadyExists` is logged and treated as success.
+- `WaitForVolumeSnapshotClass(ctx, kubeconfig, name, timeout)` — Polls until the named VolumeSnapshotClass is Get-able.
+
 ## BlockDevice
 
 `pkg/kubernetes/blockdevice.go`
@@ -223,6 +255,69 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - `IsSSHConnectionError(err)` — Checks if an error specifically indicates SSH connection failure requiring reconnection.
 - `WithRetryAfter(cfg, err)` — Returns a modified retry config that respects `RetryAfterSeconds` hints from Kubernetes API errors.
 
+## Rook Config Override
+
+`pkg/kubernetes/rookconfigoverride.go`
+
+- `SetRookConfigOverride(ctx, kubeconfig, namespace, globals)` — Creates or updates the `rook-config-override` ConfigMap in the Rook operator namespace. The provided map is rendered under `[global]` and Rook picks it up into every Ceph daemon's `ceph.conf` (used for `ms_crc_data`, `bdev_enable_discard`, and similar knobs). Keys are sorted for stable output.
+- `DeleteRookConfigOverride(ctx, kubeconfig, namespace)` — Removes the ConfigMap; safe if it does not exist.
+- `RenderCephGlobalConfig(globals)` — Pure helper that renders a `[global]` section for `ceph.conf` from a `map[string]string`. Keys are sorted so the output is byte-stable across calls with logically-equivalent maps (used by `SetRookConfigOverride` to avoid spurious ConfigMap updates and by callers that need to compare the desired vs. live ConfigMap content before deciding to roll daemons).
+
+## Ceph Credentials
+
+`pkg/kubernetes/cephcredentials.go`
+
+- `WaitForCephCredentials(ctx, kubeconfig, namespace, timeout)` — Polls Rook's `rook-ceph-mon` Secret and `rook-ceph-mon-endpoints` ConfigMap until all pieces required to connect a CSI client to the cluster (`fsid`, admin user, admin key, monitor endpoints) are present. Returns a `*CephCredentials`.
+
+## CephCluster (Rook)
+
+`pkg/kubernetes/cephcluster.go`
+
+- `CreateCephCluster(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephCluster` CR using `CephClusterConfig` (image, mon/mgr counts, network provider, OSD storage class / count / size, data-dir host path, etc.). Idempotent. **Fail-fast:** if an existing CR has `metadata.deletionTimestamp != nil`, returns an error instead of trying to update a Terminating object (which would silently no-op and trap the next `WaitForCephClusterReady` for 15-20 minutes).
+- `WaitForCephClusterReady(ctx, kubeconfig, namespace, name, timeout)` — Blocks until `status.state == "Created"` (or `status.phase == "Ready"`). HEALTH_WARN is tolerated so single-OSD test clusters still succeed. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. **Fail-fast** when the CR comes back with `deletionTimestamp != nil` — there's no point waiting for Ready on a Terminating object.
+- `DeleteCephCluster(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Does NOT garbage-collect OSD data on host disks. Pair with `WaitForCephClusterGone` if the next step depends on the CR being fully GC'd (e.g. before re-creating the cluster, or to detect a stuck `cephcluster.ceph.rook.io` finalizer early).
+- `WaitForCephClusterGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR returns NotFound (default `CephClusterGoneTimeout` = 10m when timeout is 0). Logs deletionTimestamp / finalizers progress periodically, so a stuck finalizer (typical after a teardown that left dependents alive — see `DeletionIsBlocked`) is visible immediately instead of after a silent timeout. Fail-fast on timeout: does NOT auto-strip finalizers — investigate the cluster manually before re-running.
+
+## CephBlockPool (Rook)
+
+`pkg/kubernetes/cephblockpool.go`
+
+- `CreateCephBlockPool(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephBlockPool` from `CephBlockPoolConfig` (replicated with optional `requireSafeReplicaSize` override, or erasure-coded with `dataChunks`/`codingChunks`; `failureDomain`). **Fail-fast** when the existing CR has `deletionTimestamp != nil`.
+- `WaitForCephBlockPoolReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`.
+- `DeleteCephBlockPool(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephBlockPoolGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`.
+- `WaitForCephBlockPoolGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephBlockPoolGoneTimeout` = 5m). Logs progress periodically.
+
+## CephFilesystem (Rook)
+
+`pkg/kubernetes/cephfilesystem.go`
+
+- `CreateCephFilesystem(ctx, kubeconfig, cfg)` — Creates or updates a Rook `CephFilesystem` from `CephFilesystemConfig` (one replicated metadata pool + one replicated data pool, configurable `failureDomain`, `MetadataServerActiveCount`, optional `RequireSafeReplicaSize`). Idempotent. **Fail-fast** when the existing CR has `deletionTimestamp != nil`.
+- `WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout)` — Polls until `status.phase == "Ready"`, with a fallback that also accepts `status.conditions[type=Ready,status=True]` for Rook revisions that populate conditions before phase. Each Get is bounded by `PollGetTimeout` (30s) and consecutive Get failures emit WARN, so a dropped SSH tunnel surfaces in seconds instead of after the readyTimeout. Fail-fast on `deletionTimestamp != nil`.
+- `DeleteCephFilesystem(ctx, kubeconfig, namespace, name)` — Fire-and-forget delete; NotFound is treated as success. Pair with `WaitForCephFilesystemGone` to make sure the parent CephCluster's deletion isn't blocked by `ObjectHasDependents`.
+- `WaitForCephFilesystemGone(ctx, kubeconfig, namespace, name, timeout)` — Polls until the CR is GC'd (default `CephFilesystemGoneTimeout` = 5m). Logs progress periodically.
+- `CephFSDataPoolFullName(fsName, dataPoolName)` — Returns the full Ceph pool name (`<fsName>-<dataPoolName>`) that should be passed to `CephStorageClass.spec.cephFS.pool`.
+
+## CephClusterConnection / CephClusterAuthentication (csi-ceph)
+
+`pkg/kubernetes/cephclusterconnection.go`
+
+- `CreateCephClusterAuthentication(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterAuthentication` CR (`userID` + `userKey`) used by csi-ceph to log in to Ceph. **Fail-fast** when the existing CR has `deletionTimestamp != nil`.
+- `DeleteCephClusterAuthentication(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success.
+- `WaitForCephClusterAuthenticationGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterAuthenticationGoneTimeout` = 1m).
+- `CreateCephClusterConnection(ctx, kubeconfig, cfg)` — Creates or updates a `CephClusterConnection` CR (`clusterID == fsid`, `monitors`, `userID`, `userKey`). `clusterID` is immutable: existing-resource updates leave it unchanged and only sync monitors/user. **Fail-fast** when the existing CR has `deletionTimestamp != nil`.
+- `DeleteCephClusterConnection(ctx, kubeconfig, name)` — Fire-and-forget delete; NotFound is treated as success.
+- `WaitForCephClusterConnectionGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephClusterConnectionGoneTimeout` = 1m).
+- `WaitForCephClusterConnectionCreated(ctx, kubeconfig, name, timeout)` — Polls until csi-ceph reports `status.phase == "Created"` (credentials + monitors validated against the live Ceph cluster).
+
+## CephStorageClass (csi-ceph)
+
+`pkg/kubernetes/cephstorageclass.go`
+
+- `CreateCephStorageClass(ctx, kubeconfig, cfg)` — Creates or updates a csi-ceph `CephStorageClass` CR (RBD by default; CephFS when `Type == "CephFS"` and `CephFSName` / `CephFSPool` are set). The csi-ceph controller provisions a corresponding core `storage.k8s.io/v1 StorageClass` as a side effect. **Fail-fast** when the existing CR has `deletionTimestamp != nil`.
+- `DeleteCephStorageClass(ctx, kubeconfig, name)` — Fire-and-forget delete; the controller removes the backing StorageClass.
+- `WaitForCephStorageClassGone(ctx, kubeconfig, name, timeout)` — Polls until the CR is GC'd (default `CephStorageClassGoneTimeout` = 1m).
+- `WaitForCephStorageClassCreated(ctx, kubeconfig, name, timeout)` — Polls until `status.phase == "Created"`.
+
 ## Default StorageClass (Testkit)
 
 `pkg/testkit/storageclass.go`
@@ -230,6 +325,20 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - `CreateDefaultStorageClass(ctx, kubeconfig, cfg)` — High-level helper: discovers nodes, enables sds-node-configurator/sds-local-volume modules, labels nodes, optionally attaches VirtualDisks, creates LVMVolumeGroups (Thick or Thin with thin pool), creates LocalStorageClass, waits for StorageClass. Configured via `DefaultStorageClassConfig`.
 - `EnsureDefaultStorageClass(ctx, kubeconfig, cfg)` — Idempotent wrapper around `CreateDefaultStorageClass`. Checks if StorageClass already exists, skips creation if so, then sets it as the cluster default via "global" ModuleConfig.
 
+## Ceph StorageClass (Testkit)
+
+`pkg/testkit/ceph.go`
+
+- `EnsureCephStorageClass(ctx, kubeconfig, cfg)` — High-level end-to-end helper that turns an empty test cluster into one with a working csi-ceph `StorageClass`. Steps: (1) enable `sds-node-configurator`, `sds-elastic`, `csi-ceph` modules and wait Ready; (2) optionally call `EnsureDefaultStorageClass` to auto-provision a sds-local-volume SC for OSDs when `OSDStorageClass` is empty; (3) seed `rook-config-override` with `GlobalCephConfigOverrides` (e.g. `ms_crc_data=false`); (4) create Rook `CephCluster` and wait Created; (5) create the backing pool primitive — `CephBlockPool` (when `Type == "RBD"`, default) or `CephFilesystem` (when `Type == "CephFS"`) — and wait Ready; (6) read fsid/monitors/admin-key from Rook-managed secrets; (7) wire csi-ceph by creating `CephClusterAuthentication` + `CephClusterConnection`; (8) create the matching `CephStorageClass` (RBD pool or `<fsName>-<dataPoolName>` for CephFS) and wait for the backing core StorageClass. Idempotent; returns the resulting StorageClass name.
+- `EnsureDefaultCephStorageClass(ctx, kubeconfig, cfg)` — `EnsureCephStorageClass` + `SetGlobalDefaultStorageClass` so new PVCs without an explicit `storageClassName` use the provisioned Ceph (RBD or CephFS) class.
+- `TeardownCephStorageClass(ctx, kubeconfig, cfg)` — Reverse of `EnsureCephStorageClass`. After every Delete it now waits for the CR to be fully GC'd via the matching `WaitForXxxGone` helper. Order is: `CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → (`CephBlockPool` or `CephFilesystem` per `cfg.Type`) → `CephCluster` (unless `SkipClusterTeardown`) → `rook-config-override` ConfigMap. Without those waits the parent `CephCluster` would be deleted before its dependents are gone, Rook would record `DeletionIsBlocked / ObjectHasDependents`, and the next test run would either find a stuck Terminating CR or hang in `WaitForCephClusterReady`. Fail-fast on a Wait*Gone timeout: errors are aggregated and returned, no auto-strip of finalizers — investigate the cluster manually before re-running. NotFound is still treated as success; subsequent deletions are still attempted on partial failures.
+
+## Ceph Cluster (Testkit) — no csi-ceph wiring
+
+`pkg/testkit/ceph_cluster.go`
+
+- `EnsureCephCluster(ctx, kubeconfig, cfg)` — "Stop-before-csi-ceph" variant of `EnsureCephStorageClass`: brings up a Rook-managed Ceph cluster + CephBlockPool via sds-elastic alone. Steps: (1) enable `sds-node-configurator` + `sds-elastic` (does **not** enable `csi-ceph`); (2) resolve/provision OSD backing StorageClass (reuses `EnsureDefaultStorageClass`); (3) seed `rook-config-override` with `GlobalCephConfigOverrides`; (4) create Rook `CephCluster` and wait Created; (5) create `CephBlockPool` and wait Ready. Does not create `CephClusterConnection`/`CephClusterAuthentication`/`CephStorageClass`. Useful when tests need a live Ceph backend to talk to directly (e.g. from within csi-ceph's own e2e) without the testkit preselecting a csi-ceph-backed StorageClass. Idempotent; returns the pool name.
+
 ## Stress Tests (Testkit)
 
 `pkg/testkit/stress-tests.go`
@@ -239,3 +348,14 @@ All exported functions available in the `pkg/` directory, grouped by resource.
 - `(*Config) Validate()` — Validates the stress test configuration (namespace, storage class, PVC size, mode-specific params).
 - `(*StressTestRunner) Run(ctx)` — Executes the stress test based on configured mode: flog, check_fs_only, check_cloning, check_restoring_from_snapshot, snapshot_only, or snapshot_resize_cloning.
 - `CleanupStressNamespaces(ctx, kubeconfig)` — Deletes all namespaces with the `load-test=true` label.
+
+## Ceph CRC (Testkit)
+
+`pkg/testkit/ceph_crc.go`
+
+- `EnableServerCRC(ctx, kubeconfig, namespace)` — Sets `ms_crc_data=true` on the server side: rewrites `rook-config-override` and rolling-restarts every Rook-managed Ceph daemon Deployment (mon/mgr/osd/mds/rgw) plus the rook-operator. Use when a test wants Ceph pinned in the explicit CRC-on state. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(true))`.
+- `DisableServerCRC(ctx, kubeconfig, namespace)` — Same as `EnableServerCRC` but flips Ceph into `ms_crc_data=false`. Paired with a csi-ceph client that defaults to `msCrcData=true` this reproduces the msCrcData matrix mismatch case. Thin wrapper over `SetMsCrcDataOnServer(..., ptr(false))`.
+- `ResetServerCRCToDefault(ctx, kubeconfig, namespace)` — Removes `ms_crc_data` from `rook-config-override` so Ceph falls back to its compile-time default (`true`). Convenient for `AfterAll` / `AfterEach` restoration. Thin wrapper over `SetMsCrcDataOnServer(..., nil)`.
+- `SetMsCrcDataOnServer(ctx, kubeconfig, namespace, enabled *bool)` — Lower-level primitive behind the three readability wrappers. Rewrites `rook-config-override` so that only `ms_crc_data=<enabled>` ends up under `[global]` (`nil` removes the key entirely). Idempotent: when the ConfigMap already encodes the desired state, nothing is restarted. Otherwise it (1) rolling-restarts Rook-managed Ceph daemons via `RestartCephDaemons`, (2) restarts the rook-operator via `RestartRookOperator`, and (3) waits for every `CephFilesystem` in the namespace to come back to Ready. Prefer the named wrappers at call sites; this primitive exists so a boolean test parameter (e.g. a CRC matrix) doesn't have to branch.
+- `RestartCephDaemons(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts every Rook-managed Ceph daemon Deployment that consumes `/etc/ceph/ceph.conf` — the selector covers `rook-ceph-mon`, `rook-ceph-mgr`, `rook-ceph-osd`, `rook-ceph-mds`, `rook-ceph-rgw` — and waits for each to reach its desired Ready replica count. All five roles are bounced because a global ConfigMap knob like `ms_crc_data` lives in `ceph.conf` and any daemon left running with the old value (typically MDS) silently breaks the messenger handshake and degrades CephFS / blocks csi-cephfs PVCs in Pending. Operator restart is intentionally out of scope here — see `RestartRookOperator`.
+- `RestartRookOperator(ctx, kubeconfig, namespace, timeout)` — Rollout-restarts the rook-operator Deployment in the given namespace and waits for the new pod to become Ready. Required after every wire-protocol bounce: the operator runs as a Ceph admin client (admin keyring + baked-in `ceph.conf`), and without a pod restart it keeps retrying with the stale `ceph.conf`, which surfaces in the cephcluster CR as `HEALTH_ERR` / `state: Error` until the next reconcile. Deckhouse-specific naming: the Deployment name is derived from the namespace by stripping the leading `d8-` prefix (`d8-sds-elastic` → `sds-elastic`). Vanilla Rook (`rook-ceph-operator` in `rook-ceph`) is not supported.
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index e7d995f..3725397 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -55,3 +55,34 @@ All notable changes to this repository are documented here. New entries are appe
 - **Add** `.cursor/rules/todo-command.mdc`: `/todo` command for managing `docs/TODO.md`
 - **Add** `.cursor/rules/backward-compatibility.mdc`: rule to guard backward compatibility of exported `pkg/` API — ask before breaking changes, mark worklog with `[Possible compatibility break]`
 - **Add** `.cursor/rules/versatile-functions.mdc`: rule to ensure new functions are general-purpose and reusable — return data not decisions, no hardcoded names, compose from existing functions, no empty wrappers
+
+## 2026-05-05
+
+- **Add** `internal/config/overrides.go` + `_test.go`: `ExpandEnvInModulePullOverride` resolves `${VAR}` placeholders in `modulePullOverride` at config load time; missing env fails fast with an explicit error so CI can point modules at `pr<N>` / `mr<N>` images via a single env var (`MODULE_IMAGE_TAG`) without editing `cluster_config.yml`.
+- **Update** `internal/cluster/cluster.go::LoadClusterConfig` and `pkg/cluster/cluster.go::loadClusterConfigFromPath`: hook `ExpandEnvInModulePullOverride` right after `yaml.Unmarshal`.
+- **Update** `README.md`: documented `${VAR}` form in `modulePullOverride` and the fail-fast behavior on unset env vars.
+- **Refactor** `internal/config/env.go`: extracted `ApplyDefaults()` out of `ValidateEnvironment` so suites that don't call validation still get defaults for `SSH_VM_USER` / `SSH_PRIVATE_KEY` / `SSH_PUBLIC_KEY` / `TEST_CLUSTER_NAMESPACE` / `YAML_CONFIG_FILENAME` / `TEST_CLUSTER_CLEANUP`.
+- **Update** `pkg/cluster/cluster.go::CreateTestCluster`: call `config.ApplyDefaults()` defensively + fall back to `config.YAMLConfigFilenameDefaultValue` when the filename arg is empty.
+- **Bugfix** `pkg/cluster/setup.go::executeDhctlBootstrap`: pass `FORCE_NO_PRIVATE_KEYS=true` and `USE_AGENT_WITH_NO_PRIVATE_KEYS=true` env vars into the `dhctl bootstrap` container so `lib-connection` stops opening `/root/.ssh/id_rsa` and authenticates exclusively via the mounted ssh-agent socket — fixes "Failed to read private keys from flags" on passphrase-protected keys.
+- **Bugfix** `pkg/cluster/vms.go::generateCloudInitUserData`: pin apt to `mirror.yandex.ru` and force IPv4 (`Acquire::ForceIPv4=true`) in cloud-init, so `package_update` and Docker install stop stalling when `archive.ubuntu.com` IPs are partially unreachable.
+- **Refactor** `internal/infrastructure/ssh/client.go::StartTunnel` (both `*client` and `*jumpHostClient`): extracted shared `runTunnelLoop` + `tunnelDialer`. On dial failure that looks like a dropped SSH session, the loop now logs a visible WARN, calls the existing `reconnect()` (retry + exponential backoff), and retries the dial once with the freshly rebuilt session. Fixes the "test hangs 20 minutes silently after Wi-Fi flap" failure mode.
+- **Add** `pkg/kubernetes/poll.go`: `pollResourceUntilReady` centralizes the `WaitFor*Ready` loops with a per-call `PollGetTimeout` (30s) on every Get and WARN logging once consecutive Get failures cross 3, so a dropped tunnel surfaces in seconds instead of after the 20-minute readyTimeout.
+- **Refactor** `pkg/kubernetes/cephcluster.go`, `pkg/kubernetes/cephblockpool.go`, `pkg/kubernetes/cephfilesystem.go`: `WaitForCephClusterReady` / `WaitForCephBlockPoolReady` / `WaitForCephFilesystemReady` migrated to `pollResourceUntilReady`. Public signatures unchanged.
+- **Add** `pkg/kubernetes/pod_exec.go`: `ExecInPod` (pods/exec via SPDY), `ReadFileFromPod` (`cat <path>` wrapper for non-distroless images), and `ReadFileFromDistrolessPod` (single-shot ephemeral container injection that reads through `/proc/1/root<path>` thanks to the shared PID namespace; uses the dedicated `ephemeralcontainers` subresource so the target pod and its sandbox are NOT restarted and `metadata.generation` is not bumped — keeps downstream rollout assertions clean).
+- **Add** `pkg/kubernetes/pod_exec.go::DistrolessReader` + `OpenDistrolessReader`: long-lived ephemeral-container session for cheap repeated reads. `(*DistrolessReader).ReadFile` is a plain `pods/exec` round-trip against the already-running ephemeral container; `(*DistrolessReader).PodName()` lets callers detect rollouts and re-open against the new pod. Pays the ephemeral-container cold start once instead of per `Eventually` iteration.
+- **Add** `pkg/kubernetes/poll.go::pollResourceUntilGone` + per-CR `WaitForCephClusterGone` / `WaitForCephBlockPoolGone` / `WaitForCephFilesystemGone` / `WaitForCephClusterAuthenticationGone` / `WaitForCephClusterConnectionGone` / `WaitForCephStorageClassGone` helpers. Logs `deletionTimestamp` and finalizers progress periodically so a stuck finalizer is visible immediately. Fail-fast on timeout — no auto-strip of finalizers; the operator must investigate before re-running.
+- **Update** Ceph CR `Create*` helpers (`CreateCephCluster` / `CreateCephBlockPool` / `CreateCephFilesystem` / `CreateCephClusterAuthentication` / `CreateCephClusterConnection` / `CreateCephStorageClass`) and `WaitFor*Ready`: now fail fast when the live object has `metadata.deletionTimestamp != nil`. Prevents the framework from updating a Terminating object (silent no-op) or waiting 20 minutes on Ready for an object that's being garbage-collected.
+- **Refactor** `pkg/testkit/ceph.go::TeardownCephStorageClass`: explicitly `WaitFor*Gone` after every Delete in the right order (`CephStorageClass` → `CephClusterConnection` → `CephClusterAuthentication` → `CephBlockPool` or `CephFilesystem` → `CephCluster` → `rook-config-override`). Without these waits the parent `CephCluster` was deleted before its dependents were gone, Rook recorded `DeletionIsBlocked / ObjectHasDependents`, and the next test run either found a stuck Terminating CR or hung in `WaitForCephClusterReady`. Errors are aggregated; NotFound is treated as success.
+- **Update** `pkg/testkit/ceph_crc.go::RestartCephDaemons`: extended the daemon selector from `mon,mgr,osd` to `mon,mgr,osd,mds,rgw`. A global `ms_crc_data` flip lives in `ceph.conf` and any unrestarted daemon (typically MDS) silently breaks the messenger handshake — degrades CephFS and pins csi-cephfs PVCs in Pending. `rgw` is included for forward-compat with future S3 tests.
+- **Add** `pkg/testkit/ceph_crc.go::RestartRookOperator`: rollout-restarts the rook-operator Deployment after a wire-protocol bounce so it picks up the new `ceph.conf` instead of pinning the cephcluster CR in `HEALTH_ERR`. Deployment name is derived from the namespace by stripping the leading `d8-` prefix (Deckhouse module convention, e.g. `d8-sds-elastic` → `sds-elastic`); vanilla Rook is not supported.
+- **Update** `pkg/testkit/ceph_crc.go::SetMsCrcDataOnServer`: after rewriting `rook-config-override` the helper now (1) calls `RestartCephDaemons` for the extended selector, (2) calls `RestartRookOperator`, then (3) waits for every `CephFilesystem` in the namespace to come back to Ready. This is what unblocks the CephFS half of the msCrcData matrix — previously a flip silently left MDS / operator out of sync.
+- **Update** `docs/FUNCTIONS_GLOSSARY.md`: noted that the three `WaitForCeph*Ready` helpers now apply a per-call deadline and emit WARN on consecutive Get failures.
+- **Update** `docs/ARCHITECTURE.md`: added `pkg/kubernetes/poll.go` to Section 1.1 and Section 3.6, added `pkg/kubernetes/cephfilesystem.go` (carry-over from the prior commit), added `internal/config/overrides.go` to Section 3.1.
+
+## 2026-05-06
+
+- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when SSH retrieval of `/etc/kubernetes/{super-admin,admin}.conf` fails, the function now fails fast unless `KUBE_CONFIG_PATH` is set explicitly. The previously considered fallback to `clientcmd.NewDefaultClientConfigLoadingRules` (KUBECONFIG / `~/.kube/config`) was dropped before release to preserve the original fail-fast contract — a silent fallback to the developer's personal kubeconfig is too risky in CI and on machines whose `kubectl` already points at an unrelated cluster.
+- **Update** `docs/FUNCTIONS_GLOSSARY.md`: documented `OpenDistrolessReader` + `*DistrolessReader` methods, `CreateStorageClass`, `CreateVolumeSnapshotClass` / `WaitForVolumeSnapshotClass`, `RenderCephGlobalConfig`, and the full `pkg/testkit/ceph_crc.go` surface (`EnableServerCRC` / `DisableServerCRC` / `ResetServerCRCToDefault` / `SetMsCrcDataOnServer` / `RestartCephDaemons` / `RestartRookOperator`); added matching TOC entries.
+- **Update** `docs/ARCHITECTURE.md`: added `internal/config/overrides.go` to Section 1.1 (was only in Section 3.1), added `pkg/kubernetes/pod_exec.go` to Section 1.1 and Section 3.6, documented `KUBE_CONFIG_PATH` semantics and `${VAR}` expansion (`MODULE_IMAGE_TAG`) in Section 7.
+- **Update** `internal/cluster/cluster.go::GetKubeconfig`: when the SSH-side kubeconfig fetch fails and `KUBE_CONFIG_PATH` is unset, the function now runs two cheap probes (`test -f /etc/kubernetes/{super-admin,admin}.conf` for existence, then `sudo -n -l /bin/cat <path>` for a matching NOPASSWD rule) to classify the failure and returns a structured, actionable error. The error embeds a ready-to-paste `/etc/sudoers.d/e2e-kubeconfig` snippet for the most common cause (passworded sudo on the master) and a `KUBE_CONFIG_PATH` escape hatch. Original SSH error is still wrapped via `%w` so `errors.Is`/`errors.As` keep working.
+- **Bugfix** `internal/cluster/cluster.go::getKubeconfigRemoteShell`: dropped the `sudo -n sh -c '...'` wrapper and now invokes `sudo -n /bin/cat <path>` directly (with a `||` fallback from `super-admin.conf` to `admin.conf`). The wrapper made the privileged binary `/bin/sh`, so the recommended fine-grained `NOPASSWD: /bin/cat /etc/kubernetes/{super-admin,admin}.conf` sudoers rule did not match and `GetKubeconfig` failed with "sudo on the master requires a password" even after the operator pasted the recommended snippet. Aligned the diagnostic probe in `classifyKubeconfigFetchFailure` to `sudo -n -l /bin/cat <path>` for the same reason — `sudo -n true` returned 0 under common `NOPASSWD: ALL` configurations and would mask the real problem on hosts that only allow `cat`.
diff --git a/go.mod b/go.mod
index 764c44d..698c7ce 100644
--- a/go.mod
+++ b/go.mod
@@ -35,13 +35,16 @@ require (
 	github.com/google/go-cmp v0.7.0 // indirect
 	github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
 	github.com/google/uuid v1.6.0 // indirect
+	github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/kr/fs v0.1.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/moby/spdystream v0.5.0 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
 	github.com/openshift/custom-resource-status v1.1.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/spf13/pflag v1.0.7 // indirect
diff --git a/go.sum b/go.sum
index e68a8bc..5089189 100644
--- a/go.sum
+++ b/go.sum
@@ -5,6 +5,8 @@ github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lpr
 github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
 github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
 github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
+github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
 github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
@@ -107,6 +109,8 @@ github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2c
 github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA=
 github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
 github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
+github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
@@ -133,6 +137,8 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
 github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c=
+github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
+github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -143,6 +149,7 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWu
 github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
 github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
 github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
index 1cd7469..f5acf09 100644
--- a/internal/cluster/cluster.go
+++ b/internal/cluster/cluster.go
@@ -48,6 +48,7 @@ import (
 
 	"github.com/deckhouse/storage-e2e/internal/config"
 	"github.com/deckhouse/storage-e2e/internal/infrastructure/ssh"
+	"github.com/deckhouse/storage-e2e/internal/logger"
 )
 
 // LoadClusterConfig loads and validates a cluster configuration from a YAML file
@@ -73,6 +74,14 @@ func LoadClusterConfig(configFilename string) (*config.ClusterDefinition, error)
 		return nil, fmt.Errorf("failed to parse YAML config: %w", err)
 	}
 
+	// Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to
+	// pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG)
+	// without editing the YAML between runs. Missing envs fail fast here so we
+	// don't silently regress to "main" on accidentally unset variables.
+	if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil {
+		return nil, fmt.Errorf("expand env in modulePullOverride: %w", err)
+	}
+
 	// Validate the configuration
 	if err := validateClusterConfig(&clusterDef); err != nil {
 		return nil, fmt.Errorf("config validation failed: %w", err)
@@ -182,7 +191,20 @@ func expandPath(path string) (string, error) {
 // getKubeconfigRemoteShell prints kubeconfig for use with client-go. It prefers
 // /etc/kubernetes/super-admin.conf (Kubernetes 1.29+ unified kubeconfig) when the file
 // exists, and falls back to /etc/kubernetes/admin.conf otherwise.
-const getKubeconfigRemoteShell = "sudo -n sh -c 'if [ -f /etc/kubernetes/super-admin.conf ]; then cat /etc/kubernetes/super-admin.conf; else cat /etc/kubernetes/admin.conf; fi'"
+//
+// The two `sudo -n /bin/cat ...` invocations are intentionally NOT wrapped in
+// `sudo -n sh -c '...'`. With a wrapper the privileged binary is /bin/sh, so a
+// minimal sudoers rule of the form
+//
+//	user ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf
+//
+// would NOT match and sudo would still ask for a password. By calling /bin/cat
+// directly we make this command work with the same fine-grained NOPASSWD rule
+// that the buildKubeconfigFetchError diagnostic recommends. The 2>/dev/null on
+// the first try suppresses the "permission denied / no such file" noise so the
+// fallback to admin.conf produces clean kubeconfig content on stdout.
+const getKubeconfigRemoteShell = "sudo -n /bin/cat /etc/kubernetes/super-admin.conf 2>/dev/null " +
+	"|| sudo -n /bin/cat /etc/kubernetes/admin.conf"
 
 // GetKubeconfig connects to the master node via SSH, retrieves kubeconfig (preferring
 // super-admin.conf over admin.conf when available), and returns a rest.Config that can
@@ -216,35 +238,56 @@ func GetKubeconfig(ctx context.Context, masterIP, user, keyPath string, sshClien
 
 	kubeconfigPath := filepath.Join(outputDir, fmt.Sprintf("kubeconfig-%s.yml", masterIP))
 
-	var kubeconfigContent []byte
+	var (
+		kubeconfigContent []byte
+		// kubeconfigSource is a short, human-readable tag identifying where the
+		// kubeconfig came from. It's printed at the end of GetKubeconfig so it
+		// is always obvious in test logs which cluster we're actually about to
+		// hit — important after diagnosing wrong-cluster bugs that look like
+		// "stale lock" or "unexpected modules".
+		kubeconfigSource string
+	)
 
 	// Read kubeconfig via SSH: prefer super-admin.conf when present (see getKubeconfigRemoteShell).
-	kubeconfigContentStr, err := sshClient.Exec(ctx, getKubeconfigRemoteShell)
-	if err != nil {
-		// SSH retrieval failed (likely due to sudo password requirement)
-		// Try to use KUBE_CONFIG_PATH if set, otherwise notify user
-		if config.KubeConfigPath != "" {
-			// Expand path to handle ~ and resolve symlinks if present
-			resolvedPath, err := expandPath(config.KubeConfigPath)
-			if err != nil {
-				return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, err)
-			}
-			// Read kubeconfig content from the provided file
-			kubeconfigContent, err = os.ReadFile(resolvedPath)
-			if err != nil {
-				return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, err)
-			}
-		} else {
-			// KUBE_CONFIG_PATH not set, notify user and fail
-			return nil, "", fmt.Errorf("failed to read kubeconfig from master (this may occur if sudo requires a password). "+
-				"Please download the kubeconfig file manually and provide its full path via KUBE_CONFIG_PATH environment variable. "+
-				"Original error: %w", err)
-		}
-	} else {
+	kubeconfigContentStr, sshErr := sshClient.Exec(ctx, getKubeconfigRemoteShell)
+	switch {
+	case sshErr == nil:
 		// SSH succeeded - use the content from SSH
 		kubeconfigContent = []byte(kubeconfigContentStr)
+		kubeconfigSource = fmt.Sprintf("SSH(%s@%s:/etc/kubernetes/{super-admin,admin}.conf)", user, masterIP)
+
+	case config.KubeConfigPath != "":
+		// SSH retrieval failed (likely due to sudo password requirement) and the
+		// caller pointed us at a specific kubeconfig file via KUBE_CONFIG_PATH.
+		resolvedPath, expandErr := expandPath(config.KubeConfigPath)
+		if expandErr != nil {
+			return nil, "", fmt.Errorf("failed to expand KUBE_CONFIG_PATH (%s): %w", config.KubeConfigPath, expandErr)
+		}
+		readContent, readErr := os.ReadFile(resolvedPath)
+		if readErr != nil {
+			return nil, "", fmt.Errorf("failed to read kubeconfig from KUBE_CONFIG_PATH (%s): %w", resolvedPath, readErr)
+		}
+		kubeconfigContent = readContent
+		kubeconfigSource = fmt.Sprintf("KUBE_CONFIG_PATH=%s", resolvedPath)
+
+	default:
+		// SSH failed and the caller did not opt into a specific kubeconfig via
+		// KUBE_CONFIG_PATH. Fail fast rather than silently picking up the
+		// developer's ~/.kube/config / $KUBECONFIG, which has historically
+		// caused tests to acquire stale locks on unrelated SAN clusters or
+		// deploy modules against the wrong stand. Classify the failure so the
+		// returned error tells the operator which knob to turn.
+		cause := classifyKubeconfigFetchFailure(ctx, sshClient)
+		return nil, "", buildKubeconfigFetchError(user, masterIP, sshErr, cause)
 	}
 
+	// Always stamp the kubeconfig source + the resulting current-context/server
+	// in the log. With this single line a developer reading the output knows
+	// for sure which cluster the test is about to talk to, regardless of which
+	// of the three resolution paths fired above.
+	finalCtx, finalServer := kubeconfigContextSummary(kubeconfigContent)
+	logger.Info("Loaded kubeconfig (source=%s, current-context=%q, server=%q)", kubeconfigSource, finalCtx, finalServer)
+
 	// Write kubeconfig content to file (always write a working copy, regardless of source)
 	kubeconfigFile, err := os.Create(kubeconfigPath)
 	if err != nil {
@@ -348,3 +391,128 @@ func UpdateKubeconfigPort(kubeconfigPath string, localPort int) error {
 
 	return nil
 }
+
+// kubeconfigContextSummary parses a serialized kubeconfig and returns its
+// current-context name and the matching cluster's `server:` URL. Used purely
+// for human-readable log lines that identify which cluster the test is about
+// to talk to. On any parse failure the helper returns "<unknown>" / "<unknown>"
+// rather than an error: failing here would defeat its only purpose, which is
+// to make the surrounding log message safer to print under partial failures.
+func kubeconfigContextSummary(content []byte) (currentContext, server string) {
+	currentContext = "<unknown>"
+	server = "<unknown>"
+	if len(content) == 0 {
+		return
+	}
+	cfg, err := clientcmd.Load(content)
+	if err != nil || cfg == nil {
+		return
+	}
+	if cfg.CurrentContext != "" {
+		currentContext = cfg.CurrentContext
+	}
+	if ctx, ok := cfg.Contexts[cfg.CurrentContext]; ok && ctx != nil {
+		if cl, ok := cfg.Clusters[ctx.Cluster]; ok && cl != nil && cl.Server != "" {
+			server = cl.Server
+		}
+	}
+	return
+}
+
+// kubeconfigFetchCause discriminates the most likely reason
+// getKubeconfigRemoteShell exited non-zero. Used solely to choose the
+// human-readable error template — the original SSH error is always
+// preserved via %w wrapping, so callers' errors.Is/errors.As keep working.
+type kubeconfigFetchCause int
+
+const (
+	causeUnknown kubeconfigFetchCause = iota
+	causeSudoPasswordRequired
+	causeKubeconfigMissing
+)
+
+// classifyKubeconfigFetchFailure runs two cheap probes against the master
+// to figure out the most likely reason getKubeconfigRemoteShell failed.
+// Best-effort: any probe-time error is treated as "unknown" rather than
+// surfaced — we are already in an error path and the original sshErr is
+// what callers care about.
+//
+// Order matters and matches what we actually need to know:
+//  1. Do the kubeconfig files even exist on this host? `test -f` runs as
+//     the SSH user without sudo and returns 0 even when the file is
+//     root:root 0600, because it only checks the inode. If both files are
+//     missing this is almost certainly a non-control-plane node and no
+//     sudoers tweak will help.
+//  2. If at least one file exists, are we allowed to `cat` it without a
+//     password? We probe with `sudo -n -l /bin/cat <path>`: -l makes sudo
+//     just look up the rule (no execution), and with -n it exits non-zero
+//     when no matching NOPASSWD rule applies. Crucially this matches the
+//     SAME granular rule the diagnostic recommends, so a misconfiguration
+//     where the operator added `NOPASSWD: /bin/sh` (or only NOPASSWD: ALL)
+//     does NOT mask the real "missing /bin/cat rule" cause.
+func classifyKubeconfigFetchFailure(ctx context.Context, sshClient ssh.SSHClient) kubeconfigFetchCause {
+	if _, err := sshClient.Exec(ctx,
+		"test -f /etc/kubernetes/super-admin.conf || test -f /etc/kubernetes/admin.conf"); err != nil {
+		return causeKubeconfigMissing
+	}
+	if _, err := sshClient.Exec(ctx,
+		"sudo -n -l /bin/cat /etc/kubernetes/super-admin.conf >/dev/null 2>&1 || "+
+			"sudo -n -l /bin/cat /etc/kubernetes/admin.conf >/dev/null 2>&1"); err != nil {
+		return causeSudoPasswordRequired
+	}
+	return causeUnknown
+}
+
+// buildKubeconfigFetchError renders an actionable, multi-line error for
+// the caller to print. Each branch lists the same kind of remediation
+// (sudoers tweak, KUBE_CONFIG_PATH escape, SSH check) but in the order
+// most relevant for the detected cause. The returned error always wraps
+// the original sshErr so errors.Is(err, &ssh.ExitError{...}) still works.
+func buildKubeconfigFetchError(user, masterIP string, sshErr error, cause kubeconfigFetchCause) error {
+	sudoersLine := fmt.Sprintf(
+		"%s ALL=(root) NOPASSWD: /bin/cat /etc/kubernetes/super-admin.conf, /bin/cat /etc/kubernetes/admin.conf",
+		user,
+	)
+	sudoersFix := "echo '" + sudoersLine + "' | sudo tee /etc/sudoers.d/e2e-kubeconfig && sudo chmod 0440 /etc/sudoers.d/e2e-kubeconfig"
+
+	switch cause {
+	case causeSudoPasswordRequired:
+		return fmt.Errorf(
+			"failed to read kubeconfig from master via SSH (%s@%s): "+
+				"sudo on the master requires a password (sudo -n exited non-zero).\n"+
+				"Pick ONE remedy:\n"+
+				"  1) Allow passwordless cat of the two kubeconfig files (run on the master):\n"+
+				"       %s\n"+
+				"  2) Point the test at a local kubeconfig instead (no SSH/sudo at all):\n"+
+				"       export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+
+				"Original SSH error: %w",
+			user, masterIP, sudoersFix, sshErr)
+
+	case causeKubeconfigMissing:
+		return fmt.Errorf(
+			"failed to read kubeconfig from master via SSH (%s@%s): "+
+				"neither /etc/kubernetes/super-admin.conf nor /etc/kubernetes/admin.conf exists on the host — "+
+				"this looks like a non-control-plane node.\n"+
+				"Pick ONE remedy:\n"+
+				"  1) Make sure SSH_HOST points at a Kubernetes control-plane (master) node "+
+				"(check SSH_HOST/SSH_USER, and SSH_JUMP_HOST if you use one).\n"+
+				"  2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+
+				"       export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+
+				"Original SSH error: %w",
+			user, masterIP, sshErr)
+
+	default:
+		return fmt.Errorf(
+			"failed to read kubeconfig from master via SSH (%s@%s) "+
+				"and KUBE_CONFIG_PATH is not set.\n"+
+				"Pick ONE remedy:\n"+
+				"  1) If sudo on the master requires a password, allow passwordless cat of the kubeconfig files:\n"+
+				"       %s\n"+
+				"  2) Set KUBE_CONFIG_PATH to a kubeconfig file on your local machine:\n"+
+				"       export KUBE_CONFIG_PATH=$HOME/.kube/config\n"+
+				"  3) Fix SSH credentials so the master is reachable as %s with key-based auth.\n"+
+				"Original SSH error: %w",
+			user, masterIP, sudoersFix, user, sshErr)
+	}
+}
+
diff --git a/internal/config/env.go b/internal/config/env.go
index 23b3a0a..37dcae6 100644
--- a/internal/config/env.go
+++ b/internal/config/env.go
@@ -242,8 +242,16 @@ func EffectiveVirtualMachineClassName() string {
 	return n
 }
 
-func ValidateEnvironment() error {
-	// Default values for environment variables
+// ApplyDefaults populates package-level config variables that have a documented
+// default value but were not provided through the environment. It is idempotent
+// and safe to call multiple times.
+//
+// Suites that don't call ValidateEnvironment() (because they don't need its
+// required-variable checks) should still call ApplyDefaults() — otherwise
+// optional variables like SSH_VM_USER stay empty and propagate as user="" all
+// the way to the SSH server, where it shows up as "Invalid user" / publickey
+// rejection that is hard to attribute to a missing default.
+func ApplyDefaults() {
 	if YAMLConfigFilename == "" {
 		YAMLConfigFilename = YAMLConfigFilenameDefaultValue
 	}
@@ -264,6 +272,10 @@ func ValidateEnvironment() error {
 	if TestClusterNamespace == "" {
 		TestClusterNamespace = TestClusterNamespaceDefaultValue
 	}
+}
+
+func ValidateEnvironment() error {
+	ApplyDefaults()
 
 	TestClusterVirtualMachineClassName = strings.TrimSpace(TestClusterVirtualMachineClassName)
 	if TestClusterVirtualMachineClassName == "" {
diff --git a/internal/config/overrides.go b/internal/config/overrides.go
new file mode 100644
index 0000000..5eed2fc
--- /dev/null
+++ b/internal/config/overrides.go
@@ -0,0 +1,68 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"fmt"
+	"os"
+	"regexp"
+)
+
+// envVarRefPattern matches ${NAME} placeholders. We accept only the braced
+// form (no bare $NAME) to keep substitution intent explicit and avoid
+// accidentally rewriting tags that legitimately contain a dollar sign.
+var envVarRefPattern = regexp.MustCompile(`\$\{([A-Za-z_][A-Za-z0-9_]*)\}`)
+
+// ExpandEnvInModulePullOverride expands ${VAR} references in each module's
+// ModulePullOverride field. If a referenced env var is not set, returns an
+// error pointing at the offending module so CI fails loudly instead of
+// silently falling back to the "main" default in configureModulePullOverride.
+//
+// This lets test suites declare in YAML which modules should track a CI-built
+// image without hard-coding any tag:
+//
+//	modules:
+//	  - name: csi-ceph
+//	    modulePullOverride: "${MODULE_IMAGE_TAG}"
+//
+// CI then sets MODULE_IMAGE_TAG=pr<N> (GitHub) or mr<N> (GitLab), and the
+// resulting ModulePullOverride CR points at the right image without anyone
+// editing the YAML per run.
+//
+// Use this hook right after yaml.Unmarshal of cluster_config.yml. Modules
+// without any placeholder are left untouched.
+func ExpandEnvInModulePullOverride(def *ClusterDefinition) error {
+	for _, m := range def.DKPParameters.Modules {
+		if m == nil || m.ModulePullOverride == "" {
+			continue
+		}
+		matches := envVarRefPattern.FindAllStringSubmatch(m.ModulePullOverride, -1)
+		if len(matches) == 0 {
+			continue
+		}
+		for _, ms := range matches {
+			if _, ok := os.LookupEnv(ms[1]); !ok {
+				return fmt.Errorf(
+					"module %q references env var ${%s} in modulePullOverride but it is not set",
+					m.Name, ms[1],
+				)
+			}
+		}
+		m.ModulePullOverride = os.Expand(m.ModulePullOverride, os.Getenv)
+	}
+	return nil
+}
diff --git a/internal/config/overrides_test.go b/internal/config/overrides_test.go
new file mode 100644
index 0000000..dfe79e6
--- /dev/null
+++ b/internal/config/overrides_test.go
@@ -0,0 +1,149 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"os"
+	"strings"
+	"testing"
+)
+
+func TestExpandEnvInModulePullOverride_NoPlaceholder(t *testing.T) {
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{
+				{Name: "snapshot-controller", ModulePullOverride: "main"},
+				{Name: "csi-ceph", ModulePullOverride: ""},
+				{Name: "sds-elastic"},
+			},
+		},
+	}
+	if err := ExpandEnvInModulePullOverride(def); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "main" {
+		t.Errorf("snapshot-controller: got %q, want %q", got, "main")
+	}
+	if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "" {
+		t.Errorf("csi-ceph: got %q, want empty", got)
+	}
+	if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "" {
+		t.Errorf("sds-elastic: got %q, want empty", got)
+	}
+}
+
+func TestExpandEnvInModulePullOverride_Expands(t *testing.T) {
+	t.Setenv("MODULE_IMAGE_TAG", "pr131")
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{
+				{Name: "csi-ceph", ModulePullOverride: "${MODULE_IMAGE_TAG}"},
+			},
+		},
+	}
+	if err := ExpandEnvInModulePullOverride(def); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" {
+		t.Errorf("got %q, want %q", got, "pr131")
+	}
+}
+
+func TestExpandEnvInModulePullOverride_MissingEnvFails(t *testing.T) {
+	// Use t.Setenv to register cleanup that restores the original value (if
+	// any) after the test, then os.Unsetenv to actually drop it for this run.
+	const name = "MISSING_TAG_FOR_TEST"
+	t.Setenv(name, "anything")
+	if err := os.Unsetenv(name); err != nil {
+		t.Fatalf("os.Unsetenv: %v", err)
+	}
+
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{
+				{Name: "snapshot-controller", ModulePullOverride: "main"},
+				{Name: "csi-ceph", ModulePullOverride: "${" + name + "}"},
+			},
+		},
+	}
+	err := ExpandEnvInModulePullOverride(def)
+	if err == nil {
+		t.Fatalf("expected error for missing env, got nil")
+	}
+	if !strings.Contains(err.Error(), "csi-ceph") {
+		t.Errorf("error should mention offending module name, got: %v", err)
+	}
+	if !strings.Contains(err.Error(), name) {
+		t.Errorf("error should mention env var name %q, got: %v", name, err)
+	}
+}
+
+func TestExpandEnvInModulePullOverride_PerModuleEnvs(t *testing.T) {
+	t.Setenv("CSI_CEPH_TAG", "pr131")
+	t.Setenv("SDS_ELASTIC_TAG", "mr41")
+
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{
+				{Name: "csi-ceph", ModulePullOverride: "${CSI_CEPH_TAG}"},
+				{Name: "sds-elastic", ModulePullOverride: "${SDS_ELASTIC_TAG}"},
+				{Name: "snapshot-controller", ModulePullOverride: "main"},
+			},
+		},
+	}
+	if err := ExpandEnvInModulePullOverride(def); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "pr131" {
+		t.Errorf("csi-ceph: got %q, want %q", got, "pr131")
+	}
+	if got := def.DKPParameters.Modules[1].ModulePullOverride; got != "mr41" {
+		t.Errorf("sds-elastic: got %q, want %q", got, "mr41")
+	}
+	if got := def.DKPParameters.Modules[2].ModulePullOverride; got != "main" {
+		t.Errorf("snapshot-controller: got %q, want %q", got, "main")
+	}
+}
+
+func TestExpandEnvInModulePullOverride_MultiplePlaceholdersInOneString(t *testing.T) {
+	t.Setenv("PREFIX", "branch")
+	t.Setenv("NAME", "ms-crc")
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{
+				{Name: "csi-ceph", ModulePullOverride: "${PREFIX}-${NAME}"},
+			},
+		},
+	}
+	if err := ExpandEnvInModulePullOverride(def); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got := def.DKPParameters.Modules[0].ModulePullOverride; got != "branch-ms-crc" {
+		t.Errorf("got %q, want %q", got, "branch-ms-crc")
+	}
+}
+
+func TestExpandEnvInModulePullOverride_NilModuleSliceEntry(t *testing.T) {
+	def := &ClusterDefinition{
+		DKPParameters: DKPParameters{
+			Modules: []*ModuleConfig{nil, {Name: "csi-ceph", ModulePullOverride: "main"}},
+		},
+	}
+	if err := ExpandEnvInModulePullOverride(def); err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
diff --git a/internal/infrastructure/ssh/client.go b/internal/infrastructure/ssh/client.go
index e2a24d1..a9654b5 100644
--- a/internal/infrastructure/ssh/client.go
+++ b/internal/infrastructure/ssh/client.go
@@ -404,94 +404,20 @@ func (c *client) reconnect(ctx context.Context) error {
 // StartTunnel starts an SSH tunnel with port forwarding from local to remote
 // It returns a function to stop the tunnel and an error if the tunnel fails to start
 func (c *client) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) {
-	// Check context before starting
-	if err := ctx.Err(); err != nil {
-		return nil, fmt.Errorf("context error before starting tunnel: %w", err)
-	}
-
-	listener, err := net.Listen("tcp", "127.0.0.1:"+localPort)
-	if err != nil {
-		return nil, fmt.Errorf("failed to listen on local port %s: %w", localPort, err)
-	}
-
-	stopChan := make(chan struct{})
-
-	go func() {
-		defer listener.Close()
-		for {
-			// Check context and stop channel
-			select {
-			case <-ctx.Done():
-				return
-			case <-stopChan:
-				return
-			default:
-			}
-
-			// Set deadline for Accept based on context deadline if available
-			if deadline, ok := ctx.Deadline(); ok {
-				if err := listener.(*net.TCPListener).SetDeadline(deadline); err != nil {
-					// If setting deadline fails, continue without it
-				}
-			}
-
-			localConn, err := listener.Accept()
-			if err != nil {
-				// Listener closed or error occurred
-				select {
-				case <-ctx.Done():
-					return
-				case <-stopChan:
-					return
-				default:
-					// Continue if not stopped
-					continue
-				}
+	dialer := tunnelDialer{
+		describe: fmt.Sprintf("%s@%s local:%s -> remote:%s", c.user, c.host, localPort, remotePort),
+		dial: func() (net.Conn, error) {
+			c.mu.Lock()
+			sc := c.sshClient
+			c.mu.Unlock()
+			if sc == nil {
+				return nil, fmt.Errorf("ssh client is not initialized")
 			}
-
-			go func() {
-				defer localConn.Close()
-				remoteConn, err := c.sshClient.Dial("tcp", "127.0.0.1:"+remotePort)
-				if err != nil {
-					// Connection failed, just return - the error will be visible to the client
-					return
-				}
-				defer remoteConn.Close()
-
-				// Copy data bidirectionally with context support
-				done := make(chan struct{}, 2)
-				go func() {
-					_, _ = copyWithContext(ctx, localConn, remoteConn)
-					done <- struct{}{}
-				}()
-				go func() {
-					_, _ = copyWithContext(ctx, remoteConn, localConn)
-					done <- struct{}{}
-				}()
-
-				// Wait for either direction to finish or context cancellation
-				select {
-				case <-ctx.Done():
-					return
-				case <-done:
-					// One direction finished, wait for the other
-					select {
-					case <-ctx.Done():
-						return
-					case <-done:
-						// Both directions finished
-					}
-				}
-			}()
-		}
-	}()
-
-	stop := func() error {
-		close(stopChan)
-		return listener.Close()
+			return sc.Dial("tcp", "127.0.0.1:"+remotePort)
+		},
+		reconnect: c.reconnect,
 	}
-
-	return stop, nil
+	return runTunnelLoop(ctx, localPort, dialer)
 }
 
 // Exec executes a command on the remote host with automatic retry and reconnection
@@ -667,7 +593,7 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo
 	}
 
 	// Create SSH config for target host
-	targetConfig, _, err := createSSHConfig(targetUser, targetKeyPath)
+	targetConfig, targetKeyInfo, err := createSSHConfig(targetUser, targetKeyPath)
 	if err != nil {
 		jumpClient.Close()
 		return nil, fmt.Errorf("failed to create SSH config for target host: %w", err)
@@ -696,14 +622,22 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo
 
 		targetConn, err := jumpClient.Dial("tcp", targetAddr)
 		if err != nil {
-			lastErr = fmt.Errorf("failed to dial target host %s@%s through jump host: %w", targetUser, targetAddr, err)
+			lastErr = fmt.Errorf("failed to dial target host %q@%s through jump host %q@%s: %w",
+				targetUser, targetAddr, jumpUser, jumpAddr, err)
 			continue
 		}
 
 		targetClientConn, targetChans, targetReqs, err := ssh.NewClientConn(targetConn, targetAddr, targetConfig)
 		if err != nil {
 			targetConn.Close()
-			lastErr = fmt.Errorf("failed to establish SSH connection to target host: %w", err)
+			lastErr = fmt.Errorf(
+				"failed to establish SSH connection to target host %q@%s (via jump %q@%s): %w\n"+
+					"  Key used: %s (algorithm: %s, fingerprint: %s)\n"+
+					"  Hint: verify SSH_VM_USER (current=%q) is correct for this VM image and that the key's public part is in %s@%s:~/.ssh/authorized_keys",
+				targetUser, targetAddr, jumpUser, jumpAddr, err,
+				targetKeyInfo.Path, targetKeyInfo.Algorithm, targetKeyInfo.Fingerprint,
+				targetUser, targetUser, targetAddr,
+			)
 			continue
 		}
 
@@ -713,7 +647,8 @@ func NewClientWithJumpHost(jumpUser, jumpHost, jumpKeyPath, targetUser, targetHo
 
 	if targetClient == nil {
 		jumpClient.Close()
-		return nil, fmt.Errorf("failed to connect to target host after %d attempts: %w", maxRetries, lastErr)
+		return nil, fmt.Errorf("failed to connect to target host %q@%s after %d attempts: %w",
+			targetUser, targetAddr, maxRetries, lastErr)
 	}
 
 	// Start keepalive for both connections
@@ -906,17 +841,65 @@ func (c *jumpHostClient) reconnect(ctx context.Context) error {
 	return fmt.Errorf("failed to reconnect after %d attempts: %w", config.SSHRetryCount, lastErr)
 }
 
-// StartTunnel starts an SSH tunnel with port forwarding from local to remote
+// StartTunnel starts an SSH tunnel with port forwarding from local to remote.
+// Like the non-jump-host variant, dial errors that look like a dropped SSH
+// session trigger a reconnect attempt against jump+target before the next
+// retry — Wi-Fi flaps on the developer's laptop are by far the most common
+// way for the tunnel to die mid-test.
 func (c *jumpHostClient) StartTunnel(ctx context.Context, localPort, remotePort string) (func() error, error) {
-	// Use the target client's StartTunnel method
-	// We need to access the underlying client's StartTunnel
-	// Since we can't directly call it, we'll implement it here
-	return startTunnelOnClient(ctx, c.targetClient, localPort, remotePort)
+	dialer := tunnelDialer{
+		describe: fmt.Sprintf("%s@%s via jump %s@%s local:%s -> remote:%s",
+			c.targetUser, c.targetHost, c.jumpUser, c.jumpHost, localPort, remotePort),
+		dial: func() (net.Conn, error) {
+			c.mu.Lock()
+			tc := c.targetClient
+			c.mu.Unlock()
+			if tc == nil {
+				return nil, fmt.Errorf("jump-host target client is not initialized")
+			}
+			return tc.Dial("tcp", "127.0.0.1:"+remotePort)
+		},
+		reconnect: c.reconnect,
+	}
+	return runTunnelLoop(ctx, localPort, dialer)
+}
+
+// tunnelDialer abstracts the per-tunnel concerns that runTunnelLoop needs to
+// know about: how to open a fresh remote connection through the active SSH
+// session, how to re-establish that session when it dies, and a human-readable
+// description for log messages.
+type tunnelDialer struct {
+	// describe identifies the tunnel in WARN/INFO logs. It should encode user,
+	// host(s) and ports — enough to distinguish concurrent tunnels.
+	describe string
+	// dial opens a fresh TCP connection to the remote endpoint via the *current*
+	// SSH client. Implementations must read the underlying *ssh.Client under
+	// whatever mutex guards it (so reconnect updates are visible).
+	dial func() (net.Conn, error)
+	// reconnect tries to rebuild the broken SSH session(s). Called once per
+	// accepted local connection when dial fails with a connection-style error.
+	// May itself perform retries with backoff.
+	reconnect func(ctx context.Context) error
 }
 
-// startTunnelOnClient starts a tunnel on a raw ssh.Client
-func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort, remotePort string) (func() error, error) {
-	// Check context before starting
+// runTunnelLoop runs the accept loop for an SSH tunnel.
+//
+// Compared to the previous inline implementation it adds two things:
+//
+//  1. **Auto-reconnect on dial failure.** When sshClient.Dial returns a
+//     connection-style error (EOF, connection lost, broken pipe…) we kick
+//     dialer.reconnect and retry the dial once with the freshly rebuilt
+//     SSH session. Without this, a Wi-Fi flap on the developer's laptop
+//     killed the SSH session permanently, the tunnel listener stayed up
+//     happily accepting local connects, but every Dial through the dead
+//     session returned EOF — and the test process spent the entire 20-min
+//     readiness timeout silently retrying client-go GETs through a port
+//     that nobody answered. See poll.go for the related per-call deadline.
+//  2. **Visible WARN log when reconnect kicks in.** Previously the failure
+//     was swallowed (`return`); now we emit a WARN every time the tunnel
+//     has to be rebuilt so users can correlate "tests slowed down" with
+//     "wifi flapped".
+func runTunnelLoop(ctx context.Context, localPort string, dialer tunnelDialer) (func() error, error) {
 	if err := ctx.Err(); err != nil {
 		return nil, fmt.Errorf("context error before starting tunnel: %w", err)
 	}
@@ -931,7 +914,6 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort,
 	go func() {
 		defer listener.Close()
 		for {
-			// Check context and stop channel
 			select {
 			case <-ctx.Done():
 				return
@@ -940,63 +922,26 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort,
 			default:
 			}
 
-			// Set deadline for Accept based on context deadline if available
-			if deadline, ok := ctx.Deadline(); ok {
-				if tcpListener, ok := listener.(*net.TCPListener); ok {
-					if err := tcpListener.SetDeadline(deadline); err != nil {
-						// If setting deadline fails, continue without it
-					}
-				}
+			// Short Accept deadline so the loop can re-check ctx/stopChan
+			// promptly even when no clients are connecting; a deadline tied
+			// to ctx.Deadline() fired only at the very end of the test.
+			if tcpListener, ok := listener.(*net.TCPListener); ok {
+				_ = tcpListener.SetDeadline(time.Now().Add(500 * time.Millisecond))
 			}
 
 			localConn, err := listener.Accept()
 			if err != nil {
-				// Listener closed or error occurred
 				select {
 				case <-ctx.Done():
 					return
 				case <-stopChan:
 					return
 				default:
-					// Continue if not stopped
 					continue
 				}
 			}
 
-			go func() {
-				defer localConn.Close()
-				remoteConn, err := sshClient.Dial("tcp", "127.0.0.1:"+remotePort)
-				if err != nil {
-					// Connection failed, just return - the error will be visible to the client
-					return
-				}
-				defer remoteConn.Close()
-
-				// Copy data bidirectionally with context support
-				done := make(chan struct{}, 2)
-				go func() {
-					_, _ = copyWithContext(ctx, localConn, remoteConn)
-					done <- struct{}{}
-				}()
-				go func() {
-					_, _ = copyWithContext(ctx, remoteConn, localConn)
-					done <- struct{}{}
-				}()
-
-				// Wait for either direction to finish or context cancellation
-				select {
-				case <-ctx.Done():
-					return
-				case <-done:
-					// One direction finished, wait for the other
-					select {
-					case <-ctx.Done():
-						return
-					case <-done:
-						// Both directions finished
-					}
-				}
-			}()
+			go handleTunnelConnection(ctx, localConn, dialer)
 		}
 	}()
 
@@ -1004,10 +949,64 @@ func startTunnelOnClient(ctx context.Context, sshClient *ssh.Client, localPort,
 		close(stopChan)
 		return listener.Close()
 	}
-
 	return stop, nil
 }
 
+// handleTunnelConnection serves a single accepted local connection. On the
+// first dial failure that looks like a dead SSH session we call
+// dialer.reconnect and retry once. After that, further failures are surfaced
+// to the local client by closing localConn (which causes client-go on the
+// other side to see EOF and retry through the freshly opened tunnel on the
+// next request).
+func handleTunnelConnection(ctx context.Context, localConn net.Conn, dialer tunnelDialer) {
+	defer localConn.Close()
+
+	remoteConn, err := dialer.dial()
+	if err != nil {
+		if !isConnectionError(err) {
+			// Non-connection errors (e.g. invalid address) won't be fixed by a
+			// reconnect — drop the local conn so the client sees the failure.
+			logger.Debug("SSH tunnel %s dial failed (non-retryable): %v", dialer.describe, err)
+			return
+		}
+
+		logger.Warn("SSH tunnel %s dial failed (%v); attempting to reconnect SSH session", dialer.describe, err)
+		if rcErr := dialer.reconnect(ctx); rcErr != nil {
+			logger.Warn("SSH tunnel %s reconnect failed: %v", dialer.describe, rcErr)
+			return
+		}
+		logger.Info("SSH tunnel %s SSH session reconnected; retrying dial", dialer.describe)
+
+		remoteConn, err = dialer.dial()
+		if err != nil {
+			logger.Warn("SSH tunnel %s dial still failing after reconnect: %v", dialer.describe, err)
+			return
+		}
+	}
+	defer remoteConn.Close()
+
+	done := make(chan struct{}, 2)
+	go func() {
+		_, _ = copyWithContext(ctx, localConn, remoteConn)
+		done <- struct{}{}
+	}()
+	go func() {
+		_, _ = copyWithContext(ctx, remoteConn, localConn)
+		done <- struct{}{}
+	}()
+
+	select {
+	case <-ctx.Done():
+		return
+	case <-done:
+		select {
+		case <-ctx.Done():
+			return
+		case <-done:
+		}
+	}
+}
+
 // Exec executes a command on the remote host with automatic retry and reconnection
 func (c *jumpHostClient) Exec(ctx context.Context, cmd string) (string, error) {
 	var output string
diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go
index b57f334..6d6d41f 100644
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@@ -149,6 +149,14 @@ func loadClusterConfigFromPath(configPath string) (*config.ClusterDefinition, er
 		return nil, fmt.Errorf("failed to parse YAML config: %w", err)
 	}
 
+	// Expand ${VAR} placeholders in modulePullOverride fields. CI uses this to
+	// pass a per-PR/MR image tag via a single env var (e.g. MODULE_IMAGE_TAG)
+	// without editing the YAML between runs. Missing envs fail fast here so we
+	// don't silently regress to "main" on accidentally unset variables.
+	if err := config.ExpandEnvInModulePullOverride(&clusterDef); err != nil {
+		return nil, fmt.Errorf("expand env in modulePullOverride: %w", err)
+	}
+
 	// Validate the configuration (using the same validation logic as internal/cluster)
 	if len(clusterDef.Masters) == 0 {
 		return nil, fmt.Errorf("at least one master node is required")
@@ -194,6 +202,22 @@ func CreateTestCluster(
 	ctx context.Context,
 	yamlConfigFilename string,
 ) (*TestClusterResources, error) {
+	// Apply env-var defaults defensively so suites that don't call
+	// config.ValidateEnvironment() (e.g. csi-ceph e2e) still get sensible
+	// values for SSH_VM_USER / SSH_PRIVATE_KEY / SSH_PUBLIC_KEY /
+	// TEST_CLUSTER_NAMESPACE / YAML_CONFIG_FILENAME / TEST_CLUSTER_CLEANUP
+	// instead of empty strings that surface as obscure failures (e.g.
+	// user="" -> sshd "Invalid user", or "" filename -> directory read).
+	config.ApplyDefaults()
+
+	// Belt-and-suspenders: function arg also has a documented default. Without
+	// this, an empty filename gets joined with the test-package directory and
+	// yields a path to the directory itself, failing later with a confusing
+	// "is a directory" read error.
+	if yamlConfigFilename == "" {
+		yamlConfigFilename = config.YAMLConfigFilenameDefaultValue
+	}
+
 	logger.Step(1, "Loading cluster configuration from %s", yamlConfigFilename)
 
 	// Find the test package directory by walking the call stack.
diff --git a/pkg/cluster/vms.go b/pkg/cluster/vms.go
index 61c4e3f..2507b87 100644
--- a/pkg/cluster/vms.go
+++ b/pkg/cluster/vms.go
@@ -591,10 +591,39 @@ func getCVMINameFromImageURL(imageURL string) string {
 	return name
 }
 
+// cloudInitAptMirror configures cloud-init to use mirror.yandex.ru as the
+// Ubuntu apt mirror for both the primary archive and security pools, and
+// pins apt to IPv4. Default Ubuntu mirrors (archive.ubuntu.com /
+// security.ubuntu.com) round-robin across many IPs and are partially
+// unreachable from some Flant infra (e.g. some egress paths block all the
+// IPv6 endpoints, and most IPv4 ones time out for archive.ubuntu.com),
+// which makes Step 9 (Wait for Docker) and per-node package_update very flaky
+// or outright stall. mirror.yandex.ru carries main/universe/multiverse/restricted
+// for the same suites and is reachable in those environments.
+//
+// The leading newline keeps the indentation flush with the rest of the
+// cloud-config when interpolated mid-document.
+const cloudInitAptMirror = `apt:
+  primary:
+    - arches: [default]
+      uri: http://mirror.yandex.ru/ubuntu
+  security:
+    - arches: [default]
+      uri: http://mirror.yandex.ru/ubuntu
+`
+
+// cloudInitForceIPv4 disables IPv6 for apt to avoid 30-second connection
+// timeouts on every package fetch when the host lacks working IPv6 egress.
+// Written via write_files so it is in effect before package_update runs.
+const cloudInitForceIPv4Apt = `  - path: /etc/apt/apt.conf.d/99force-ipv4
+    content: |
+      Acquire::ForceIPv4 "true";
+`
+
 // generateCloudInitUserData generates cloud-init user data for VM provisioning (cluster nodes)
 func generateCloudInitUserData(hostname, sshPubKey string) string {
 	return fmt.Sprintf(`#cloud-config
-package_update: true
+%spackage_update: true
 packages:
   - tmux
   - htop
@@ -619,7 +648,7 @@ users:
     ssh_authorized_keys:
       - %s
 write_files:
-  - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf
+%s  - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf
     content: |
       # Разрешить TCP forwarding
       AllowTcpForwarding yes
@@ -635,14 +664,14 @@ runcmd:
   - systemctl daemon-reload
   - systemctl enable --now qemu-guest-agent.service
   - echo 'source /root/.kubectl_aliases' >> /root/.bashrc
-`, sshPubKey, hostname)
+`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname)
 }
 
 // generateSetupNodeCloudInit generates cloud-init user data for the setup/bootstrap node.
 // This includes Docker which is required for running the Deckhouse installer.
 func generateSetupNodeCloudInit(hostname, sshPubKey string) string {
 	return fmt.Sprintf(`#cloud-config
-package_update: true
+%spackage_update: true
 packages:
   - tmux
   - htop
@@ -664,7 +693,7 @@ users:
     ssh_authorized_keys:
       - %s
 write_files:
-  - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf
+%s  - path: /etc/ssh/sshd_config.d/allow_tcp_forwarding.conf
     content: |
       # Разрешить TCP forwarding
       AllowTcpForwarding yes
@@ -675,7 +704,7 @@ runcmd:
   - systemctl daemon-reload
   - systemctl enable --now qemu-guest-agent.service
   - systemctl enable --now docker.service
-`, sshPubKey, hostname)
+`, cloudInitAptMirror, sshPubKey, cloudInitForceIPv4Apt, hostname)
 }
 
 // RemoveAllVMs forcefully stops and deletes virtual machines, virtual disks, and virtual images.
diff --git a/pkg/kubernetes/cephblockpool.go b/pkg/kubernetes/cephblockpool.go
new file mode 100644
index 0000000..8ad2dfc
--- /dev/null
+++ b/pkg/kubernetes/cephblockpool.go
@@ -0,0 +1,225 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// CephBlockPoolGVR is the GroupVersionResource of Rook's CephBlockPool.
+var CephBlockPoolGVR = schema.GroupVersionResource{
+	Group:    "ceph.rook.io",
+	Version:  "v1",
+	Resource: "cephblockpools",
+}
+
+// CephBlockPoolConfig describes a minimal replicated or erasure-coded Ceph
+// RBD pool managed by Rook. Exactly one of ReplicaSize or ErasureCoded must
+// be set; leaving both zero defaults to a single-replica pool suitable for
+// single-node test clusters.
+type CephBlockPoolConfig struct {
+	// Name of the CephBlockPool CR (also becomes the Ceph pool name).
+	Name string
+
+	// Namespace the Rook operator watches (typically "d8-sds-elastic").
+	Namespace string
+
+	// FailureDomain is the CRUSH failure domain: "host" or "osd" (default: "host").
+	FailureDomain string
+
+	// --- Replicated pool knobs (used when ErasureCoded is nil) ---
+
+	// ReplicaSize is the number of object copies. Default: 1.
+	ReplicaSize int
+
+	// RequireSafeReplicaSize toggles Ceph's safeguard against single-replica
+	// pools. When nil, it is set to `false` for ReplicaSize==1 (unsafe single
+	// replica, accepted for e2e test clusters) and left unset otherwise.
+	RequireSafeReplicaSize *bool
+
+	// --- Erasure-coded pool knobs ---
+
+	// ErasureCoded, when non-nil, produces an EC pool instead of a replicated
+	// one. Its fields map to `spec.erasureCoded.{dataChunks,codingChunks}`.
+	ErasureCoded *CephBlockPoolErasureCoded
+}
+
+// CephBlockPoolErasureCoded configures a Ceph erasure-coded RBD pool.
+type CephBlockPoolErasureCoded struct {
+	DataChunks   int
+	CodingChunks int
+}
+
+// CreateCephBlockPool creates (or updates, if already present) a CephBlockPool
+// in the given namespace from the provided configuration. It is idempotent and
+// safe to call on every test run.
+func CreateCephBlockPool(ctx context.Context, kubeconfig *rest.Config, cfg CephBlockPoolConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("CephBlockPool name is required")
+	}
+	if cfg.Namespace == "" {
+		return fmt.Errorf("CephBlockPool namespace is required")
+	}
+	if cfg.ErasureCoded == nil && cfg.ReplicaSize <= 0 {
+		cfg.ReplicaSize = 1
+	}
+	if cfg.FailureDomain == "" {
+		cfg.FailureDomain = "host"
+	}
+
+	spec := map[string]interface{}{
+		"failureDomain": cfg.FailureDomain,
+	}
+
+	if cfg.ErasureCoded != nil {
+		if cfg.ErasureCoded.DataChunks <= 0 || cfg.ErasureCoded.CodingChunks <= 0 {
+			return fmt.Errorf("ErasureCoded pool requires positive dataChunks and codingChunks")
+		}
+		spec["erasureCoded"] = map[string]interface{}{
+			"dataChunks":   int64(cfg.ErasureCoded.DataChunks),
+			"codingChunks": int64(cfg.ErasureCoded.CodingChunks),
+		}
+	} else {
+		replicated := map[string]interface{}{
+			"size": int64(cfg.ReplicaSize),
+		}
+		requireSafe := cfg.RequireSafeReplicaSize
+		if requireSafe == nil && cfg.ReplicaSize == 1 {
+			f := false
+			requireSafe = &f
+		}
+		if requireSafe != nil {
+			replicated["requireSafeReplicaSize"] = *requireSafe
+		}
+		spec["replicated"] = replicated
+	}
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "ceph.rook.io/v1",
+			"kind":       "CephBlockPool",
+			"metadata": map[string]interface{}{
+				"name":      cfg.Name,
+				"namespace": cfg.Namespace,
+			},
+			"spec": spec,
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephBlockPool %s/%s", cfg.Namespace, cfg.Name)
+	_, err = dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		logger.Success("CephBlockPool %s/%s created", cfg.Namespace, cfg.Name)
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+
+	logger.Info("CephBlockPool %s/%s already exists, updating spec", cfg.Namespace, cfg.Name)
+	existing, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch existing CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephBlockPool", formatRef(cfg.Namespace, cfg.Name)); err != nil {
+		return err
+	}
+	existing.Object["spec"] = spec
+	if _, err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephBlockPool %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	return nil
+}
+
+// WaitForCephBlockPoolReady blocks until the CephBlockPool reports
+// `status.phase == "Ready"`. Rook transitions the pool from Progressing to
+// Ready once the Ceph OSDs have accepted the new pool and its CRUSH rule.
+//
+// Per-call deadlines and loud (WARN) logging on consecutive network failures
+// are inherited from pollResourceUntilReady, so a dropped SSH tunnel surfaces
+// in seconds instead of after the parent timeout.
+func WaitForCephBlockPoolReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	return pollResourceUntilReady(
+		ctx, kubeconfig, CephBlockPoolGVR, namespace, name,
+		timeout, PollTickInterval, "CephBlockPool",
+		func(obj *unstructured.Unstructured) (bool, string) {
+			phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase")
+			if phase == "Ready" {
+				return true, "phase=Ready"
+			}
+			logger.Debug("CephBlockPool %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase)
+			return false, ""
+		},
+	)
+}
+
+// DeleteCephBlockPool deletes a CephBlockPool. Safe to call if the pool does
+// not exist. NOTE: this is fire-and-forget — the API call returns as soon as
+// the apiserver accepts the request, but Rook may still be running its
+// finalizer (`cephblockpool.ceph.rook.io`) for a few minutes afterwards. If
+// you want to be certain the CR is fully gone before continuing, follow up
+// with WaitForCephBlockPoolGone.
+func DeleteCephBlockPool(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	if err := dynamicClient.Resource(CephBlockPoolGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephBlockPool %s/%s: %w", namespace, name, err)
+	}
+	logger.Info("Deleted CephBlockPool %s/%s", namespace, name)
+	return nil
+}
+
+// CephBlockPoolGoneTimeout is the default budget for WaitForCephBlockPoolGone.
+// Rook removes the underlying RBD pool from Ceph before lifting the
+// finalizer; with one OSD the pool delete normally completes in seconds but
+// can take a few minutes if the cluster is unhealthy.
+const CephBlockPoolGoneTimeout = 5 * time.Minute
+
+// WaitForCephBlockPoolGone polls until the CephBlockPool is fully GC'd by
+// Kubernetes (GET returns NotFound). Use this after DeleteCephBlockPool to
+// be sure the parent CephCluster won't be blocked by `ObjectHasDependents`
+// when it gets deleted next.
+func WaitForCephBlockPoolGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephBlockPoolGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephBlockPoolGVR, namespace, name,
+		timeout, PollTickInterval, "CephBlockPool",
+	)
+}
diff --git a/pkg/kubernetes/cephcluster.go b/pkg/kubernetes/cephcluster.go
new file mode 100644
index 0000000..501d8d8
--- /dev/null
+++ b/pkg/kubernetes/cephcluster.go
@@ -0,0 +1,411 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// CephClusterGVR is the GroupVersionResource of Rook's CephCluster.
+var CephClusterGVR = schema.GroupVersionResource{
+	Group:    "ceph.rook.io",
+	Version:  "v1",
+	Resource: "cephclusters",
+}
+
+// Defaults shared between CephClusterConfig and the testkit-level helper.
+const (
+	DefaultRookNamespace       = "d8-sds-elastic"
+	DefaultCephClusterName     = "ceph-cluster"
+	DefaultCephImage           = "quay.io/ceph/ceph:v18.2.7"
+	DefaultDataDirHostPath     = "/var/lib/rook"
+	DefaultOSDStorageClassSize = "10Gi"
+)
+
+// CephClusterConfig describes a Rook-managed Ceph cluster suitable for e2e
+// testing. It is intentionally narrower than Rook's native CephCluster CRD:
+// knobs that don't matter for our scenarios are hidden behind hard-coded
+// defaults (mirroring the values from the internal Flant wiki instruction
+// on deploying sds-elastic + Rook + Ceph on LVM).
+type CephClusterConfig struct {
+	// Name of the CephCluster (default: "ceph-cluster").
+	Name string
+
+	// Namespace where Rook watches (default: "d8-sds-elastic").
+	Namespace string
+
+	// CephImage is the Ceph container image tag.
+	// Default: "quay.io/ceph/ceph:v18.2.7".
+	CephImage string
+
+	// AllowUnsupportedCephVersion flips spec.cephVersion.allowUnsupported.
+	// Default: true (e2e clusters are allowed to run any version Ceph ships).
+	AllowUnsupportedCephVersion *bool
+
+	// MonCount / MgrCount are the Rook mon/mgr replica counts. Defaults:
+	// 1 / 1, which is appropriate for single-node / tiny test clusters.
+	MonCount int
+	MgrCount int
+
+	// AllowMultipleMonPerNode allows multiple mons on the same node
+	// (required for single-node clusters). Default: true.
+	AllowMultipleMonPerNode *bool
+
+	// DataDirHostPath is where Rook persists mon/OSD data on each node.
+	// Default: "/var/lib/rook".
+	DataDirHostPath string
+
+	// NetworkProvider selects the Rook networking mode. Supported values:
+	//   ""      — default CNI pod network (suitable for in-cluster e2e);
+	//   "host"  — host networking (matches the Flant wiki production layout).
+	NetworkProvider string
+
+	// PublicNetworkCIDRs / ClusterNetworkCIDRs are the public/cluster CIDRs
+	// plumbed into `spec.network.addressRanges` when NetworkProvider is
+	// non-empty. They are ignored for the default (CNI) mode.
+	PublicNetworkCIDRs  []string
+	ClusterNetworkCIDRs []string
+
+	// --- OSD backing ---
+
+	// OSDStorageClass is the name of a k8s StorageClass able to hand out
+	// block-mode PVCs. Those PVCs are used by Rook's
+	// `storage.storageClassDeviceSets` to back OSDs.
+	OSDStorageClass string
+
+	// OSDCount is the number of OSDs to provision (default: 1).
+	OSDCount int
+
+	// OSDSize is the size of each OSD PVC (default: "10Gi").
+	OSDSize string
+
+	// OSDDeviceSetName is the `storageClassDeviceSets[].name` (default:
+	// "set1"). Changing it is useful mostly for debugging.
+	OSDDeviceSetName string
+}
+
+func (c *CephClusterConfig) applyDefaults() {
+	if c.Name == "" {
+		c.Name = DefaultCephClusterName
+	}
+	if c.Namespace == "" {
+		c.Namespace = DefaultRookNamespace
+	}
+	if c.CephImage == "" {
+		c.CephImage = DefaultCephImage
+	}
+	if c.AllowUnsupportedCephVersion == nil {
+		t := true
+		c.AllowUnsupportedCephVersion = &t
+	}
+	if c.MonCount <= 0 {
+		c.MonCount = 1
+	}
+	if c.MgrCount <= 0 {
+		c.MgrCount = 1
+	}
+	if c.AllowMultipleMonPerNode == nil {
+		t := true
+		c.AllowMultipleMonPerNode = &t
+	}
+	if c.DataDirHostPath == "" {
+		c.DataDirHostPath = DefaultDataDirHostPath
+	}
+	if c.OSDCount <= 0 {
+		c.OSDCount = 1
+	}
+	if c.OSDSize == "" {
+		c.OSDSize = DefaultOSDStorageClassSize
+	}
+	if c.OSDDeviceSetName == "" {
+		c.OSDDeviceSetName = "set1"
+	}
+}
+
+// CreateCephCluster creates (or updates) a CephCluster in the given namespace.
+// It is idempotent: if the resource already exists, its spec is overwritten
+// with the freshly-rendered one so callers can tweak `CephClusterConfig` and
+// re-apply without manual cleanup.
+func CreateCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConfig) error {
+	cfg.applyDefaults()
+
+	if cfg.OSDStorageClass == "" {
+		return fmt.Errorf("CephCluster requires OSDStorageClass (backing StorageClass for OSD PVCs)")
+	}
+
+	spec := buildCephClusterSpec(cfg)
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "ceph.rook.io/v1",
+			"kind":       "CephCluster",
+			"metadata": map[string]interface{}{
+				"name":      cfg.Name,
+				"namespace": cfg.Namespace,
+			},
+			"spec": spec,
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephCluster %s/%s (image=%s, mon=%d, mgr=%d, osd=%d x %s on SC %s)",
+		cfg.Namespace, cfg.Name, cfg.CephImage, cfg.MonCount, cfg.MgrCount, cfg.OSDCount, cfg.OSDSize, cfg.OSDStorageClass)
+
+	_, err = dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		logger.Success("CephCluster %s/%s created", cfg.Namespace, cfg.Name)
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+
+	logger.Info("CephCluster %s/%s already exists, updating spec", cfg.Namespace, cfg.Name)
+	existing, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch existing CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephCluster", formatRef(cfg.Namespace, cfg.Name)); err != nil {
+		return err
+	}
+	existing.Object["spec"] = spec
+	if _, err := dynamicClient.Resource(CephClusterGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephCluster %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	return nil
+}
+
+// buildCephClusterSpec renders the spec portion of a CephCluster object. The
+// choice of fields follows the Flant internal wiki instruction for
+// sds-elastic + Rook + Ceph, stripped down to the parts that matter in e2e:
+//   - mon/mgr counts come from the config (1/1 by default for single-node);
+//   - network.provider=host is opt-in via NetworkProvider;
+//   - OSDs are backed by one `storageClassDeviceSets[0]` entry that points
+//     to a user-supplied StorageClass capable of issuing block-mode PVCs.
+func buildCephClusterSpec(cfg CephClusterConfig) map[string]interface{} {
+	spec := map[string]interface{}{
+		"cephVersion": map[string]interface{}{
+			"image":            cfg.CephImage,
+			"allowUnsupported": *cfg.AllowUnsupportedCephVersion,
+		},
+		"dataDirHostPath":                            cfg.DataDirHostPath,
+		"skipUpgradeChecks":                          false,
+		"continueUpgradeAfterChecksEvenIfNotHealthy": false,
+		"mon": map[string]interface{}{
+			"count":                int64(cfg.MonCount),
+			"allowMultiplePerNode": *cfg.AllowMultipleMonPerNode,
+		},
+		"mgr": map[string]interface{}{
+			"count":                int64(cfg.MgrCount),
+			"allowMultiplePerNode": *cfg.AllowMultipleMonPerNode,
+			"modules": []interface{}{
+				map[string]interface{}{
+					"name":    "pg_autoscaler",
+					"enabled": true,
+				},
+			},
+		},
+		"dashboard": map[string]interface{}{
+			"enabled": false,
+			"ssl":     false,
+		},
+		"crashCollector": map[string]interface{}{
+			"disable": false,
+		},
+		"logCollector": map[string]interface{}{
+			"enabled":     true,
+			"periodicity": "daily",
+			"maxLogSize":  "100M",
+		},
+		"priorityClassNames": map[string]interface{}{
+			"mon": "system-node-critical",
+			"osd": "system-node-critical",
+			"mgr": "system-cluster-critical",
+		},
+		"disruptionManagement": map[string]interface{}{
+			"managePodBudgets":      true,
+			"osdMaintenanceTimeout": int64(30),
+			"pgHealthCheckTimeout":  int64(0),
+		},
+		"storage": map[string]interface{}{
+			"useAllNodes":   true,
+			"useAllDevices": false,
+			"storageClassDeviceSets": []interface{}{
+				map[string]interface{}{
+					"name":            cfg.OSDDeviceSetName,
+					"count":           int64(cfg.OSDCount),
+					"portable":        false,
+					"tuneDeviceClass": true,
+					"volumeClaimTemplates": []interface{}{
+						map[string]interface{}{
+							"metadata": map[string]interface{}{
+								"name": "data",
+							},
+							"spec": map[string]interface{}{
+								"resources": map[string]interface{}{
+									"requests": map[string]interface{}{
+										"storage": cfg.OSDSize,
+									},
+								},
+								"storageClassName": cfg.OSDStorageClass,
+								"volumeMode":       "Block",
+								"accessModes":      []interface{}{"ReadWriteOnce"},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	if cfg.NetworkProvider != "" {
+		network := map[string]interface{}{
+			"provider": cfg.NetworkProvider,
+			"connections": map[string]interface{}{
+				"encryption":   map[string]interface{}{"enabled": false},
+				"compression":  map[string]interface{}{"enabled": false},
+				"requireMsgr2": false,
+			},
+		}
+
+		addrs := map[string]interface{}{}
+		if len(cfg.PublicNetworkCIDRs) > 0 {
+			addrs["public"] = toInterfaceSlice(cfg.PublicNetworkCIDRs)
+		}
+		if len(cfg.ClusterNetworkCIDRs) > 0 {
+			addrs["cluster"] = toInterfaceSlice(cfg.ClusterNetworkCIDRs)
+		}
+		if len(addrs) > 0 {
+			network["addressRanges"] = addrs
+		}
+		spec["network"] = network
+	}
+
+	return spec
+}
+
+// toInterfaceSlice converts a []string to a []interface{} so it can be
+// embedded into an `unstructured.Unstructured`'s object tree.
+func toInterfaceSlice(in []string) []interface{} {
+	out := make([]interface{}, len(in))
+	for i, v := range in {
+		out[i] = v
+	}
+	return out
+}
+
+// WaitForCephClusterReady blocks until the CephCluster status reports that
+// Ceph is up and healthy. Rook exposes the cluster state through two status
+// fields:
+//   - `status.state` — overall lifecycle phase ("Creating", "Created",
+//     "Updating", "Error");
+//   - `status.ceph.health` — the Ceph health summary ("HEALTH_OK",
+//     "HEALTH_WARN", "HEALTH_ERR"). On a single-OSD test cluster Ceph often
+//     sits in HEALTH_WARN (PGs undersized, no replicas), which we still treat
+//     as "good enough" as long as `status.state == "Created"`.
+//
+// We return success once `state == "Created"`. HEALTH_ERR is reported in the
+// log and does not short-circuit (Rook may recover).
+//
+// Network errors are logged loud (WARN) after a few consecutive failures so a
+// dropped SSH tunnel surfaces in seconds instead of getting buried in Debug
+// output. See pollResourceUntilReady for the per-call deadline rationale.
+func WaitForCephClusterReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	return pollResourceUntilReady(
+		ctx, kubeconfig, CephClusterGVR, namespace, name,
+		timeout, 10*time.Second, "CephCluster",
+		func(obj *unstructured.Unstructured) (bool, string) {
+			state, _, _ := unstructured.NestedString(obj.Object, "status", "state")
+			health, _, _ := unstructured.NestedString(obj.Object, "status", "ceph", "health")
+			phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase")
+
+			if state == "Created" || phase == "Ready" {
+				return true, fmt.Sprintf("state=%s phase=%s ceph health: %s", state, phase, health)
+			}
+			logger.Debug("CephCluster %s/%s state=%q phase=%q health=%q",
+				obj.GetNamespace(), obj.GetName(), state, phase, health)
+			return false, ""
+		},
+	)
+}
+
+// DeleteCephCluster removes a CephCluster. Tearing down the cluster this way
+// is a *destructive* operation — Rook will leave OSD data on host disks under
+// `dataDirHostPath` and operator-managed PVCs will not be garbage-collected
+// automatically. The operation is still idempotent: a NotFound error is
+// swallowed.
+//
+// NOTE: this is fire-and-forget. The apiserver returns success as soon as it
+// records the delete intent; Rook then runs its `cephcluster.ceph.rook.io`
+// finalizer for several minutes, removing pools, mon/mgr/osd pods, and so
+// on. If any dependent CR (CephBlockPool, CephFilesystem, ...) is still
+// alive, Rook records `DeletionIsBlocked / ObjectHasDependents` and the CR
+// stays in `phase=Deleting` indefinitely. Always tear down dependents first
+// (and call WaitForCephBlockPoolGone / WaitForCephFilesystemGone on them)
+// before invoking DeleteCephCluster, then follow up with
+// WaitForCephClusterGone.
+func DeleteCephCluster(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	if err := dynamicClient.Resource(CephClusterGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephCluster %s/%s: %w", namespace, name, err)
+	}
+	logger.Info("Deleted CephCluster %s/%s", namespace, name)
+	return nil
+}
+
+// CephClusterGoneTimeout is the default budget for WaitForCephClusterGone.
+// Rook needs to drain mon/mgr/osd pods, remove the CRUSH map, and unset
+// finalizers — easily 5+ minutes on a single-OSD cluster, longer on
+// degraded ones.
+const CephClusterGoneTimeout = 10 * time.Minute
+
+// WaitForCephClusterGone polls until the CephCluster is fully GC'd by
+// Kubernetes (GET returns NotFound). The poller logs the
+// deletionTimestamp/finalizers progress periodically so a stuck finalizer
+// (typical e2e failure: orphan dependent CR, broken Ceph health) is
+// immediately visible in the test log instead of being hidden behind a
+// silent timeout.
+func WaitForCephClusterGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephClusterGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephClusterGVR, namespace, name,
+		timeout, PollTickInterval, "CephCluster",
+	)
+}
diff --git a/pkg/kubernetes/cephclusterconnection.go b/pkg/kubernetes/cephclusterconnection.go
new file mode 100644
index 0000000..f8117db
--- /dev/null
+++ b/pkg/kubernetes/cephclusterconnection.go
@@ -0,0 +1,313 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// GVRs of the csi-ceph cluster-scoped CRs. We use unstructured to avoid
+// pulling github.com/deckhouse/csi-ceph/api into go.mod just for these
+// tiny types.
+var (
+	CephClusterConnectionGVR = schema.GroupVersionResource{
+		Group:    "storage.deckhouse.io",
+		Version:  "v1alpha1",
+		Resource: "cephclusterconnections",
+	}
+	CephClusterAuthenticationGVR = schema.GroupVersionResource{
+		Group:    "storage.deckhouse.io",
+		Version:  "v1alpha1",
+		Resource: "cephclusterauthentications",
+	}
+)
+
+// CephClusterAuthenticationConfig describes CephX credentials that csi-ceph
+// reuses for every StorageClass that references the authentication.
+type CephClusterAuthenticationConfig struct {
+	// Name of the CephClusterAuthentication CR.
+	Name string
+	// UserID is the Ceph user (typically "admin").
+	UserID string
+	// UserKey is the CephX key of UserID.
+	UserKey string
+}
+
+// CreateCephClusterAuthentication creates (or updates) a
+// CephClusterAuthentication CR with the given CephX credentials.
+func CreateCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterAuthenticationConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("CephClusterAuthentication name is required")
+	}
+	if cfg.UserID == "" {
+		return fmt.Errorf("CephClusterAuthentication UserID is required")
+	}
+	if cfg.UserKey == "" {
+		return fmt.Errorf("CephClusterAuthentication UserKey is required")
+	}
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "storage.deckhouse.io/v1alpha1",
+			"kind":       "CephClusterAuthentication",
+			"metadata": map[string]interface{}{
+				"name": cfg.Name,
+			},
+			"spec": map[string]interface{}{
+				"userID":  cfg.UserID,
+				"userKey": cfg.UserKey,
+			},
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephClusterAuthentication %s (userID=%s)", cfg.Name, cfg.UserID)
+	_, err = dynamicClient.Resource(CephClusterAuthenticationGVR).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephClusterAuthentication %s: %w", cfg.Name, err)
+	}
+
+	logger.Info("CephClusterAuthentication %s already exists, updating spec", cfg.Name)
+	existing, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch CephClusterAuthentication %s: %w", cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephClusterAuthentication", cfg.Name); err != nil {
+		return err
+	}
+	existing.Object["spec"] = obj.Object["spec"]
+	if _, err := dynamicClient.Resource(CephClusterAuthenticationGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephClusterAuthentication %s: %w", cfg.Name, err)
+	}
+	return nil
+}
+
+// DeleteCephClusterAuthentication removes a CephClusterAuthentication.
+// NotFound is treated as success. Pair with WaitForCephClusterAuthenticationGone
+// when teardown order matters.
+func DeleteCephClusterAuthentication(ctx context.Context, kubeconfig *rest.Config, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+	if err := dynamicClient.Resource(CephClusterAuthenticationGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephClusterAuthentication %s: %w", name, err)
+	}
+	logger.Info("Deleted CephClusterAuthentication %s", name)
+	return nil
+}
+
+// CephClusterAuthenticationGoneTimeout is the default budget for
+// WaitForCephClusterAuthenticationGone. The CR has no heavy finalizer.
+const CephClusterAuthenticationGoneTimeout = 1 * time.Minute
+
+// WaitForCephClusterAuthenticationGone polls until the CephClusterAuthentication
+// is fully GC'd by Kubernetes (GET returns NotFound).
+func WaitForCephClusterAuthenticationGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephClusterAuthenticationGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephClusterAuthenticationGVR, "", name,
+		timeout, PollTickInterval, "CephClusterAuthentication",
+	)
+}
+
+// CephClusterConnectionConfig describes a csi-ceph CephClusterConnection CR.
+// Its spec.clusterID (== Ceph fsid) is immutable once created.
+type CephClusterConnectionConfig struct {
+	// Name of the CephClusterConnection CR.
+	Name string
+	// ClusterID is the Ceph fsid. Immutable after creation.
+	ClusterID string
+	// Monitors is the list of `ip:port` monitor endpoints.
+	Monitors []string
+	// UserID is the Ceph user (typically "admin").
+	UserID string
+	// UserKey is the CephX key of UserID.
+	UserKey string
+}
+
+// CreateCephClusterConnection creates (or updates) a CephClusterConnection CR.
+// If the resource already exists we do *not* attempt to update spec.clusterID
+// (which the CRD marks immutable) — only Monitors/UserID/UserKey are synced.
+func CreateCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, cfg CephClusterConnectionConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("CephClusterConnection name is required")
+	}
+	if cfg.ClusterID == "" {
+		return fmt.Errorf("CephClusterConnection ClusterID (fsid) is required")
+	}
+	if len(cfg.Monitors) == 0 {
+		return fmt.Errorf("CephClusterConnection Monitors is required")
+	}
+
+	monitors := make([]interface{}, len(cfg.Monitors))
+	for i, m := range cfg.Monitors {
+		monitors[i] = m
+	}
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "storage.deckhouse.io/v1alpha1",
+			"kind":       "CephClusterConnection",
+			"metadata": map[string]interface{}{
+				"name": cfg.Name,
+			},
+			"spec": map[string]interface{}{
+				"clusterID": cfg.ClusterID,
+				"monitors":  monitors,
+				"userID":    cfg.UserID,
+				"userKey":   cfg.UserKey,
+			},
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephClusterConnection %s (clusterID=%s, mons=%d)", cfg.Name, cfg.ClusterID, len(cfg.Monitors))
+	_, err = dynamicClient.Resource(CephClusterConnectionGVR).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephClusterConnection %s: %w", cfg.Name, err)
+	}
+
+	logger.Info("CephClusterConnection %s already exists, syncing monitors/userID/userKey", cfg.Name)
+	existing, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch CephClusterConnection %s: %w", cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephClusterConnection", cfg.Name); err != nil {
+		return err
+	}
+	if err := unstructured.SetNestedSlice(existing.Object, monitors, "spec", "monitors"); err != nil {
+		return fmt.Errorf("set monitors: %w", err)
+	}
+	if err := unstructured.SetNestedField(existing.Object, cfg.UserID, "spec", "userID"); err != nil {
+		return fmt.Errorf("set userID: %w", err)
+	}
+	if err := unstructured.SetNestedField(existing.Object, cfg.UserKey, "spec", "userKey"); err != nil {
+		return fmt.Errorf("set userKey: %w", err)
+	}
+	if _, err := dynamicClient.Resource(CephClusterConnectionGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephClusterConnection %s: %w", cfg.Name, err)
+	}
+	return nil
+}
+
+// DeleteCephClusterConnection removes a CephClusterConnection.
+// NotFound is treated as success. Pair with WaitForCephClusterConnectionGone
+// when teardown order matters.
+func DeleteCephClusterConnection(ctx context.Context, kubeconfig *rest.Config, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+	if err := dynamicClient.Resource(CephClusterConnectionGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephClusterConnection %s: %w", name, err)
+	}
+	logger.Info("Deleted CephClusterConnection %s", name)
+	return nil
+}
+
+// CephClusterConnectionGoneTimeout is the default budget for
+// WaitForCephClusterConnectionGone. The CR has no heavy finalizer.
+const CephClusterConnectionGoneTimeout = 1 * time.Minute
+
+// WaitForCephClusterConnectionGone polls until the CephClusterConnection is
+// fully GC'd by Kubernetes (GET returns NotFound).
+func WaitForCephClusterConnectionGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephClusterConnectionGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephClusterConnectionGVR, "", name,
+		timeout, PollTickInterval, "CephClusterConnection",
+	)
+}
+
+// WaitForCephClusterConnectionCreated polls until the CephClusterConnection
+// status reports phase=Created. csi-ceph's controller flips the status from
+// Pending to Created once it has verified the supplied fsid / monitors /
+// CephX credentials against the real Ceph cluster.
+func WaitForCephClusterConnectionCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	if name == "" {
+		return fmt.Errorf("name is required")
+	}
+
+	logger.Debug("Waiting for CephClusterConnection %s phase=Created (timeout: %v)", name, timeout)
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(5 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		obj, err := dynamicClient.Resource(CephClusterConnectionGVR).Get(ctx, name, metav1.GetOptions{})
+		if err == nil {
+			phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase")
+			reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason")
+			if phase == "Created" {
+				logger.Success("CephClusterConnection %s is Created", name)
+				return nil
+			}
+			logger.Debug("CephClusterConnection %s phase=%q reason=%q", name, phase, reason)
+		} else if !apierrors.IsNotFound(err) {
+			logger.Debug("Error getting CephClusterConnection %s: %v", name, err)
+		}
+
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("timeout waiting for CephClusterConnection %s: %w", name, ctx.Err())
+		case <-ticker.C:
+		}
+	}
+}
diff --git a/pkg/kubernetes/cephcredentials.go b/pkg/kubernetes/cephcredentials.go
new file mode 100644
index 0000000..11f68ec
--- /dev/null
+++ b/pkg/kubernetes/cephcredentials.go
@@ -0,0 +1,183 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"strings"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// Well-known Rook resources that hold Ceph connection data.
+const (
+	// RookMonSecretName is the Secret that the Rook operator populates with
+	// admin credentials and cluster fsid once the CephCluster is bootstrapped.
+	RookMonSecretName = "rook-ceph-mon"
+
+	// RookMonEndpointsConfigMapName is the ConfigMap the operator keeps in
+	// sync with the current set of Ceph monitors.
+	RookMonEndpointsConfigMapName = "rook-ceph-mon-endpoints"
+)
+
+// CephCredentials holds the information a Ceph CSI client needs to connect
+// to a cluster bootstrapped by Rook.
+type CephCredentials struct {
+	// FSID is the Ceph cluster unique identifier.
+	FSID string
+
+	// AdminUser is the Ceph user name (typically "admin").
+	AdminUser string
+
+	// AdminKey is the CephX key for AdminUser.
+	AdminKey string
+
+	// Monitors is the list of monitor endpoints in "IP:PORT" form, sorted
+	// alphabetically to make the output stable across runs.
+	Monitors []string
+}
+
+// WaitForCephCredentials blocks until all pieces of information required to
+// connect to the Rook-managed Ceph cluster are populated:
+//   - Secret `rook-ceph-mon` exists and has `fsid`, `ceph-username`, `ceph-secret`.
+//   - ConfigMap `rook-ceph-mon-endpoints` exists and has at least one reachable monitor.
+//
+// The returned CephCredentials is suitable for wiring csi-ceph CRs
+// (CephClusterConnection, CephClusterAuthentication).
+func WaitForCephCredentials(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) (*CephCredentials, error) {
+	if namespace == "" {
+		return nil, fmt.Errorf("namespace is required")
+	}
+
+	logger.Debug("Waiting for Ceph credentials in %s (timeout: %v)", namespace, timeout)
+
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(5 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, RookMonSecretName, metav1.GetOptions{})
+		if err != nil && !apierrors.IsNotFound(err) {
+			logger.Debug("Failed to get Secret %s/%s: %v", namespace, RookMonSecretName, err)
+		}
+
+		cm, cmErr := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookMonEndpointsConfigMapName, metav1.GetOptions{})
+		if cmErr != nil && !apierrors.IsNotFound(cmErr) {
+			logger.Debug("Failed to get ConfigMap %s/%s: %v", namespace, RookMonEndpointsConfigMapName, cmErr)
+		}
+
+		if err == nil && cmErr == nil {
+			creds, extractErr := extractCephCredentials(secret.Data, cm.Data)
+			if extractErr == nil {
+				logger.Success("Ceph credentials ready in %s (fsid=%s, %d monitor(s))", namespace, creds.FSID, len(creds.Monitors))
+				return creds, nil
+			}
+			logger.Debug("Rook credentials not complete yet: %v", extractErr)
+		}
+
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("timeout waiting for Ceph credentials in %s: %w", namespace, ctx.Err())
+		case <-ticker.C:
+		}
+	}
+}
+
+// extractCephCredentials parses the Rook-managed Secret/ConfigMap payloads
+// into a CephCredentials struct. It returns an error if any required field
+// is missing so the caller can keep polling until the operator has populated
+// everything.
+func extractCephCredentials(secretData map[string][]byte, cmData map[string]string) (*CephCredentials, error) {
+	fsid := strings.TrimSpace(string(secretData["fsid"]))
+	if fsid == "" {
+		return nil, fmt.Errorf("Secret %s is missing `fsid`", RookMonSecretName)
+	}
+
+	adminUser := strings.TrimSpace(string(secretData["ceph-username"]))
+	if adminUser == "" {
+		adminUser = "client.admin"
+	}
+	adminUser = strings.TrimPrefix(adminUser, "client.")
+
+	adminKey := strings.TrimSpace(string(secretData["ceph-secret"]))
+	if adminKey == "" {
+		return nil, fmt.Errorf("Secret %s is missing `ceph-secret`", RookMonSecretName)
+	}
+
+	raw, ok := cmData["data"]
+	if !ok {
+		return nil, fmt.Errorf("ConfigMap %s is missing `data`", RookMonEndpointsConfigMapName)
+	}
+	monitors, err := parseMonEndpoints(raw)
+	if err != nil {
+		return nil, err
+	}
+	if len(monitors) == 0 {
+		return nil, fmt.Errorf("ConfigMap %s has no populated monitor endpoints", RookMonEndpointsConfigMapName)
+	}
+
+	return &CephCredentials{
+		FSID:      fsid,
+		AdminUser: adminUser,
+		AdminKey:  adminKey,
+		Monitors:  monitors,
+	}, nil
+}
+
+// parseMonEndpoints parses the Rook-maintained monitor endpoints string.
+//
+// Rook stores the current mon list in the `data` key of the
+// `rook-ceph-mon-endpoints` ConfigMap as a comma-separated list of
+// `<mon-name>=<ip>:<port>` pairs, for example:
+//
+//	a=10.0.0.1:6789,b=10.0.0.2:6789,c=10.0.0.3:6789
+//
+// This helper returns just the `<ip>:<port>` portion of every entry, sorted
+// alphabetically for stable output.
+func parseMonEndpoints(raw string) ([]string, error) {
+	out := []string{}
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		// Strip the "<mon-name>=" prefix if present.
+		if idx := strings.Index(part, "="); idx >= 0 {
+			part = part[idx+1:]
+		}
+		if part == "" {
+			continue
+		}
+		out = append(out, part)
+	}
+	sort.Strings(out)
+	return out, nil
+}
diff --git a/pkg/kubernetes/cephfilesystem.go b/pkg/kubernetes/cephfilesystem.go
new file mode 100644
index 0000000..91fab14
--- /dev/null
+++ b/pkg/kubernetes/cephfilesystem.go
@@ -0,0 +1,274 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// CephFilesystemGVR is the GroupVersionResource of Rook's CephFilesystem.
+var CephFilesystemGVR = schema.GroupVersionResource{
+	Group:    "ceph.rook.io",
+	Version:  "v1",
+	Resource: "cephfilesystems",
+}
+
+// CephFilesystemConfig describes a minimal Rook CephFilesystem with one
+// metadata pool and exactly one data pool. Defaults are tuned for tiny
+// single-node test clusters and mirror CephBlockPoolConfig conventions.
+type CephFilesystemConfig struct {
+	// Name of the CephFilesystem CR.
+	Name string
+
+	// Namespace the Rook operator watches (typically "d8-sds-elastic").
+	Namespace string
+
+	// FailureDomain is the CRUSH failure domain: "host" or "osd"
+	// (default: "osd" when MetadataPoolReplicas == DataPoolReplicas == 1,
+	// "host" otherwise).
+	FailureDomain string
+
+	// MetadataPoolReplicas is the metadata pool replication factor. Default: 1.
+	MetadataPoolReplicas int
+
+	// DataPoolName is the (Rook-side) data pool name. The full Ceph pool
+	// name is "<Name>-<DataPoolName>" — see CephFSDataPoolFullName.
+	// Default: "data0".
+	DataPoolName string
+
+	// DataPoolReplicas is the data pool replication factor. Default: 1.
+	DataPoolReplicas int
+
+	// MetadataServerActiveCount is the number of active MDS daemons.
+	// Default: 1.
+	MetadataServerActiveCount int
+
+	// RequireSafeReplicaSize toggles Ceph's safeguard against single-replica
+	// pools. When nil, it is set to false for replicas==1 (unsafe single
+	// replica, accepted for e2e test clusters) and left unset otherwise.
+	RequireSafeReplicaSize *bool
+}
+
+// CephFSDataPoolFullName returns the full Ceph pool name that ends up
+// referenced from CephStorageClass.spec.cephFS.pool. Rook composes the
+// per-filesystem pool name as "<filesystem>-<dataPool.name>".
+func CephFSDataPoolFullName(fsName, dataPoolName string) string {
+	return fmt.Sprintf("%s-%s", fsName, dataPoolName)
+}
+
+// CreateCephFilesystem creates (or updates, if already present) a
+// CephFilesystem in the given namespace from the provided configuration. It
+// is idempotent and safe to call on every test run.
+func CreateCephFilesystem(ctx context.Context, kubeconfig *rest.Config, cfg CephFilesystemConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("CephFilesystem name is required")
+	}
+	if cfg.Namespace == "" {
+		return fmt.Errorf("CephFilesystem namespace is required")
+	}
+	if cfg.MetadataPoolReplicas <= 0 {
+		cfg.MetadataPoolReplicas = 1
+	}
+	if cfg.DataPoolReplicas <= 0 {
+		cfg.DataPoolReplicas = 1
+	}
+	if cfg.DataPoolName == "" {
+		cfg.DataPoolName = "data0"
+	}
+	if cfg.MetadataServerActiveCount <= 0 {
+		cfg.MetadataServerActiveCount = 1
+	}
+	if cfg.FailureDomain == "" {
+		if cfg.MetadataPoolReplicas == 1 && cfg.DataPoolReplicas == 1 {
+			cfg.FailureDomain = "osd"
+		} else {
+			cfg.FailureDomain = "host"
+		}
+	}
+
+	requireSafe := cfg.RequireSafeReplicaSize
+	if requireSafe == nil && (cfg.MetadataPoolReplicas == 1 || cfg.DataPoolReplicas == 1) {
+		f := false
+		requireSafe = &f
+	}
+
+	metadataReplicated := map[string]interface{}{
+		"size": int64(cfg.MetadataPoolReplicas),
+	}
+	dataReplicated := map[string]interface{}{
+		"size": int64(cfg.DataPoolReplicas),
+	}
+	if requireSafe != nil {
+		metadataReplicated["requireSafeReplicaSize"] = *requireSafe
+		dataReplicated["requireSafeReplicaSize"] = *requireSafe
+	}
+
+	spec := map[string]interface{}{
+		"metadataPool": map[string]interface{}{
+			"failureDomain": cfg.FailureDomain,
+			"replicated":    metadataReplicated,
+		},
+		"dataPools": []interface{}{
+			map[string]interface{}{
+				"name":          cfg.DataPoolName,
+				"failureDomain": cfg.FailureDomain,
+				"replicated":    dataReplicated,
+			},
+		},
+		"preserveFilesystemOnDelete": false,
+		"metadataServer": map[string]interface{}{
+			"activeCount":   int64(cfg.MetadataServerActiveCount),
+			"activeStandby": false,
+		},
+	}
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "ceph.rook.io/v1",
+			"kind":       "CephFilesystem",
+			"metadata": map[string]interface{}{
+				"name":      cfg.Name,
+				"namespace": cfg.Namespace,
+			},
+			"spec": spec,
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephFilesystem %s/%s", cfg.Namespace, cfg.Name)
+	_, err = dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		logger.Success("CephFilesystem %s/%s created", cfg.Namespace, cfg.Name)
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+
+	logger.Info("CephFilesystem %s/%s already exists, updating spec", cfg.Namespace, cfg.Name)
+	existing, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch existing CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephFilesystem", formatRef(cfg.Namespace, cfg.Name)); err != nil {
+		return err
+	}
+	existing.Object["spec"] = spec
+	if _, err := dynamicClient.Resource(CephFilesystemGVR).Namespace(cfg.Namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephFilesystem %s/%s: %w", cfg.Namespace, cfg.Name, err)
+	}
+	return nil
+}
+
+// WaitForCephFilesystemReady blocks until the CephFilesystem reports
+// `status.phase == "Ready"`. As a fallback (some Rook revisions populate
+// `status.conditions` first) the function also accepts a Ready=True
+// condition.
+//
+// Per-call deadlines and loud (WARN) logging on consecutive network failures
+// are inherited from pollResourceUntilReady.
+func WaitForCephFilesystemReady(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	return pollResourceUntilReady(
+		ctx, kubeconfig, CephFilesystemGVR, namespace, name,
+		timeout, PollTickInterval, "CephFilesystem",
+		func(obj *unstructured.Unstructured) (bool, string) {
+			phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase")
+			if phase == "Ready" {
+				return true, "status.phase"
+			}
+			if cephFilesystemReadyByCondition(obj.Object) {
+				return true, "status.conditions[Ready]=True"
+			}
+			logger.Debug("CephFilesystem %s/%s phase: %q, waiting...", obj.GetNamespace(), obj.GetName(), phase)
+			return false, ""
+		},
+	)
+}
+
+func cephFilesystemReadyByCondition(obj map[string]interface{}) bool {
+	conditions, found, err := unstructured.NestedSlice(obj, "status", "conditions")
+	if err != nil || !found {
+		return false
+	}
+	for _, raw := range conditions {
+		cond, ok := raw.(map[string]interface{})
+		if !ok {
+			continue
+		}
+		ctype, _, _ := unstructured.NestedString(cond, "type")
+		cstatus, _, _ := unstructured.NestedString(cond, "status")
+		if ctype == "Ready" && cstatus == "True" {
+			return true
+		}
+	}
+	return false
+}
+
+// DeleteCephFilesystem deletes a CephFilesystem. Safe to call if the
+// filesystem does not exist. NOTE: fire-and-forget — Rook's
+// `cephfilesystem.ceph.rook.io` finalizer takes time to detach the MDS
+// daemons and remove the metadata/data pools. Pair with
+// WaitForCephFilesystemGone if you need to know the CR has actually been
+// GC'd before doing something else (e.g. deleting the parent CephCluster).
+func DeleteCephFilesystem(ctx context.Context, kubeconfig *rest.Config, namespace, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	if err := dynamicClient.Resource(CephFilesystemGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephFilesystem %s/%s: %w", namespace, name, err)
+	}
+	logger.Info("Deleted CephFilesystem %s/%s", namespace, name)
+	return nil
+}
+
+// CephFilesystemGoneTimeout is the default budget for WaitForCephFilesystemGone.
+// MDS shutdown + pool removal usually settles in 1-2 minutes; we allow more
+// to absorb operator restarts and slow Ceph mons.
+const CephFilesystemGoneTimeout = 5 * time.Minute
+
+// WaitForCephFilesystemGone polls until the CephFilesystem is fully GC'd by
+// Kubernetes (GET returns NotFound). Use this after DeleteCephFilesystem to
+// be sure the parent CephCluster's deletion won't be blocked by
+// `ObjectHasDependents`.
+func WaitForCephFilesystemGone(ctx context.Context, kubeconfig *rest.Config, namespace, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephFilesystemGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephFilesystemGVR, namespace, name,
+		timeout, PollTickInterval, "CephFilesystem",
+	)
+}
diff --git a/pkg/kubernetes/cephstorageclass.go b/pkg/kubernetes/cephstorageclass.go
new file mode 100644
index 0000000..942dd49
--- /dev/null
+++ b/pkg/kubernetes/cephstorageclass.go
@@ -0,0 +1,252 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// CephStorageClassGVR points at csi-ceph's CephStorageClass CR (not to be
+// confused with Rook's CephCluster / CephBlockPool).
+var CephStorageClassGVR = schema.GroupVersionResource{
+	Group:    "storage.deckhouse.io",
+	Version:  "v1alpha1",
+	Resource: "cephstorageclasses",
+}
+
+// Supported CephStorageClass types, mirroring csi-ceph's CRD enum.
+const (
+	CephStorageClassTypeRBD    = "RBD"
+	CephStorageClassTypeCephFS = "CephFS"
+)
+
+// CephStorageClassConfig is an intentionally narrow shape tailored for the
+// e2e scenarios we care about today — an RBD StorageClass backed by a single
+// block pool. CephFS variant is supported but requires FSName+FSPool to be
+// set by the caller.
+type CephStorageClassConfig struct {
+	// Name of the CephStorageClass CR (becomes the k8s StorageClass name).
+	Name string
+
+	// ClusterConnectionName points at a CephClusterConnection CR.
+	ClusterConnectionName string
+
+	// ClusterAuthenticationName points at a CephClusterAuthentication CR.
+	ClusterAuthenticationName string
+
+	// ReclaimPolicy mirrors StorageClass.ReclaimPolicy ("Delete" / "Retain").
+	// Default: "Delete".
+	ReclaimPolicy string
+
+	// Type is "RBD" (default) or "CephFS".
+	Type string
+
+	// --- RBD options (Type == "RBD") ---
+
+	// RBDPool is the Ceph pool name (e.g. "ceph-rbd-r1").
+	RBDPool string
+
+	// RBDDefaultFSType picks the filesystem mkfs on volume attach.
+	// Default: "ext4".
+	RBDDefaultFSType string
+
+	// --- CephFS options (Type == "CephFS") ---
+	CephFSName string // Name of the CephFilesystem.
+	CephFSPool string // Pool to use inside that filesystem.
+}
+
+// CreateCephStorageClass creates (or updates) a CephStorageClass CR. On
+// success the csi-ceph controller provisions a corresponding core
+// storage.k8s.io/v1 StorageClass in the cluster.
+func CreateCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("CephStorageClass name is required")
+	}
+	if cfg.ClusterConnectionName == "" {
+		return fmt.Errorf("CephStorageClass ClusterConnectionName is required")
+	}
+	if cfg.ClusterAuthenticationName == "" {
+		return fmt.Errorf("CephStorageClass ClusterAuthenticationName is required")
+	}
+	if cfg.Type == "" {
+		cfg.Type = CephStorageClassTypeRBD
+	}
+	if cfg.ReclaimPolicy == "" {
+		cfg.ReclaimPolicy = "Delete"
+	}
+
+	spec := map[string]interface{}{
+		"clusterConnectionName":     cfg.ClusterConnectionName,
+		"clusterAuthenticationName": cfg.ClusterAuthenticationName,
+		"reclaimPolicy":             cfg.ReclaimPolicy,
+		"type":                      cfg.Type,
+	}
+
+	switch cfg.Type {
+	case CephStorageClassTypeRBD:
+		if cfg.RBDPool == "" {
+			return fmt.Errorf("CephStorageClass of type RBD requires RBDPool")
+		}
+		if cfg.RBDDefaultFSType == "" {
+			cfg.RBDDefaultFSType = "ext4"
+		}
+		spec["rbd"] = map[string]interface{}{
+			"defaultFSType": cfg.RBDDefaultFSType,
+			"pool":          cfg.RBDPool,
+		}
+	case CephStorageClassTypeCephFS:
+		if cfg.CephFSName == "" || cfg.CephFSPool == "" {
+			return fmt.Errorf("CephStorageClass of type CephFS requires CephFSName and CephFSPool")
+		}
+		spec["cephFS"] = map[string]interface{}{
+			"fsName": cfg.CephFSName,
+			"pool":   cfg.CephFSPool,
+		}
+	default:
+		return fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type)
+	}
+
+	obj := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "storage.deckhouse.io/v1alpha1",
+			"kind":       "CephStorageClass",
+			"metadata": map[string]interface{}{
+				"name": cfg.Name,
+			},
+			"spec": spec,
+		},
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	logger.Info("Creating CephStorageClass %s (type=%s, conn=%s, auth=%s)",
+		cfg.Name, cfg.Type, cfg.ClusterConnectionName, cfg.ClusterAuthenticationName)
+	_, err = dynamicClient.Resource(CephStorageClassGVR).Create(ctx, obj, metav1.CreateOptions{})
+	if err == nil {
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return fmt.Errorf("failed to create CephStorageClass %s: %w", cfg.Name, err)
+	}
+
+	logger.Info("CephStorageClass %s already exists, updating spec", cfg.Name)
+	existing, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, cfg.Name, metav1.GetOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to fetch CephStorageClass %s: %w", cfg.Name, err)
+	}
+	if err := errIfTerminating(existing, "CephStorageClass", cfg.Name); err != nil {
+		return err
+	}
+	existing.Object["spec"] = spec
+	if _, err := dynamicClient.Resource(CephStorageClassGVR).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update CephStorageClass %s: %w", cfg.Name, err)
+	}
+	return nil
+}
+
+// DeleteCephStorageClass removes a CephStorageClass. NotFound is treated as
+// success. The underlying k8s StorageClass is removed by the csi-ceph
+// controller as a side effect. Use WaitForCephStorageClassGone to confirm
+// the CR is fully GC'd.
+func DeleteCephStorageClass(ctx context.Context, kubeconfig *rest.Config, name string) error {
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+	if err := dynamicClient.Resource(CephStorageClassGVR).Delete(ctx, name, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete CephStorageClass %s: %w", name, err)
+	}
+	logger.Info("Deleted CephStorageClass %s", name)
+	return nil
+}
+
+// CephStorageClassGoneTimeout is the default budget for
+// WaitForCephStorageClassGone. CephStorageClass has no heavyweight finalizer
+// (csi-ceph just deletes the backing k8s StorageClass), so this typically
+// completes in seconds.
+const CephStorageClassGoneTimeout = 1 * time.Minute
+
+// WaitForCephStorageClassGone polls until the CephStorageClass is fully GC'd
+// by Kubernetes (GET returns NotFound).
+func WaitForCephStorageClassGone(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	if timeout <= 0 {
+		timeout = CephStorageClassGoneTimeout
+	}
+	return pollResourceUntilGone(
+		ctx, kubeconfig, CephStorageClassGVR, "", name,
+		timeout, PollTickInterval, "CephStorageClass",
+	)
+}
+
+// WaitForCephStorageClassCreated polls until the CephStorageClass status
+// reports phase=Created (the csi-ceph controller flips this once the backing
+// k8s StorageClass has been provisioned).
+func WaitForCephStorageClassCreated(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	if name == "" {
+		return fmt.Errorf("name is required")
+	}
+
+	logger.Debug("Waiting for CephStorageClass %s phase=Created (timeout: %v)", name, timeout)
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(3 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		obj, err := dynamicClient.Resource(CephStorageClassGVR).Get(ctx, name, metav1.GetOptions{})
+		if err == nil {
+			phase, _, _ := unstructured.NestedString(obj.Object, "status", "phase")
+			reason, _, _ := unstructured.NestedString(obj.Object, "status", "reason")
+			if phase == "Created" {
+				logger.Success("CephStorageClass %s is Created", name)
+				return nil
+			}
+			logger.Debug("CephStorageClass %s phase=%q reason=%q", name, phase, reason)
+		} else if !apierrors.IsNotFound(err) {
+			logger.Debug("Error getting CephStorageClass %s: %v", name, err)
+		}
+
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("timeout waiting for CephStorageClass %s: %w", name, ctx.Err())
+		case <-ticker.C:
+		}
+	}
+}
diff --git a/pkg/kubernetes/modules.go b/pkg/kubernetes/modules.go
index 3b4cedf..94490a7 100644
--- a/pkg/kubernetes/modules.go
+++ b/pkg/kubernetes/modules.go
@@ -252,9 +252,18 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC
 		settings = moduleConfig.Settings
 	}
 
-	// Retry logic for webhook connection errors and network timeouts
-	maxRetries := 10
+	// Retry logic for webhook connection errors and network timeouts.
+	// On freshly-bootstrapped Deckhouse clusters the validating-webhook-handler
+	// pod (or the d8-system Service endpoint backing it) can be unready for
+	// several minutes while the control plane converges. Our previous cap of
+	// 10 retries with exponential backoff topped out at ~3.7 minutes total
+	// which was not enough for the SAN stand — we'd fail Step 18 with
+	// "connection refused" during the first ModuleConfig write. Bumping to 60
+	// attempts with delays capped at 30s gives us up to ~30 minutes of
+	// soft-retries, which easily outlives any realistic webhook cold start.
+	maxRetries := 60
 	retryDelay := 2 * time.Second
+	const maxRetryDelay = 30 * time.Second
 	var lastErr error
 
 	for attempt := 0; attempt < maxRetries; attempt++ {
@@ -282,8 +291,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC
 					case <-ctx.Done():
 						return ctx.Err()
 					case <-time.After(retryDelay):
-						// Exponential backoff
+						// Exponential backoff, capped so we don't sleep forever
+						// between retries on a slow-to-converge cluster.
 						retryDelay = time.Duration(float64(retryDelay) * 1.5)
+						if retryDelay > maxRetryDelay {
+							retryDelay = maxRetryDelay
+						}
 						continue
 					}
 				}
@@ -307,8 +320,12 @@ func configureModuleConfig(ctx context.Context, kubeconfig *rest.Config, moduleC
 					case <-ctx.Done():
 						return ctx.Err()
 					case <-time.After(retryDelay):
-						// Exponential backoff
+						// Exponential backoff, capped (see create branch above
+						// for the rationale — same webhook cold-start).
 						retryDelay = time.Duration(float64(retryDelay) * 1.5)
+						if retryDelay > maxRetryDelay {
+							retryDelay = maxRetryDelay
+						}
 						continue
 					}
 				}
diff --git a/pkg/kubernetes/pod_exec.go b/pkg/kubernetes/pod_exec.go
new file mode 100644
index 0000000..92297a0
--- /dev/null
+++ b/pkg/kubernetes/pod_exec.go
@@ -0,0 +1,388 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"fmt"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/remotecommand"
+)
+
+// DefaultDebugImage is the image ReadFileFromDistrolessPod injects as the
+// short-lived ephemeral container. busybox ships cat, sleep and a
+// minimal sh — exactly the toolset we need to read /proc/1/root/<path>
+// in the target container's filesystem. Tests against an air-gapped
+// registry can override this via ReadFileOptions.DebugImage.
+const DefaultDebugImage = "busybox:1.36"
+
+// DefaultEphemeralStartupTimeout caps the wait for the injected
+// ephemeral container to transition into Running. Image pull from a
+// warm registry usually takes a couple of seconds; 60 s is a generous
+// upper bound that still surfaces ImagePullBackOff/ErrImagePull early.
+const DefaultEphemeralStartupTimeout = 60 * time.Second
+
+// DefaultDistrolessSessionTTL is the lifetime of the `sleep` process
+// inside the injected ephemeral container when used as a long-lived
+// reader session (OpenDistrolessReader / DistrolessReader.ReadFile).
+// 30 minutes comfortably outlasts any single test cell while still
+// guaranteeing eventual self-cleanup if the caller crashes.
+const DefaultDistrolessSessionTTL = 30 * time.Minute
+
+// ephemeralPollInterval is how often we re-Get the pod when waiting for
+// the ephemeral container to start. 500 ms is a deliberate compromise:
+// fast enough that the typical 1-3 s pull is observed promptly, slow
+// enough that we don't hammer the apiserver.
+const ephemeralPollInterval = 500 * time.Millisecond
+
+// ReadFileOptions tunes ReadFileFromDistrolessPod and OpenDistrolessReader.
+type ReadFileOptions struct {
+	// DebugImage overrides the ephemeral container image. Defaults to
+	// DefaultDebugImage. Use this on air-gapped clusters to point at an
+	// internal mirror.
+	DebugImage string
+	// StartupTimeout caps the wait for the ephemeral container to reach
+	// state.Running. Defaults to DefaultEphemeralStartupTimeout.
+	StartupTimeout time.Duration
+	// SessionTTL controls how long the injected ephemeral container's
+	// `sleep` process stays alive. Defaults to DefaultDistrolessSessionTTL.
+	// Used by OpenDistrolessReader; ReadFileFromDistrolessPod does not
+	// rely on this value (the entry's status flip after the cat exits
+	// has no effect on the pod).
+	SessionTTL time.Duration
+}
+
+// ExecInPod runs cmd inside container of pod namespace/pod via the
+// apiserver's pods/exec subresource and returns stdout and stderr
+// separately, plus any transport- or exec-level error.
+//
+// The container must ship every binary referenced by cmd; ExecInPod does
+// NOT inject any helper. For distroless containers without cat / sh,
+// see ReadFileFromDistrolessPod.
+func ExecInPod(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	namespace, pod, container string,
+	cmd []string,
+) (stdout, stderr string, err error) {
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return "", "", fmt.Errorf("create clientset: %w", err)
+	}
+
+	req := clientset.CoreV1().RESTClient().Post().
+		Resource("pods").
+		Name(pod).
+		Namespace(namespace).
+		SubResource("exec").
+		VersionedParams(&corev1.PodExecOptions{
+			Container: container,
+			Command:   cmd,
+			Stdout:    true,
+			Stderr:    true,
+		}, scheme.ParameterCodec)
+
+	executor, err := remotecommand.NewSPDYExecutor(kubeconfig, "POST", req.URL())
+	if err != nil {
+		return "", "", fmt.Errorf("create SPDY executor for %s/%s[%s]: %w",
+			namespace, pod, container, err)
+	}
+
+	var stdoutBuf, stderrBuf bytes.Buffer
+	err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{
+		Stdout: &stdoutBuf,
+		Stderr: &stderrBuf,
+	})
+	stdout = stdoutBuf.String()
+	stderr = stderrBuf.String()
+	if err != nil {
+		return stdout, stderr, fmt.Errorf("exec %v in %s/%s[%s]: %w (stderr=%q)",
+			cmd, namespace, pod, container, err, stderr)
+	}
+	return stdout, stderr, nil
+}
+
+// ReadFileFromPod cat's `path` from inside `container` of pod
+// `namespace/pod`. Equivalent to `kubectl exec -c container -- cat
+// path`, with stderr surfaced as part of the error if non-empty.
+//
+// Requires the container image to ship cat. For distroless / scratch
+// images, use ReadFileFromDistrolessPod.
+func ReadFileFromPod(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	namespace, pod, container, path string,
+) (string, error) {
+	stdout, stderr, err := ExecInPod(ctx, kubeconfig, namespace, pod, container, []string{"cat", path})
+	if err != nil {
+		return stdout, err
+	}
+	if stderr != "" {
+		return stdout, fmt.Errorf("cat %s in %s/%s[%s] reported stderr: %s",
+			path, namespace, pod, container, stderr)
+	}
+	return stdout, nil
+}
+
+// ReadFileFromDistrolessPod reads `path` from inside `targetContainer`
+// of pod `namespace/pod` even when targetContainer ships no shell, no
+// cat and no tar — i.e. a distroless or scratch image like
+// csi-controller. It does so by injecting a short-lived ephemeral
+// container (TargetContainerName=targetContainer, which gives it a
+// shared PID namespace with the target) and then catting
+// /proc/1/root<path>. /proc/1 is PID 1 inside the target container's
+// PID namespace, and /proc/<pid>/root is the well-known kernel-exposed
+// view of that process's filesystem root.
+//
+// Why this does NOT restart the target pod or any of its containers:
+//
+//   - Ephemeral containers are added through the dedicated
+//     /pods/<name>/ephemeralcontainers subresource (UpdateEphemeralContainers
+//     in client-go). The apiserver explicitly allows this mutation on a
+//     running pod; the ordinary pod PUT/PATCH path that would trigger
+//     re-creation is bypassed entirely. Without this dedicated path,
+//     adding a container to a live pod would be flat-out forbidden.
+//   - metadata.generation, spec.containers, the pod sandbox UID and the
+//     ReplicaSet/DaemonSet observation all stay intact. The kubelet
+//     simply launches the new container in the existing pod sandbox
+//     without disturbing existing containers. Workload-controller
+//     rollouts and pod-template `checksum/...` annotations are not
+//     affected, so e2e suites that subsequently assert on rollout
+//     state see a clean signal — the FS read does not contaminate it.
+//   - Ephemeral containers are forbidden from declaring ports, probes,
+//     lifecycle hooks or resources, which guarantees the inject is a
+//     cheap no-op for the pod's lifecycle.
+//
+// Caveat: ephemeral containers cannot be removed once added. The cat
+// process exits with the container after `sleep`, but the entry remains
+// in pod.spec.ephemeralContainers and
+// pod.status.ephemeralContainerStatuses (state=Terminated). For
+// long-running suites those entries simply pile up until the next pod
+// recycle. Each invocation here generates a unique container name, so
+// repeat calls against the same pod are safe.
+//
+// For polling loops or any scenario that reads the same pod multiple
+// times, prefer OpenDistrolessReader: each ReadFileFromDistrolessPod
+// call pays the full ephemeral-container cold-start cost (~10–20 s for
+// kubelet to launch a new container in the existing pod sandbox), and
+// that cost dominates the runtime of a Eventually-style poll.
+func ReadFileFromDistrolessPod(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	namespace, pod, targetContainer, path string,
+	opts ReadFileOptions,
+) (string, error) {
+	r, err := OpenDistrolessReader(ctx, kubeconfig, namespace, pod, targetContainer, opts)
+	if err != nil {
+		return "", err
+	}
+	return r.ReadFile(ctx, path)
+}
+
+// DistrolessReader is a long-lived ephemeral-container reader session
+// against a single distroless pod. Open one with OpenDistrolessReader,
+// then call ReadFile as many times as you need — each ReadFile is just
+// an exec into the already-running ephemeral container (cheap), so a
+// polling loop pays the ephemeral-container cold start ONCE instead of
+// per-iteration.
+//
+// The session expires when the ephemeral container's `sleep`
+// (opts.SessionTTL, default DefaultDistrolessSessionTTL) elapses; there
+// is no Close — Kubernetes does not allow removing an ephemeral
+// container — but the inert "Terminated" status entry has no effect on
+// the pod. Callers that need fresh sessions across pod identities
+// (e.g. after a workload rollout) should re-open against the new pod.
+type DistrolessReader struct {
+	kubeconfig      *rest.Config
+	namespace       string
+	podName         string
+	targetContainer string
+	ephemeralName   string
+}
+
+// PodName returns the name of the pod this reader is bound to. Useful
+// for callers that need to detect rollouts (the pod name changes when
+// the workload-controller recycles the pod) and re-open the session.
+func (r *DistrolessReader) PodName() string { return r.podName }
+
+// EphemeralName returns the auto-generated name of the injected
+// ephemeral container, mostly for logging.
+func (r *DistrolessReader) EphemeralName() string { return r.ephemeralName }
+
+// ReadFile cat's `path` from inside the target container's filesystem
+// (resolved through the ephemeral container's view of /proc/1/root).
+// Cheap — just a pods/exec round-trip; no apiserver mutations.
+func (r *DistrolessReader) ReadFile(ctx context.Context, path string) (string, error) {
+	stdout, stderr, err := ExecInPod(ctx, r.kubeconfig, r.namespace, r.podName, r.ephemeralName,
+		[]string{"cat", "/proc/1/root" + path})
+	if err != nil {
+		return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: %w",
+			path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, err)
+	}
+	if stderr != "" {
+		return stdout, fmt.Errorf("read %s from %s/%s[%s] via ephemeral %s: stderr=%s",
+			path, r.namespace, r.podName, r.targetContainer, r.ephemeralName, stderr)
+	}
+	return stdout, nil
+}
+
+// OpenDistrolessReader injects a long-lived ephemeral container into
+// the target pod and waits for it to become Running. The returned
+// DistrolessReader can then be used for arbitrarily many cheap
+// ReadFile calls until opts.SessionTTL elapses (default 30 minutes).
+//
+// Failure modes (returned as errors): pod not found, ephemeral
+// container terminates before Running, image pull failure, startup
+// timeout. On any of these no usable reader is returned.
+//
+// See ReadFileFromDistrolessPod for the rationale on why this does
+// not restart the target pod or any of its existing containers.
+func OpenDistrolessReader(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	namespace, pod, targetContainer string,
+	opts ReadFileOptions,
+) (*DistrolessReader, error) {
+	if opts.DebugImage == "" {
+		opts.DebugImage = DefaultDebugImage
+	}
+	if opts.StartupTimeout <= 0 {
+		opts.StartupTimeout = DefaultEphemeralStartupTimeout
+	}
+	if opts.SessionTTL <= 0 {
+		opts.SessionTTL = DefaultDistrolessSessionTTL
+	}
+
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return nil, fmt.Errorf("create clientset: %w", err)
+	}
+	pods := clientset.CoreV1().Pods(namespace)
+
+	ecName, err := randomEphemeralName("filereader-")
+	if err != nil {
+		return nil, fmt.Errorf("generate ephemeral container name: %w", err)
+	}
+
+	livePod, err := pods.Get(ctx, pod, metav1.GetOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("get pod %s/%s: %w", namespace, pod, err)
+	}
+	sleepSeconds := int64(opts.SessionTTL.Seconds())
+	if sleepSeconds < 1 {
+		sleepSeconds = 1
+	}
+	livePod.Spec.EphemeralContainers = append(livePod.Spec.EphemeralContainers, corev1.EphemeralContainer{
+		EphemeralContainerCommon: corev1.EphemeralContainerCommon{
+			Name:                     ecName,
+			Image:                    opts.DebugImage,
+			Command:                  []string{"sleep", fmt.Sprintf("%d", sleepSeconds)},
+			ImagePullPolicy:          corev1.PullIfNotPresent,
+			TerminationMessagePolicy: corev1.TerminationMessageReadFile,
+		},
+		TargetContainerName: targetContainer,
+	})
+	if _, err := pods.UpdateEphemeralContainers(ctx, pod, livePod, metav1.UpdateOptions{}); err != nil {
+		return nil, fmt.Errorf("inject ephemeral container %q into %s/%s: %w",
+			ecName, namespace, pod, err)
+	}
+
+	if err := waitEphemeralContainerRunning(ctx, pods, pod, ecName, opts.StartupTimeout); err != nil {
+		return nil, err
+	}
+
+	return &DistrolessReader{
+		kubeconfig:      kubeconfig,
+		namespace:       namespace,
+		podName:         pod,
+		targetContainer: targetContainer,
+		ephemeralName:   ecName,
+	}, nil
+}
+
+// waitEphemeralContainerRunning polls pod.status.ephemeralContainerStatuses
+// until the container with name ecName reports state.Running != nil.
+// Returns immediately on Terminated / hard pull failures so tests don't
+// have to sit through the full timeout when the debug image is
+// unreachable.
+func waitEphemeralContainerRunning(
+	ctx context.Context,
+	pods typedcorev1.PodInterface,
+	podName, ecName string,
+	timeout time.Duration,
+) error {
+	deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+	ticker := time.NewTicker(ephemeralPollInterval)
+	defer ticker.Stop()
+
+	for {
+		p, getErr := pods.Get(deadlineCtx, podName, metav1.GetOptions{})
+		switch {
+		case apierrors.IsNotFound(getErr):
+			return fmt.Errorf("pod %s disappeared while waiting for ephemeral container %q",
+				podName, ecName)
+		case getErr == nil:
+			for _, st := range p.Status.EphemeralContainerStatuses {
+				if st.Name != ecName {
+					continue
+				}
+				if st.State.Running != nil {
+					return nil
+				}
+				if st.State.Terminated != nil {
+					return fmt.Errorf("ephemeral container %q in pod %s terminated before exec: reason=%s exitCode=%d",
+						ecName, podName,
+						st.State.Terminated.Reason, st.State.Terminated.ExitCode)
+				}
+				if w := st.State.Waiting; w != nil && (w.Reason == "ImagePullBackOff" || w.Reason == "ErrImagePull") {
+					return fmt.Errorf("ephemeral container %q in pod %s cannot start: %s: %s",
+						ecName, podName, w.Reason, w.Message)
+				}
+			}
+		}
+
+		select {
+		case <-deadlineCtx.Done():
+			return fmt.Errorf("timeout (%s) waiting for ephemeral container %q in pod %s to be Running",
+				timeout, ecName, podName)
+		case <-ticker.C:
+		}
+	}
+}
+
+// randomEphemeralName returns prefix + 8 hex chars from crypto/rand.
+// Sufficient entropy for uniqueness across a single test run; we don't
+// need cryptographic strength but crypto/rand keeps us out of math/rand
+// seeding pitfalls.
+func randomEphemeralName(prefix string) (string, error) {
+	var b [4]byte
+	if _, err := rand.Read(b[:]); err != nil {
+		return "", err
+	}
+	return prefix + hex.EncodeToString(b[:]), nil
+}
diff --git a/pkg/kubernetes/poll.go b/pkg/kubernetes/poll.go
new file mode 100644
index 0000000..4fc833f
--- /dev/null
+++ b/pkg/kubernetes/poll.go
@@ -0,0 +1,339 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/dynamic"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// PollGetTimeout caps a single Get call inside readiness pollers. Without
+// this cap a hung TCP connect (e.g. SSH tunnel that died after a Wi-Fi flap
+// on the developer's laptop) eats the entire parent timeout silently — the
+// poller appears to "hang" until the per-resource ReadyTimeout fires 15-20
+// minutes later. With a 30s cap each Get fails fast, so we surface the
+// network problem early via the WARN log emitted by pollResourceUntilReady.
+const PollGetTimeout = 30 * time.Second
+
+// PollTickInterval is the default tick interval between Get attempts when
+// waiting for a Kubernetes resource to reach a ready state.
+const PollTickInterval = 5 * time.Second
+
+// pollResourceUntilReady polls a single namespaced unstructured resource
+// until isReady returns (true, "<reason>") or the parent timeout expires.
+//
+// It centralizes three behaviors that all of our Wait*Ready helpers want:
+//   - per-call deadline (PollGetTimeout) on every Get, so a dead network
+//     surfaces in seconds instead of after the readiness timeout;
+//   - WARN logs with a counter when consecutive network errors happen — silent
+//     pollers were the root cause of "test hangs forever after Wi-Fi flap";
+//   - tolerance of NotFound (the resource may not have been seen by the
+//     watch cache yet) and of `isReady=false` (still progressing).
+//
+// Parameters:
+//
+//   - kubeconfig:       rest config used to construct the dynamic client.
+//   - gvr:              GroupVersionResource of the resource being polled.
+//   - namespace, name:  scope of the resource. Must both be non-empty.
+//   - readyTimeout:     overall budget. Returns timeout error after this.
+//   - tickInterval:     gap between Get attempts. Pass PollTickInterval if
+//     unsure; resources with slow reconcilers can use longer intervals.
+//   - resourceLabel:    string used in log lines (e.g. "CephCluster"). Keep
+//     short — the namespace/name is appended for context.
+//   - isReady:          decider over the unstructured object. Returns
+//     (ready, humanReason). If ready is true, pollResourceUntilReady
+//     prints a Success log including the reason and returns nil.
+func pollResourceUntilReady(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	gvr schema.GroupVersionResource,
+	namespace, name string,
+	readyTimeout time.Duration,
+	tickInterval time.Duration,
+	resourceLabel string,
+	isReady func(obj *unstructured.Unstructured) (ready bool, reason string),
+) error {
+	if name == "" {
+		return fmt.Errorf("name is required")
+	}
+	if isReady == nil {
+		return fmt.Errorf("isReady is required")
+	}
+	if tickInterval <= 0 {
+		tickInterval = PollTickInterval
+	}
+
+	ref := formatRef(namespace, name)
+	logger.Debug("Waiting for %s %s to become Ready (timeout: %v)", resourceLabel, ref, readyTimeout)
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	deadlineCtx, cancel := context.WithTimeout(ctx, readyTimeout)
+	defer cancel()
+
+	ticker := time.NewTicker(tickInterval)
+	defer ticker.Stop()
+
+	var consecutiveErrs int
+	for {
+		obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout)
+		switch {
+		case err == nil:
+			consecutiveErrs = 0
+			// Refuse to wait for Ready on a Terminating object. Without this
+			// short-circuit a stale `Deleting` CR (e.g. CephCluster left over
+			// by a previous run that didn't finish teardown) would keep us
+			// polling for the full readyTimeout: phase=Deleting never matches
+			// any "Ready" condition. Failing fast here gives the operator a
+			// chance to clean up (or strip finalizers) instead of hiding the
+			// real state of the cluster behind a 15-20 minute timeout.
+			if dt := obj.GetDeletionTimestamp(); dt != nil {
+				return fmt.Errorf(
+					"%s %s is being deleted (deletionTimestamp=%s, finalizers=%v); "+
+						"refusing to wait for Ready on a Terminating object",
+					resourceLabel, ref,
+					dt.Format(time.RFC3339), obj.GetFinalizers(),
+				)
+			}
+			if ready, reason := isReady(obj); ready {
+				if reason != "" {
+					logger.Success("%s %s is Ready (%s)", resourceLabel, ref, reason)
+				} else {
+					logger.Success("%s %s is Ready", resourceLabel, ref)
+				}
+				return nil
+			}
+		case apierrors.IsNotFound(err):
+			// Resource hasn't propagated yet. Treat as "still progressing"
+			// without warning so we don't spam logs on healthy clusters that
+			// just haven't observed the create yet.
+			consecutiveErrs = 0
+			logger.Debug("%s %s not found yet", resourceLabel, ref)
+		default:
+			consecutiveErrs++
+			// Quiet the first two failures (spurious 5xx, leader re-election),
+			// loud after that. Loud == WARN at every iteration so the user
+			// can see the cluster connection is dying instead of waiting for
+			// the readyTimeout to fire.
+			if consecutiveErrs >= 3 {
+				logger.Warn(
+					"%s %s GET failed for %d consecutive iterations: %v",
+					resourceLabel, ref, consecutiveErrs, err,
+				)
+			} else {
+				logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err)
+			}
+		}
+
+		select {
+		case <-deadlineCtx.Done():
+			return fmt.Errorf("timeout waiting for %s %s: %w", resourceLabel, ref, deadlineCtx.Err())
+		case <-ticker.C:
+		}
+	}
+}
+
+// PollGoneProgressEvery controls how often pollResourceUntilGone emits a
+// progress INFO line while the resource is still alive. We don't want a log
+// per tick (chatty) but we also don't want long stretches of silence when a
+// finalizer is stuck for minutes — every ~30s strikes a balance.
+const PollGoneProgressEvery = 30 * time.Second
+
+// pollResourceUntilGone polls a single namespaced unstructured resource
+// until a GET returns NotFound (i.e. the API server has GC'd the object) or
+// the parent timeout expires.
+//
+// Mirrors pollResourceUntilReady but with inverted success criterion. Three
+// behaviors worth calling out:
+//   - per-call deadline (PollGetTimeout) on every Get;
+//   - WARN logs after a few consecutive non-NotFound errors so a dropped
+//     SSH tunnel surfaces in seconds rather than at the timeout;
+//   - periodic INFO progress log including the object's deletionTimestamp
+//     and finalizers — that's exactly the diagnostic info you need to know
+//     why Rook hasn't finished tearing the resource down. We avoid logging
+//     this on every tick (chatty) and instead emit at most once per
+//     PollGoneProgressEvery.
+func pollResourceUntilGone(
+	ctx context.Context,
+	kubeconfig *rest.Config,
+	gvr schema.GroupVersionResource,
+	namespace, name string,
+	goneTimeout time.Duration,
+	tickInterval time.Duration,
+	resourceLabel string,
+) error {
+	if name == "" {
+		return fmt.Errorf("name is required")
+	}
+	if tickInterval <= 0 {
+		tickInterval = PollTickInterval
+	}
+
+	ref := formatRef(namespace, name)
+	logger.Debug("Waiting for %s %s to be gone (timeout: %v)", resourceLabel, ref, goneTimeout)
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	deadlineCtx, cancel := context.WithTimeout(ctx, goneTimeout)
+	defer cancel()
+
+	ticker := time.NewTicker(tickInterval)
+	defer ticker.Stop()
+
+	var (
+		consecutiveErrs int
+		lastProgress    time.Time
+		lastFinalizers  []string
+		lastDeletionTS  string
+	)
+	for {
+		obj, err := getWithTimeout(deadlineCtx, dynamicClient, gvr, namespace, name, PollGetTimeout)
+		switch {
+		case apierrors.IsNotFound(err):
+			logger.Success("%s %s is gone", resourceLabel, ref)
+			return nil
+		case err == nil:
+			consecutiveErrs = 0
+			finalizers := obj.GetFinalizers()
+			deletionTS := ""
+			if dt := obj.GetDeletionTimestamp(); dt != nil {
+				deletionTS = dt.Format(time.RFC3339)
+			}
+			// Surface progress periodically OR whenever the visible state
+			// changes (finalizers list shrunk, deletionTimestamp finally
+			// appeared after a Delete request was missed, ...).
+			stateChanged := deletionTS != lastDeletionTS || !sameFinalizers(finalizers, lastFinalizers)
+			if stateChanged || time.Since(lastProgress) >= PollGoneProgressEvery {
+				if deletionTS == "" {
+					logger.Info("%s %s still alive (no deletionTimestamp yet, finalizers=%v)",
+						resourceLabel, ref, finalizers)
+				} else {
+					logger.Info("%s %s still terminating (deletionTimestamp=%s, finalizers=%v)",
+						resourceLabel, ref, deletionTS, finalizers)
+				}
+				lastProgress = time.Now()
+				lastFinalizers = append(lastFinalizers[:0], finalizers...)
+				lastDeletionTS = deletionTS
+			}
+		default:
+			consecutiveErrs++
+			if consecutiveErrs >= 3 {
+				logger.Warn(
+					"%s %s GET failed for %d consecutive iterations: %v",
+					resourceLabel, ref, consecutiveErrs, err,
+				)
+			} else {
+				logger.Debug("Error getting %s %s: %v", resourceLabel, ref, err)
+			}
+		}
+
+		select {
+		case <-deadlineCtx.Done():
+			// Surface the last observed state in the timeout error so the
+			// caller (and the dev reading the test log) can immediately tell
+			// whether they're stuck on a finalizer, on a missing
+			// deletionTimestamp, or on a network issue.
+			lastSeen := "no observation yet"
+			if lastDeletionTS != "" || len(lastFinalizers) > 0 {
+				lastSeen = fmt.Sprintf("deletionTimestamp=%q, finalizers=%v", lastDeletionTS, lastFinalizers)
+			}
+			return fmt.Errorf("timeout waiting for %s %s to be gone (%s): %w",
+				resourceLabel, ref, lastSeen, deadlineCtx.Err())
+		case <-ticker.C:
+		}
+	}
+}
+
+// formatRef renders a resource reference as either "name" (cluster-scoped)
+// or "namespace/name" (namespaced) for log lines and error messages.
+func formatRef(namespace, name string) string {
+	if namespace == "" {
+		return name
+	}
+	return namespace + "/" + name
+}
+
+// errIfTerminating returns a descriptive error if obj has a non-nil
+// metadata.deletionTimestamp. Used by Create* helpers to fail-fast in the
+// IsAlreadyExists branch when an existing CR is in `Terminating` state —
+// updating its spec would be a no-op (the controller is busy unwinding the
+// finalizer), and a follow-up Wait*Ready would hang forever because phase
+// transitions never reach a Ready state on a Terminating object.
+//
+// `kind` is the human-readable kind ("CephCluster") and `ref` is the
+// formatted "[namespace/]name" identifier.
+func errIfTerminating(obj *unstructured.Unstructured, kind, ref string) error {
+	dt := obj.GetDeletionTimestamp()
+	if dt == nil {
+		return nil
+	}
+	return fmt.Errorf(
+		"%s %s exists but is being deleted (deletionTimestamp=%s, finalizers=%v); "+
+			"wait for it to disappear or remove finalizers manually before re-running",
+		kind, ref, dt.Format(time.RFC3339), obj.GetFinalizers(),
+	)
+}
+
+// sameFinalizers returns true when both slices contain the same strings in
+// the same order. Used by pollResourceUntilGone to decide if the visible
+// state has changed.
+func sameFinalizers(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// getWithTimeout wraps dynamicClient.Get with a per-call deadline derived
+// from the parent context. The wrapper avoids leaking goroutines blocked on
+// a dead TCP connection. An empty namespace selects the cluster-scoped
+// path (used by csi-ceph CRs like CephClusterConnection).
+func getWithTimeout(
+	parent context.Context,
+	dynamicClient dynamic.Interface,
+	gvr schema.GroupVersionResource,
+	namespace, name string,
+	perCallTimeout time.Duration,
+) (*unstructured.Unstructured, error) {
+	callCtx, cancel := context.WithTimeout(parent, perCallTimeout)
+	defer cancel()
+	if namespace == "" {
+		return dynamicClient.Resource(gvr).Get(callCtx, name, metav1.GetOptions{})
+	}
+	return dynamicClient.Resource(gvr).Namespace(namespace).Get(callCtx, name, metav1.GetOptions{})
+}
diff --git a/pkg/kubernetes/rookconfigoverride.go b/pkg/kubernetes/rookconfigoverride.go
new file mode 100644
index 0000000..dab8aad
--- /dev/null
+++ b/pkg/kubernetes/rookconfigoverride.go
@@ -0,0 +1,140 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"sort"
+	"strings"
+
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+// RookConfigOverrideName is the well-known ConfigMap name Rook reads Ceph
+// config overrides from (see Rook docs: "Advanced Configuration – Custom
+// ceph.conf Settings"). Rook watches this ConfigMap in its operator namespace
+// and injects the `config` key into `/etc/ceph/ceph.conf` of every Ceph daemon.
+const RookConfigOverrideName = "rook-config-override"
+
+// SetRookConfigOverride creates or updates the `rook-config-override` ConfigMap
+// in the given Rook operator namespace so that Ceph daemons pick up the
+// provided global settings.
+//
+// The ConfigMap format expected by Rook is:
+//
+//	apiVersion: v1
+//	kind: ConfigMap
+//	metadata:
+//	  name: rook-config-override
+//	  namespace: <rook-namespace>
+//	data:
+//	  config: |
+//	    [global]
+//	    key1 = value1
+//	    key2 = value2
+//
+// `globals` is rendered under `[global]`. Keys are sorted for a stable output.
+// Passing an empty/nil `globals` map produces an empty `[global]` section,
+// which effectively clears previously-set overrides.
+func SetRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string, globals map[string]string) error {
+	if namespace == "" {
+		return fmt.Errorf("namespace is required")
+	}
+
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	cfg := RenderCephGlobalConfig(globals)
+
+	cm := &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      RookConfigOverrideName,
+			Namespace: namespace,
+		},
+		Data: map[string]string{
+			"config": cfg,
+		},
+	}
+
+	existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, RookConfigOverrideName, metav1.GetOptions{})
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			logger.Info("Creating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals))
+			if _, err := clientset.CoreV1().ConfigMaps(namespace).Create(ctx, cm, metav1.CreateOptions{}); err != nil {
+				return fmt.Errorf("failed to create ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err)
+			}
+			return nil
+		}
+		return fmt.Errorf("failed to get ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err)
+	}
+
+	logger.Info("Updating ConfigMap %s/%s with Ceph global overrides (%d keys)", namespace, RookConfigOverrideName, len(globals))
+	existing.Data = cm.Data
+	if _, err := clientset.CoreV1().ConfigMaps(namespace).Update(ctx, existing, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("failed to update ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err)
+	}
+	return nil
+}
+
+// DeleteRookConfigOverride removes the `rook-config-override` ConfigMap. It
+// is safe to call when the ConfigMap does not exist.
+func DeleteRookConfigOverride(ctx context.Context, kubeconfig *rest.Config, namespace string) error {
+	if namespace == "" {
+		return fmt.Errorf("namespace is required")
+	}
+
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	if err := clientset.CoreV1().ConfigMaps(namespace).Delete(ctx, RookConfigOverrideName, metav1.DeleteOptions{}); err != nil {
+		if apierrors.IsNotFound(err) {
+			return nil
+		}
+		return fmt.Errorf("failed to delete ConfigMap %s/%s: %w", namespace, RookConfigOverrideName, err)
+	}
+	logger.Info("Deleted ConfigMap %s/%s", namespace, RookConfigOverrideName)
+	return nil
+}
+
+// RenderCephGlobalConfig renders a `[global]` section for ceph.conf from the
+// provided key/value pairs. Keys are sorted so the rendered output is stable
+// across calls with logically-equivalent maps (avoids unnecessary CM updates).
+func RenderCephGlobalConfig(globals map[string]string) string {
+	var b strings.Builder
+	b.WriteString("[global]\n")
+
+	keys := make([]string, 0, len(globals))
+	for k := range globals {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	for _, k := range keys {
+		fmt.Fprintf(&b, "%s = %s\n", k, globals[k])
+	}
+	return b.String()
+}
diff --git a/pkg/kubernetes/storageclass_manage.go b/pkg/kubernetes/storageclass_manage.go
new file mode 100644
index 0000000..bb7fb94
--- /dev/null
+++ b/pkg/kubernetes/storageclass_manage.go
@@ -0,0 +1,100 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+
+	corev1 "k8s.io/api/core/v1"
+	storagev1 "k8s.io/api/storage/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+type StorageClassCreateConfig struct {
+	Name               string
+	Provisioner        string
+	Parameters         map[string]string
+	VolumeBindingMode  storagev1.VolumeBindingMode
+	ReclaimPolicy      corev1.PersistentVolumeReclaimPolicy
+	AllowExpansion     bool
+	MakeDefault        bool
+	AdditionalLabels   map[string]string
+	AdditionalAnnot    map[string]string
+}
+
+func CreateStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg StorageClassCreateConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("storage class name is required")
+	}
+	if cfg.Provisioner == "" {
+		return fmt.Errorf("provisioner is required")
+	}
+
+	clientset, err := NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	annotations := map[string]string{}
+	for k, v := range cfg.AdditionalAnnot {
+		annotations[k] = v
+	}
+	if cfg.MakeDefault {
+		annotations["storageclass.kubernetes.io/is-default-class"] = "true"
+		annotations["storageclass.beta.kubernetes.io/is-default-class"] = "true"
+	}
+
+	labels := map[string]string{}
+	for k, v := range cfg.AdditionalLabels {
+		labels[k] = v
+	}
+
+	sc := &storagev1.StorageClass{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "StorageClass",
+			APIVersion: "storage.k8s.io/v1",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        cfg.Name,
+			Labels:      labels,
+			Annotations: annotations,
+		},
+		Provisioner:          cfg.Provisioner,
+		Parameters:           cfg.Parameters,
+		ReclaimPolicy:        &cfg.ReclaimPolicy,
+		AllowVolumeExpansion: &cfg.AllowExpansion,
+		VolumeBindingMode:    &cfg.VolumeBindingMode,
+	}
+
+	logger.Info("Creating StorageClass %s (provisioner=%s)", cfg.Name, cfg.Provisioner)
+	_, err = clientset.StorageV1().StorageClasses().Create(ctx, sc, metav1.CreateOptions{})
+	if err != nil {
+		if apierrors.IsAlreadyExists(err) {
+			logger.Info("StorageClass %s already exists, skipping create", cfg.Name)
+			return nil
+		}
+		return fmt.Errorf("failed to create StorageClass %s: %w", cfg.Name, err)
+	}
+	logger.Success("StorageClass %s created", cfg.Name)
+	return nil
+}
+
diff --git a/pkg/kubernetes/volumesnapshotclass.go b/pkg/kubernetes/volumesnapshotclass.go
new file mode 100644
index 0000000..9307615
--- /dev/null
+++ b/pkg/kubernetes/volumesnapshotclass.go
@@ -0,0 +1,125 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+)
+
+var VolumeSnapshotClassGVR = schema.GroupVersionResource{
+	Group:    "snapshot.storage.k8s.io",
+	Version:  "v1",
+	Resource: "volumesnapshotclasses",
+}
+
+type VolumeSnapshotClassConfig struct {
+	Name           string
+	Driver         string
+	DeletionPolicy string // "Delete" or "Retain"
+	Parameters     map[string]string
+	MakeDefault    bool
+}
+
+func CreateVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, cfg VolumeSnapshotClassConfig) error {
+	if cfg.Name == "" {
+		return fmt.Errorf("volume snapshot class name is required")
+	}
+	if cfg.Driver == "" {
+		return fmt.Errorf("driver is required")
+	}
+	if cfg.DeletionPolicy == "" {
+		cfg.DeletionPolicy = "Delete"
+	}
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	annotations := map[string]interface{}{}
+	if cfg.MakeDefault {
+		annotations["snapshot.storage.kubernetes.io/is-default-class"] = "true"
+	}
+
+	parameters := map[string]interface{}{}
+	for k, v := range cfg.Parameters {
+		parameters[k] = v
+	}
+
+	vsc := &unstructured.Unstructured{
+		Object: map[string]interface{}{
+			"apiVersion": "snapshot.storage.k8s.io/v1",
+			"kind":       "VolumeSnapshotClass",
+			"metadata": map[string]interface{}{
+				"name":        cfg.Name,
+				"annotations": annotations,
+			},
+			"driver":         cfg.Driver,
+			"deletionPolicy": cfg.DeletionPolicy,
+			"parameters":     parameters,
+		},
+	}
+
+	logger.Info("Creating VolumeSnapshotClass %s (driver=%s, deletionPolicy=%s)", cfg.Name, cfg.Driver, cfg.DeletionPolicy)
+	_, err = dynamicClient.Resource(VolumeSnapshotClassGVR).Create(ctx, vsc, metav1.CreateOptions{})
+	if err != nil {
+		if apierrors.IsAlreadyExists(err) {
+			logger.Info("VolumeSnapshotClass %s already exists, skipping create", cfg.Name)
+			return nil
+		}
+		return fmt.Errorf("failed to create VolumeSnapshotClass %s: %w", cfg.Name, err)
+	}
+	logger.Success("VolumeSnapshotClass %s created", cfg.Name)
+	return nil
+}
+
+func WaitForVolumeSnapshotClass(ctx context.Context, kubeconfig *rest.Config, name string, timeout time.Duration) error {
+	logger.Debug("Waiting for VolumeSnapshotClass %s to become available (timeout: %v)", name, timeout)
+
+	dynamicClient, err := NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	deadline := time.Now().Add(timeout)
+	for {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		if time.Now().After(deadline) {
+			return fmt.Errorf("timeout waiting for VolumeSnapshotClass %s", name)
+		}
+
+		_, err := dynamicClient.Resource(VolumeSnapshotClassGVR).Get(ctx, name, metav1.GetOptions{})
+		if err == nil {
+			logger.Success("VolumeSnapshotClass %s is available", name)
+			return nil
+		}
+
+		time.Sleep(5 * time.Second)
+	}
+}
diff --git a/pkg/testkit/ceph.go b/pkg/testkit/ceph.go
new file mode 100644
index 0000000..7427967
--- /dev/null
+++ b/pkg/testkit/ceph.go
@@ -0,0 +1,622 @@
+/*
+Copyright 2025 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package testkit
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/infrastructure/ssh"
+	"github.com/deckhouse/storage-e2e/internal/logger"
+	"github.com/deckhouse/storage-e2e/pkg/kubernetes"
+)
+
+// Re-exports of the supported CephStorageClass types so callers don't have
+// to import the lower-level pkg/kubernetes package just to set cfg.Type.
+const (
+	CephStorageClassTypeRBD    = kubernetes.CephStorageClassTypeRBD
+	CephStorageClassTypeCephFS = kubernetes.CephStorageClassTypeCephFS
+)
+
+// CephStorageClassConfig controls the end-to-end provisioning of a
+// Rook-managed Ceph cluster plus a csi-ceph-backed k8s StorageClass:
+//
+//  1. Enables Deckhouse modules required for the stack:
+//     sds-node-configurator, sds-elastic (Rook), csi-ceph.
+//  2. (Optional) Falls back to EnsureDefaultStorageClass to produce a
+//     sds-local-volume StorageClass for backing OSD PVCs.
+//  3. Seeds `rook-config-override` with per-test global Ceph settings
+//     (e.g. `ms_crc_data = false` for the PR #131 scenario).
+//  4. Creates a CephCluster (Rook) and waits until it is Created.
+//  5. Creates a CephBlockPool and waits until it is Ready.
+//  6. Reads fsid / monitors / CephX admin key from Rook-managed secrets
+//     and wires them into CephClusterConnection + CephClusterAuthentication
+//     CRs so csi-ceph can talk to the cluster.
+//  7. Creates a CephStorageClass CR and waits for the csi-ceph controller
+//     to materialize a core storage.k8s.io/v1 StorageClass.
+//
+// Only StorageClassName is strictly required; everything else has sensible
+// defaults tuned for single-node / tiny test clusters.
+type CephStorageClassConfig struct {
+	// --- Top-level identity ---
+
+	// StorageClassName is the name of the CephStorageClass CR (and of the
+	// resulting k8s StorageClass). Required.
+	StorageClassName string
+
+	// Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic".
+	Namespace string
+
+	// --- sds-elastic / Rook CephCluster ---
+
+	// CephClusterName is the Rook CephCluster name. Default: "ceph-cluster".
+	CephClusterName string
+
+	// CephImage is the Ceph container image tag. Default: "quay.io/ceph/ceph:v18.2.7".
+	CephImage string
+
+	// MonCount / MgrCount are the Rook mon/mgr replica counts.
+	// Defaults: 1 / 1 (good for 1..3 node test clusters).
+	MonCount int
+	MgrCount int
+
+	// NetworkProvider: "" for CNI (default), "host" for host networking.
+	NetworkProvider     string
+	PublicNetworkCIDRs  []string
+	ClusterNetworkCIDRs []string
+
+	// GlobalCephConfigOverrides populates `rook-config-override` under
+	// `[global]`, e.g. {"ms_crc_data": "false"}. nil / empty map leaves
+	// the ConfigMap untouched except for creating it as an empty `[global]`.
+	GlobalCephConfigOverrides map[string]string
+
+	// --- OSD backing ---
+
+	// OSDStorageClass is a block-capable StorageClass used to back OSD PVCs.
+	// When empty, EnsureDefaultStorageClass is invoked with
+	// OSDBackingStorageClass* to provision a sds-local-volume SC.
+	OSDStorageClass string
+
+	// OSDCount is the number of OSDs. Default: 1.
+	OSDCount int
+
+	// OSDSize is the size of each OSD PVC. Default: "10Gi".
+	OSDSize string
+
+	// --- Fallback SC provisioning via sds-local-volume (when OSDStorageClass is empty) ---
+
+	// OSDBackingStorageClassName names the sds-local-volume SC that we
+	// auto-provision for OSDs. Default: "sds-local-volume-thin-ceph-osd".
+	OSDBackingStorageClassName string
+
+	// OSDBackingLVMType passed to EnsureDefaultStorageClass ("Thick"/"Thin").
+	// Default: "Thick" (simpler for block-mode PVCs used as Ceph OSDs).
+	OSDBackingLVMType string
+
+	// OSDBackingIncludeMasters exposes EnsureDefaultStorageClass.IncludeMasters.
+	OSDBackingIncludeMasters bool
+
+	// OSDBackingBaseKubeconfig/VMNamespace/BaseStorageClassName are plumbed
+	// through to EnsureDefaultStorageClass to enable automatic VirtualDisk
+	// attachment on nested-VM clusters.
+	OSDBackingBaseKubeconfig       *rest.Config
+	OSDBackingVMNamespace          string
+	OSDBackingBaseStorageClassName string
+
+	// MasterSSH is optional SSH access to the control plane. Not used by
+	// EnsureCephStorageClass in this revision; callers may set it for
+	// follow-up bootstrap or diagnostics hooks.
+	MasterSSH ssh.SSHClient
+
+	// --- CephBlockPool ---
+
+	// PoolName is the Rook CephBlockPool name (also becomes the Ceph pool
+	// name referenced by CephStorageClass.spec.rbd.pool).
+	// Default: "ceph-rbd-r<ReplicaSize>".
+	PoolName string
+
+	// ReplicaSize is the CephBlockPool replication factor. Default: 1.
+	ReplicaSize int
+
+	// FailureDomain is the CRUSH failure domain: "host" or "osd".
+	// Default: "osd" when ReplicaSize==1, "host" otherwise.
+	FailureDomain string
+
+	// --- Pool kind ---
+
+	// Type selects the backing Ceph primitive: "RBD" (default) provisions a
+	// CephBlockPool; "CephFS" provisions a CephFilesystem. The resulting
+	// csi-ceph CephStorageClass CR mirrors this choice via spec.type.
+	Type string
+
+	// --- CephFilesystem (used only when Type == "CephFS") ---
+
+	// CephFSName is the Rook CephFilesystem name. Default: "ceph-fs".
+	CephFSName string
+
+	// CephFSDataPoolName is the per-filesystem data pool name (Rook-side,
+	// not the full Ceph pool name). Default: "data0".
+	CephFSDataPoolName string
+
+	// CephFSMetadataReplicas is the metadata pool replication factor.
+	// Default: ReplicaSize.
+	CephFSMetadataReplicas int
+
+	// CephFSDataReplicas is the data pool replication factor.
+	// Default: ReplicaSize.
+	CephFSDataReplicas int
+
+	// CephFSActiveMDSCount is the number of active MDS daemons. Default: 1.
+	CephFSActiveMDSCount int
+
+	// --- csi-ceph wiring ---
+
+	// ClusterConnectionName and ClusterAuthenticationName point at the
+	// CephClusterConnection / CephClusterAuthentication CRs we create.
+	// Defaults: both "<StorageClassName>-conn".
+	ClusterConnectionName     string
+	ClusterAuthenticationName string
+
+	// RBDDefaultFSType picks the mkfs used on attach. Default: "ext4".
+	RBDDefaultFSType string
+
+	// --- Modules ---
+
+	// SkipModuleEnablement disables the module-enable step (useful when the
+	// caller has already configured ModuleConfig on the cluster).
+	SkipModuleEnablement bool
+
+	// SkipClusterTeardown leaves the underlying Rook CephCluster and the
+	// rook-config-override ConfigMap in place during TeardownCephStorageClass.
+	// Use it when several StorageClasses share a single CephCluster — the
+	// "owning" call should leave the flag false and tear the cluster down
+	// last, while every other teardown sets it to true and only removes its
+	// SC-specific resources (CephStorageClass / connection / auth / pool /
+	// filesystem).
+	SkipClusterTeardown bool
+
+	// SdsElasticSettings overrides `spec.settings` of the sds-elastic
+	// ModuleConfig. Defaults to the minimal set that makes sense on a
+	// single-node test cluster.
+	SdsElasticSettings map[string]interface{}
+
+	// CsiCephSettings overrides `spec.settings` of the csi-ceph ModuleConfig.
+	CsiCephSettings map[string]interface{}
+
+	// CsiCephModulePullOverride pins a specific csi-ceph image tag (dev
+	// registry only). Useful for testing PRs that haven't been released yet.
+	CsiCephModulePullOverride string
+
+	// --- Timeouts ---
+
+	ModulesReadyTimeout        time.Duration // default 15m
+	CephClusterReadyTimeout    time.Duration // default 20m
+	CephPoolReadyTimeout       time.Duration // default 10m
+	CephFilesystemReadyTimeout time.Duration // default 10m
+	CredentialsTimeout         time.Duration // default 10m
+	CSICephPhaseTimeout        time.Duration // default 5m
+	StorageClassWaitTimeout    time.Duration // default 2m
+}
+
+func (c *CephStorageClassConfig) applyDefaults() {
+	if c.Namespace == "" {
+		c.Namespace = kubernetes.DefaultRookNamespace
+	}
+	if c.CephClusterName == "" {
+		c.CephClusterName = kubernetes.DefaultCephClusterName
+	}
+	if c.CephImage == "" {
+		c.CephImage = kubernetes.DefaultCephImage
+	}
+	if c.MonCount <= 0 {
+		c.MonCount = 1
+	}
+	if c.MgrCount <= 0 {
+		c.MgrCount = 1
+	}
+	if c.OSDCount <= 0 {
+		c.OSDCount = 1
+	}
+	if c.OSDSize == "" {
+		c.OSDSize = kubernetes.DefaultOSDStorageClassSize
+	}
+	if c.OSDBackingStorageClassName == "" {
+		c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd"
+	}
+	if c.OSDBackingLVMType == "" {
+		c.OSDBackingLVMType = "Thick"
+	}
+	if c.ReplicaSize <= 0 {
+		c.ReplicaSize = 1
+	}
+	if c.PoolName == "" {
+		c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize)
+	}
+	if c.FailureDomain == "" {
+		if c.ReplicaSize == 1 {
+			c.FailureDomain = "osd"
+		} else {
+			c.FailureDomain = "host"
+		}
+	}
+	if c.ClusterConnectionName == "" {
+		c.ClusterConnectionName = c.StorageClassName + "-conn"
+	}
+	if c.ClusterAuthenticationName == "" {
+		c.ClusterAuthenticationName = c.StorageClassName + "-conn"
+	}
+	if c.RBDDefaultFSType == "" {
+		c.RBDDefaultFSType = "ext4"
+	}
+	if c.Type == "" {
+		c.Type = kubernetes.CephStorageClassTypeRBD
+	}
+	if c.CephFSName == "" {
+		c.CephFSName = "ceph-fs"
+	}
+	if c.CephFSDataPoolName == "" {
+		c.CephFSDataPoolName = "data0"
+	}
+	if c.CephFSMetadataReplicas <= 0 {
+		c.CephFSMetadataReplicas = c.ReplicaSize
+	}
+	if c.CephFSDataReplicas <= 0 {
+		c.CephFSDataReplicas = c.ReplicaSize
+	}
+	if c.CephFSActiveMDSCount <= 0 {
+		c.CephFSActiveMDSCount = 1
+	}
+	if c.ModulesReadyTimeout == 0 {
+		c.ModulesReadyTimeout = 15 * time.Minute
+	}
+	if c.CephClusterReadyTimeout == 0 {
+		c.CephClusterReadyTimeout = 20 * time.Minute
+	}
+	if c.CephPoolReadyTimeout == 0 {
+		c.CephPoolReadyTimeout = 10 * time.Minute
+	}
+	if c.CephFilesystemReadyTimeout == 0 {
+		c.CephFilesystemReadyTimeout = 10 * time.Minute
+	}
+	if c.CredentialsTimeout == 0 {
+		c.CredentialsTimeout = 10 * time.Minute
+	}
+	if c.CSICephPhaseTimeout == 0 {
+		c.CSICephPhaseTimeout = 5 * time.Minute
+	}
+	if c.StorageClassWaitTimeout == 0 {
+		c.StorageClassWaitTimeout = 2 * time.Minute
+	}
+}
+
+// EnsureCephStorageClass is the high-level entry point that turns an empty
+// cluster into one with a working csi-ceph StorageClass. See
+// CephStorageClassConfig for the step-by-step flow.
+//
+// The function is idempotent: re-running it picks up the existing Rook
+// CephCluster / pool / csi-ceph CRs and only fills in whatever is still
+// missing. Returns the name of the resulting k8s StorageClass.
+func EnsureCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) {
+	cfg.applyDefaults()
+
+	if cfg.StorageClassName == "" {
+		return "", fmt.Errorf("StorageClassName is required")
+	}
+
+	logger.Step(1, "Enabling Deckhouse modules for csi-ceph (sds-node-configurator, sds-elastic, csi-ceph)")
+	if !cfg.SkipModuleEnablement {
+		if err := ensureCephModules(ctx, kubeconfig, cfg); err != nil {
+			return "", fmt.Errorf("enable ceph modules: %w", err)
+		}
+	}
+	logger.StepComplete(1, "Modules enabled")
+
+	logger.Step(2, "Resolving OSD backing StorageClass")
+	osdSC, err := ensureOSDBackingStorageClass(ctx, kubeconfig, &cfg)
+	if err != nil {
+		return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err)
+	}
+	logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC)
+
+	logger.Step(3, "Seeding rook-config-override ConfigMap")
+	if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil {
+		return "", fmt.Errorf("set rook-config-override: %w", err)
+	}
+	logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides))
+
+	logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName)
+	if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{
+		Name:                cfg.CephClusterName,
+		Namespace:           cfg.Namespace,
+		CephImage:           cfg.CephImage,
+		MonCount:            cfg.MonCount,
+		MgrCount:            cfg.MgrCount,
+		NetworkProvider:     cfg.NetworkProvider,
+		PublicNetworkCIDRs:  cfg.PublicNetworkCIDRs,
+		ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs,
+		OSDStorageClass:     osdSC,
+		OSDCount:            cfg.OSDCount,
+		OSDSize:             cfg.OSDSize,
+	}); err != nil {
+		return "", fmt.Errorf("create CephCluster: %w", err)
+	}
+	if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil {
+		return "", fmt.Errorf("wait CephCluster: %w", err)
+	}
+	logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName)
+
+	switch cfg.Type {
+	case kubernetes.CephStorageClassTypeRBD:
+		logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)",
+			cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain)
+		if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{
+			Name:          cfg.PoolName,
+			Namespace:     cfg.Namespace,
+			FailureDomain: cfg.FailureDomain,
+			ReplicaSize:   cfg.ReplicaSize,
+		}); err != nil {
+			return "", fmt.Errorf("create CephBlockPool: %w", err)
+		}
+		if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil {
+			return "", fmt.Errorf("wait CephBlockPool: %w", err)
+		}
+		logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName)
+	case kubernetes.CephStorageClassTypeCephFS:
+		logger.Step(5, "Creating CephFilesystem %s/%s (metadata replica=%d, data pool %q replica=%d, failureDomain=%s, activeMDS=%d)",
+			cfg.Namespace, cfg.CephFSName,
+			cfg.CephFSMetadataReplicas, cfg.CephFSDataPoolName, cfg.CephFSDataReplicas,
+			cfg.FailureDomain, cfg.CephFSActiveMDSCount)
+		if err := kubernetes.CreateCephFilesystem(ctx, kubeconfig, kubernetes.CephFilesystemConfig{
+			Name:                      cfg.CephFSName,
+			Namespace:                 cfg.Namespace,
+			FailureDomain:             cfg.FailureDomain,
+			MetadataPoolReplicas:      cfg.CephFSMetadataReplicas,
+			DataPoolName:              cfg.CephFSDataPoolName,
+			DataPoolReplicas:          cfg.CephFSDataReplicas,
+			MetadataServerActiveCount: cfg.CephFSActiveMDSCount,
+		}); err != nil {
+			return "", fmt.Errorf("create CephFilesystem: %w", err)
+		}
+		if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, cfg.CephFilesystemReadyTimeout); err != nil {
+			return "", fmt.Errorf("wait CephFilesystem: %w", err)
+		}
+		logger.StepComplete(5, "CephFilesystem %s/%s is Ready", cfg.Namespace, cfg.CephFSName)
+	default:
+		return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type)
+	}
+
+	logger.Step(6, "Extracting Rook-managed Ceph credentials (fsid, monitors, admin key)")
+	creds, err := kubernetes.WaitForCephCredentials(ctx, kubeconfig, cfg.Namespace, cfg.CredentialsTimeout)
+	if err != nil {
+		return "", fmt.Errorf("wait ceph credentials: %w", err)
+	}
+	logger.StepComplete(6, "Ceph credentials: fsid=%s, user=%s, %d monitor(s): %v",
+		creds.FSID, creds.AdminUser, len(creds.Monitors), creds.Monitors)
+
+	logger.Step(7, "Wiring csi-ceph: CephClusterAuthentication %q + CephClusterConnection %q",
+		cfg.ClusterAuthenticationName, cfg.ClusterConnectionName)
+	if err := kubernetes.CreateCephClusterAuthentication(ctx, kubeconfig, kubernetes.CephClusterAuthenticationConfig{
+		Name:    cfg.ClusterAuthenticationName,
+		UserID:  creds.AdminUser,
+		UserKey: creds.AdminKey,
+	}); err != nil {
+		return "", fmt.Errorf("create CephClusterAuthentication: %w", err)
+	}
+	if err := kubernetes.CreateCephClusterConnection(ctx, kubeconfig, kubernetes.CephClusterConnectionConfig{
+		Name:      cfg.ClusterConnectionName,
+		ClusterID: creds.FSID,
+		Monitors:  creds.Monitors,
+		UserID:    creds.AdminUser,
+		UserKey:   creds.AdminKey,
+	}); err != nil {
+		return "", fmt.Errorf("create CephClusterConnection: %w", err)
+	}
+	if err := kubernetes.WaitForCephClusterConnectionCreated(ctx, kubeconfig, cfg.ClusterConnectionName, cfg.CSICephPhaseTimeout); err != nil {
+		return "", fmt.Errorf("wait CephClusterConnection: %w", err)
+	}
+	logger.StepComplete(7, "csi-ceph wired against Ceph cluster %s", creds.FSID)
+
+	logger.Step(8, "Creating CephStorageClass %q (type=%s) → StorageClass", cfg.StorageClassName, cfg.Type)
+	cscCfg := kubernetes.CephStorageClassConfig{
+		Name:                      cfg.StorageClassName,
+		ClusterConnectionName:     cfg.ClusterConnectionName,
+		ClusterAuthenticationName: cfg.ClusterAuthenticationName,
+		Type:                      cfg.Type,
+	}
+	switch cfg.Type {
+	case kubernetes.CephStorageClassTypeRBD:
+		cscCfg.RBDPool = cfg.PoolName
+		cscCfg.RBDDefaultFSType = cfg.RBDDefaultFSType
+	case kubernetes.CephStorageClassTypeCephFS:
+		cscCfg.CephFSName = cfg.CephFSName
+		cscCfg.CephFSPool = kubernetes.CephFSDataPoolFullName(cfg.CephFSName, cfg.CephFSDataPoolName)
+	default:
+		return "", fmt.Errorf("unsupported CephStorageClass Type: %s", cfg.Type)
+	}
+	if err := kubernetes.CreateCephStorageClass(ctx, kubeconfig, cscCfg); err != nil {
+		return "", fmt.Errorf("create CephStorageClass: %w", err)
+	}
+	if err := kubernetes.WaitForCephStorageClassCreated(ctx, kubeconfig, cfg.StorageClassName, cfg.CSICephPhaseTimeout); err != nil {
+		return "", fmt.Errorf("wait CephStorageClass: %w", err)
+	}
+	if err := kubernetes.WaitForStorageClass(ctx, kubeconfig, cfg.StorageClassName, cfg.StorageClassWaitTimeout); err != nil {
+		return "", fmt.Errorf("wait core StorageClass: %w", err)
+	}
+	logger.StepComplete(8, "StorageClass %s is available", cfg.StorageClassName)
+
+	switch cfg.Type {
+	case kubernetes.CephStorageClassTypeCephFS:
+		logger.Success("Ceph e2e stack ready: CephCluster %s/%s + filesystem %s → StorageClass %s",
+			cfg.Namespace, cfg.CephClusterName, cfg.CephFSName, cfg.StorageClassName)
+	default:
+		logger.Success("Ceph e2e stack ready: CephCluster %s/%s + pool %s → StorageClass %s",
+			cfg.Namespace, cfg.CephClusterName, cfg.PoolName, cfg.StorageClassName)
+	}
+	return cfg.StorageClassName, nil
+}
+
+// TeardownCephStorageClass removes the csi-ceph wiring + Rook CephCluster +
+// pool + rook-config-override produced by EnsureCephStorageClass. Safe to
+// call on partial state (missing resources are skipped — the first error is
+// returned but subsequent deletions are still attempted).
+//
+// Each Delete is followed by a Wait*Gone that waits for the apiserver to
+// actually GC the CR. Without this synchronization the next test run (in
+// alwaysUseExisting mode, or a fresh bootstrap that re-creates the same
+// namespace) would race against Rook's finalizer and either:
+//   - find the CR still in Terminating and try to update its spec (no-op
+//     while the controller unwinds the finalizer);
+//   - delete the parent CephCluster while a child CephBlockPool /
+//     CephFilesystem is still alive — Rook then sets `DeletionIsBlocked /
+//     ObjectHasDependents` and the CephCluster sticks in `phase=Deleting`
+//     forever.
+//
+// On a Wait*Gone timeout we DO NOT auto-strip finalizers: the failure is
+// surfaced as an aggregated error so the operator can investigate the
+// cluster (typical reasons: HEALTH_ERR Ceph, stuck OSD prepare, dead mgr).
+//
+// It deliberately does NOT disable the Deckhouse modules: they may be owned
+// by the cluster admin, and re-bootstrapping is cheaper than a full
+// module-disable → module-enable cycle.
+func TeardownCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error {
+	cfg.applyDefaults()
+
+	var firstErr error
+	note := func(err error, what string) {
+		if err == nil {
+			return
+		}
+		logger.Warn("teardown: %s: %v", what, err)
+		if firstErr == nil {
+			firstErr = fmt.Errorf("%s: %w", what, err)
+		}
+	}
+
+	logger.Info("Tearing down csi-ceph StorageClass %q (type=%s)", cfg.StorageClassName, cfg.Type)
+
+	// 1. CephStorageClass: leaf, no finalizer dependency on the rest.
+	note(kubernetes.DeleteCephStorageClass(ctx, kubeconfig, cfg.StorageClassName), "delete CephStorageClass")
+	note(kubernetes.WaitForCephStorageClassGone(ctx, kubeconfig, cfg.StorageClassName, 0), "wait CephStorageClass gone")
+
+	// 2. CephClusterConnection / CephClusterAuthentication: csi-ceph CRs.
+	// Order between conn and auth doesn't matter — neither depends on the
+	// other.
+	note(kubernetes.DeleteCephClusterConnection(ctx, kubeconfig, cfg.ClusterConnectionName), "delete CephClusterConnection")
+	note(kubernetes.WaitForCephClusterConnectionGone(ctx, kubeconfig, cfg.ClusterConnectionName, 0), "wait CephClusterConnection gone")
+
+	note(kubernetes.DeleteCephClusterAuthentication(ctx, kubeconfig, cfg.ClusterAuthenticationName), "delete CephClusterAuthentication")
+	note(kubernetes.WaitForCephClusterAuthenticationGone(ctx, kubeconfig, cfg.ClusterAuthenticationName, 0), "wait CephClusterAuthentication gone")
+
+	// 3. Pool / Filesystem: must be fully gone before deleting CephCluster,
+	// otherwise Rook records DeletionIsBlocked / ObjectHasDependents.
+	switch cfg.Type {
+	case kubernetes.CephStorageClassTypeCephFS:
+		note(kubernetes.DeleteCephFilesystem(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName), "delete CephFilesystem")
+		note(kubernetes.WaitForCephFilesystemGone(ctx, kubeconfig, cfg.Namespace, cfg.CephFSName, 0), "wait CephFilesystem gone")
+	default:
+		note(kubernetes.DeleteCephBlockPool(ctx, kubeconfig, cfg.Namespace, cfg.PoolName), "delete CephBlockPool")
+		note(kubernetes.WaitForCephBlockPoolGone(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, 0), "wait CephBlockPool gone")
+	}
+
+	// 4. CephCluster: only when this teardown call owns it (the other
+	// TeardownCephStorageClass call shares the same Rook cluster — see
+	// SkipClusterTeardown doc-comment).
+	if !cfg.SkipClusterTeardown {
+		note(kubernetes.DeleteCephCluster(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName), "delete CephCluster")
+		note(kubernetes.WaitForCephClusterGone(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, 0), "wait CephCluster gone")
+		note(kubernetes.DeleteRookConfigOverride(ctx, kubeconfig, cfg.Namespace), "delete rook-config-override")
+	} else {
+		logger.Info("Skipping CephCluster + rook-config-override teardown (SkipClusterTeardown=true)")
+	}
+	return firstErr
+}
+
+// EnsureDefaultCephStorageClass is EnsureCephStorageClass + SetGlobalDefaultStorageClass.
+// After this call new PVCs without an explicit storageClassName will use the
+// freshly-provisioned Ceph RBD class.
+func EnsureDefaultCephStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) (string, error) {
+	scName, err := EnsureCephStorageClass(ctx, kubeconfig, cfg)
+	if err != nil {
+		return "", err
+	}
+	if err := kubernetes.SetGlobalDefaultStorageClass(ctx, kubeconfig, scName); err != nil {
+		return "", fmt.Errorf("set %s as default in global ModuleConfig: %w", scName, err)
+	}
+	logger.Success("StorageClass %s set as cluster default", scName)
+	return scName, nil
+}
+
+// ensureCephModules enables sds-node-configurator + sds-elastic + csi-ceph
+// and waits for their Ready phase.
+func ensureCephModules(ctx context.Context, kubeconfig *rest.Config, cfg CephStorageClassConfig) error {
+	sdsElasticSettings := cfg.SdsElasticSettings
+	if sdsElasticSettings == nil {
+		sdsElasticSettings = map[string]interface{}{}
+	}
+
+	csiCephSettings := cfg.CsiCephSettings
+	if csiCephSettings == nil {
+		csiCephSettings = map[string]interface{}{}
+	}
+
+	modules := []kubernetes.ModuleSpec{
+		{
+			Name:    "sds-node-configurator",
+			Version: 1,
+			Enabled: true,
+		},
+		{
+			Name:         "sds-elastic",
+			Version:      1,
+			Enabled:      true,
+			Settings:     sdsElasticSettings,
+			Dependencies: []string{"sds-node-configurator"},
+		},
+		{
+			Name:               "csi-ceph",
+			Version:            1,
+			Enabled:            true,
+			Settings:           csiCephSettings,
+			Dependencies:       []string{"sds-elastic"},
+			ModulePullOverride: cfg.CsiCephModulePullOverride,
+		},
+	}
+	return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, cfg.ModulesReadyTimeout)
+}
+
+// ensureOSDBackingStorageClass returns an already-existing SC name (if the
+// caller supplied OSDStorageClass) or delegates to EnsureDefaultStorageClass
+// to provision a sds-local-volume SC on the fly.
+func ensureOSDBackingStorageClass(ctx context.Context, kubeconfig *rest.Config, cfg *CephStorageClassConfig) (string, error) {
+	if cfg.OSDStorageClass != "" {
+		logger.Info("Using pre-existing OSD backing StorageClass %s", cfg.OSDStorageClass)
+		return cfg.OSDStorageClass, nil
+	}
+
+	localCfg := DefaultStorageClassConfig{
+		StorageClassName:     cfg.OSDBackingStorageClassName,
+		LVMType:              cfg.OSDBackingLVMType,
+		IncludeMasters:       cfg.OSDBackingIncludeMasters,
+		BaseKubeconfig:       cfg.OSDBackingBaseKubeconfig,
+		VMNamespace:          cfg.OSDBackingVMNamespace,
+		BaseStorageClassName: cfg.OSDBackingBaseStorageClassName,
+	}
+	return EnsureDefaultStorageClass(ctx, kubeconfig, localCfg)
+}
diff --git a/pkg/testkit/ceph_cluster.go b/pkg/testkit/ceph_cluster.go
new file mode 100644
index 0000000..cf683f2
--- /dev/null
+++ b/pkg/testkit/ceph_cluster.go
@@ -0,0 +1,295 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package testkit
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+	"github.com/deckhouse/storage-e2e/pkg/kubernetes"
+)
+
+// RookCephClusterConfig configures EnsureCephCluster — the "just bring up
+// a Rook-managed Ceph cluster + pool" variant of EnsureCephStorageClass.
+//
+// Unlike EnsureCephStorageClass, EnsureCephCluster does NOT:
+//   - enable the `csi-ceph` Deckhouse module;
+//   - create CephClusterConnection / CephClusterAuthentication CRs;
+//   - create a CephStorageClass CR / materialize a core StorageClass.
+//
+// It stops once the Rook CephCluster is Created and the CephBlockPool is
+// Ready. Use this when the test suite needs a live Ceph backend to exercise
+// (e.g. to run rbd / ceph CLI against it, or to hook some other client) but
+// deliberately does NOT want csi-ceph in the picture.
+type RookCephClusterConfig struct {
+	// --- Namespacing / naming ---
+
+	// Namespace is the Rook / sds-elastic namespace. Default: "d8-sds-elastic".
+	Namespace string
+
+	// CephClusterName is the Rook CephCluster name. Default: "ceph-cluster".
+	CephClusterName string
+
+	// CephImage is the Ceph container image. Default:
+	// "quay.io/ceph/ceph:v18.2.7".
+	CephImage string
+
+	// MonCount / MgrCount are the Rook mon/mgr replica counts.
+	// Defaults: 1 / 1 (appropriate for 1..3-node test clusters).
+	MonCount int
+	MgrCount int
+
+	// NetworkProvider: "" for CNI (default), "host" for host networking.
+	NetworkProvider     string
+	PublicNetworkCIDRs  []string
+	ClusterNetworkCIDRs []string
+
+	// GlobalCephConfigOverrides populates `rook-config-override` under
+	// `[global]`, e.g. {"ms_crc_data": "false"} for the csi-ceph
+	// msCrcData matrix. nil leaves the ConfigMap otherwise empty.
+	GlobalCephConfigOverrides map[string]string
+
+	// --- OSD backing ---
+
+	// OSDStorageClass is a block-capable StorageClass used to back OSD PVCs.
+	// When empty, EnsureDefaultStorageClass is invoked with
+	// OSDBacking* to provision a sds-local-volume SC on the fly.
+	OSDStorageClass string
+
+	// OSDCount is the number of OSDs. Default: 1.
+	OSDCount int
+
+	// OSDSize is the size of each OSD PVC. Default: kubernetes.DefaultOSDStorageClassSize.
+	OSDSize string
+
+	// --- Fallback SC provisioning via sds-local-volume ---
+
+	// OSDBackingStorageClassName names the sds-local-volume SC we auto-
+	// provision for OSDs. Default: "sds-local-volume-thick-ceph-osd".
+	OSDBackingStorageClassName string
+
+	// OSDBackingLVMType ("Thick"/"Thin"). Default: "Thick".
+	OSDBackingLVMType string
+
+	OSDBackingIncludeMasters       bool
+	OSDBackingBaseKubeconfig       *rest.Config
+	OSDBackingVMNamespace          string
+	OSDBackingBaseStorageClassName string
+
+	// --- CephBlockPool ---
+
+	// PoolName is the Rook CephBlockPool name. Default:
+	// "ceph-rbd-r<ReplicaSize>".
+	PoolName string
+
+	// ReplicaSize is the CephBlockPool replication factor. Default: 1.
+	ReplicaSize int
+
+	// FailureDomain: "host" or "osd". Default: "osd" when ReplicaSize==1,
+	// "host" otherwise.
+	FailureDomain string
+
+	// --- Modules ---
+
+	// SkipModuleEnablement disables the module-enable step (useful when
+	// the caller has already enabled sds-node-configurator + sds-elastic
+	// through other means).
+	SkipModuleEnablement bool
+
+	// SdsElasticSettings overrides `spec.settings` of the sds-elastic
+	// ModuleConfig. Defaults to an empty map.
+	SdsElasticSettings map[string]interface{}
+
+	// --- Timeouts ---
+
+	ModulesReadyTimeout     time.Duration // default 15m
+	CephClusterReadyTimeout time.Duration // default 20m
+	CephPoolReadyTimeout    time.Duration // default 10m
+}
+
+func (c *RookCephClusterConfig) applyDefaults() {
+	if c.Namespace == "" {
+		c.Namespace = kubernetes.DefaultRookNamespace
+	}
+	if c.CephClusterName == "" {
+		c.CephClusterName = kubernetes.DefaultCephClusterName
+	}
+	if c.CephImage == "" {
+		c.CephImage = kubernetes.DefaultCephImage
+	}
+	if c.MonCount <= 0 {
+		c.MonCount = 1
+	}
+	if c.MgrCount <= 0 {
+		c.MgrCount = 1
+	}
+	if c.OSDCount <= 0 {
+		c.OSDCount = 1
+	}
+	if c.OSDSize == "" {
+		c.OSDSize = kubernetes.DefaultOSDStorageClassSize
+	}
+	if c.OSDBackingStorageClassName == "" {
+		c.OSDBackingStorageClassName = "sds-local-volume-thick-ceph-osd"
+	}
+	if c.OSDBackingLVMType == "" {
+		c.OSDBackingLVMType = "Thick"
+	}
+	if c.ReplicaSize <= 0 {
+		c.ReplicaSize = 1
+	}
+	if c.PoolName == "" {
+		c.PoolName = fmt.Sprintf("ceph-rbd-r%d", c.ReplicaSize)
+	}
+	if c.FailureDomain == "" {
+		if c.ReplicaSize == 1 {
+			c.FailureDomain = "osd"
+		} else {
+			c.FailureDomain = "host"
+		}
+	}
+	if c.ModulesReadyTimeout == 0 {
+		c.ModulesReadyTimeout = 15 * time.Minute
+	}
+	if c.CephClusterReadyTimeout == 0 {
+		c.CephClusterReadyTimeout = 20 * time.Minute
+	}
+	if c.CephPoolReadyTimeout == 0 {
+		c.CephPoolReadyTimeout = 10 * time.Minute
+	}
+}
+
+// EnsureCephCluster brings up (or reuses) a Rook-managed Ceph cluster plus
+// a CephBlockPool via sds-elastic — without touching csi-ceph.
+//
+// Flow:
+//  1. Enable Deckhouse modules: sds-node-configurator + sds-elastic.
+//  2. Resolve an OSD backing StorageClass (re-using EnsureDefaultStorageClass
+//     when none is pre-provided).
+//  3. Seed `rook-config-override` with per-test global Ceph settings.
+//  4. Create the Rook CephCluster and wait until it is Created.
+//  5. Create the CephBlockPool and wait until it is Ready.
+//
+// Idempotent: re-running picks up existing resources. Returns the pool
+// name (same one callers would reference as Ceph pool, e.g. for a
+// subsequent `rbd create`/`CephStorageClass.rbd.pool`).
+func EnsureCephCluster(ctx context.Context, kubeconfig *rest.Config, cfg RookCephClusterConfig) (string, error) {
+	cfg.applyDefaults()
+
+	logger.Step(1, "Enabling Deckhouse modules for Rook (sds-node-configurator, sds-elastic)")
+	if !cfg.SkipModuleEnablement {
+		if err := ensureRookModules(ctx, kubeconfig, cfg.SdsElasticSettings, cfg.ModulesReadyTimeout); err != nil {
+			return "", fmt.Errorf("enable rook modules: %w", err)
+		}
+	}
+	logger.StepComplete(1, "Modules enabled")
+
+	logger.Step(2, "Resolving OSD backing StorageClass")
+	osdSC := cfg.OSDStorageClass
+	if osdSC == "" {
+		local := DefaultStorageClassConfig{
+			StorageClassName:     cfg.OSDBackingStorageClassName,
+			LVMType:              cfg.OSDBackingLVMType,
+			IncludeMasters:       cfg.OSDBackingIncludeMasters,
+			BaseKubeconfig:       cfg.OSDBackingBaseKubeconfig,
+			VMNamespace:          cfg.OSDBackingVMNamespace,
+			BaseStorageClassName: cfg.OSDBackingBaseStorageClassName,
+		}
+		name, err := EnsureDefaultStorageClass(ctx, kubeconfig, local)
+		if err != nil {
+			return "", fmt.Errorf("resolve OSD backing StorageClass: %w", err)
+		}
+		osdSC = name
+	} else {
+		logger.Info("Using pre-existing OSD backing StorageClass %s", osdSC)
+	}
+	logger.StepComplete(2, "OSD backing StorageClass: %s", osdSC)
+
+	logger.Step(3, "Seeding rook-config-override ConfigMap")
+	if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, cfg.Namespace, cfg.GlobalCephConfigOverrides); err != nil {
+		return "", fmt.Errorf("set rook-config-override: %w", err)
+	}
+	logger.StepComplete(3, "rook-config-override ready (%d global key(s))", len(cfg.GlobalCephConfigOverrides))
+
+	logger.Step(4, "Creating Rook CephCluster %s/%s", cfg.Namespace, cfg.CephClusterName)
+	if err := kubernetes.CreateCephCluster(ctx, kubeconfig, kubernetes.CephClusterConfig{
+		Name:                cfg.CephClusterName,
+		Namespace:           cfg.Namespace,
+		CephImage:           cfg.CephImage,
+		MonCount:            cfg.MonCount,
+		MgrCount:            cfg.MgrCount,
+		NetworkProvider:     cfg.NetworkProvider,
+		PublicNetworkCIDRs:  cfg.PublicNetworkCIDRs,
+		ClusterNetworkCIDRs: cfg.ClusterNetworkCIDRs,
+		OSDStorageClass:     osdSC,
+		OSDCount:            cfg.OSDCount,
+		OSDSize:             cfg.OSDSize,
+	}); err != nil {
+		return "", fmt.Errorf("create CephCluster: %w", err)
+	}
+	if err := kubernetes.WaitForCephClusterReady(ctx, kubeconfig, cfg.Namespace, cfg.CephClusterName, cfg.CephClusterReadyTimeout); err != nil {
+		return "", fmt.Errorf("wait CephCluster: %w", err)
+	}
+	logger.StepComplete(4, "CephCluster %s/%s is Created", cfg.Namespace, cfg.CephClusterName)
+
+	logger.Step(5, "Creating CephBlockPool %s/%s (replica=%d, failureDomain=%s)",
+		cfg.Namespace, cfg.PoolName, cfg.ReplicaSize, cfg.FailureDomain)
+	if err := kubernetes.CreateCephBlockPool(ctx, kubeconfig, kubernetes.CephBlockPoolConfig{
+		Name:          cfg.PoolName,
+		Namespace:     cfg.Namespace,
+		FailureDomain: cfg.FailureDomain,
+		ReplicaSize:   cfg.ReplicaSize,
+	}); err != nil {
+		return "", fmt.Errorf("create CephBlockPool: %w", err)
+	}
+	if err := kubernetes.WaitForCephBlockPoolReady(ctx, kubeconfig, cfg.Namespace, cfg.PoolName, cfg.CephPoolReadyTimeout); err != nil {
+		return "", fmt.Errorf("wait CephBlockPool: %w", err)
+	}
+	logger.StepComplete(5, "CephBlockPool %s/%s is Ready", cfg.Namespace, cfg.PoolName)
+
+	logger.Success("Ceph cluster ready: CephCluster %s/%s + pool %s (no csi-ceph wiring)",
+		cfg.Namespace, cfg.CephClusterName, cfg.PoolName)
+	return cfg.PoolName, nil
+}
+
+// ensureRookModules enables sds-node-configurator + sds-elastic (and nothing
+// else). Used by EnsureCephCluster and as the Rook-only step of
+// EnsureCephStorageClass's module list.
+func ensureRookModules(ctx context.Context, kubeconfig *rest.Config, sdsElasticSettings map[string]interface{}, readyTimeout time.Duration) error {
+	if sdsElasticSettings == nil {
+		sdsElasticSettings = map[string]interface{}{}
+	}
+	modules := []kubernetes.ModuleSpec{
+		{
+			Name:    "sds-node-configurator",
+			Version: 1,
+			Enabled: true,
+		},
+		{
+			Name:         "sds-elastic",
+			Version:      1,
+			Enabled:      true,
+			Settings:     sdsElasticSettings,
+			Dependencies: []string{"sds-node-configurator"},
+		},
+	}
+	return kubernetes.EnableModulesAndWait(ctx, kubeconfig, nil, nil, modules, readyTimeout)
+}
diff --git a/pkg/testkit/ceph_crc.go b/pkg/testkit/ceph_crc.go
new file mode 100644
index 0000000..39fb9a7
--- /dev/null
+++ b/pkg/testkit/ceph_crc.go
@@ -0,0 +1,347 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package testkit
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/rest"
+
+	"github.com/deckhouse/storage-e2e/internal/logger"
+	"github.com/deckhouse/storage-e2e/pkg/kubernetes"
+)
+
+// EnableServerCRC is the readable counterpart of
+// `SetMsCrcDataOnServer(..., ptr.To(true))`. It writes
+// `ms_crc_data = true` into rook-config-override and rolling-restarts
+// mon/mgr/osd so the override is live on every daemon before returning.
+//
+// Useful for tests that want the Ceph cluster in an explicit CRC-on state
+// (the default Ceph behaviour, but pinned in the ConfigMap so the test
+// can assert on it).
+func EnableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error {
+	enabled := true
+	return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled)
+}
+
+// DisableServerCRC flips Ceph into the "CRC off" state:
+// `ms_crc_data = false` in rook-config-override + rolling-restart of
+// mon/mgr/osd. Paired with a csi-ceph client that still defaults to
+// `msCrcData=true`, this reproduces the msCrcData matrix mismatch case.
+func DisableServerCRC(ctx context.Context, kubeconfig *rest.Config, namespace string) error {
+	enabled := false
+	return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, &enabled)
+}
+
+// ResetServerCRCToDefault removes `ms_crc_data` from rook-config-override
+// (rendered `[global]` section becomes empty). Ceph falls back to its
+// compile-time default (ms_crc_data = true), matching a freshly-installed
+// cluster. Convenient for AfterAll / AfterEach restoration.
+func ResetServerCRCToDefault(ctx context.Context, kubeconfig *rest.Config, namespace string) error {
+	return SetMsCrcDataOnServer(ctx, kubeconfig, namespace, nil)
+}
+
+// SetMsCrcDataOnServer rewrites `rook-config-override` so that only
+// `ms_crc_data = <enabled>` ends up under `[global]` (nil removes the key
+// entirely, falling back to Ceph's compile-time default = true).
+//
+// After flipping the ConfigMap, it force-restarts mon/mgr/osd Deployments
+// in the Rook namespace and waits for them to converge. Idempotent: when
+// the ConfigMap already encodes the desired state, nothing is restarted.
+//
+// Prefer EnableServerCRC / DisableServerCRC / ResetServerCRCToDefault at
+// call sites for readability; this lower-level primitive exists so a
+// boolean test parameter (e.g. a CRC compatibility matrix) doesn't have to branch.
+func SetMsCrcDataOnServer(ctx context.Context, kubeconfig *rest.Config, namespace string, enabled *bool) error {
+	if namespace == "" {
+		namespace = kubernetes.DefaultRookNamespace
+	}
+
+	overrides := renderMsCrcDataOverrides(enabled)
+	wantConfig := kubernetes.RenderCephGlobalConfig(overrides)
+
+	clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	existing, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, kubernetes.RookConfigOverrideName, metav1.GetOptions{})
+	if err != nil && !apierrors.IsNotFound(err) {
+		return fmt.Errorf("get %s/%s: %w", namespace, kubernetes.RookConfigOverrideName, err)
+	}
+	currentConfig := ""
+	if existing != nil {
+		currentConfig = existing.Data["config"]
+	}
+
+	if currentConfig == wantConfig {
+		logger.Info("rook-config-override already has ms_crc_data=%s, skipping daemon restart",
+			msCrcDataString(enabled))
+		return nil
+	}
+
+	logger.Info("Setting server-side ms_crc_data=%s in rook-config-override", msCrcDataString(enabled))
+	if err := kubernetes.SetRookConfigOverride(ctx, kubeconfig, namespace, overrides); err != nil {
+		return fmt.Errorf("set rook-config-override: %w", err)
+	}
+
+	// Rook operator notices CM changes on its next reconcile loop; force
+	// a rolling restart of the core Ceph daemons so the new
+	// `/etc/ceph/ceph.conf` takes effect right now.
+	if err := RestartCephDaemons(ctx, kubeconfig, namespace, 10*time.Minute); err != nil {
+		return fmt.Errorf("restart ceph daemons: %w", err)
+	}
+
+	// The operator pod is itself a Ceph admin client: it talks to mons
+	// to update CephCluster.status, evaluate CephFilesystem health,
+	// etc. Its in-pod ceph.conf was rendered at startup, so until it
+	// restarts it keeps using the old `ms_crc_data` value and can't
+	// connect to the freshly-bounced mons. Symptom: cephcluster CR
+	// flips to phase=Ready/state=Error with `failed to get status. .
+	// timed out` until the next reconcile after operator pod recycle.
+	// Bounce it now so the operator's view of the cluster lines up
+	// with reality before we return.
+	if err := RestartRookOperator(ctx, kubeconfig, namespace, 5*time.Minute); err != nil {
+		return fmt.Errorf("restart rook-ceph-operator: %w", err)
+	}
+
+	// Final sanity check: any CephFilesystem in the namespace must be
+	// Ready before we consider the flip "live". This is the gate that
+	// catches the MDS-stuck-on-old-CRC class of bug — if the MDS
+	// daemons we just bounced fail to rejoin the mons, the CR will
+	// linger in a non-Ready phase and we'd rather surface that here
+	// than have a downstream csi-cephfs PVC hang for minutes.
+	if err := waitCephFilesystemsReady(ctx, kubeconfig, namespace, 5*time.Minute); err != nil {
+		return fmt.Errorf("wait CephFilesystem ready after CRC flip: %w", err)
+	}
+
+	logger.Success("Server-side ms_crc_data=%s is now live on all Ceph daemons", msCrcDataString(enabled))
+	return nil
+}
+
+// RestartRookOperator rollout-restarts the rook-operator Deployment
+// in the given namespace and waits for the new pod to become Ready.
+//
+// The operator runs as a Ceph admin client (uses the cluster admin
+// keyring + a baked-in ceph.conf to query mon/osd state). When tests
+// flip a global wire-protocol knob like `ms_crc_data` and bounce the
+// daemons, the operator's existing connections become invalid — but
+// without a pod restart it'll keep retrying with the stale ceph.conf
+// and the cephcluster CR ends up reporting `HEALTH_ERR` /
+// `state: Error` until the next operator reconcile cycle.
+//
+// Deckhouse packages the rook-operator binary inside a Deployment
+// named after the Helm release, which conventionally equals the
+// namespace minus the leading `d8-` prefix (`d8-sds-elastic` →
+// `sds-elastic`, `d8-sds-replicated-volume` → `sds-replicated-volume`,
+// etc.). storage-e2e targets that flavor exclusively — vanilla Rook
+// (`rook-ceph-operator` Deployment in `rook-ceph` namespace) is not
+// supported here.
+func RestartRookOperator(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error {
+	if namespace == "" {
+		namespace = kubernetes.DefaultRookNamespace
+	}
+	clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	operatorName, ok := strings.CutPrefix(namespace, "d8-")
+	if !ok || operatorName == "" {
+		return fmt.Errorf("namespace %q is not a deckhouse module namespace (expected d8-<module> prefix); cannot derive rook-operator Deployment name", namespace)
+	}
+	if _, err := clientset.AppsV1().Deployments(namespace).Get(ctx, operatorName, metav1.GetOptions{}); err != nil {
+		return fmt.Errorf("get rook-operator Deployment %s/%s: %w", namespace, operatorName, err)
+	}
+
+	logger.Info("Rolling-restarting %s/%s so its Ceph admin client picks up the new ceph.conf", namespace, operatorName)
+	stamp := time.Now().UTC().Format(time.RFC3339Nano)
+	patch := []byte(fmt.Sprintf(
+		`{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp))
+	if _, err := clientset.AppsV1().Deployments(namespace).Patch(
+		ctx, operatorName, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil {
+		return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, operatorName, err)
+	}
+
+	waitCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+	ticker := time.NewTicker(5 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, operatorName, metav1.GetOptions{})
+		if err != nil {
+			return fmt.Errorf("get Deployment %s/%s: %w", namespace, operatorName, err)
+		}
+		desired := int32(1)
+		if d.Spec.Replicas != nil {
+			desired = *d.Spec.Replicas
+		}
+		if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired {
+			logger.Success("%s/%s is Ready after rollout", namespace, operatorName)
+			return nil
+		}
+		select {
+		case <-waitCtx.Done():
+			return fmt.Errorf("timed out after %s waiting for Deployment %s/%s to become ready", timeout, namespace, operatorName)
+		case <-ticker.C:
+		}
+	}
+}
+
+// waitCephFilesystemsReady lists every CephFilesystem CR in
+// `namespace` and waits for each to reach `status.phase=Ready` (or a
+// matching Ready condition). If the namespace has no CephFilesystem
+// CRs (RBD-only cluster), the function is a no-op.
+func waitCephFilesystemsReady(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error {
+	if namespace == "" {
+		namespace = kubernetes.DefaultRookNamespace
+	}
+	dynamicClient, err := kubernetes.NewDynamicClientWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create dynamic client: %w", err)
+	}
+
+	list, err := dynamicClient.Resource(kubernetes.CephFilesystemGVR).Namespace(namespace).List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return fmt.Errorf("list CephFilesystem in %s: %w", namespace, err)
+	}
+	if len(list.Items) == 0 {
+		return nil
+	}
+
+	for i := range list.Items {
+		name := list.Items[i].GetName()
+		if err := kubernetes.WaitForCephFilesystemReady(ctx, kubeconfig, namespace, name, timeout); err != nil {
+			return fmt.Errorf("CephFilesystem %s/%s did not become Ready after CRC flip: %w", namespace, name, err)
+		}
+	}
+	return nil
+}
+
+// RestartCephDaemons rollout-restarts every Rook-managed Ceph daemon
+// Deployment that consumes `/etc/ceph/ceph.conf` (mon, mgr, osd, mds,
+// rgw) and waits for each to reach its desired Ready replica count.
+//
+// Why all five roles, not just mon/mgr/osd: a global ConfigMap knob
+// like `ms_crc_data` lives in ceph.conf, which means every daemon
+// needs to be restarted for it to take effect. If only mon/mgr/osd
+// are bounced and an MDS keeps running with the old value, the
+// resulting CRC mismatch silently severs the MDS↔mon messenger
+// channel, CephFS goes degraded, and any csi-cephfs PVC hangs in
+// Pending until somebody (often the human running the test) bounces
+// MDS by hand. Including `rook-ceph-mds` here is what unblocks the
+// CephFS half of the msCrcData matrix.
+//
+// The selector also covers `rook-ceph-rgw` for forward-compat with
+// future S3 tests; if no rgw Deployments exist in the cluster, the
+// match list is just smaller and the function continues. Operator
+// restart is intentionally out of scope here — see RestartRookOperator.
+func RestartCephDaemons(ctx context.Context, kubeconfig *rest.Config, namespace string, timeout time.Duration) error {
+	if namespace == "" {
+		namespace = kubernetes.DefaultRookNamespace
+	}
+	clientset, err := kubernetes.NewClientsetWithRetry(ctx, kubeconfig)
+	if err != nil {
+		return fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	// Rook labels each Ceph daemon Deployment with `app=rook-ceph-<role>`.
+	labelSel := "app in (rook-ceph-mon,rook-ceph-mgr,rook-ceph-osd,rook-ceph-mds,rook-ceph-rgw)"
+	deployList, err := clientset.AppsV1().Deployments(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSel})
+	if err != nil {
+		return fmt.Errorf("list ceph daemon Deployments (%s): %w", labelSel, err)
+	}
+	if len(deployList.Items) == 0 {
+		return fmt.Errorf("no Ceph daemon Deployments matched %q in namespace %s — is Rook running?", labelSel, namespace)
+	}
+
+	names := make([]string, 0, len(deployList.Items))
+	for i := range deployList.Items {
+		names = append(names, deployList.Items[i].Name)
+	}
+	logger.Info("Rolling-restarting %d Ceph daemon Deployment(s): %v", len(names), names)
+
+	stamp := time.Now().UTC().Format(time.RFC3339Nano)
+	patch := []byte(fmt.Sprintf(
+		`{"spec":{"template":{"metadata":{"annotations":{"storage-e2e/restarted-at":%q}}}}}`, stamp))
+
+	for _, name := range names {
+		if _, err := clientset.AppsV1().Deployments(namespace).Patch(
+			ctx, name, types.StrategicMergePatchType, patch, metav1.PatchOptions{}); err != nil {
+			return fmt.Errorf("annotate Deployment %s/%s for rollout: %w", namespace, name, err)
+		}
+	}
+
+	waitCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	ticker := time.NewTicker(5 * time.Second)
+	defer ticker.Stop()
+
+	for {
+		ready := 0
+		for _, name := range names {
+			d, err := clientset.AppsV1().Deployments(namespace).Get(waitCtx, name, metav1.GetOptions{})
+			if err != nil {
+				return fmt.Errorf("get Deployment %s/%s: %w", namespace, name, err)
+			}
+			desired := int32(1)
+			if d.Spec.Replicas != nil {
+				desired = *d.Spec.Replicas
+			}
+			if d.Status.ObservedGeneration >= d.Generation && d.Status.UpdatedReplicas >= desired && d.Status.AvailableReplicas >= desired {
+				ready++
+			}
+		}
+		if ready == len(names) {
+			logger.Success("All %d Ceph daemon Deployment(s) report Ready after rollout", len(names))
+			return nil
+		}
+		select {
+		case <-waitCtx.Done():
+			return fmt.Errorf("timed out after %s waiting for %d Ceph daemon Deployments to become ready (%d/%d)",
+				timeout, len(names), ready, len(names))
+		case <-ticker.C:
+		}
+	}
+}
+
+// renderMsCrcDataOverrides turns a *bool into the minimal rook-config-override
+// key/value map used by the msCrcData test matrix.
+func renderMsCrcDataOverrides(enabled *bool) map[string]string {
+	if enabled == nil {
+		return nil
+	}
+	return map[string]string{
+		"ms_crc_data": strconv.FormatBool(*enabled),
+	}
+}
+
+func msCrcDataString(enabled *bool) string {
+	if enabled == nil {
+		return "<unset>"
+	}
+	return strconv.FormatBool(*enabled)
+}